2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 for (i=0; i<N/8; i++){
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
156 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
164 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
165 bool vectype_maybe_set_p
,
168 gimple
*stmt
= stmt_info
->stmt
;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
171 && !STMT_VINFO_LIVE_P (stmt_info
))
172 || gimple_clobber_p (stmt
))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype
, nunits_vectype
;
180 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
188 if (STMT_VINFO_VECTYPE (stmt_info
))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
193 || vectype_maybe_set_p
)
194 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
196 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
200 vect_update_max_nunits (vf
, nunits_vectype
);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
211 vect_determine_vf_for_stmt (vec_info
*vinfo
,
212 stmt_vec_info stmt_info
, poly_uint64
*vf
)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
217 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
222 && STMT_VINFO_RELATED_STMT (stmt_info
))
224 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
225 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
229 !gsi_end_p (si
); gsi_next (&si
))
231 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE
, vect_location
,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info
->stmt
);
236 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE
, vect_location
,
243 "==> examining pattern statement: %G",
245 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
281 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
282 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
283 unsigned nbbs
= loop
->num_nodes
;
284 poly_uint64 vectorization_factor
= 1;
285 tree scalar_type
= NULL_TREE
;
288 stmt_vec_info stmt_info
;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i
= 0; i
< nbbs
; i
++)
295 basic_block bb
= bbs
[i
];
297 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
301 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
306 gcc_assert (stmt_info
);
308 if (STMT_VINFO_RELEVANT_P (stmt_info
)
309 || STMT_VINFO_LIVE_P (stmt_info
))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
312 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE
, vect_location
,
316 "get vectype for scalar type: %T\n",
319 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
321 return opt_result::failure_at (phi
,
322 "not vectorized: unsupported "
325 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
334 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
335 dump_printf (MSG_NOTE
, "\n");
338 vect_update_max_nunits (&vectorization_factor
, vectype
);
342 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
345 if (is_gimple_debug (gsi_stmt (si
)))
347 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
349 = vect_determine_vf_for_stmt (loop_vinfo
,
350 stmt_info
, &vectorization_factor
);
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
360 dump_dec (MSG_NOTE
, vectorization_factor
);
361 dump_printf (MSG_NOTE
, "\n");
364 if (known_le (vectorization_factor
, 1U))
365 return opt_result::failure_at (vect_location
,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
378 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
383 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
386 /* When there is no evolution in this loop, the evolution function
388 if (evolution_part
== NULL_TREE
)
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part
))
396 step_expr
= evolution_part
;
397 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
401 step_expr
, init_expr
);
406 if (TREE_CODE (step_expr
) != INTEGER_CST
407 && (TREE_CODE (step_expr
) != SSA_NAME
408 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
409 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
412 || !flag_associative_math
)))
413 && (TREE_CODE (step_expr
) != REAL_CST
414 || !flag_associative_math
))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
430 x_1 = PHI <x_4(outer2), ...>;
434 x_2 = PHI <x_1(outer1), ...>;
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
451 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
452 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
453 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
)
468 basic_block bb
= loop
->header
;
470 auto_vec
<stmt_vec_info
, 64> worklist
;
472 bool double_reduc
, reduc_chain
;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
479 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
481 gphi
*phi
= gsi
.phi ();
482 tree access_fn
= NULL
;
483 tree def
= PHI_RESULT (phi
);
484 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def
))
494 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
496 /* Analyze the evolution function. */
497 access_fn
= analyze_scalar_evolution (loop
, def
);
500 STRIP_NOPS (access_fn
);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE
, vect_location
,
503 "Access function of PHI: %T\n", access_fn
);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
505 = initial_condition_in_loop_num (access_fn
, loop
->num
);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
507 = evolution_part_in_loop_num (access_fn
, loop
->num
);
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
512 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
513 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
514 && TREE_CODE (step
) != INTEGER_CST
))
516 worklist
.safe_push (stmt_vinfo
);
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist
.length () > 0)
533 stmt_vec_info stmt_vinfo
= worklist
.pop ();
534 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
535 tree def
= PHI_RESULT (phi
);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
540 gcc_assert (!virtual_operand_p (def
)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
548 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE
, vect_location
,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
561 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE
, vect_location
,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE
, vect_location
,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
581 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
618 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
620 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
632 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
635 /* Transfer group and reduction information from STMT_INFO to its
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
641 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
645 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
648 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
650 == STMT_VINFO_DEF_TYPE (stmt_info
));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
652 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
654 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
655 = STMT_VINFO_RELATED_STMT (stmt_info
);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
670 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
673 if ((STMT_VINFO_IN_PATTERN_P (next
)
674 != STMT_VINFO_IN_PATTERN_P (first
))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
677 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first
))
686 vect_fixup_reduc_chain (first
);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
688 = STMT_VINFO_RELATED_STMT (first
);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
696 stmt_vec_info vinfo
= first
;
697 stmt_vec_info last
= NULL
;
700 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
708 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
726 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
727 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
729 edge exit
= single_exit (loop
);
730 class tree_niter_desc niter_desc
;
731 tree niter_assumptions
, niter
, may_be_zero
;
732 gcond
*cond
= get_loop_exit_condition (loop
);
734 *assumptions
= boolean_true_node
;
735 *number_of_iterationsm1
= chrec_dont_know
;
736 *number_of_iterations
= chrec_dont_know
;
737 DUMP_VECT_SCOPE ("get_loop_niters");
742 may_be_zero
= NULL_TREE
;
743 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
744 || chrec_contains_undetermined (niter_desc
.niter
))
747 niter_assumptions
= niter_desc
.assumptions
;
748 may_be_zero
= niter_desc
.may_be_zero
;
749 niter
= niter_desc
.niter
;
751 if (may_be_zero
&& integer_zerop (may_be_zero
))
752 may_be_zero
= NULL_TREE
;
756 if (COMPARISON_CLASS_P (may_be_zero
))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
761 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
763 fold_build1 (TRUTH_NOT_EXPR
,
767 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
768 build_int_cst (TREE_TYPE (niter
), 0),
769 rewrite_to_non_trapping_overflow (niter
));
771 may_be_zero
= NULL_TREE
;
773 else if (integer_nonzerop (may_be_zero
))
775 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
776 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
783 *assumptions
= niter_assumptions
;
784 *number_of_iterationsm1
= niter
;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter
&& !chrec_contains_undetermined (niter
))
791 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
792 build_int_cst (TREE_TYPE (niter
), 1));
793 *number_of_iterations
= niter
;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
803 bb_in_loop_p (const_basic_block bb
, const void *data
)
805 const class loop
*const loop
= (const class loop
*)data
;
806 if (flow_bb_inside_loop_p (loop
, bb
))
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
816 : vec_info (vec_info::loop
, init_cost (loop_in
), shared
),
818 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
819 num_itersm1 (NULL_TREE
),
820 num_iters (NULL_TREE
),
821 num_iters_unchanged (NULL_TREE
),
822 num_iters_assumptions (NULL_TREE
),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE
),
828 rgroup_compare_type (NULL_TREE
),
829 simd_if_cond (NULL_TREE
),
831 peeling_for_alignment (0),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
849 orig_loop_info (NULL
)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
857 bbs
, loop
->num_nodes
, loop
);
858 gcc_assert (nbbs
== loop
->num_nodes
);
860 for (unsigned int i
= 0; i
< nbbs
; i
++)
862 basic_block bb
= bbs
[i
];
863 gimple_stmt_iterator si
;
865 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
867 gimple
*phi
= gsi_stmt (si
);
868 gimple_set_uid (phi
, 0);
872 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
874 gimple
*stmt
= gsi_stmt (si
);
875 gimple_set_uid (stmt
, 0);
876 if (is_gimple_debug (stmt
))
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
885 && is_gimple_call (stmt
)
886 && gimple_call_internal_p (stmt
)
887 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt
) >= 3
889 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
891 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
893 tree arg
= gimple_call_arg (stmt
, 2);
894 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
897 gcc_assert (integer_nonzerop (arg
));
902 epilogue_vinfos
.create (6);
905 /* Free all levels of rgroup CONTROLS. */
908 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
910 rgroup_controls
*rgc
;
912 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
913 rgc
->controls
.release ();
914 controls
->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
924 release_vec_loop_controls (&masks
);
925 release_vec_loop_controls (&lens
);
928 epilogue_vinfos
.release ();
930 /* When we release an epiloge vinfo that we do not intend to use
931 avoid clearing AUX of the main loop which should continue to
932 point to the main loop vinfo since otherwise we'll leak that. */
933 if (loop
->aux
== this)
937 /* Return an invariant or register for EXPR and emit necessary
938 computations in the LOOP_VINFO loop preheader. */
941 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
943 if (is_gimple_reg (expr
)
944 || is_gimple_min_invariant (expr
))
947 if (! loop_vinfo
->ivexpr_map
)
948 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
949 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
952 gimple_seq stmts
= NULL
;
953 cached
= force_gimple_operand (unshare_expr (expr
),
954 &stmts
, true, NULL_TREE
);
957 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
958 gsi_insert_seq_on_edge_immediate (e
, stmts
);
964 /* Return true if we can use CMP_TYPE as the comparison type to produce
965 all masks required to mask LOOP_VINFO. */
968 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
970 rgroup_controls
*rgm
;
972 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
973 if (rgm
->type
!= NULL_TREE
974 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
981 /* Calculate the maximum number of scalars per iteration for every
982 rgroup in LOOP_VINFO. */
985 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
987 unsigned int res
= 1;
989 rgroup_controls
*rgm
;
990 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
991 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
995 /* Calculate the minimum precision necessary to represent:
999 as an unsigned integer, where MAX_NITERS is the maximum number of
1000 loop header iterations for the original scalar form of LOOP_VINFO. */
1003 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1005 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1007 /* Get the maximum number of iterations that is representable
1008 in the counter type. */
1009 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1010 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1012 /* Get a more refined estimate for the number of iterations. */
1013 widest_int max_back_edges
;
1014 if (max_loop_iterations (loop
, &max_back_edges
))
1015 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1017 /* Work out how many bits we need to represent the limit. */
1018 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1021 /* True if the loop needs peeling or partial vectors when vectorized. */
1024 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1026 unsigned HOST_WIDE_INT const_vf
;
1027 HOST_WIDE_INT max_niter
1028 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1030 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1031 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1032 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1035 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1036 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1038 /* Work out the (constant) number of iterations that need to be
1039 peeled for reasons other than niters. */
1040 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1041 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1043 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1044 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1047 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1048 /* ??? When peeling for gaps but not alignment, we could
1049 try to check whether the (variable) niters is known to be
1050 VF * N + 1. That's something of a niche case though. */
1051 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1052 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1053 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1054 < (unsigned) exact_log2 (const_vf
))
1055 /* In case of versioning, check if the maximum number of
1056 iterations is greater than th. If they are identical,
1057 the epilogue is unnecessary. */
1058 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1059 || ((unsigned HOST_WIDE_INT
) max_niter
1060 > (th
/ const_vf
) * const_vf
))))
1066 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1067 whether we can actually generate the masks required. Return true if so,
1068 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1071 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1073 unsigned int min_ni_width
;
1074 unsigned int max_nscalars_per_iter
1075 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1077 /* Use a normal loop if there are no statements that need masking.
1078 This only happens in rare degenerate cases: it means that the loop
1079 has no loads, no stores, and no live-out values. */
1080 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1083 /* Work out how many bits we need to represent the limit. */
1085 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1087 /* Find a scalar mode for which WHILE_ULT is supported. */
1088 opt_scalar_int_mode cmp_mode_iter
;
1089 tree cmp_type
= NULL_TREE
;
1090 tree iv_type
= NULL_TREE
;
1091 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1092 unsigned int iv_precision
= UINT_MAX
;
1095 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1098 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1100 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1101 if (cmp_bits
>= min_ni_width
1102 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1104 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1106 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1108 /* Although we could stop as soon as we find a valid mode,
1109 there are at least two reasons why that's not always the
1112 - An IV that's Pmode or wider is more likely to be reusable
1113 in address calculations than an IV that's narrower than
1116 - Doing the comparison in IV_PRECISION or wider allows
1117 a natural 0-based IV, whereas using a narrower comparison
1118 type requires mitigations against wrap-around.
1120 Conversely, if the IV limit is variable, doing the comparison
1121 in a wider type than the original type can introduce
1122 unnecessary extensions, so picking the widest valid mode
1123 is not always a good choice either.
1125 Here we prefer the first IV type that's Pmode or wider,
1126 and the first comparison type that's IV_PRECISION or wider.
1127 (The comparison type must be no wider than the IV type,
1128 to avoid extensions in the vector loop.)
1130 ??? We might want to try continuing beyond Pmode for ILP32
1131 targets if CMP_BITS < IV_PRECISION. */
1132 iv_type
= this_type
;
1133 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1134 cmp_type
= this_type
;
1135 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1144 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1145 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1149 /* Check whether we can use vector access with length based on precison
1150 comparison. So far, to keep it simple, we only allow the case that the
1151 precision of the target supported length is larger than the precision
1152 required by loop niters. */
1155 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1157 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1160 unsigned int max_nitems_per_iter
= 1;
1162 rgroup_controls
*rgl
;
1163 /* Find the maximum number of items per iteration for every rgroup. */
1164 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1166 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1167 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1170 /* Work out how many bits we need to represent the length limit. */
1171 unsigned int min_ni_prec
1172 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1174 /* Now use the maximum of below precisions for one suitable IV type:
1175 - the IV's natural precision
1176 - the precision needed to hold: the maximum number of scalar
1177 iterations multiplied by the scale factor (min_ni_prec above)
1178 - the Pmode precision
1180 If min_ni_prec is less than the precision of the current niters,
1181 we perfer to still use the niters type. Prefer to use Pmode and
1182 wider IV to avoid narrow conversions. */
1184 unsigned int ni_prec
1185 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1186 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1187 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1189 tree iv_type
= NULL_TREE
;
1190 opt_scalar_int_mode tmode_iter
;
1191 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1193 scalar_mode tmode
= tmode_iter
.require ();
1194 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1196 /* ??? Do we really want to construct one IV whose precision exceeds
1198 if (tbits
> BITS_PER_WORD
)
1201 /* Find the first available standard integral type. */
1202 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1204 iv_type
= build_nonstandard_integer_type (tbits
, true);
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1213 "can't vectorize with length-based partial vectors"
1214 " because there is no suitable iv type.\n");
1218 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1219 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1224 /* Calculate the cost of one scalar iteration of the loop. */
1226 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1228 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1229 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1230 int nbbs
= loop
->num_nodes
, factor
;
1231 int innerloop_iters
, i
;
1233 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1235 /* Gather costs for statements in the scalar loop. */
1238 innerloop_iters
= 1;
1240 innerloop_iters
= 50; /* FIXME */
1242 for (i
= 0; i
< nbbs
; i
++)
1244 gimple_stmt_iterator si
;
1245 basic_block bb
= bbs
[i
];
1247 if (bb
->loop_father
== loop
->inner
)
1248 factor
= innerloop_iters
;
1252 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1254 gimple
*stmt
= gsi_stmt (si
);
1255 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1257 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1260 /* Skip stmts that are not vectorized inside the loop. */
1261 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1262 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1263 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1264 || !VECTORIZABLE_CYCLE_DEF
1265 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1268 vect_cost_for_stmt kind
;
1269 if (STMT_VINFO_DATA_REF (stmt_info
))
1271 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1274 kind
= scalar_store
;
1276 else if (vect_nop_conversion_p (stmt_info
))
1281 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1282 factor
, kind
, stmt_info
, 0, vect_prologue
);
1286 /* Now accumulate cost. */
1287 void *target_cost_data
= init_cost (loop
);
1288 stmt_info_for_cost
*si
;
1290 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1292 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, si
->count
,
1293 si
->kind
, si
->stmt_info
, si
->vectype
,
1294 si
->misalign
, vect_body
);
1295 unsigned dummy
, body_cost
= 0;
1296 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1297 destroy_cost_data (target_cost_data
);
1298 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1302 /* Function vect_analyze_loop_form_1.
1304 Verify that certain CFG restrictions hold, including:
1305 - the loop has a pre-header
1306 - the loop has a single entry and exit
1307 - the loop exit condition is simple enough
1308 - the number of iterations can be analyzed, i.e, a countable loop. The
1309 niter could be analyzed under some assumptions. */
1312 vect_analyze_loop_form_1 (class loop
*loop
, gcond
**loop_cond
,
1313 tree
*assumptions
, tree
*number_of_iterationsm1
,
1314 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1316 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1318 /* Different restrictions apply when we are considering an inner-most loop,
1319 vs. an outer (nested) loop.
1320 (FORNOW. May want to relax some of these restrictions in the future). */
1324 /* Inner-most loop. We currently require that the number of BBs is
1325 exactly 2 (the header and latch). Vectorizable inner-most loops
1336 if (loop
->num_nodes
!= 2)
1337 return opt_result::failure_at (vect_location
,
1339 " control flow in loop.\n");
1341 if (empty_block_p (loop
->header
))
1342 return opt_result::failure_at (vect_location
,
1343 "not vectorized: empty loop.\n");
1347 class loop
*innerloop
= loop
->inner
;
1350 /* Nested loop. We currently require that the loop is doubly-nested,
1351 contains a single inner loop, and the number of BBs is exactly 5.
1352 Vectorizable outer-loops look like this:
1364 The inner-loop has the properties expected of inner-most loops
1365 as described above. */
1367 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1368 return opt_result::failure_at (vect_location
,
1370 " multiple nested loops.\n");
1372 if (loop
->num_nodes
!= 5)
1373 return opt_result::failure_at (vect_location
,
1375 " control flow in loop.\n");
1377 entryedge
= loop_preheader_edge (innerloop
);
1378 if (entryedge
->src
!= loop
->header
1379 || !single_exit (innerloop
)
1380 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1381 return opt_result::failure_at (vect_location
,
1383 " unsupported outerloop form.\n");
1385 /* Analyze the inner-loop. */
1386 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1388 = vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1389 &inner_assumptions
, &inner_niterm1
,
1390 &inner_niter
, NULL
);
1393 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1395 "not vectorized: Bad inner loop.\n");
1399 /* Don't support analyzing niter under assumptions for inner
1401 if (!integer_onep (inner_assumptions
))
1402 return opt_result::failure_at (vect_location
,
1403 "not vectorized: Bad inner loop.\n");
1405 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1406 return opt_result::failure_at (vect_location
,
1407 "not vectorized: inner-loop count not"
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_NOTE
, vect_location
,
1412 "Considering outer-loop vectorization.\n");
1415 if (!single_exit (loop
))
1416 return opt_result::failure_at (vect_location
,
1417 "not vectorized: multiple exits.\n");
1418 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1419 return opt_result::failure_at (vect_location
,
1421 " too many incoming edges.\n");
1423 /* We assume that the loop exit condition is at the end of the loop. i.e,
1424 that the loop is represented as a do-while (with a proper if-guard
1425 before the loop if needed), where the loop header contains all the
1426 executable statements, and the latch is empty. */
1427 if (!empty_block_p (loop
->latch
)
1428 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1429 return opt_result::failure_at (vect_location
,
1430 "not vectorized: latch block not empty.\n");
1432 /* Make sure the exit is not abnormal. */
1433 edge e
= single_exit (loop
);
1434 if (e
->flags
& EDGE_ABNORMAL
)
1435 return opt_result::failure_at (vect_location
,
1437 " abnormal loop exit edge.\n");
1439 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1440 number_of_iterationsm1
);
1442 return opt_result::failure_at
1444 "not vectorized: complicated exit condition.\n");
1446 if (integer_zerop (*assumptions
)
1447 || !*number_of_iterations
1448 || chrec_contains_undetermined (*number_of_iterations
))
1449 return opt_result::failure_at
1451 "not vectorized: number of iterations cannot be computed.\n");
1453 if (integer_zerop (*number_of_iterations
))
1454 return opt_result::failure_at
1456 "not vectorized: number of iterations = 0.\n");
1458 return opt_result::success ();
1461 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1464 vect_analyze_loop_form (class loop
*loop
, vec_info_shared
*shared
)
1466 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1467 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1470 = vect_analyze_loop_form_1 (loop
, &loop_cond
,
1471 &assumptions
, &number_of_iterationsm1
,
1472 &number_of_iterations
, &inner_loop_cond
);
1474 return opt_loop_vec_info::propagate_failure (res
);
1476 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1477 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1478 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1479 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1480 if (!integer_onep (assumptions
))
1482 /* We consider to vectorize this loop by versioning it under
1483 some assumptions. In order to do this, we need to clear
1484 existing information computed by scev and niter analyzer. */
1486 free_numbers_of_iterations_estimates (loop
);
1487 /* Also set flag for this loop so that following scev and niter
1488 analysis are done under the assumptions. */
1489 loop_constraint_set (loop
, LOOP_C_FINITE
);
1490 /* Also record the assumptions for versioning. */
1491 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1494 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1496 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_NOTE
, vect_location
,
1499 "Symbolic number of iterations is ");
1500 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1501 dump_printf (MSG_NOTE
, "\n");
1505 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (loop_cond
);
1506 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1507 if (inner_loop_cond
)
1509 stmt_vec_info inner_loop_cond_info
1510 = loop_vinfo
->lookup_stmt (inner_loop_cond
);
1511 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1514 gcc_assert (!loop
->aux
);
1515 loop
->aux
= loop_vinfo
;
1516 return opt_loop_vec_info::success (loop_vinfo
);
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522 statements update the vectorization factor. */
1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1527 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1528 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1529 int nbbs
= loop
->num_nodes
;
1530 poly_uint64 vectorization_factor
;
1533 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1535 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1536 gcc_assert (known_ne (vectorization_factor
, 0U));
1538 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1539 vectorization factor of the loop is the unrolling factor required by
1540 the SLP instances. If that unrolling factor is 1, we say, that we
1541 perform pure SLP on loop - cross iteration parallelism is not
1543 bool only_slp_in_loop
= true;
1544 for (i
= 0; i
< nbbs
; i
++)
1546 basic_block bb
= bbs
[i
];
1547 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1550 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1553 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1554 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1555 && !PURE_SLP_STMT (stmt_info
))
1556 /* STMT needs both SLP and loop-based vectorization. */
1557 only_slp_in_loop
= false;
1559 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1562 if (is_gimple_debug (gsi_stmt (si
)))
1564 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1565 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1566 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1567 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1568 && !PURE_SLP_STMT (stmt_info
))
1569 /* STMT needs both SLP and loop-based vectorization. */
1570 only_slp_in_loop
= false;
1574 if (only_slp_in_loop
)
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE
, vect_location
,
1578 "Loop contains only SLP stmts\n");
1579 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE
, vect_location
,
1585 "Loop contains SLP and non-SLP stmts\n");
1586 /* Both the vectorization factor and unroll factor have the form
1587 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1588 so they must have a common multiple. */
1589 vectorization_factor
1590 = force_common_multiple (vectorization_factor
,
1591 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1594 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1595 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_NOTE
, vect_location
,
1598 "Updating vectorization factor to ");
1599 dump_dec (MSG_NOTE
, vectorization_factor
);
1600 dump_printf (MSG_NOTE
, ".\n");
1604 /* Return true if STMT_INFO describes a double reduction phi and if
1605 the other phi in the reduction is also relevant for vectorization.
1606 This rejects cases such as:
1609 x_1 = PHI <x_3(outer2), ...>;
1617 x_3 = PHI <x_2(inner)>;
1619 if nothing in x_2 or elsewhere makes x_1 relevant. */
1622 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1624 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1627 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1630 /* Function vect_analyze_loop_operations.
1632 Scan the loop stmts and make sure they are all vectorizable. */
1635 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1637 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1638 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1639 int nbbs
= loop
->num_nodes
;
1641 stmt_vec_info stmt_info
;
1642 bool need_to_vectorize
= false;
1645 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1647 auto_vec
<stmt_info_for_cost
> cost_vec
;
1649 for (i
= 0; i
< nbbs
; i
++)
1651 basic_block bb
= bbs
[i
];
1653 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1656 gphi
*phi
= si
.phi ();
1659 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1662 if (virtual_operand_p (gimple_phi_result (phi
)))
1665 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1666 (i.e., a phi in the tail of the outer-loop). */
1667 if (! is_loop_header_bb_p (bb
))
1669 /* FORNOW: we currently don't support the case that these phis
1670 are not used in the outerloop (unless it is double reduction,
1671 i.e., this phi is vect_reduction_def), cause this case
1672 requires to actually do something here. */
1673 if (STMT_VINFO_LIVE_P (stmt_info
)
1674 && !vect_active_double_reduction_p (stmt_info
))
1675 return opt_result::failure_at (phi
,
1676 "Unsupported loop-closed phi"
1677 " in outer-loop.\n");
1679 /* If PHI is used in the outer loop, we check that its operand
1680 is defined in the inner loop. */
1681 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1685 if (gimple_phi_num_args (phi
) != 1)
1686 return opt_result::failure_at (phi
, "unsupported phi");
1688 phi_op
= PHI_ARG_DEF (phi
, 0);
1689 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1691 return opt_result::failure_at (phi
, "unsupported phi\n");
1693 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1694 && (STMT_VINFO_RELEVANT (op_def_info
)
1695 != vect_used_in_outer_by_reduction
))
1696 return opt_result::failure_at (phi
, "unsupported phi\n");
1698 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1699 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1700 == vect_double_reduction_def
))
1701 && !vectorizable_lc_phi (loop_vinfo
,
1702 stmt_info
, NULL
, NULL
))
1703 return opt_result::failure_at (phi
, "unsupported phi\n");
1709 gcc_assert (stmt_info
);
1711 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1712 || STMT_VINFO_LIVE_P (stmt_info
))
1713 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1714 /* A scalar-dependence cycle that we don't support. */
1715 return opt_result::failure_at (phi
,
1717 " scalar dependence cycle.\n");
1719 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1721 need_to_vectorize
= true;
1722 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1723 && ! PURE_SLP_STMT (stmt_info
))
1724 ok
= vectorizable_induction (loop_vinfo
,
1725 stmt_info
, NULL
, NULL
,
1727 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1728 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1729 == vect_double_reduction_def
)
1730 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1731 && ! PURE_SLP_STMT (stmt_info
))
1732 ok
= vectorizable_reduction (loop_vinfo
,
1733 stmt_info
, NULL
, NULL
, &cost_vec
);
1736 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1738 && STMT_VINFO_LIVE_P (stmt_info
)
1739 && !PURE_SLP_STMT (stmt_info
))
1740 ok
= vectorizable_live_operation (loop_vinfo
,
1741 stmt_info
, NULL
, NULL
, NULL
,
1742 -1, false, &cost_vec
);
1745 return opt_result::failure_at (phi
,
1746 "not vectorized: relevant phi not "
1748 static_cast <gimple
*> (phi
));
1751 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1754 gimple
*stmt
= gsi_stmt (si
);
1755 if (!gimple_clobber_p (stmt
)
1756 && !is_gimple_debug (stmt
))
1759 = vect_analyze_stmt (loop_vinfo
,
1760 loop_vinfo
->lookup_stmt (stmt
),
1762 NULL
, NULL
, &cost_vec
);
1769 add_stmt_costs (loop_vinfo
, loop_vinfo
->target_cost_data
, &cost_vec
);
1771 /* All operations in the loop are either irrelevant (deal with loop
1772 control, or dead), or only used outside the loop and can be moved
1773 out of the loop (e.g. invariants, inductions). The loop can be
1774 optimized away by scalar optimizations. We're better off not
1775 touching this loop. */
1776 if (!need_to_vectorize
)
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_NOTE
, vect_location
,
1780 "All the computation can be taken out of the loop.\n");
1781 return opt_result::failure_at
1783 "not vectorized: redundant loop. no profit to vectorize.\n");
1786 return opt_result::success ();
1789 /* Return true if we know that the iteration count is smaller than the
1790 vectorization factor. Return false if it isn't, or if we can't be sure
1794 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
1796 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1798 HOST_WIDE_INT max_niter
;
1799 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1800 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1802 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1804 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1810 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1811 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1812 definitely no, or -1 if it's worth retrying. */
1815 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1817 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1818 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1820 /* Only loops that can handle partially-populated vectors can have iteration
1821 counts less than the vectorization factor. */
1822 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
1824 if (vect_known_niters_smaller_than_vf (loop_vinfo
))
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1828 "not vectorized: iteration count smaller than "
1829 "vectorization factor.\n");
1834 /* If using the "very cheap" model. reject cases in which we'd keep
1835 a copy of the scalar code (even if we might be able to vectorize it). */
1836 if (flag_vect_cost_model
== VECT_COST_MODEL_VERY_CHEAP
1837 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1838 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1839 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1843 "some scalar iterations would need to be peeled\n");
1847 int min_profitable_iters
, min_profitable_estimate
;
1848 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1849 &min_profitable_estimate
);
1851 if (min_profitable_iters
< 0)
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1855 "not vectorized: vectorization not profitable.\n");
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1858 "not vectorized: vector version will never be "
1863 int min_scalar_loop_bound
= (param_min_vect_loop_bound
1866 /* Use the cost model only if it is more conservative than user specified
1868 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1869 min_profitable_iters
);
1871 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1873 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1874 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1878 "not vectorized: vectorization not profitable.\n");
1879 if (dump_enabled_p ())
1880 dump_printf_loc (MSG_NOTE
, vect_location
,
1881 "not vectorized: iteration count smaller than user "
1882 "specified loop bound parameter or minimum profitable "
1883 "iterations (whichever is more conservative).\n");
1887 /* The static profitablity threshold min_profitable_estimate includes
1888 the cost of having to check at runtime whether the scalar loop
1889 should be used instead. If it turns out that we don't need or want
1890 such a check, the threshold we should use for the static estimate
1891 is simply the point at which the vector loop becomes more profitable
1892 than the scalar loop. */
1893 if (min_profitable_estimate
> min_profitable_iters
1894 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1895 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
1896 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1897 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
1901 " choice between the scalar and vector loops\n");
1902 min_profitable_estimate
= min_profitable_iters
;
1905 /* If the vector loop needs multiple iterations to be beneficial then
1906 things are probably too close to call, and the conservative thing
1907 would be to stick with the scalar code. */
1908 if (flag_vect_cost_model
== VECT_COST_MODEL_VERY_CHEAP
1909 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
1911 if (dump_enabled_p ())
1912 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1913 "one iteration of the vector loop would be"
1914 " more expensive than the equivalent number of"
1915 " iterations of the scalar loop\n");
1919 HOST_WIDE_INT estimated_niter
;
1921 /* If we are vectorizing an epilogue then we know the maximum number of
1922 scalar iterations it will cover is at least one lower than the
1923 vectorization factor of the main loop. */
1924 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1926 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
1929 estimated_niter
= estimated_stmt_executions_int (loop
);
1930 if (estimated_niter
== -1)
1931 estimated_niter
= likely_max_stmt_executions_int (loop
);
1933 if (estimated_niter
!= -1
1934 && ((unsigned HOST_WIDE_INT
) estimated_niter
1935 < MAX (th
, (unsigned) min_profitable_estimate
)))
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1939 "not vectorized: estimated iteration count too "
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE
, vect_location
,
1943 "not vectorized: estimated iteration count smaller "
1944 "than specified loop bound parameter or minimum "
1945 "profitable iterations (whichever is more "
1946 "conservative).\n");
1954 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1955 vec
<data_reference_p
> *datarefs
,
1956 unsigned int *n_stmts
)
1959 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1960 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1961 !gsi_end_p (gsi
); gsi_next (&gsi
))
1963 gimple
*stmt
= gsi_stmt (gsi
);
1964 if (is_gimple_debug (stmt
))
1967 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
1971 if (is_gimple_call (stmt
) && loop
->safelen
)
1973 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1974 if (fndecl
!= NULL_TREE
)
1976 cgraph_node
*node
= cgraph_node::get (fndecl
);
1977 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1979 unsigned int j
, n
= gimple_call_num_args (stmt
);
1980 for (j
= 0; j
< n
; j
++)
1982 op
= gimple_call_arg (stmt
, j
);
1984 || (REFERENCE_CLASS_P (op
)
1985 && get_base_address (op
)))
1988 op
= gimple_call_lhs (stmt
);
1989 /* Ignore #pragma omp declare simd functions
1990 if they don't have data references in the
1991 call stmt itself. */
1995 || (REFERENCE_CLASS_P (op
)
1996 && get_base_address (op
)))))
2003 /* If dependence analysis will give up due to the limit on the
2004 number of datarefs stop here and fail fatally. */
2005 if (datarefs
->length ()
2006 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2007 return opt_result::failure_at (stmt
, "exceeded param "
2008 "loop-max-datarefs-for-datadeps\n");
2010 return opt_result::success ();
2013 /* Look for SLP-only access groups and turn each individual access into its own
2016 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2019 struct data_reference
*dr
;
2021 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2023 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2024 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2026 gcc_assert (DR_REF (dr
));
2027 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
2029 /* Check if the load is a part of an interleaving chain. */
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2032 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2033 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2035 /* Check if SLP-only groups. */
2036 if (!STMT_SLP_TYPE (stmt_info
)
2037 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2039 /* Dissolve the group. */
2040 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2042 stmt_vec_info vinfo
= first_element
;
2045 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2046 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2047 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2048 DR_GROUP_SIZE (vinfo
) = 1;
2049 if (STMT_VINFO_STRIDED_P (first_element
))
2050 DR_GROUP_GAP (vinfo
) = 0;
2052 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2060 /* Determine if operating on full vectors for LOOP_VINFO might leave
2061 some scalar iterations still to do. If so, decide how we should
2062 handle those scalar iterations. The possibilities are:
2064 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2067 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2068 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2069 LOOP_VINFO_PEELING_FOR_NITER == false
2071 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2072 to handle the remaining scalar iterations. In this case:
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2075 LOOP_VINFO_PEELING_FOR_NITER == true
2077 There are two choices:
2079 (2a) Consider vectorizing the epilogue loop at the same VF as the
2080 main loop, but using partial vectors instead of full vectors.
2083 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2085 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2088 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2090 When FOR_EPILOGUE_P is true, make this determination based on the
2091 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2092 based on the assumption that LOOP_VINFO is the main loop. The caller
2093 has made sure that the number of iterations is set appropriately for
2094 this value of FOR_EPILOGUE_P. */
2097 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
,
2098 bool for_epilogue_p
)
2100 /* Determine whether there would be any scalar iterations left over. */
2101 bool need_peeling_or_partial_vectors_p
2102 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2104 /* Decide whether to vectorize the loop with partial vectors. */
2105 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2106 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2107 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2108 && need_peeling_or_partial_vectors_p
)
2110 /* For partial-vector-usage=1, try to push the handling of partial
2111 vectors to the epilogue, with the main loop continuing to operate
2114 ??? We could then end up failing to use partial vectors if we
2115 decide to peel iterations into a prologue, and if the main loop
2116 then ends up processing fewer than VF iterations. */
2117 if (param_vect_partial_vector_usage
== 1
2118 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2119 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2120 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2122 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2125 if (dump_enabled_p ())
2127 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2128 dump_printf_loc (MSG_NOTE
, vect_location
,
2129 "operating on partial vectors%s.\n",
2130 for_epilogue_p
? " for epilogue loop" : "");
2132 dump_printf_loc (MSG_NOTE
, vect_location
,
2133 "operating only on full vectors%s.\n",
2134 for_epilogue_p
? " for epilogue loop" : "");
2139 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2140 gcc_assert (orig_loop_vinfo
);
2141 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2142 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2143 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)));
2146 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2147 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2149 /* Check that the loop processes at least one full vector. */
2150 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2151 tree scalar_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2152 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2153 return opt_result::failure_at (vect_location
,
2154 "loop does not have enough iterations"
2155 " to support vectorization.\n");
2157 /* If we need to peel an extra epilogue iteration to handle data
2158 accesses with gaps, check that there are enough scalar iterations
2161 The check above is redundant with this one when peeling for gaps,
2162 but the distinction is useful for diagnostics. */
2163 tree scalar_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2165 && known_lt (wi::to_widest (scalar_nitersm1
), vf
))
2166 return opt_result::failure_at (vect_location
,
2167 "loop does not have enough iterations"
2168 " to support peeling for gaps.\n");
2171 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2172 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2173 && need_peeling_or_partial_vectors_p
);
2175 return opt_result::success ();
2178 /* Function vect_analyze_loop_2.
2180 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2181 for it. The different analyses will record information in the
2182 loop_vec_info struct. */
2184 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
, unsigned *n_stmts
)
2186 opt_result ok
= opt_result::success ();
2188 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2189 poly_uint64 min_vf
= 2;
2190 loop_vec_info orig_loop_vinfo
= NULL
;
2192 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2193 loop_vec_info of the first vectorized loop. */
2194 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2195 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2197 orig_loop_vinfo
= loop_vinfo
;
2198 gcc_assert (orig_loop_vinfo
);
2200 /* The first group of checks is independent of the vector size. */
2203 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2204 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2205 return opt_result::failure_at (vect_location
,
2206 "not vectorized: simd if(0)\n");
2208 /* Find all data references in the loop (which correspond to vdefs/vuses)
2209 and analyze their evolution in the loop. */
2211 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2213 /* Gather the data references and count stmts in the loop. */
2214 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2217 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2218 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2224 "not vectorized: loop contains function "
2225 "calls or data references that cannot "
2229 loop_vinfo
->shared
->save_datarefs ();
2232 loop_vinfo
->shared
->check_datarefs ();
2234 /* Analyze the data references and also adjust the minimal
2235 vectorization factor according to the loads and stores. */
2237 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2242 "bad data references.\n");
2246 /* Classify all cross-iteration scalar data-flow cycles.
2247 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2248 vect_analyze_scalar_cycles (loop_vinfo
);
2250 vect_pattern_recog (loop_vinfo
);
2252 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2254 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2255 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2257 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2262 "bad data access.\n");
2266 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2268 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2273 "unexpected pattern.\n");
2277 /* While the rest of the analysis below depends on it in some way. */
2280 /* Analyze data dependences between the data-refs in the loop
2281 and adjust the maximum vectorization factor according to
2283 FORNOW: fail at the first data dependence that we encounter. */
2285 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2290 "bad data dependence.\n");
2293 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2294 && maybe_lt (max_vf
, min_vf
))
2295 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2296 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2298 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2303 "can't determine vectorization factor.\n");
2306 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2307 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2308 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2310 /* Compute the scalar iteration cost. */
2311 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2313 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2315 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2316 ok
= vect_analyze_slp (loop_vinfo
, *n_stmts
);
2320 /* If there are any SLP instances mark them as pure_slp. */
2321 bool slp
= vect_make_slp_decision (loop_vinfo
);
2324 /* Find stmts that need to be both vectorized and SLPed. */
2325 vect_detect_hybrid_slp (loop_vinfo
);
2327 /* Update the vectorization factor based on the SLP decision. */
2328 vect_update_vf_for_slp (loop_vinfo
);
2330 /* Optimize the SLP graph with the vectorization factor fixed. */
2331 vect_optimize_slp (loop_vinfo
);
2333 /* Gather the loads reachable from the SLP graph entries. */
2334 vect_gather_slp_loads (loop_vinfo
);
2337 bool saved_can_use_partial_vectors_p
2338 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2340 /* We don't expect to have to roll back to anything other than an empty
2342 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2344 /* This is the point where we can re-start analysis with SLP forced off. */
2347 /* Now the vectorization factor is final. */
2348 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2349 gcc_assert (known_ne (vectorization_factor
, 0U));
2351 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE
, vect_location
,
2354 "vectorization_factor = ");
2355 dump_dec (MSG_NOTE
, vectorization_factor
);
2356 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2357 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2360 /* Analyze the alignment of the data-refs in the loop.
2361 Fail if a data reference is found that cannot be vectorized. */
2363 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2368 "bad data alignment.\n");
2372 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2373 It is important to call pruning after vect_analyze_data_ref_accesses,
2374 since we use grouping information gathered by interleaving analysis. */
2375 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2379 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2380 vectorization, since we do not want to add extra peeling or
2381 add versioning for alignment. */
2382 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2383 /* This pass will decide on using loop versioning and/or loop peeling in
2384 order to enhance the alignment of data references in the loop. */
2385 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2391 /* Analyze operations in the SLP instances. Note this may
2392 remove unsupported SLP instances which makes the above
2393 SLP kind detection invalid. */
2394 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2395 vect_slp_analyze_operations (loop_vinfo
);
2396 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2398 ok
= opt_result::failure_at (vect_location
,
2399 "unsupported SLP instances\n");
2403 /* Check whether any load in ALL SLP instances is possibly permuted. */
2404 slp_tree load_node
, slp_root
;
2406 slp_instance instance
;
2407 bool can_use_lanes
= true;
2408 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2410 slp_root
= SLP_INSTANCE_TREE (instance
);
2411 int group_size
= SLP_TREE_LANES (slp_root
);
2412 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2413 bool loads_permuted
= false;
2414 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2416 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2419 stmt_vec_info load_info
;
2420 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2421 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2423 loads_permuted
= true;
2428 /* If the loads and stores can be handled with load/store-lane
2429 instructions record it and move on to the next instance. */
2431 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2432 && vect_store_lanes_supported (vectype
, group_size
, false))
2434 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2436 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2437 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2438 /* Use SLP for strided accesses (or if we can't
2440 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2441 || ! vect_load_lanes_supported
2442 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2443 DR_GROUP_SIZE (stmt_vinfo
), false))
2448 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
2450 if (can_use_lanes
&& dump_enabled_p ())
2451 dump_printf_loc (MSG_NOTE
, vect_location
,
2452 "SLP instance %p can use load/store-lanes\n",
2457 can_use_lanes
= false;
2462 /* If all SLP instances can use load/store-lanes abort SLP and try again
2463 with SLP disabled. */
2466 ok
= opt_result::failure_at (vect_location
,
2467 "Built SLP cancelled: can use "
2468 "load/store-lanes\n");
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2471 "Built SLP cancelled: all SLP instances support "
2472 "load/store-lanes\n");
2477 /* Dissolve SLP-only groups. */
2478 vect_dissolve_slp_only_groups (loop_vinfo
);
2480 /* Scan all the remaining operations in the loop that are not subject
2481 to SLP and make sure they are vectorizable. */
2482 ok
= vect_analyze_loop_operations (loop_vinfo
);
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2487 "bad operation or unsupported loop bound.\n");
2491 /* For now, we don't expect to mix both masking and length approaches for one
2492 loop, disable it if both are recorded. */
2493 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2494 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
2495 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2499 "can't vectorize a loop with partial vectors"
2500 " because we don't expect to mix different"
2501 " approaches with partial vectors for the"
2503 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2506 /* If we still have the option of using partial vectors,
2507 check whether we can generate the necessary loop controls. */
2508 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2509 && !vect_verify_full_masking (loop_vinfo
)
2510 && !vect_verify_loop_lens (loop_vinfo
))
2511 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2513 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2514 to be able to handle fewer than VF scalars, or needs to have a lower VF
2515 than the main loop. */
2516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2517 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2518 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2519 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2520 return opt_result::failure_at (vect_location
,
2521 "Vectorization factor too high for"
2522 " epilogue loop.\n");
2524 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2525 assuming that the loop will be used as a main loop. We will redo
2526 this analysis later if we instead decide to use the loop as an
2528 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
, false);
2532 /* Check the costings of the loop make vectorizing worthwhile. */
2533 res
= vect_analyze_loop_costing (loop_vinfo
);
2536 ok
= opt_result::failure_at (vect_location
,
2537 "Loop costings may not be worthwhile.\n");
2541 return opt_result::failure_at (vect_location
,
2542 "Loop costings not worthwhile.\n");
2544 /* If an epilogue loop is required make sure we can create one. */
2545 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2546 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2548 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2550 if (!vect_can_advance_ivs_p (loop_vinfo
)
2551 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2552 single_exit (LOOP_VINFO_LOOP
2555 ok
= opt_result::failure_at (vect_location
,
2556 "not vectorized: can't create required "
2562 /* During peeling, we need to check if number of loop iterations is
2563 enough for both peeled prolog loop and vector loop. This check
2564 can be merged along with threshold check of loop versioning, so
2565 increase threshold for this case if necessary.
2567 If we are analyzing an epilogue we still want to check what its
2568 versioning threshold would be. If we decide to vectorize the epilogues we
2569 will want to use the lowest versioning threshold of all epilogues and main
2570 loop. This will enable us to enter a vectorized epilogue even when
2571 versioning the loop. We can't simply check whether the epilogue requires
2572 versioning though since we may have skipped some versioning checks when
2573 analyzing the epilogue. For instance, checks for alias versioning will be
2574 skipped when dealing with epilogues as we assume we already checked them
2575 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2576 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2578 poly_uint64 niters_th
= 0;
2579 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2581 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2583 /* Niters for peeled prolog loop. */
2584 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2586 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2587 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2588 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2591 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2594 /* Niters for at least one iteration of vectorized loop. */
2595 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2596 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2597 /* One additional iteration because of peeling for gap. */
2598 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2601 /* Use the same condition as vect_transform_loop to decide when to use
2602 the cost to determine a versioning threshold. */
2603 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2604 && ordered_p (th
, niters_th
))
2605 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2607 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2610 gcc_assert (known_eq (vectorization_factor
,
2611 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2613 /* Ok to vectorize! */
2614 return opt_result::success ();
2617 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2620 /* Try again with SLP forced off but if we didn't do any SLP there is
2621 no point in re-trying. */
2625 /* If there are reduction chains re-trying will fail anyway. */
2626 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2629 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630 via interleaving or lane instructions. */
2631 slp_instance instance
;
2634 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2636 stmt_vec_info vinfo
;
2637 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2638 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2640 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2641 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2642 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2643 if (! vect_store_lanes_supported (vectype
, size
, false)
2644 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2645 && ! vect_grouped_store_supported (vectype
, size
))
2646 return opt_result::failure_at (vinfo
->stmt
,
2647 "unsupported grouped store\n");
2648 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2650 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2651 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2652 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2653 size
= DR_GROUP_SIZE (vinfo
);
2654 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2655 if (! vect_load_lanes_supported (vectype
, size
, false)
2656 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2658 return opt_result::failure_at (vinfo
->stmt
,
2659 "unsupported grouped load\n");
2663 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_NOTE
, vect_location
,
2665 "re-trying with SLP disabled\n");
2667 /* Roll back state appropriately. No SLP this time. */
2669 /* Restore vectorization factor as it were without SLP. */
2670 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2671 /* Free the SLP instances. */
2672 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2673 vect_free_slp_instance (instance
);
2674 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2675 /* Reset SLP type to loop_vect on all stmts. */
2676 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2678 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2679 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2680 !gsi_end_p (si
); gsi_next (&si
))
2682 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2683 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2684 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2685 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2687 /* vectorizable_reduction adjusts reduction stmt def-types,
2688 restore them to that of the PHI. */
2689 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2690 = STMT_VINFO_DEF_TYPE (stmt_info
);
2691 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2692 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2693 = STMT_VINFO_DEF_TYPE (stmt_info
);
2696 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2697 !gsi_end_p (si
); gsi_next (&si
))
2699 if (is_gimple_debug (gsi_stmt (si
)))
2701 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2702 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2703 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2705 stmt_vec_info pattern_stmt_info
2706 = STMT_VINFO_RELATED_STMT (stmt_info
);
2707 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
2708 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
2710 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2711 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
2712 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2713 !gsi_end_p (pi
); gsi_next (&pi
))
2714 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2719 /* Free optimized alias test DDRS. */
2720 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2721 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2722 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2723 /* Reset target cost data. */
2724 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2725 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2726 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2727 /* Reset accumulated rgroup information. */
2728 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
));
2729 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
2730 /* Reset assorted flags. */
2731 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2732 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2733 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2734 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2735 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2736 = saved_can_use_partial_vectors_p
;
2741 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2742 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2743 OLD_LOOP_VINFO is better unless something specifically indicates
2746 Note that this deliberately isn't a partial order. */
2749 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2750 loop_vec_info old_loop_vinfo
)
2752 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2753 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2755 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2756 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2758 /* Always prefer a VF of loop->simdlen over any other VF. */
2761 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2762 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2763 if (new_simdlen_p
!= old_simdlen_p
)
2764 return new_simdlen_p
;
2767 /* Limit the VFs to what is likely to be the maximum number of iterations,
2768 to handle cases in which at least one loop_vinfo is fully-masked. */
2769 HOST_WIDE_INT estimated_max_niter
= likely_max_stmt_executions_int (loop
);
2770 if (estimated_max_niter
!= -1)
2772 if (known_le (estimated_max_niter
, new_vf
))
2773 new_vf
= estimated_max_niter
;
2774 if (known_le (estimated_max_niter
, old_vf
))
2775 old_vf
= estimated_max_niter
;
2778 /* Check whether the (fractional) cost per scalar iteration is lower
2779 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2780 poly_int64 rel_new
= new_loop_vinfo
->vec_inside_cost
* old_vf
;
2781 poly_int64 rel_old
= old_loop_vinfo
->vec_inside_cost
* new_vf
;
2783 HOST_WIDE_INT est_rel_new_min
2784 = estimated_poly_value (rel_new
, POLY_VALUE_MIN
);
2785 HOST_WIDE_INT est_rel_new_max
2786 = estimated_poly_value (rel_new
, POLY_VALUE_MAX
);
2788 HOST_WIDE_INT est_rel_old_min
2789 = estimated_poly_value (rel_old
, POLY_VALUE_MIN
);
2790 HOST_WIDE_INT est_rel_old_max
2791 = estimated_poly_value (rel_old
, POLY_VALUE_MAX
);
2793 /* Check first if we can make out an unambigous total order from the minimum
2794 and maximum estimates. */
2795 if (est_rel_new_min
< est_rel_old_min
2796 && est_rel_new_max
< est_rel_old_max
)
2798 else if (est_rel_old_min
< est_rel_new_min
2799 && est_rel_old_max
< est_rel_new_max
)
2801 /* When old_loop_vinfo uses a variable vectorization factor,
2802 we know that it has a lower cost for at least one runtime VF.
2803 However, we don't know how likely that VF is.
2805 One option would be to compare the costs for the estimated VFs.
2806 The problem is that that can put too much pressure on the cost
2807 model. E.g. if the estimated VF is also the lowest possible VF,
2808 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2809 for the estimated VF, we'd then choose new_loop_vinfo even
2810 though (a) new_loop_vinfo might not actually be better than
2811 old_loop_vinfo for that VF and (b) it would be significantly
2812 worse at larger VFs.
2814 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2815 no more expensive than old_loop_vinfo even after doubling the
2816 estimated old_loop_vinfo VF. For all but trivial loops, this
2817 ensures that we only pick new_loop_vinfo if it is significantly
2818 better than old_loop_vinfo at the estimated VF. */
2820 if (est_rel_old_min
!= est_rel_new_min
2821 || est_rel_old_max
!= est_rel_new_max
)
2823 HOST_WIDE_INT est_rel_new_likely
2824 = estimated_poly_value (rel_new
, POLY_VALUE_LIKELY
);
2825 HOST_WIDE_INT est_rel_old_likely
2826 = estimated_poly_value (rel_old
, POLY_VALUE_LIKELY
);
2828 return est_rel_new_likely
* 2 <= est_rel_old_likely
;
2831 /* If there's nothing to choose between the loop bodies, see whether
2832 there's a difference in the prologue and epilogue costs. */
2833 if (new_loop_vinfo
->vec_outside_cost
!= old_loop_vinfo
->vec_outside_cost
)
2834 return new_loop_vinfo
->vec_outside_cost
< old_loop_vinfo
->vec_outside_cost
;
2839 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2840 true if we should. */
2843 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
2844 loop_vec_info old_loop_vinfo
)
2846 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
2849 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE
, vect_location
,
2851 "***** Preferring vector mode %s to vector mode %s\n",
2852 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
2853 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
2857 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2858 try to reanalyze it as a main loop. Return the loop_vinfo on success
2859 and null on failure. */
2861 static loop_vec_info
2862 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo
, unsigned int *n_stmts
)
2864 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2867 if (dump_enabled_p ())
2868 dump_printf_loc (MSG_NOTE
, vect_location
,
2869 "***** Reanalyzing as a main loop with vector mode %s\n",
2870 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2872 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2873 vec_info_shared
*shared
= loop_vinfo
->shared
;
2874 opt_loop_vec_info main_loop_vinfo
= vect_analyze_loop_form (loop
, shared
);
2875 gcc_assert (main_loop_vinfo
);
2877 main_loop_vinfo
->vector_mode
= loop_vinfo
->vector_mode
;
2880 bool res
= vect_analyze_loop_2 (main_loop_vinfo
, fatal
, n_stmts
);
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE
, vect_location
,
2886 "***** Failed to analyze main loop with vector"
2888 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2889 delete main_loop_vinfo
;
2892 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo
) = 1;
2893 return main_loop_vinfo
;
2896 /* Function vect_analyze_loop.
2898 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2899 for it. The different analyses will record information in the
2900 loop_vec_info struct. */
2902 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
2904 auto_vector_modes vector_modes
;
2906 /* Autodetect first vector size we try. */
2907 unsigned int autovec_flags
2908 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
2909 loop
->simdlen
!= 0);
2910 unsigned int mode_i
= 0;
2912 DUMP_VECT_SCOPE ("analyze_loop_nest");
2914 if (loop_outer (loop
)
2915 && loop_vec_info_for_loop (loop_outer (loop
))
2916 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2917 return opt_loop_vec_info::failure_at (vect_location
,
2918 "outer-loop already vectorized.\n");
2920 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2921 return opt_loop_vec_info::failure_at
2923 "not vectorized: loop nest containing two or more consecutive inner"
2924 " loops cannot be vectorized\n");
2926 unsigned n_stmts
= 0;
2927 machine_mode autodetected_vector_mode
= VOIDmode
;
2928 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2929 machine_mode next_vector_mode
= VOIDmode
;
2930 poly_uint64 lowest_th
= 0;
2931 unsigned vectorized_loops
= 0;
2932 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
2933 && !unlimited_cost_model (loop
));
2935 bool vect_epilogues
= false;
2936 opt_result res
= opt_result::success ();
2937 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
2940 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2941 opt_loop_vec_info loop_vinfo
= vect_analyze_loop_form (loop
, shared
);
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2946 "bad loop form.\n");
2947 gcc_checking_assert (first_loop_vinfo
== NULL
);
2950 loop_vinfo
->vector_mode
= next_vector_mode
;
2954 /* When pick_lowest_cost_p is true, we should in principle iterate
2955 over all the loop_vec_infos that LOOP_VINFO could replace and
2956 try to vectorize LOOP_VINFO under the same conditions.
2957 E.g. when trying to replace an epilogue loop, we should vectorize
2958 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2959 to replace the main loop, we should vectorize LOOP_VINFO as a main
2962 However, autovectorize_vector_modes is usually sorted as follows:
2964 - Modes that naturally produce lower VFs usually follow modes that
2965 naturally produce higher VFs.
2967 - When modes naturally produce the same VF, maskable modes
2968 usually follow unmaskable ones, so that the maskable mode
2969 can be used to vectorize the epilogue of the unmaskable mode.
2971 This order is preferred because it leads to the maximum
2972 epilogue vectorization opportunities. Targets should only use
2973 a different order if they want to make wide modes available while
2974 disparaging them relative to earlier, smaller modes. The assumption
2975 in that case is that the wider modes are more expensive in some
2976 way that isn't reflected directly in the costs.
2978 There should therefore be few interesting cases in which
2979 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2980 treated as a standalone loop, and ends up being genuinely cheaper
2981 than FIRST_LOOP_VINFO. */
2983 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = first_loop_vinfo
;
2985 res
= vect_analyze_loop_2 (loop_vinfo
, fatal
, &n_stmts
);
2987 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
2988 if (dump_enabled_p ())
2991 dump_printf_loc (MSG_NOTE
, vect_location
,
2992 "***** Analysis succeeded with vector mode %s\n",
2993 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2995 dump_printf_loc (MSG_NOTE
, vect_location
,
2996 "***** Analysis failed with vector mode %s\n",
2997 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3003 while (mode_i
< vector_modes
.length ()
3004 && vect_chooses_same_modes_p (loop_vinfo
, vector_modes
[mode_i
]))
3006 if (dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE
, vect_location
,
3008 "***** The result for vector mode %s would"
3010 GET_MODE_NAME (vector_modes
[mode_i
]));
3016 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
3019 /* Once we hit the desired simdlen for the first time,
3020 discard any previous attempts. */
3022 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3024 delete first_loop_vinfo
;
3025 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3026 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = NULL
;
3029 else if (pick_lowest_cost_p
&& first_loop_vinfo
)
3031 /* Keep trying to roll back vectorization attempts while the
3032 loop_vec_infos they produced were worse than this one. */
3033 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3034 while (!vinfos
.is_empty ()
3035 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3037 gcc_assert (vect_epilogues
);
3038 delete vinfos
.pop ();
3040 if (vinfos
.is_empty ()
3041 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3043 loop_vec_info main_loop_vinfo
3044 = vect_reanalyze_as_main_loop (loop_vinfo
, &n_stmts
);
3045 if (main_loop_vinfo
== loop_vinfo
)
3047 delete first_loop_vinfo
;
3048 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3050 else if (main_loop_vinfo
3051 && vect_joust_loop_vinfos (main_loop_vinfo
,
3054 delete first_loop_vinfo
;
3055 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3058 = opt_loop_vec_info::success (main_loop_vinfo
);
3061 delete main_loop_vinfo
;
3065 if (first_loop_vinfo
== NULL
)
3067 first_loop_vinfo
= loop_vinfo
;
3068 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3070 else if (vect_epilogues
3071 /* For now only allow one epilogue loop. */
3072 && first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3074 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3075 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3076 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3077 || maybe_ne (lowest_th
, 0U));
3078 /* Keep track of the known smallest versioning
3080 if (ordered_p (lowest_th
, th
))
3081 lowest_th
= ordered_min (lowest_th
, th
);
3086 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3089 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3090 enabled, SIMDUID is not set, it is the innermost loop and we have
3091 either already found the loop's SIMDLEN or there was no SIMDLEN to
3093 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3094 vect_epilogues
= (!simdlen
3095 && loop
->inner
== NULL
3096 && param_vect_epilogues_nomask
3097 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3099 /* For now only allow one epilogue loop, but allow
3100 pick_lowest_cost_p to replace it. */
3101 && (first_loop_vinfo
->epilogue_vinfos
.is_empty ()
3102 || pick_lowest_cost_p
));
3104 /* Commit to first_loop_vinfo if we have no reason to try
3106 if (!simdlen
&& !vect_epilogues
&& !pick_lowest_cost_p
)
3112 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3115 gcc_checking_assert (first_loop_vinfo
== NULL
);
3120 /* Handle the case that the original loop can use partial
3121 vectorization, but want to only adopt it for the epilogue.
3122 The retry should be in the same mode as original. */
3125 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3127 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3128 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
));
3129 if (dump_enabled_p ())
3130 dump_printf_loc (MSG_NOTE
, vect_location
,
3131 "***** Re-trying analysis with same vector mode"
3132 " %s for epilogue with partial vectors.\n",
3133 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3137 if (mode_i
< vector_modes
.length ()
3138 && VECTOR_MODE_P (autodetected_vector_mode
)
3139 && (related_vector_mode (vector_modes
[mode_i
],
3140 GET_MODE_INNER (autodetected_vector_mode
))
3141 == autodetected_vector_mode
)
3142 && (related_vector_mode (autodetected_vector_mode
,
3143 GET_MODE_INNER (vector_modes
[mode_i
]))
3144 == vector_modes
[mode_i
]))
3146 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_NOTE
, vect_location
,
3148 "***** Skipping vector mode %s, which would"
3149 " repeat the analysis for %s\n",
3150 GET_MODE_NAME (vector_modes
[mode_i
]),
3151 GET_MODE_NAME (autodetected_vector_mode
));
3155 if (mode_i
== vector_modes
.length ()
3156 || autodetected_vector_mode
== VOIDmode
)
3159 /* Try the next biggest vector size. */
3160 next_vector_mode
= vector_modes
[mode_i
++];
3161 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE
, vect_location
,
3163 "***** Re-trying analysis with vector mode %s\n",
3164 GET_MODE_NAME (next_vector_mode
));
3167 if (first_loop_vinfo
)
3169 loop
->aux
= (loop_vec_info
) first_loop_vinfo
;
3170 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_NOTE
, vect_location
,
3172 "***** Choosing vector mode %s\n",
3173 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3174 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3175 return first_loop_vinfo
;
3178 return opt_loop_vec_info::propagate_failure (res
);
3181 /* Return true if there is an in-order reduction function for CODE, storing
3182 it in *REDUC_FN if so. */
3185 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
3190 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3198 /* Function reduction_fn_for_scalar_code
3201 CODE - tree_code of a reduction operations.
3204 REDUC_FN - the corresponding internal function to be used to reduce the
3205 vector of partial results into a single scalar result, or IFN_LAST
3206 if the operation is a supported reduction operation, but does not have
3207 such an internal function.
3209 Return FALSE if CODE currently cannot be vectorized as reduction. */
3212 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
3217 *reduc_fn
= IFN_REDUC_MAX
;
3221 *reduc_fn
= IFN_REDUC_MIN
;
3225 *reduc_fn
= IFN_REDUC_PLUS
;
3229 *reduc_fn
= IFN_REDUC_AND
;
3233 *reduc_fn
= IFN_REDUC_IOR
;
3237 *reduc_fn
= IFN_REDUC_XOR
;
3242 *reduc_fn
= IFN_LAST
;
3250 /* If there is a neutral value X such that SLP reduction NODE would not
3251 be affected by the introduction of additional X elements, return that X,
3252 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3253 is the vector type that would hold element X. REDUC_CHAIN is true if
3254 the SLP statements perform a single reduction, false if each statement
3255 performs an independent reduction. */
3258 neutral_op_for_slp_reduction (slp_tree slp_node
, tree vector_type
,
3259 tree_code code
, bool reduc_chain
)
3261 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
3262 stmt_vec_info stmt_vinfo
= stmts
[0];
3263 tree scalar_type
= TREE_TYPE (vector_type
);
3264 class loop
*loop
= gimple_bb (stmt_vinfo
->stmt
)->loop_father
;
3269 case WIDEN_SUM_EXPR
:
3276 return build_zero_cst (scalar_type
);
3279 return build_one_cst (scalar_type
);
3282 return build_all_ones_cst (scalar_type
);
3286 /* For MIN/MAX the initial values are neutral. A reduction chain
3287 has only a single initial value, so that value is neutral for
3290 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
,
3291 loop_preheader_edge (loop
));
3299 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3300 STMT is printed with a message MSG. */
3303 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3305 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3308 /* Return true if we need an in-order reduction for operation CODE
3309 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3310 overflow must wrap. */
3313 needs_fold_left_reduction_p (tree type
, tree_code code
)
3315 /* CHECKME: check for !flag_finite_math_only too? */
3316 if (SCALAR_FLOAT_TYPE_P (type
))
3324 return !flag_associative_math
;
3327 if (INTEGRAL_TYPE_P (type
))
3329 if (!operation_no_trapping_overflow (type
, code
))
3334 if (SAT_FIXED_POINT_TYPE_P (type
))
3340 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3341 has a handled computation expression. Store the main reduction
3342 operation in *CODE. */
3345 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3346 tree loop_arg
, enum tree_code
*code
,
3347 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3349 auto_bitmap visited
;
3350 tree lookfor
= PHI_RESULT (phi
);
3352 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3353 while (USE_FROM_PTR (curr
) != loop_arg
)
3354 curr
= op_iter_next_use (&curri
);
3355 curri
.i
= curri
.numops
;
3358 path
.safe_push (std::make_pair (curri
, curr
));
3359 tree use
= USE_FROM_PTR (curr
);
3362 gimple
*def
= SSA_NAME_DEF_STMT (use
);
3363 if (gimple_nop_p (def
)
3364 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3369 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3373 curr
= op_iter_next_use (&curri
);
3374 /* Skip already visited or non-SSA operands (from iterating
3376 while (curr
!= NULL_USE_OPERAND_P
3377 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3378 || ! bitmap_set_bit (visited
,
3380 (USE_FROM_PTR (curr
)))));
3382 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3383 if (curr
== NULL_USE_OPERAND_P
)
3388 if (gimple_code (def
) == GIMPLE_PHI
)
3389 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3391 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3392 while (curr
!= NULL_USE_OPERAND_P
3393 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3394 || ! bitmap_set_bit (visited
,
3396 (USE_FROM_PTR (curr
)))))
3397 curr
= op_iter_next_use (&curri
);
3398 if (curr
== NULL_USE_OPERAND_P
)
3403 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3405 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3407 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3408 FOR_EACH_VEC_ELT (path
, i
, x
)
3409 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
3410 dump_printf (MSG_NOTE
, "\n");
3413 /* Check whether the reduction path detected is valid. */
3414 bool fail
= path
.length () == 0;
3418 for (unsigned i
= 1; i
< path
.length (); ++i
)
3420 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3421 tree op
= USE_FROM_PTR (path
[i
].second
);
3422 if (! is_gimple_assign (use_stmt
)
3423 /* The following make sure we can compute the operand index
3424 easily plus it mostly disallows chaining via COND_EXPR condition
3426 || (gimple_assign_rhs1_ptr (use_stmt
) != path
[i
].second
->use
3427 && (gimple_num_ops (use_stmt
) <= 2
3428 || gimple_assign_rhs2_ptr (use_stmt
) != path
[i
].second
->use
)
3429 && (gimple_num_ops (use_stmt
) <= 3
3430 || gimple_assign_rhs3_ptr (use_stmt
) != path
[i
].second
->use
)))
3435 tree_code use_code
= gimple_assign_rhs_code (use_stmt
);
3436 if (use_code
== MINUS_EXPR
)
3438 use_code
= PLUS_EXPR
;
3439 /* Track whether we negate the reduction value each iteration. */
3440 if (gimple_assign_rhs2 (use_stmt
) == op
)
3443 if (CONVERT_EXPR_CODE_P (use_code
)
3444 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt
)),
3445 TREE_TYPE (gimple_assign_rhs1 (use_stmt
))))
3447 else if (*code
== ERROR_MARK
)
3450 sign
= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
)));
3452 else if (use_code
!= *code
)
3457 else if ((use_code
== MIN_EXPR
3458 || use_code
== MAX_EXPR
)
3459 && sign
!= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
))))
3464 /* Check there's only a single stmt the op is used on. For the
3465 not value-changing tail and the last stmt allow out-of-loop uses.
3466 ??? We could relax this and handle arbitrary live stmts by
3467 forcing a scalar epilogue for example. */
3468 imm_use_iterator imm_iter
;
3469 gimple
*op_use_stmt
;
3471 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
)
3472 if (!is_gimple_debug (op_use_stmt
)
3473 && (*code
!= ERROR_MARK
3474 || flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
))))
3476 /* We want to allow x + x but not x < 1 ? x : 2. */
3477 if (is_gimple_assign (op_use_stmt
)
3478 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3480 use_operand_p use_p
;
3481 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3493 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3497 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3498 tree loop_arg
, enum tree_code code
)
3500 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3501 enum tree_code code_
;
3502 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3508 /* Function vect_is_simple_reduction
3510 (1) Detect a cross-iteration def-use cycle that represents a simple
3511 reduction computation. We look for the following pattern:
3516 a2 = operation (a3, a1)
3523 a2 = operation (a3, a1)
3526 1. operation is commutative and associative and it is safe to
3527 change the order of the computation
3528 2. no uses for a2 in the loop (a2 is used out of the loop)
3529 3. no uses of a1 in the loop besides the reduction operation
3530 4. no uses of a1 outside the loop.
3532 Conditions 1,4 are tested here.
3533 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3535 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3538 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3542 inner loop (def of a3)
3545 (4) Detect condition expressions, ie:
3546 for (int i = 0; i < N; i++)
3552 static stmt_vec_info
3553 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3554 bool *double_reduc
, bool *reduc_chain_p
)
3556 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3557 gimple
*phi_use_stmt
= NULL
;
3558 imm_use_iterator imm_iter
;
3559 use_operand_p use_p
;
3561 *double_reduc
= false;
3562 *reduc_chain_p
= false;
3563 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3565 tree phi_name
= PHI_RESULT (phi
);
3566 /* ??? If there are no uses of the PHI result the inner loop reduction
3567 won't be detected as possibly double-reduction by vectorizable_reduction
3568 because that tries to walk the PHI arg from the preheader edge which
3569 can be constant. See PR60382. */
3570 if (has_zero_uses (phi_name
))
3572 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3573 unsigned nphi_def_loop_uses
= 0;
3574 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3576 gimple
*use_stmt
= USE_STMT (use_p
);
3577 if (is_gimple_debug (use_stmt
))
3580 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3584 "intermediate value used outside loop.\n");
3589 nphi_def_loop_uses
++;
3590 phi_use_stmt
= use_stmt
;
3593 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3594 if (TREE_CODE (latch_def
) != SSA_NAME
)
3596 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3598 "reduction: not ssa_name: %T\n", latch_def
);
3602 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3604 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3607 bool nested_in_vect_loop
3608 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3609 unsigned nlatch_def_loop_uses
= 0;
3610 auto_vec
<gphi
*, 3> lcphis
;
3611 bool inner_loop_of_double_reduc
= false;
3612 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3614 gimple
*use_stmt
= USE_STMT (use_p
);
3615 if (is_gimple_debug (use_stmt
))
3617 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3618 nlatch_def_loop_uses
++;
3621 /* We can have more than one loop-closed PHI. */
3622 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3623 if (nested_in_vect_loop
3624 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3625 == vect_double_reduction_def
))
3626 inner_loop_of_double_reduc
= true;
3630 /* If we are vectorizing an inner reduction we are executing that
3631 in the original order only in case we are not dealing with a
3632 double reduction. */
3633 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3635 if (dump_enabled_p ())
3636 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3637 "detected nested cycle: ");
3638 return def_stmt_info
;
3641 /* If this isn't a nested cycle or if the nested cycle reduction value
3642 is used ouside of the inner loop we cannot handle uses of the reduction
3644 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3648 "reduction used in loop.\n");
3652 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3653 defined in the inner loop. */
3654 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3656 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3657 if (gimple_phi_num_args (def_stmt
) != 1
3658 || TREE_CODE (op1
) != SSA_NAME
)
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3662 "unsupported phi node definition.\n");
3667 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3668 if (gimple_bb (def1
)
3669 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3671 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3672 && is_gimple_assign (def1
)
3673 && is_a
<gphi
*> (phi_use_stmt
)
3674 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3676 if (dump_enabled_p ())
3677 report_vect_op (MSG_NOTE
, def_stmt
,
3678 "detected double reduction: ");
3680 *double_reduc
= true;
3681 return def_stmt_info
;
3687 /* Look for the expression computing latch_def from then loop PHI result. */
3688 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3689 enum tree_code code
;
3690 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3693 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3694 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3695 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3697 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3698 reduction chain for which the additional restriction is that
3699 all operations in the chain are the same. */
3700 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3702 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3703 for (i
= path
.length () - 1; i
>= 1; --i
)
3705 gimple
*stmt
= USE_STMT (path
[i
].second
);
3706 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3707 STMT_VINFO_REDUC_IDX (stmt_info
)
3708 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (stmt
);
3709 enum tree_code stmt_code
= gimple_assign_rhs_code (stmt
);
3710 bool leading_conversion
= (CONVERT_EXPR_CODE_P (stmt_code
)
3711 && (i
== 1 || i
== path
.length () - 1));
3712 if ((stmt_code
!= code
&& !leading_conversion
)
3713 /* We can only handle the final value in epilogue
3714 generation for reduction chains. */
3715 || (i
!= 1 && !has_single_use (gimple_assign_lhs (stmt
))))
3716 is_slp_reduc
= false;
3717 /* For reduction chains we support a trailing/leading
3718 conversions. We do not store those in the actual chain. */
3719 if (leading_conversion
)
3721 reduc_chain
.safe_push (stmt_info
);
3723 if (is_slp_reduc
&& reduc_chain
.length () > 1)
3725 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3727 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3728 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3730 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3731 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3733 /* Save the chain for further analysis in SLP detection. */
3734 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3735 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3737 *reduc_chain_p
= true;
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE
, vect_location
,
3740 "reduction: detected reduction chain\n");
3742 else if (dump_enabled_p ())
3743 dump_printf_loc (MSG_NOTE
, vect_location
,
3744 "reduction: detected reduction\n");
3746 return def_stmt_info
;
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE
, vect_location
,
3751 "reduction: unknown pattern\n");
3756 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3757 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3758 or -1 if not known. */
3761 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
3763 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3764 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_NOTE
, vect_location
,
3768 "cost model: epilogue peel iters set to vf/2 "
3769 "because loop iterations are unknown .\n");
3770 return assumed_vf
/ 2;
3774 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3775 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
3776 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3777 /* If we need to peel for gaps, but no peeling is required, we have to
3778 peel VF iterations. */
3779 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
3780 peel_iters_epilogue
= assumed_vf
;
3781 return peel_iters_epilogue
;
3785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3788 int *peel_iters_epilogue
,
3789 stmt_vector_for_cost
*scalar_cost_vec
,
3790 stmt_vector_for_cost
*prologue_cost_vec
,
3791 stmt_vector_for_cost
*epilogue_cost_vec
)
3795 *peel_iters_epilogue
3796 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
3798 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3800 /* If peeled iterations are known but number of scalar loop
3801 iterations are unknown, count a taken branch per peeled loop. */
3802 if (peel_iters_prologue
> 0)
3803 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3804 NULL
, NULL_TREE
, 0, vect_prologue
);
3805 if (*peel_iters_epilogue
> 0)
3806 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3807 NULL
, NULL_TREE
, 0, vect_epilogue
);
3810 stmt_info_for_cost
*si
;
3812 if (peel_iters_prologue
)
3813 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3814 retval
+= record_stmt_cost (prologue_cost_vec
,
3815 si
->count
* peel_iters_prologue
,
3816 si
->kind
, si
->stmt_info
, si
->misalign
,
3818 if (*peel_iters_epilogue
)
3819 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3820 retval
+= record_stmt_cost (epilogue_cost_vec
,
3821 si
->count
* *peel_iters_epilogue
,
3822 si
->kind
, si
->stmt_info
, si
->misalign
,
3828 /* Function vect_estimate_min_profitable_iters
3830 Return the number of iterations required for the vector version of the
3831 loop to be profitable relative to the cost of the scalar version of the
3834 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3835 of iterations for vectorization. -1 value means loop vectorization
3836 is not profitable. This returned value may be used for dynamic
3837 profitability check.
3839 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3840 for static check against estimated number of iterations. */
3843 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3844 int *ret_min_profitable_niters
,
3845 int *ret_min_profitable_estimate
)
3847 int min_profitable_iters
;
3848 int min_profitable_estimate
;
3849 int peel_iters_prologue
;
3850 int peel_iters_epilogue
;
3851 unsigned vec_inside_cost
= 0;
3852 int vec_outside_cost
= 0;
3853 unsigned vec_prologue_cost
= 0;
3854 unsigned vec_epilogue_cost
= 0;
3855 int scalar_single_iter_cost
= 0;
3856 int scalar_outside_cost
= 0;
3857 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3858 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3859 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3861 /* Cost model disabled. */
3862 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3866 *ret_min_profitable_niters
= 0;
3867 *ret_min_profitable_estimate
= 0;
3871 /* Requires loop versioning tests to handle misalignment. */
3872 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3874 /* FIXME: Make cost depend on complexity of individual check. */
3875 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3876 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3877 NULL
, NULL_TREE
, 0, vect_prologue
);
3878 if (dump_enabled_p ())
3879 dump_printf (MSG_NOTE
,
3880 "cost model: Adding cost of checks for loop "
3881 "versioning to treat misalignment.\n");
3884 /* Requires loop versioning with alias checks. */
3885 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3887 /* FIXME: Make cost depend on complexity of individual check. */
3888 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3889 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3890 NULL
, NULL_TREE
, 0, vect_prologue
);
3891 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3893 /* Count LEN - 1 ANDs and LEN comparisons. */
3894 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
* 2 - 1,
3895 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3896 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3899 /* Count LEN - 1 ANDs and LEN comparisons. */
3900 unsigned int nstmts
= len
* 2 - 1;
3901 /* +1 for each bias that needs adding. */
3902 for (unsigned int i
= 0; i
< len
; ++i
)
3903 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3905 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, nstmts
,
3906 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3908 if (dump_enabled_p ())
3909 dump_printf (MSG_NOTE
,
3910 "cost model: Adding cost of checks for loop "
3911 "versioning aliasing.\n");
3914 /* Requires loop versioning with niter checks. */
3915 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3917 /* FIXME: Make cost depend on complexity of individual check. */
3918 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, vector_stmt
,
3919 NULL
, NULL_TREE
, 0, vect_prologue
);
3920 if (dump_enabled_p ())
3921 dump_printf (MSG_NOTE
,
3922 "cost model: Adding cost of checks for loop "
3923 "versioning niters.\n");
3926 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3927 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
3928 NULL
, NULL_TREE
, 0, vect_prologue
);
3930 /* Count statements in scalar loop. Using this as scalar cost for a single
3933 TODO: Add outer loop support.
3935 TODO: Consider assigning different costs to different scalar
3938 scalar_single_iter_cost
3939 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3941 /* Add additional cost for the peeled instructions in prologue and epilogue
3942 loop. (For fully-masked loops there will be no peeling.)
3944 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3945 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3947 TODO: Build an expression that represents peel_iters for prologue and
3948 epilogue to be used in a run-time test. */
3950 bool prologue_need_br_taken_cost
= false;
3951 bool prologue_need_br_not_taken_cost
= false;
3953 /* Calculate peel_iters_prologue. */
3954 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
3955 peel_iters_prologue
= 0;
3958 peel_iters_prologue
= assumed_vf
/ 2;
3959 if (dump_enabled_p ())
3960 dump_printf (MSG_NOTE
, "cost model: "
3961 "prologue peel iters set to vf/2.\n");
3963 /* If peeled iterations are unknown, count a taken branch and a not taken
3964 branch per peeled loop. Even if scalar loop iterations are known,
3965 vector iterations are not known since peeled prologue iterations are
3966 not known. Hence guards remain the same. */
3967 prologue_need_br_taken_cost
= true;
3968 prologue_need_br_not_taken_cost
= true;
3972 peel_iters_prologue
= npeel
;
3973 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
3974 /* If peeled iterations are known but number of scalar loop
3975 iterations are unknown, count a taken branch per peeled loop. */
3976 prologue_need_br_taken_cost
= true;
3979 bool epilogue_need_br_taken_cost
= false;
3980 bool epilogue_need_br_not_taken_cost
= false;
3982 /* Calculate peel_iters_epilogue. */
3983 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3984 /* We need to peel exactly one iteration for gaps. */
3985 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
3988 /* If peeling for alignment is unknown, loop bound of main loop
3990 peel_iters_epilogue
= assumed_vf
/ 2;
3991 if (dump_enabled_p ())
3992 dump_printf (MSG_NOTE
, "cost model: "
3993 "epilogue peel iters set to vf/2 because "
3994 "peeling for alignment is unknown.\n");
3996 /* See the same reason above in peel_iters_prologue calculation. */
3997 epilogue_need_br_taken_cost
= true;
3998 epilogue_need_br_not_taken_cost
= true;
4002 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4003 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4004 /* If peeled iterations are known but number of scalar loop
4005 iterations are unknown, count a taken branch per peeled loop. */
4006 epilogue_need_br_taken_cost
= true;
4009 stmt_info_for_cost
*si
;
4011 /* Add costs associated with peel_iters_prologue. */
4012 if (peel_iters_prologue
)
4013 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4015 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
4016 si
->count
* peel_iters_prologue
, si
->kind
,
4017 si
->stmt_info
, si
->vectype
, si
->misalign
,
4021 /* Add costs associated with peel_iters_epilogue. */
4022 if (peel_iters_epilogue
)
4023 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4025 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
4026 si
->count
* peel_iters_epilogue
, si
->kind
,
4027 si
->stmt_info
, si
->vectype
, si
->misalign
,
4031 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4033 if (prologue_need_br_taken_cost
)
4034 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
4035 NULL
, NULL_TREE
, 0, vect_prologue
);
4037 if (prologue_need_br_not_taken_cost
)
4038 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1,
4039 cond_branch_not_taken
, NULL
, NULL_TREE
, 0,
4042 if (epilogue_need_br_taken_cost
)
4043 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
4044 NULL
, NULL_TREE
, 0, vect_epilogue
);
4046 if (epilogue_need_br_not_taken_cost
)
4047 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1,
4048 cond_branch_not_taken
, NULL
, NULL_TREE
, 0,
4051 /* Take care of special costs for rgroup controls of partial vectors. */
4052 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
4054 /* Calculate how many masks we need to generate. */
4055 unsigned int num_masks
= 0;
4056 rgroup_controls
*rgm
;
4057 unsigned int num_vectors_m1
;
4058 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
4060 num_masks
+= num_vectors_m1
+ 1;
4061 gcc_assert (num_masks
> 0);
4063 /* In the worst case, we need to generate each mask in the prologue
4064 and in the loop body. One of the loop body mask instructions
4065 replaces the comparison in the scalar loop, and since we don't
4066 count the scalar comparison against the scalar body, we shouldn't
4067 count that vector instruction against the vector body either.
4069 Sometimes we can use unpacks instead of generating prologue
4070 masks and sometimes the prologue mask will fold to a constant,
4071 so the actual prologue cost might be smaller. However, it's
4072 simpler and safer to use the worst-case cost; if this ends up
4073 being the tie-breaker between vectorizing or not, then it's
4074 probably better not to vectorize. */
4075 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, num_masks
,
4076 vector_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
4077 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, num_masks
- 1,
4078 vector_stmt
, NULL
, NULL_TREE
, 0, vect_body
);
4080 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4082 /* Referring to the functions vect_set_loop_condition_partial_vectors
4083 and vect_set_loop_controls_directly, we need to generate each
4084 length in the prologue and in the loop body if required. Although
4085 there are some possible optimizations, we consider the worst case
4088 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4090 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4091 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4093 /* Calculate how many statements to be added. */
4094 unsigned int prologue_stmts
= 0;
4095 unsigned int body_stmts
= 0;
4097 rgroup_controls
*rgc
;
4098 unsigned int num_vectors_m1
;
4099 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4102 /* May need one SHIFT for nitems_total computation. */
4103 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4104 if (nitems
!= 1 && !niters_known_p
)
4105 prologue_stmts
+= 1;
4107 /* May need one MAX and one MINUS for wrap around. */
4108 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4109 prologue_stmts
+= 2;
4111 /* Need one MAX and one MINUS for each batch limit excepting for
4113 prologue_stmts
+= num_vectors_m1
* 2;
4115 unsigned int num_vectors
= num_vectors_m1
+ 1;
4117 /* Need to set up lengths in prologue, only one MIN required
4118 for each since start index is zero. */
4119 prologue_stmts
+= num_vectors
;
4121 /* Each may need two MINs and one MINUS to update lengths in body
4122 for next iteration. */
4124 body_stmts
+= 3 * num_vectors
;
4127 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, prologue_stmts
,
4128 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
4129 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, body_stmts
,
4130 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_body
);
4133 /* FORNOW: The scalar outside cost is incremented in one of the
4136 1. The vectorizer checks for alignment and aliasing and generates
4137 a condition that allows dynamic vectorization. A cost model
4138 check is ANDED with the versioning condition. Hence scalar code
4139 path now has the added cost of the versioning check.
4141 if (cost > th & versioning_check)
4144 Hence run-time scalar is incremented by not-taken branch cost.
4146 2. The vectorizer then checks if a prologue is required. If the
4147 cost model check was not done before during versioning, it has to
4148 be done before the prologue check.
4151 prologue = scalar_iters
4156 if (prologue == num_iters)
4159 Hence the run-time scalar cost is incremented by a taken branch,
4160 plus a not-taken branch, plus a taken branch cost.
4162 3. The vectorizer then checks if an epilogue is required. If the
4163 cost model check was not done before during prologue check, it
4164 has to be done with the epilogue check.
4170 if (prologue == num_iters)
4173 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4176 Hence the run-time scalar cost should be incremented by 2 taken
4179 TODO: The back end may reorder the BBS's differently and reverse
4180 conditions/branch directions. Change the estimates below to
4181 something more reasonable. */
4183 /* If the number of iterations is known and we do not do versioning, we can
4184 decide whether to vectorize at compile time. Hence the scalar version
4185 do not carry cost model guard costs. */
4186 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4187 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4189 /* Cost model check occurs at versioning. */
4190 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4191 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4194 /* Cost model check occurs at prologue generation. */
4195 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4196 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4197 + vect_get_stmt_cost (cond_branch_not_taken
);
4198 /* Cost model check occurs at epilogue generation. */
4200 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4204 /* Complete the target-specific cost calculations. */
4205 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
4206 &vec_inside_cost
, &vec_epilogue_cost
);
4208 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
4210 /* Stash the costs so that we can compare two loop_vec_infos. */
4211 loop_vinfo
->vec_inside_cost
= vec_inside_cost
;
4212 loop_vinfo
->vec_outside_cost
= vec_outside_cost
;
4214 if (dump_enabled_p ())
4216 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4217 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
4219 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
4221 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
4223 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
4224 scalar_single_iter_cost
);
4225 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
4226 scalar_outside_cost
);
4227 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
4229 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
4230 peel_iters_prologue
);
4231 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
4232 peel_iters_epilogue
);
4235 /* Calculate number of iterations required to make the vector version
4236 profitable, relative to the loop bodies only. The following condition
4238 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4240 SIC = scalar iteration cost, VIC = vector iteration cost,
4241 VOC = vector outside cost, VF = vectorization factor,
4242 NPEEL = prologue iterations + epilogue iterations,
4243 SOC = scalar outside cost for run time cost model check. */
4245 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
4247 if (saving_per_viter
<= 0)
4249 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
4250 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
4251 "vectorization did not happen for a simd loop");
4253 if (dump_enabled_p ())
4254 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4255 "cost model: the vector iteration cost = %d "
4256 "divided by the scalar iteration cost = %d "
4257 "is greater or equal to the vectorization factor = %d"
4259 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4260 *ret_min_profitable_niters
= -1;
4261 *ret_min_profitable_estimate
= -1;
4265 /* ??? The "if" arm is written to handle all cases; see below for what
4266 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4267 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4269 /* Rewriting the condition above in terms of the number of
4270 vector iterations (vniters) rather than the number of
4271 scalar iterations (niters) gives:
4273 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4275 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4277 For integer N, X and Y when X > 0:
4279 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4280 int outside_overhead
= (vec_outside_cost
4281 - scalar_single_iter_cost
* peel_iters_prologue
4282 - scalar_single_iter_cost
* peel_iters_epilogue
4283 - scalar_outside_cost
);
4284 /* We're only interested in cases that require at least one
4285 vector iteration. */
4286 int min_vec_niters
= 1;
4287 if (outside_overhead
> 0)
4288 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4290 if (dump_enabled_p ())
4291 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
4294 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4296 /* Now that we know the minimum number of vector iterations,
4297 find the minimum niters for which the scalar cost is larger:
4299 SIC * niters > VIC * vniters + VOC - SOC
4301 We know that the minimum niters is no more than
4302 vniters * VF + NPEEL, but it might be (and often is) less
4303 than that if a partial vector iteration is cheaper than the
4304 equivalent scalar code. */
4305 int threshold
= (vec_inside_cost
* min_vec_niters
4307 - scalar_outside_cost
);
4309 min_profitable_iters
= 1;
4311 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
4314 /* Convert the number of vector iterations into a number of
4315 scalar iterations. */
4316 min_profitable_iters
= (min_vec_niters
* assumed_vf
4317 + peel_iters_prologue
4318 + peel_iters_epilogue
);
4322 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
4324 - vec_inside_cost
* peel_iters_prologue
4325 - vec_inside_cost
* peel_iters_epilogue
);
4326 if (min_profitable_iters
<= 0)
4327 min_profitable_iters
= 0;
4330 min_profitable_iters
/= saving_per_viter
;
4332 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
4333 <= (((int) vec_inside_cost
* min_profitable_iters
)
4334 + (((int) vec_outside_cost
- scalar_outside_cost
)
4336 min_profitable_iters
++;
4340 if (dump_enabled_p ())
4341 dump_printf (MSG_NOTE
,
4342 " Calculated minimum iters for profitability: %d\n",
4343 min_profitable_iters
);
4345 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
4346 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4347 /* We want the vectorized loop to execute at least once. */
4348 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4349 else if (min_profitable_iters
< peel_iters_prologue
)
4350 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4351 vectorized loop executes at least once. */
4352 min_profitable_iters
= peel_iters_prologue
;
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_NOTE
, vect_location
,
4356 " Runtime profitability threshold = %d\n",
4357 min_profitable_iters
);
4359 *ret_min_profitable_niters
= min_profitable_iters
;
4361 /* Calculate number of iterations required to make the vector version
4362 profitable, relative to the loop bodies only.
4364 Non-vectorized variant is SIC * niters and it must win over vector
4365 variant on the expected loop trip count. The following condition must hold true:
4366 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4368 if (vec_outside_cost
<= 0)
4369 min_profitable_estimate
= 0;
4370 /* ??? This "else if" arm is written to handle all cases; see below for
4371 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4372 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4374 /* This is a repeat of the code above, but with + SOC rather
4376 int outside_overhead
= (vec_outside_cost
4377 - scalar_single_iter_cost
* peel_iters_prologue
4378 - scalar_single_iter_cost
* peel_iters_epilogue
4379 + scalar_outside_cost
);
4380 int min_vec_niters
= 1;
4381 if (outside_overhead
> 0)
4382 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4384 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4386 int threshold
= (vec_inside_cost
* min_vec_niters
4388 + scalar_outside_cost
);
4389 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
4392 min_profitable_estimate
= (min_vec_niters
* assumed_vf
4393 + peel_iters_prologue
4394 + peel_iters_epilogue
);
4398 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4400 - vec_inside_cost
* peel_iters_prologue
4401 - vec_inside_cost
* peel_iters_epilogue
)
4402 / ((scalar_single_iter_cost
* assumed_vf
)
4405 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4406 if (dump_enabled_p ())
4407 dump_printf_loc (MSG_NOTE
, vect_location
,
4408 " Static estimate profitability threshold = %d\n",
4409 min_profitable_estimate
);
4411 *ret_min_profitable_estimate
= min_profitable_estimate
;
4414 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4415 vector elements (not bits) for a vector with NELT elements. */
4417 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4418 vec_perm_builder
*sel
)
4420 /* The encoding is a single stepped pattern. Any wrap-around is handled
4421 by vec_perm_indices. */
4422 sel
->new_vector (nelt
, 1, 3);
4423 for (unsigned int i
= 0; i
< 3; i
++)
4424 sel
->quick_push (i
+ offset
);
4427 /* Checks whether the target supports whole-vector shifts for vectors of mode
4428 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4429 it supports vec_perm_const with masks for all necessary shift amounts. */
4431 have_whole_vector_shift (machine_mode mode
)
4433 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4436 /* Variable-length vectors should be handled via the optab. */
4438 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4441 vec_perm_builder sel
;
4442 vec_perm_indices indices
;
4443 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4445 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4446 indices
.new_vector (sel
, 2, nelt
);
4447 if (!can_vec_perm_const_p (mode
, indices
, false))
4453 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4454 functions. Design better to avoid maintenance issues. */
4456 /* Function vect_model_reduction_cost.
4458 Models cost for a reduction operation, including the vector ops
4459 generated within the strip-mine loop in some cases, the initial
4460 definition before the loop, and the epilogue code that must be generated. */
4463 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
4464 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4465 vect_reduction_type reduction_type
,
4466 int ncopies
, stmt_vector_for_cost
*cost_vec
)
4468 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
4469 enum tree_code code
;
4473 class loop
*loop
= NULL
;
4476 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4478 /* Condition reductions generate two reductions in the loop. */
4479 if (reduction_type
== COND_REDUCTION
)
4482 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4483 mode
= TYPE_MODE (vectype
);
4484 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4486 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
4488 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
4489 /* No extra instructions are needed in the prologue. The loop body
4490 operations are costed in vectorizable_condition. */
4492 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
4494 /* No extra instructions needed in the prologue. */
4497 if (reduc_fn
!= IFN_LAST
)
4498 /* Count one reduction-like operation per vector. */
4499 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
4500 stmt_info
, 0, vect_body
);
4503 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4504 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4505 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
4506 vec_to_scalar
, stmt_info
, 0,
4508 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
4509 scalar_stmt
, stmt_info
, 0,
4515 /* Add in cost for initial definition.
4516 For cond reduction we have four vectors: initial index, step,
4517 initial result of the data reduction, initial value of the index
4519 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
4520 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
4521 scalar_to_vec
, stmt_info
, 0,
4525 /* Determine cost of epilogue code.
4527 We have a reduction operator that will reduce the vector in one statement.
4528 Also requires scalar extract. */
4530 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4532 if (reduc_fn
!= IFN_LAST
)
4534 if (reduction_type
== COND_REDUCTION
)
4536 /* An EQ stmt and an COND_EXPR stmt. */
4537 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4538 vector_stmt
, stmt_info
, 0,
4540 /* Reduction of the max index and a reduction of the found
4542 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4543 vec_to_scalar
, stmt_info
, 0,
4545 /* A broadcast of the max value. */
4546 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4547 scalar_to_vec
, stmt_info
, 0,
4552 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4553 stmt_info
, 0, vect_epilogue
);
4554 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4555 vec_to_scalar
, stmt_info
, 0,
4559 else if (reduction_type
== COND_REDUCTION
)
4561 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4562 /* Extraction of scalar elements. */
4563 epilogue_cost
+= record_stmt_cost (cost_vec
,
4564 2 * estimated_nunits
,
4565 vec_to_scalar
, stmt_info
, 0,
4567 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4568 epilogue_cost
+= record_stmt_cost (cost_vec
,
4569 2 * estimated_nunits
- 3,
4570 scalar_stmt
, stmt_info
, 0,
4573 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4574 || reduction_type
== FOLD_LEFT_REDUCTION
)
4575 /* No extra instructions need in the epilogue. */
4579 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4581 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info
->stmt
)));
4582 int element_bitsize
= tree_to_uhwi (bitsize
);
4583 int nelements
= vec_size_in_bits
/ element_bitsize
;
4585 if (code
== COND_EXPR
)
4588 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4590 /* We have a whole vector shift available. */
4591 if (optab
!= unknown_optab
4592 && VECTOR_MODE_P (mode
)
4593 && optab_handler (optab
, mode
) != CODE_FOR_nothing
4594 && have_whole_vector_shift (mode
))
4596 /* Final reduction via vector shifts and the reduction operator.
4597 Also requires scalar extract. */
4598 epilogue_cost
+= record_stmt_cost (cost_vec
,
4599 exact_log2 (nelements
) * 2,
4600 vector_stmt
, stmt_info
, 0,
4602 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4603 vec_to_scalar
, stmt_info
, 0,
4607 /* Use extracts and reduction op for final reduction. For N
4608 elements, we have N extracts and N-1 reduction ops. */
4609 epilogue_cost
+= record_stmt_cost (cost_vec
,
4610 nelements
+ nelements
- 1,
4611 vector_stmt
, stmt_info
, 0,
4616 if (dump_enabled_p ())
4617 dump_printf (MSG_NOTE
,
4618 "vect_model_reduction_cost: inside_cost = %d, "
4619 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4620 prologue_cost
, epilogue_cost
);
4625 /* Function get_initial_def_for_reduction
4628 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4629 INIT_VAL - the initial value of the reduction variable
4632 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4633 of the reduction (used for adjusting the epilog - see below).
4634 Return a vector variable, initialized according to the operation that
4635 STMT_VINFO performs. This vector will be used as the initial value
4636 of the vector of partial results.
4638 Option1 (adjust in epilog): Initialize the vector as follows:
4639 add/bit or/xor: [0,0,...,0,0]
4640 mult/bit and: [1,1,...,1,1]
4641 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4642 and when necessary (e.g. add/mult case) let the caller know
4643 that it needs to adjust the result by init_val.
4645 Option2: Initialize the vector as follows:
4646 add/bit or/xor: [init_val,0,0,...,0]
4647 mult/bit and: [init_val,1,1,...,1]
4648 min/max/cond_expr: [init_val,init_val,...,init_val]
4649 and no adjustments are needed.
4651 For example, for the following code:
4657 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4658 For a vector of 4 units, we want to return either [0,0,0,init_val],
4659 or [0,0,0,0] and let the caller know that it needs to adjust
4660 the result at the end by 'init_val'.
4662 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4663 initialization vector is simpler (same element in all entries), if
4664 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4666 A cost model should help decide between these two schemes. */
4669 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4670 stmt_vec_info stmt_vinfo
,
4671 enum tree_code code
, tree init_val
,
4672 tree
*adjustment_def
)
4674 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4675 tree scalar_type
= TREE_TYPE (init_val
);
4676 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4679 REAL_VALUE_TYPE real_init_val
= dconst0
;
4680 int int_init_val
= 0;
4681 gimple_seq stmts
= NULL
;
4683 gcc_assert (vectype
);
4685 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4686 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4688 gcc_assert (nested_in_vect_loop_p (loop
, stmt_vinfo
)
4689 || loop
== (gimple_bb (stmt_vinfo
->stmt
))->loop_father
);
4691 /* ADJUSTMENT_DEF is NULL when called from
4692 vect_create_epilog_for_reduction to vectorize double reduction. */
4694 *adjustment_def
= NULL
;
4698 case WIDEN_SUM_EXPR
:
4708 if (code
== MULT_EXPR
)
4710 real_init_val
= dconst1
;
4714 if (code
== BIT_AND_EXPR
)
4717 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4718 def_for_init
= build_real (scalar_type
, real_init_val
);
4720 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4722 if (adjustment_def
|| operand_equal_p (def_for_init
, init_val
, 0))
4724 /* Option1: the first element is '0' or '1' as well. */
4725 if (!operand_equal_p (def_for_init
, init_val
, 0))
4726 *adjustment_def
= init_val
;
4727 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4730 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4732 /* Option2 (variable length): the first element is INIT_VAL. */
4733 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4735 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4736 vectype
, init_def
, init_val
);
4740 /* Option2: the first element is INIT_VAL. */
4741 tree_vector_builder
elts (vectype
, 1, 2);
4742 elts
.quick_push (init_val
);
4743 elts
.quick_push (def_for_init
);
4744 init_def
= gimple_build_vector (&stmts
, &elts
);
4753 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4754 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4763 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4767 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4768 NUMBER_OF_VECTORS is the number of vector defs to create.
4769 If NEUTRAL_OP is nonnull, introducing extra elements of that
4770 value will not change the result. */
4773 get_initial_defs_for_reduction (vec_info
*vinfo
,
4775 vec
<tree
> *vec_oprnds
,
4776 unsigned int number_of_vectors
,
4777 bool reduc_chain
, tree neutral_op
)
4779 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4780 stmt_vec_info stmt_vinfo
= stmts
[0];
4781 unsigned HOST_WIDE_INT nunits
;
4782 unsigned j
, number_of_places_left_in_vector
;
4784 unsigned int group_size
= stmts
.length ();
4788 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4790 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4792 loop
= (gimple_bb (stmt_vinfo
->stmt
))->loop_father
;
4794 edge pe
= loop_preheader_edge (loop
);
4796 gcc_assert (!reduc_chain
|| neutral_op
);
4798 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4799 created vectors. It is greater than 1 if unrolling is performed.
4801 For example, we have two scalar operands, s1 and s2 (e.g., group of
4802 strided accesses of size two), while NUNITS is four (i.e., four scalars
4803 of this type can be packed in a vector). The output vector will contain
4804 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4807 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4808 vectors containing the operands.
4810 For example, NUNITS is four as before, and the group size is 8
4811 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4812 {s5, s6, s7, s8}. */
4814 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4815 nunits
= group_size
;
4817 number_of_places_left_in_vector
= nunits
;
4818 bool constant_p
= true;
4819 tree_vector_builder
elts (vector_type
, nunits
, 1);
4820 elts
.quick_grow (nunits
);
4821 gimple_seq ctor_seq
= NULL
;
4822 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4826 stmt_vinfo
= stmts
[i
];
4828 /* Get the def before the loop. In reduction chain we have only
4829 one initial value. Else we have as many as PHIs in the group. */
4831 op
= j
!= 0 ? neutral_op
: PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4832 else if (((vec_oprnds
->length () + 1) * nunits
4833 - number_of_places_left_in_vector
>= group_size
)
4837 op
= PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4839 /* Create 'vect_ = {op0,op1,...,opn}'. */
4840 number_of_places_left_in_vector
--;
4841 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4842 if (!CONSTANT_CLASS_P (op
))
4845 if (number_of_places_left_in_vector
== 0)
4848 if (constant_p
&& !neutral_op
4849 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4850 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4851 /* Build the vector directly from ELTS. */
4852 init
= gimple_build_vector (&ctor_seq
, &elts
);
4853 else if (neutral_op
)
4855 /* Build a vector of the neutral value and shift the
4856 other elements into place. */
4857 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4860 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4865 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4866 vector_type
, init
, elts
[k
]);
4871 /* First time round, duplicate ELTS to fill the
4872 required number of vectors. */
4873 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
, elts
,
4874 number_of_vectors
, *vec_oprnds
);
4877 vec_oprnds
->quick_push (init
);
4879 number_of_places_left_in_vector
= nunits
;
4880 elts
.new_vector (vector_type
, nunits
, 1);
4881 elts
.quick_grow (nunits
);
4885 if (ctor_seq
!= NULL
)
4886 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4889 /* For a statement STMT_INFO taking part in a reduction operation return
4890 the stmt_vec_info the meta information is stored on. */
4893 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
4895 stmt_info
= vect_orig_stmt (stmt_info
);
4896 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
4897 if (!is_a
<gphi
*> (stmt_info
->stmt
)
4898 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
4899 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4900 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
4901 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4903 if (gimple_phi_num_args (phi
) == 1)
4904 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4906 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
4908 edge pe
= loop_preheader_edge (gimple_bb (phi
)->loop_father
);
4910 = vinfo
->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi
, pe
));
4911 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
4917 /* Function vect_create_epilog_for_reduction
4919 Create code at the loop-epilog to finalize the result of a reduction
4922 STMT_INFO is the scalar reduction stmt that is being vectorized.
4923 SLP_NODE is an SLP node containing a group of reduction statements. The
4924 first one in this group is STMT_INFO.
4925 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4926 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4930 1. Completes the reduction def-use cycles.
4931 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4932 by calling the function specified by REDUC_FN if available, or by
4933 other means (whole-vector shifts or a scalar loop).
4934 The function also creates a new phi node at the loop exit to preserve
4935 loop-closed form, as illustrated below.
4937 The flow at the entry to this function:
4940 vec_def = phi <vec_init, null> # REDUCTION_PHI
4941 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4942 s_loop = scalar_stmt # (scalar) STMT_INFO
4944 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4948 The above is transformed by this function into:
4951 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4952 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4953 s_loop = scalar_stmt # (scalar) STMT_INFO
4955 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4956 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4957 v_out2 = reduce <v_out1>
4958 s_out3 = extract_field <v_out2, 0>
4959 s_out4 = adjust_result <s_out3>
4965 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
4966 stmt_vec_info stmt_info
,
4968 slp_instance slp_node_instance
)
4970 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
4971 gcc_assert (reduc_info
->is_reduc_info
);
4972 /* For double reductions we need to get at the inner loop reduction
4973 stmt which has the meta info attached. Our stmt_info is that of the
4974 loop-closed PHI of the inner loop which we remember as
4975 def for the reduction PHI generation. */
4976 bool double_reduc
= false;
4977 stmt_vec_info rdef_info
= stmt_info
;
4978 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4980 gcc_assert (!slp_node
);
4981 double_reduc
= true;
4982 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
4983 (stmt_info
->stmt
, 0));
4984 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
4986 gphi
*reduc_def_stmt
4987 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
4988 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
4989 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
4992 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4993 basic_block exit_bb
;
4996 gimple
*new_phi
= NULL
, *phi
;
4997 gimple_stmt_iterator exit_gsi
;
4998 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
4999 gimple
*epilog_stmt
= NULL
;
5003 tree orig_name
, scalar_result
;
5004 imm_use_iterator imm_iter
, phi_imm_iter
;
5005 use_operand_p use_p
, phi_use_p
;
5007 bool nested_in_vect_loop
= false;
5008 auto_vec
<gimple
*> new_phis
;
5010 auto_vec
<tree
> scalar_results
;
5011 unsigned int group_size
= 1, k
;
5012 auto_vec
<gimple
*> phis
;
5013 bool slp_reduc
= false;
5014 bool direct_slp_reduc
;
5015 tree new_phi_result
;
5016 tree induction_index
= NULL_TREE
;
5019 group_size
= SLP_TREE_LANES (slp_node
);
5021 if (nested_in_vect_loop_p (loop
, stmt_info
))
5025 nested_in_vect_loop
= true;
5026 gcc_assert (!slp_node
);
5028 gcc_assert (!nested_in_vect_loop
|| double_reduc
);
5030 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
5031 gcc_assert (vectype
);
5032 mode
= TYPE_MODE (vectype
);
5034 tree initial_def
= NULL
;
5035 tree induc_val
= NULL_TREE
;
5036 tree adjustment_def
= NULL
;
5041 /* Get at the scalar def before the loop, that defines the initial value
5042 of the reduction variable. */
5043 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5044 loop_preheader_edge (loop
));
5045 /* Optimize: for induction condition reduction, if we can't use zero
5046 for induc_val, use initial_def. */
5047 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5048 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
5049 else if (double_reduc
)
5051 else if (nested_in_vect_loop
)
5054 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
5061 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
5066 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
5068 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
5071 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5072 which is updated with the current index of the loop for every match of
5073 the original loop's cond_expr (VEC_STMT). This results in a vector
5074 containing the last time the condition passed for that vector lane.
5075 The first match will be a 1 to allow 0 to be used for non-matching
5076 indexes. If there are no matches at all then the vector will be all
5079 PR92772: This algorithm is broken for architectures that support
5080 masked vectors, but do not provide fold_extract_last. */
5081 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
5083 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
5084 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
5085 cond_info
= vect_stmt_to_vectorize (cond_info
);
5086 while (cond_info
!= reduc_info
)
5088 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
5090 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
5091 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
5093 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
5094 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
5097 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
5098 1 + STMT_VINFO_REDUC_IDX
5100 cond_info
= vect_stmt_to_vectorize (cond_info
);
5102 gcc_assert (ccompares
.length () != 0);
5104 tree indx_before_incr
, indx_after_incr
;
5105 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
5106 int scalar_precision
5107 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
5108 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5109 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
5110 (TYPE_MODE (vectype
), cr_index_scalar_type
,
5111 TYPE_VECTOR_SUBPARTS (vectype
));
5113 /* First we create a simple vector induction variable which starts
5114 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5115 vector size (STEP). */
5117 /* Create a {1,2,3,...} vector. */
5118 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
5120 /* Create a vector of the step value. */
5121 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
5122 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
5124 /* Create an induction variable. */
5125 gimple_stmt_iterator incr_gsi
;
5127 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5128 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
5129 insert_after
, &indx_before_incr
, &indx_after_incr
);
5131 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5132 filled with zeros (VEC_ZERO). */
5134 /* Create a vector of 0s. */
5135 tree zero
= build_zero_cst (cr_index_scalar_type
);
5136 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
5138 /* Create a vector phi node. */
5139 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
5140 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
5141 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
5142 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
5144 /* Now take the condition from the loops original cond_exprs
5145 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5146 every match uses values from the induction variable
5147 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5149 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5150 the new cond_expr (INDEX_COND_EXPR). */
5151 gimple_seq stmts
= NULL
;
5152 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
5154 tree ccompare
= ccompares
[i
].first
;
5155 if (ccompares
[i
].second
)
5156 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5157 cr_index_vector_type
,
5159 indx_before_incr
, new_phi_tree
);
5161 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5162 cr_index_vector_type
,
5164 new_phi_tree
, indx_before_incr
);
5166 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
5168 /* Update the phi with the vec cond. */
5169 induction_index
= new_phi_tree
;
5170 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
5171 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
5174 /* 2. Create epilog code.
5175 The reduction epilog code operates across the elements of the vector
5176 of partial results computed by the vectorized loop.
5177 The reduction epilog code consists of:
5179 step 1: compute the scalar result in a vector (v_out2)
5180 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5181 step 3: adjust the scalar result (s_out3) if needed.
5183 Step 1 can be accomplished using one the following three schemes:
5184 (scheme 1) using reduc_fn, if available.
5185 (scheme 2) using whole-vector shifts, if available.
5186 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5189 The overall epilog code looks like this:
5191 s_out0 = phi <s_loop> # original EXIT_PHI
5192 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5193 v_out2 = reduce <v_out1> # step 1
5194 s_out3 = extract_field <v_out2, 0> # step 2
5195 s_out4 = adjust_result <s_out3> # step 3
5197 (step 3 is optional, and steps 1 and 2 may be combined).
5198 Lastly, the uses of s_out0 are replaced by s_out4. */
5201 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5202 v_out1 = phi <VECT_DEF>
5203 Store them in NEW_PHIS. */
5206 exit_bb
= single_exit (loop
)->dest
;
5207 new_phis
.create (slp_node
? vec_num
: ncopies
);
5208 for (unsigned i
= 0; i
< vec_num
; i
++)
5211 def
= vect_get_slp_vect_def (slp_node
, i
);
5213 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
5214 for (j
= 0; j
< ncopies
; j
++)
5216 tree new_def
= copy_ssa_name (def
);
5217 phi
= create_phi_node (new_def
, exit_bb
);
5219 new_phis
.quick_push (phi
);
5222 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
5223 new_phis
.quick_push (phi
);
5226 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5230 exit_gsi
= gsi_after_labels (exit_bb
);
5232 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5233 (i.e. when reduc_fn is not available) and in the final adjustment
5234 code (if needed). Also get the original scalar reduction variable as
5235 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5236 represents a reduction pattern), the tree-code and scalar-def are
5237 taken from the original stmt that the pattern-stmt (STMT) replaces.
5238 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5239 are taken from STMT. */
5241 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5242 if (orig_stmt_info
!= stmt_info
)
5244 /* Reduction pattern */
5245 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5246 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
5249 scalar_dest
= gimple_assign_lhs (orig_stmt_info
->stmt
);
5250 scalar_type
= TREE_TYPE (scalar_dest
);
5251 scalar_results
.create (group_size
);
5252 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5253 bitsize
= TYPE_SIZE (scalar_type
);
5255 /* SLP reduction without reduction chain, e.g.,
5259 b2 = operation (b1) */
5260 slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5262 /* True if we should implement SLP_REDUC using native reduction operations
5263 instead of scalar operations. */
5264 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5266 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5268 /* In case of reduction chain, e.g.,
5271 a3 = operation (a2),
5273 we may end up with more than one vector result. Here we reduce them to
5275 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) || direct_slp_reduc
)
5277 gimple_seq stmts
= NULL
;
5278 tree first_vect
= PHI_RESULT (new_phis
[0]);
5279 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
5280 for (k
= 1; k
< new_phis
.length (); k
++)
5282 gimple
*next_phi
= new_phis
[k
];
5283 tree second_vect
= PHI_RESULT (next_phi
);
5284 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
5285 first_vect
= gimple_build (&stmts
, code
, vectype
,
5286 first_vect
, second_vect
);
5288 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5290 new_phi_result
= first_vect
;
5291 new_phis
.truncate (0);
5292 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
5294 /* Likewise if we couldn't use a single defuse cycle. */
5295 else if (ncopies
> 1)
5297 gimple_seq stmts
= NULL
;
5298 tree first_vect
= PHI_RESULT (new_phis
[0]);
5299 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
5300 for (int k
= 1; k
< ncopies
; ++k
)
5302 tree second_vect
= PHI_RESULT (new_phis
[k
]);
5303 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
5304 first_vect
= gimple_build (&stmts
, code
, vectype
,
5305 first_vect
, second_vect
);
5307 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5308 new_phi_result
= first_vect
;
5309 new_phis
.truncate (0);
5310 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
5313 new_phi_result
= PHI_RESULT (new_phis
[0]);
5315 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5316 && reduc_fn
!= IFN_LAST
)
5318 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5319 various data values where the condition matched and another vector
5320 (INDUCTION_INDEX) containing all the indexes of those matches. We
5321 need to extract the last matching index (which will be the index with
5322 highest value) and use this to index into the data vector.
5323 For the case where there were no matches, the data vector will contain
5324 all default values and the index vector will be all zeros. */
5326 /* Get various versions of the type of the vector of indexes. */
5327 tree index_vec_type
= TREE_TYPE (induction_index
);
5328 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5329 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5330 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
5332 /* Get an unsigned integer version of the type of the data vector. */
5333 int scalar_precision
5334 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5335 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5336 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
5339 /* First we need to create a vector (ZERO_VEC) of zeros and another
5340 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5341 can create using a MAX reduction and then expanding.
5342 In the case where the loop never made any matches, the max index will
5345 /* Vector of {0, 0, 0,...}. */
5346 tree zero_vec
= build_zero_cst (vectype
);
5348 gimple_seq stmts
= NULL
;
5349 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
5350 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5352 /* Find maximum value from the vector of found indexes. */
5353 tree max_index
= make_ssa_name (index_scalar_type
);
5354 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5355 1, induction_index
);
5356 gimple_call_set_lhs (max_index_stmt
, max_index
);
5357 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5359 /* Vector of {max_index, max_index, max_index,...}. */
5360 tree max_index_vec
= make_ssa_name (index_vec_type
);
5361 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5363 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5365 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5367 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5368 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5369 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5370 otherwise. Only one value should match, resulting in a vector
5371 (VEC_COND) with one data value and the rest zeros.
5372 In the case where the loop never made any matches, every index will
5373 match, resulting in a vector with all data values (which will all be
5374 the default value). */
5376 /* Compare the max index vector to the vector of found indexes to find
5377 the position of the max value. */
5378 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5379 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5382 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5384 /* Use the compare to choose either values from the data vector or
5386 tree vec_cond
= make_ssa_name (vectype
);
5387 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5388 vec_compare
, new_phi_result
,
5390 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5392 /* Finally we need to extract the data value from the vector (VEC_COND)
5393 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5394 reduction, but because this doesn't exist, we can use a MAX reduction
5395 instead. The data value might be signed or a float so we need to cast
5397 In the case where the loop never made any matches, the data values are
5398 all identical, and so will reduce down correctly. */
5400 /* Make the matched data values unsigned. */
5401 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5402 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5404 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5407 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5409 /* Reduce down to a scalar value. */
5410 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5411 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5413 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5414 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5416 /* Convert the reduced value back to the result type and set as the
5419 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5421 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5422 scalar_results
.safe_push (new_temp
);
5424 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5425 && reduc_fn
== IFN_LAST
)
5427 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5429 idx_val = induction_index[0];
5430 val = data_reduc[0];
5431 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5432 if (induction_index[i] > idx_val)
5433 val = data_reduc[i], idx_val = induction_index[i];
5436 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5437 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5438 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5439 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5440 /* Enforced by vectorizable_reduction, which ensures we have target
5441 support before allowing a conditional reduction on variable-length
5443 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5444 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5445 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5447 tree old_idx_val
= idx_val
;
5449 idx_val
= make_ssa_name (idx_eltype
);
5450 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5451 build3 (BIT_FIELD_REF
, idx_eltype
,
5453 bitsize_int (el_size
),
5454 bitsize_int (off
)));
5455 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5456 val
= make_ssa_name (data_eltype
);
5457 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5458 build3 (BIT_FIELD_REF
,
5461 bitsize_int (el_size
),
5462 bitsize_int (off
)));
5463 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5466 tree new_idx_val
= idx_val
;
5467 if (off
!= v_size
- el_size
)
5469 new_idx_val
= make_ssa_name (idx_eltype
);
5470 epilog_stmt
= gimple_build_assign (new_idx_val
,
5473 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5475 tree new_val
= make_ssa_name (data_eltype
);
5476 epilog_stmt
= gimple_build_assign (new_val
,
5483 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5484 idx_val
= new_idx_val
;
5488 /* Convert the reduced value back to the result type and set as the
5490 gimple_seq stmts
= NULL
;
5491 val
= gimple_convert (&stmts
, scalar_type
, val
);
5492 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5493 scalar_results
.safe_push (val
);
5496 /* 2.3 Create the reduction code, using one of the three schemes described
5497 above. In SLP we simply need to extract all the elements from the
5498 vector (without reducing them), so we use scalar shifts. */
5499 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5505 v_out2 = reduc_expr <v_out1> */
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_NOTE
, vect_location
,
5509 "Reduce using direct vector reduction.\n");
5511 gimple_seq stmts
= NULL
;
5512 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
5513 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5514 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5515 vec_elem_type
, new_phi_result
);
5516 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5517 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5519 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5522 /* Earlier we set the initial value to be a vector if induc_val
5523 values. Check the result and if it is induc_val then replace
5524 with the original initial value, unless induc_val is
5525 the same as initial_def already. */
5526 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5529 tmp
= make_ssa_name (new_scalar_dest
);
5530 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5531 initial_def
, new_temp
);
5532 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5536 scalar_results
.safe_push (new_temp
);
5538 else if (direct_slp_reduc
)
5540 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5541 with the elements for other SLP statements replaced with the
5542 neutral value. We can then do a normal reduction on each vector. */
5544 /* Enforced by vectorizable_reduction. */
5545 gcc_assert (new_phis
.length () == 1);
5546 gcc_assert (pow2p_hwi (group_size
));
5548 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5549 vec
<stmt_vec_info
> orig_phis
5550 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5551 gimple_seq seq
= NULL
;
5553 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5554 and the same element size as VECTYPE. */
5555 tree index
= build_index_vector (vectype
, 0, 1);
5556 tree index_type
= TREE_TYPE (index
);
5557 tree index_elt_type
= TREE_TYPE (index_type
);
5558 tree mask_type
= truth_type_for (index_type
);
5560 /* Create a vector that, for each element, identifies which of
5561 the REDUC_GROUP_SIZE results should use it. */
5562 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5563 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5564 build_vector_from_val (index_type
, index_mask
));
5566 /* Get a neutral vector value. This is simply a splat of the neutral
5567 scalar value if we have one, otherwise the initial scalar value
5568 is itself a neutral value. */
5569 tree vector_identity
= NULL_TREE
;
5570 tree neutral_op
= NULL_TREE
;
5573 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
5575 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
,
5576 vectype
, code
, first
!= NULL
);
5579 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5581 for (unsigned int i
= 0; i
< group_size
; ++i
)
5583 /* If there's no univeral neutral value, we can use the
5584 initial scalar value from the original PHI. This is used
5585 for MIN and MAX reduction, for example. */
5589 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
]->stmt
,
5590 loop_preheader_edge (loop
));
5591 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
5593 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5597 /* Calculate the equivalent of:
5599 sel[j] = (index[j] == i);
5601 which selects the elements of NEW_PHI_RESULT that should
5602 be included in the result. */
5603 tree compare_val
= build_int_cst (index_elt_type
, i
);
5604 compare_val
= build_vector_from_val (index_type
, compare_val
);
5605 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5606 index
, compare_val
);
5608 /* Calculate the equivalent of:
5610 vec = seq ? new_phi_result : vector_identity;
5612 VEC is now suitable for a full vector reduction. */
5613 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5614 sel
, new_phi_result
, vector_identity
);
5616 /* Do the reduction and convert it to the appropriate type. */
5617 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5618 TREE_TYPE (vectype
), vec
);
5619 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5620 scalar_results
.safe_push (scalar
);
5622 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5626 bool reduce_with_shift
;
5629 gcc_assert (slp_reduc
|| new_phis
.length () == 1);
5631 /* See if the target wants to do the final (shift) reduction
5632 in a vector mode of smaller size and first reduce upper/lower
5633 halves against each other. */
5634 enum machine_mode mode1
= mode
;
5635 tree stype
= TREE_TYPE (vectype
);
5636 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5637 unsigned nunits1
= nunits
;
5638 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
5639 && new_phis
.length () == 1)
5641 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5642 /* For SLP reductions we have to make sure lanes match up, but
5643 since we're doing individual element final reduction reducing
5644 vector width here is even more important.
5645 ??? We can also separate lanes with permutes, for the common
5646 case of power-of-two group-size odd/even extracts would work. */
5647 if (slp_reduc
&& nunits
!= nunits1
)
5649 nunits1
= least_common_multiple (nunits1
, group_size
);
5650 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
5654 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5655 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5657 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5659 reduce_with_shift
= have_whole_vector_shift (mode1
);
5660 if (!VECTOR_MODE_P (mode1
))
5661 reduce_with_shift
= false;
5664 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5665 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5666 reduce_with_shift
= false;
5669 /* First reduce the vector to the desired vector size we should
5670 do shift reduction on by combining upper and lower halves. */
5671 new_temp
= new_phi_result
;
5672 while (nunits
> nunits1
)
5675 vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5677 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5679 /* The target has to make sure we support lowpart/highpart
5680 extraction, either via direct vector extract or through
5681 an integer mode punning. */
5683 if (convert_optab_handler (vec_extract_optab
,
5684 TYPE_MODE (TREE_TYPE (new_temp
)),
5685 TYPE_MODE (vectype1
))
5686 != CODE_FOR_nothing
)
5688 /* Extract sub-vectors directly once vec_extract becomes
5689 a conversion optab. */
5690 dst1
= make_ssa_name (vectype1
);
5692 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5693 build3 (BIT_FIELD_REF
, vectype1
,
5694 new_temp
, TYPE_SIZE (vectype1
),
5696 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5697 dst2
= make_ssa_name (vectype1
);
5699 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5700 build3 (BIT_FIELD_REF
, vectype1
,
5701 new_temp
, TYPE_SIZE (vectype1
),
5702 bitsize_int (bitsize
)));
5703 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5707 /* Extract via punning to appropriately sized integer mode
5709 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5710 tree etype
= build_vector_type (eltype
, 2);
5711 gcc_assert (convert_optab_handler (vec_extract_optab
,
5714 != CODE_FOR_nothing
);
5715 tree tem
= make_ssa_name (etype
);
5716 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5717 build1 (VIEW_CONVERT_EXPR
,
5719 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5721 tem
= make_ssa_name (eltype
);
5723 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5724 build3 (BIT_FIELD_REF
, eltype
,
5725 new_temp
, TYPE_SIZE (eltype
),
5727 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5728 dst1
= make_ssa_name (vectype1
);
5729 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5730 build1 (VIEW_CONVERT_EXPR
,
5732 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5733 tem
= make_ssa_name (eltype
);
5735 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5736 build3 (BIT_FIELD_REF
, eltype
,
5737 new_temp
, TYPE_SIZE (eltype
),
5738 bitsize_int (bitsize
)));
5739 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5740 dst2
= make_ssa_name (vectype1
);
5741 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5742 build1 (VIEW_CONVERT_EXPR
,
5744 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5747 new_temp
= make_ssa_name (vectype1
);
5748 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5749 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5750 new_phis
[0] = epilog_stmt
;
5753 if (reduce_with_shift
&& !slp_reduc
)
5755 int element_bitsize
= tree_to_uhwi (bitsize
);
5756 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5757 for variable-length vectors and also requires direct target support
5758 for loop reductions. */
5759 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5760 int nelements
= vec_size_in_bits
/ element_bitsize
;
5761 vec_perm_builder sel
;
5762 vec_perm_indices indices
;
5766 tree zero_vec
= build_zero_cst (vectype1
);
5768 for (offset = nelements/2; offset >= 1; offset/=2)
5770 Create: va' = vec_shift <va, offset>
5771 Create: va = vop <va, va'>
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_NOTE
, vect_location
,
5778 "Reduce using vector shifts\n");
5780 gimple_seq stmts
= NULL
;
5781 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
5782 for (elt_offset
= nelements
/ 2;
5786 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5787 indices
.new_vector (sel
, 2, nelements
);
5788 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5789 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
5790 new_temp
, zero_vec
, mask
);
5791 new_temp
= gimple_build (&stmts
, code
,
5792 vectype1
, new_name
, new_temp
);
5794 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5796 /* 2.4 Extract the final scalar result. Create:
5797 s_out3 = extract_field <v_out2, bitpos> */
5799 if (dump_enabled_p ())
5800 dump_printf_loc (MSG_NOTE
, vect_location
,
5801 "extract scalar result\n");
5803 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5804 bitsize
, bitsize_zero_node
);
5805 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5806 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5807 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5808 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5809 scalar_results
.safe_push (new_temp
);
5814 s = extract_field <v_out2, 0>
5815 for (offset = element_size;
5816 offset < vector_size;
5817 offset += element_size;)
5819 Create: s' = extract_field <v_out2, offset>
5820 Create: s = op <s, s'> // For non SLP cases
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_NOTE
, vect_location
,
5825 "Reduce using scalar code.\n");
5827 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5828 int element_bitsize
= tree_to_uhwi (bitsize
);
5829 tree compute_type
= TREE_TYPE (vectype
);
5830 gimple_seq stmts
= NULL
;
5831 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5834 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5835 vec_temp
= PHI_RESULT (new_phi
);
5837 vec_temp
= gimple_assign_lhs (new_phi
);
5838 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
5839 vec_temp
, bitsize
, bitsize_zero_node
);
5841 /* In SLP we don't need to apply reduction operation, so we just
5842 collect s' values in SCALAR_RESULTS. */
5844 scalar_results
.safe_push (new_temp
);
5846 for (bit_offset
= element_bitsize
;
5847 bit_offset
< vec_size_in_bits
;
5848 bit_offset
+= element_bitsize
)
5850 tree bitpos
= bitsize_int (bit_offset
);
5851 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
5852 compute_type
, vec_temp
,
5856 /* In SLP we don't need to apply reduction operation, so
5857 we just collect s' values in SCALAR_RESULTS. */
5858 new_temp
= new_name
;
5859 scalar_results
.safe_push (new_name
);
5862 new_temp
= gimple_build (&stmts
, code
, compute_type
,
5863 new_name
, new_temp
);
5867 /* The only case where we need to reduce scalar results in SLP, is
5868 unrolling. If the size of SCALAR_RESULTS is greater than
5869 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5870 REDUC_GROUP_SIZE. */
5873 tree res
, first_res
, new_res
;
5875 /* Reduce multiple scalar results in case of SLP unrolling. */
5876 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5879 first_res
= scalar_results
[j
% group_size
];
5880 new_res
= gimple_build (&stmts
, code
, compute_type
,
5882 scalar_results
[j
% group_size
] = new_res
;
5884 for (k
= 0; k
< group_size
; k
++)
5885 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
5890 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5891 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5892 scalar_results
.safe_push (new_temp
);
5895 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5898 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5901 /* Earlier we set the initial value to be a vector if induc_val
5902 values. Check the result and if it is induc_val then replace
5903 with the original initial value, unless induc_val is
5904 the same as initial_def already. */
5905 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5908 tree tmp
= make_ssa_name (new_scalar_dest
);
5909 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5910 initial_def
, new_temp
);
5911 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5912 scalar_results
[0] = tmp
;
5916 /* 2.5 Adjust the final result by the initial value of the reduction
5917 variable. (When such adjustment is not needed, then
5918 'adjustment_def' is zero). For example, if code is PLUS we create:
5919 new_temp = loop_exit_def + adjustment_def */
5923 gcc_assert (!slp_reduc
);
5924 gimple_seq stmts
= NULL
;
5925 if (nested_in_vect_loop
)
5927 new_phi
= new_phis
[0];
5928 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
5929 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
5930 new_temp
= gimple_build (&stmts
, code
, vectype
,
5931 PHI_RESULT (new_phi
), adjustment_def
);
5935 new_temp
= scalar_results
[0];
5936 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5937 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
5938 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
5939 new_temp
, adjustment_def
);
5942 epilog_stmt
= gimple_seq_last_stmt (stmts
);
5943 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5944 if (nested_in_vect_loop
)
5947 scalar_results
.quick_push (new_temp
);
5949 scalar_results
[0] = new_temp
;
5952 scalar_results
[0] = new_temp
;
5954 new_phis
[0] = epilog_stmt
;
5960 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5961 phis with new adjusted scalar results, i.e., replace use <s_out0>
5966 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5967 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5968 v_out2 = reduce <v_out1>
5969 s_out3 = extract_field <v_out2, 0>
5970 s_out4 = adjust_result <s_out3>
5977 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5978 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5979 v_out2 = reduce <v_out1>
5980 s_out3 = extract_field <v_out2, 0>
5981 s_out4 = adjust_result <s_out3>
5986 /* In SLP reduction chain we reduce vector results into one vector if
5987 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5988 LHS of the last stmt in the reduction chain, since we are looking for
5989 the loop exit phi node. */
5990 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5992 stmt_vec_info dest_stmt_info
5993 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
5994 scalar_dest
= gimple_assign_lhs (dest_stmt_info
->stmt
);
5998 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5999 case that REDUC_GROUP_SIZE is greater than vectorization factor).
6000 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
6001 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
6002 correspond to the first vector stmt, etc.
6003 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
6004 if (group_size
> new_phis
.length ())
6005 gcc_assert (!(group_size
% new_phis
.length ()));
6007 for (k
= 0; k
< group_size
; k
++)
6011 stmt_vec_info scalar_stmt_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
6013 orig_stmt_info
= STMT_VINFO_RELATED_STMT (scalar_stmt_info
);
6014 /* SLP statements can't participate in patterns. */
6015 gcc_assert (!orig_stmt_info
);
6016 scalar_dest
= gimple_assign_lhs (scalar_stmt_info
->stmt
);
6019 if (nested_in_vect_loop
)
6028 /* Find the loop-closed-use at the loop exit of the original scalar
6029 result. (The reduction result is expected to have two immediate uses,
6030 one at the latch block, and one at the loop exit). For double
6031 reductions we are looking for exit phis of the outer loop. */
6032 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6034 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6036 if (!is_gimple_debug (USE_STMT (use_p
)))
6037 phis
.safe_push (USE_STMT (use_p
));
6041 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6043 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6045 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6047 if (!flow_bb_inside_loop_p (loop
,
6048 gimple_bb (USE_STMT (phi_use_p
)))
6049 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6050 phis
.safe_push (USE_STMT (phi_use_p
));
6056 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6058 /* Replace the uses: */
6059 orig_name
= PHI_RESULT (exit_phi
);
6060 scalar_result
= scalar_results
[k
];
6061 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6063 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6064 SET_USE (use_p
, scalar_result
);
6065 update_stmt (use_stmt
);
6073 /* Return a vector of type VECTYPE that is equal to the vector select
6074 operation "MASK ? VEC : IDENTITY". Insert the select statements
6078 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6079 tree vec
, tree identity
)
6081 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6082 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6083 mask
, vec
, identity
);
6084 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6088 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6089 order, starting with LHS. Insert the extraction statements before GSI and
6090 associate the new scalar SSA names with variable SCALAR_DEST.
6091 Return the SSA name for the result. */
6094 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6095 tree_code code
, tree lhs
, tree vector_rhs
)
6097 tree vectype
= TREE_TYPE (vector_rhs
);
6098 tree scalar_type
= TREE_TYPE (vectype
);
6099 tree bitsize
= TYPE_SIZE (scalar_type
);
6100 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6101 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6103 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6104 bit_offset
< vec_size_in_bits
;
6105 bit_offset
+= element_bitsize
)
6107 tree bitpos
= bitsize_int (bit_offset
);
6108 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6111 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6112 rhs
= make_ssa_name (scalar_dest
, stmt
);
6113 gimple_assign_set_lhs (stmt
, rhs
);
6114 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6116 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6117 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6118 gimple_assign_set_lhs (stmt
, new_name
);
6119 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6125 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6126 type of the vector input. */
6129 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
6131 internal_fn mask_reduc_fn
;
6135 case IFN_FOLD_LEFT_PLUS
:
6136 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
6143 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
6144 OPTIMIZE_FOR_SPEED
))
6145 return mask_reduc_fn
;
6149 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6150 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6151 statement. CODE is the operation performed by STMT_INFO and OPS are
6152 its scalar operands. REDUC_INDEX is the index of the operand in
6153 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6154 implements in-order reduction, or IFN_LAST if we should open-code it.
6155 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6156 that should be used to control the operation in a fully-masked loop. */
6159 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
6160 stmt_vec_info stmt_info
,
6161 gimple_stmt_iterator
*gsi
,
6162 gimple
**vec_stmt
, slp_tree slp_node
,
6163 gimple
*reduc_def_stmt
,
6164 tree_code code
, internal_fn reduc_fn
,
6165 tree ops
[3], tree vectype_in
,
6166 int reduc_index
, vec_loop_masks
*masks
)
6168 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6169 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6170 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
6176 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6178 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
6179 gcc_assert (ncopies
== 1);
6180 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6183 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6184 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6186 tree op0
= ops
[1 - reduc_index
];
6189 stmt_vec_info scalar_dest_def_info
;
6190 auto_vec
<tree
> vec_oprnds0
;
6193 auto_vec
<vec
<tree
> > vec_defs (2);
6194 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6195 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
6196 vec_defs
[0].release ();
6197 vec_defs
[1].release ();
6198 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6199 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6203 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
6205 scalar_dest_def_info
= stmt_info
;
6208 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
6209 tree scalar_type
= TREE_TYPE (scalar_dest
);
6210 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6212 int vec_num
= vec_oprnds0
.length ();
6213 gcc_assert (vec_num
== 1 || slp_node
);
6214 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6215 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6217 tree vector_identity
= NULL_TREE
;
6218 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6219 vector_identity
= build_zero_cst (vectype_out
);
6221 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6224 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6227 tree mask
= NULL_TREE
;
6228 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6229 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6231 /* Handle MINUS by adding the negative. */
6232 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6234 tree negated
= make_ssa_name (vectype_out
);
6235 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6236 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6240 if (mask
&& mask_reduc_fn
== IFN_LAST
)
6241 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6244 /* On the first iteration the input is simply the scalar phi
6245 result, and for subsequent iterations it is the output of
6246 the preceding operation. */
6247 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
6249 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
6250 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
6253 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
6255 /* For chained SLP reductions the output of the previous reduction
6256 operation serves as the input of the next. For the final statement
6257 the output cannot be a temporary - we reuse the original
6258 scalar destination of the last statement. */
6259 if (i
!= vec_num
- 1)
6261 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6262 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6263 gimple_set_lhs (new_stmt
, reduc_var
);
6268 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6270 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6271 /* Remove the statement, so that we can use the same code paths
6272 as for statements that we've just created. */
6273 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6274 gsi_remove (&tmp_gsi
, true);
6277 if (i
== vec_num
- 1)
6279 gimple_set_lhs (new_stmt
, scalar_dest
);
6280 vect_finish_replace_stmt (loop_vinfo
,
6281 scalar_dest_def_info
,
6285 vect_finish_stmt_generation (loop_vinfo
,
6286 scalar_dest_def_info
,
6290 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6293 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
6294 *vec_stmt
= new_stmt
;
6301 /* Function is_nonwrapping_integer_induction.
6303 Check if STMT_VINO (which is part of loop LOOP) both increments and
6304 does not cause overflow. */
6307 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
6309 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
6310 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6311 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6312 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
6313 widest_int ni
, max_loop_value
, lhs_max
;
6314 wi::overflow_type overflow
= wi::OVF_NONE
;
6316 /* Make sure the loop is integer based. */
6317 if (TREE_CODE (base
) != INTEGER_CST
6318 || TREE_CODE (step
) != INTEGER_CST
)
6321 /* Check that the max size of the loop will not wrap. */
6323 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6326 if (! max_stmt_executions (loop
, &ni
))
6329 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6334 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6335 TYPE_SIGN (lhs_type
), &overflow
);
6339 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6340 <= TYPE_PRECISION (lhs_type
));
6343 /* Check if masking can be supported by inserting a conditional expression.
6344 CODE is the code for the operation. COND_FN is the conditional internal
6345 function, if it exists. VECTYPE_IN is the type of the vector input. */
6347 use_mask_by_cond_expr_p (enum tree_code code
, internal_fn cond_fn
,
6350 if (cond_fn
!= IFN_LAST
6351 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6352 OPTIMIZE_FOR_SPEED
))
6366 /* Insert a conditional expression to enable masked vectorization. CODE is the
6367 code for the operation. VOP is the array of operands. MASK is the loop
6368 mask. GSI is a statement iterator used to place the new conditional
6371 build_vect_cond_expr (enum tree_code code
, tree vop
[3], tree mask
,
6372 gimple_stmt_iterator
*gsi
)
6378 tree vectype
= TREE_TYPE (vop
[1]);
6379 tree zero
= build_zero_cst (vectype
);
6380 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6381 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6382 mask
, vop
[1], zero
);
6383 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6384 vop
[1] = masked_op1
;
6390 tree vectype
= TREE_TYPE (vop
[1]);
6391 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6392 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6393 mask
, vop
[1], vop
[0]);
6394 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6395 vop
[1] = masked_op1
;
6404 /* Function vectorizable_reduction.
6406 Check if STMT_INFO performs a reduction operation that can be vectorized.
6407 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6408 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6409 Return true if STMT_INFO is vectorizable in this way.
6411 This function also handles reduction idioms (patterns) that have been
6412 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6413 may be of this form:
6414 X = pattern_expr (arg0, arg1, ..., X)
6415 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6416 sequence that had been detected and replaced by the pattern-stmt
6419 This function also handles reduction of condition expressions, for example:
6420 for (int i = 0; i < N; i++)
6423 This is handled by vectorising the loop and creating an additional vector
6424 containing the loop indexes for which "a[i] < value" was true. In the
6425 function epilogue this is reduced to a single max value and then used to
6426 index into the vector of results.
6428 In some cases of reduction patterns, the type of the reduction variable X is
6429 different than the type of the other arguments of STMT_INFO.
6430 In such cases, the vectype that is used when transforming STMT_INFO into
6431 a vector stmt is different than the vectype that is used to determine the
6432 vectorization factor, because it consists of a different number of elements
6433 than the actual number of elements that are being operated upon in parallel.
6435 For example, consider an accumulation of shorts into an int accumulator.
6436 On some targets it's possible to vectorize this pattern operating on 8
6437 shorts at a time (hence, the vectype for purposes of determining the
6438 vectorization factor should be V8HI); on the other hand, the vectype that
6439 is used to create the vector form is actually V4SI (the type of the result).
6441 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6442 indicates what is the actual level of parallelism (V8HI in the example), so
6443 that the right vectorization factor would be derived. This vectype
6444 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6445 be used to create the vectorized stmt. The right vectype for the vectorized
6446 stmt is obtained from the type of the result X:
6447 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6449 This means that, contrary to "regular" reductions (or "regular" stmts in
6450 general), the following equation:
6451 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6452 does *NOT* necessarily hold for reduction patterns. */
6455 vectorizable_reduction (loop_vec_info loop_vinfo
,
6456 stmt_vec_info stmt_info
, slp_tree slp_node
,
6457 slp_instance slp_node_instance
,
6458 stmt_vector_for_cost
*cost_vec
)
6461 tree vectype_in
= NULL_TREE
;
6462 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6463 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
6464 stmt_vec_info cond_stmt_vinfo
= NULL
;
6468 bool single_defuse_cycle
= false;
6469 bool nested_cycle
= false;
6470 bool double_reduc
= false;
6473 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6474 tree cond_reduc_val
= NULL_TREE
;
6476 /* Make sure it was already recognized as a reduction computation. */
6477 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6478 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6479 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6482 /* The stmt we store reduction analysis meta on. */
6483 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6484 reduc_info
->is_reduc_info
= true;
6486 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6488 if (is_a
<gphi
*> (stmt_info
->stmt
))
6492 /* We eventually need to set a vector type on invariant
6496 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
6497 if (!vect_maybe_update_slp_op_vectype
6498 (child
, SLP_TREE_VECTYPE (slp_node
)))
6500 if (dump_enabled_p ())
6501 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6502 "incompatible vector types for "
6507 /* Analysis for double-reduction is done on the outer
6508 loop PHI, nested cycles have no further restrictions. */
6509 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6512 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6516 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6517 stmt_vec_info phi_info
= stmt_info
;
6518 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
6519 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6521 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6523 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6528 slp_node_instance
->reduc_phis
= slp_node
;
6529 /* ??? We're leaving slp_node to point to the PHIs, we only
6530 need it to get at the number of vector stmts which wasn't
6531 yet initialized for the instance root. */
6533 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
6534 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
6535 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6537 use_operand_p use_p
;
6539 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6542 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6543 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
6547 /* PHIs should not participate in patterns. */
6548 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6549 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6551 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6552 and compute the reduction chain length. Discover the real
6553 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6555 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6557 (gimple_bb (reduc_def_phi
)->loop_father
));
6558 unsigned reduc_chain_length
= 0;
6559 bool only_slp_reduc_chain
= true;
6561 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
6562 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6564 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6565 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6566 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6568 if (dump_enabled_p ())
6569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6570 "reduction chain broken by patterns.\n");
6573 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6574 only_slp_reduc_chain
= false;
6575 /* ??? For epilogue generation live members of the chain need
6576 to point back to the PHI via their original stmt for
6577 info_for_reduction to work. */
6578 if (STMT_VINFO_LIVE_P (vdef
))
6579 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6580 gassign
*assign
= dyn_cast
<gassign
*> (vdef
->stmt
);
6583 if (dump_enabled_p ())
6584 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6585 "reduction chain includes calls.\n");
6588 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
6590 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign
)),
6591 TREE_TYPE (gimple_assign_rhs1 (assign
))))
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6595 "conversion in the reduction chain.\n");
6599 else if (!stmt_info
)
6600 /* First non-conversion stmt. */
6602 reduc_def
= gimple_op (vdef
->stmt
, 1 + STMT_VINFO_REDUC_IDX (vdef
));
6603 reduc_chain_length
++;
6604 if (!stmt_info
&& slp_node
)
6605 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6607 /* PHIs should not participate in patterns. */
6608 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6610 if (nested_in_vect_loop_p (loop
, stmt_info
))
6613 nested_cycle
= true;
6616 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6618 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6620 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6621 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6623 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6624 gcc_assert (slp_node
6625 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6627 /* 1. Is vectorizable reduction? */
6628 /* Not supportable if the reduction variable is used in the loop, unless
6629 it's a reduction chain. */
6630 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6631 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6634 /* Reductions that are not used even in an enclosing outer-loop,
6635 are expected to be "live" (used out of the loop). */
6636 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6637 && !STMT_VINFO_LIVE_P (stmt_info
))
6640 /* 2. Has this been recognized as a reduction pattern?
6642 Check if STMT represents a pattern that has been recognized
6643 in earlier analysis stages. For stmts that represent a pattern,
6644 the STMT_VINFO_RELATED_STMT field records the last stmt in
6645 the original sequence that constitutes the pattern. */
6647 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6650 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6651 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6654 /* 3. Check the operands of the operation. The first operands are defined
6655 inside the loop body. The last operand is the reduction variable,
6656 which is defined by the loop-header-phi. */
6658 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6659 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6660 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6661 enum tree_code code
= gimple_assign_rhs_code (stmt
);
6662 bool lane_reduc_code_p
6663 = (code
== DOT_PROD_EXPR
|| code
== WIDEN_SUM_EXPR
|| code
== SAD_EXPR
);
6664 int op_type
= TREE_CODE_LENGTH (code
);
6666 scalar_dest
= gimple_assign_lhs (stmt
);
6667 scalar_type
= TREE_TYPE (scalar_dest
);
6668 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6669 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6672 /* Do not try to vectorize bit-precision reductions. */
6673 if (!type_has_mode_precision_p (scalar_type
))
6676 /* For lane-reducing ops we're reducing the number of reduction PHIs
6677 which means the only use of that may be in the lane-reducing operation. */
6678 if (lane_reduc_code_p
6679 && reduc_chain_length
!= 1
6680 && !only_slp_reduc_chain
)
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6684 "lane-reducing reduction with extra stmts.\n");
6688 /* All uses but the last are expected to be defined in the loop.
6689 The last use is the reduction variable. In case of nested cycle this
6690 assumption is not true: we use reduc_index to record the index of the
6691 reduction variable. */
6692 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op_type
);
6693 /* We need to skip an extra operand for COND_EXPRs with embedded
6695 unsigned opno_adjust
= 0;
6696 if (code
== COND_EXPR
6697 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
6699 for (i
= 0; i
< op_type
; i
++)
6701 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6702 if (i
== 0 && code
== COND_EXPR
)
6705 stmt_vec_info def_stmt_info
;
6706 enum vect_def_type dt
;
6708 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
6709 i
+ opno_adjust
, &op
, &slp_op
[i
], &dt
, &tem
,
6712 if (dump_enabled_p ())
6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6714 "use not simple.\n");
6717 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
6720 /* There should be only one cycle def in the stmt, the one
6721 leading to reduc_def. */
6722 if (VECTORIZABLE_CYCLE_DEF (dt
))
6725 /* To properly compute ncopies we are interested in the widest
6726 non-reduction input type in case we're looking at a widening
6727 accumulation that we later handle in vect_transform_reduction. */
6728 if (lane_reduc_code_p
6731 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6732 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
))))))
6735 if (code
== COND_EXPR
)
6737 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6738 if (dt
== vect_constant_def
)
6741 cond_reduc_val
= op
;
6743 if (dt
== vect_induction_def
6745 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6748 cond_stmt_vinfo
= def_stmt_info
;
6753 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
6754 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
6756 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
6757 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
6758 /* If we have a condition reduction, see if we can simplify it further. */
6759 if (v_reduc_type
== COND_REDUCTION
)
6764 /* When the condition uses the reduction value in the condition, fail. */
6765 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6769 "condition depends on previous iteration\n");
6773 if (reduc_chain_length
== 1
6774 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6775 vectype_in
, OPTIMIZE_FOR_SPEED
))
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6779 "optimizing condition reduction with"
6780 " FOLD_EXTRACT_LAST.\n");
6781 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
6783 else if (cond_reduc_dt
== vect_induction_def
)
6786 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6787 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6789 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6790 && TREE_CODE (step
) == INTEGER_CST
);
6791 cond_reduc_val
= NULL_TREE
;
6792 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6793 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
6794 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
6796 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6797 above base; punt if base is the minimum value of the type for
6798 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6799 else if (tree_int_cst_sgn (step
) == -1)
6801 cond_reduc_op_code
= MIN_EXPR
;
6802 if (tree_int_cst_sgn (base
) == -1)
6803 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6804 else if (tree_int_cst_lt (base
,
6805 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6807 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6811 cond_reduc_op_code
= MAX_EXPR
;
6812 if (tree_int_cst_sgn (base
) == 1)
6813 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6814 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6817 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6821 if (dump_enabled_p ())
6822 dump_printf_loc (MSG_NOTE
, vect_location
,
6823 "condition expression based on "
6824 "integer induction.\n");
6825 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
6826 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
6828 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
6831 else if (cond_reduc_dt
== vect_constant_def
)
6833 enum vect_def_type cond_initial_dt
;
6834 tree cond_initial_val
6835 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
, loop_preheader_edge (loop
));
6837 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6838 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
6839 if (cond_initial_dt
== vect_constant_def
6840 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6841 TREE_TYPE (cond_reduc_val
)))
6843 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6844 cond_initial_val
, cond_reduc_val
);
6845 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_NOTE
, vect_location
,
6849 "condition expression based on "
6850 "compile time constant.\n");
6851 /* Record reduction code at analysis stage. */
6852 STMT_VINFO_REDUC_CODE (reduc_info
)
6853 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6854 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
6860 if (STMT_VINFO_LIVE_P (phi_info
))
6866 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6868 gcc_assert (ncopies
>= 1);
6870 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6874 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
6875 == vect_double_reduction_def
);
6876 double_reduc
= true;
6879 /* 4.2. Check support for the epilog operation.
6881 If STMT represents a reduction pattern, then the type of the
6882 reduction variable may be different than the type of the rest
6883 of the arguments. For example, consider the case of accumulation
6884 of shorts into an int accumulator; The original code:
6885 S1: int_a = (int) short_a;
6886 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6889 STMT: int_acc = widen_sum <short_a, int_acc>
6892 1. The tree-code that is used to create the vector operation in the
6893 epilog code (that reduces the partial results) is not the
6894 tree-code of STMT, but is rather the tree-code of the original
6895 stmt from the pattern that STMT is replacing. I.e, in the example
6896 above we want to use 'widen_sum' in the loop, but 'plus' in the
6898 2. The type (mode) we use to check available target support
6899 for the vector operation to be created in the *epilog*, is
6900 determined by the type of the reduction variable (in the example
6901 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6902 However the type (mode) we use to check available target support
6903 for the vector operation to be created *inside the loop*, is
6904 determined by the type of the other arguments to STMT (in the
6905 example we'd check this: optab_handler (widen_sum_optab,
6908 This is contrary to "regular" reductions, in which the types of all
6909 the arguments are the same as the type of the reduction variable.
6910 For "regular" reductions we can therefore use the same vector type
6911 (and also the same tree-code) when generating the epilog code and
6912 when generating the code inside the loop. */
6914 enum tree_code orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
6915 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
6917 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6918 if (reduction_type
== TREE_CODE_REDUCTION
)
6920 /* Check whether it's ok to change the order of the computation.
6921 Generally, when vectorizing a reduction we change the order of the
6922 computation. This may change the behavior of the program in some
6923 cases, so we need to check that this is ok. One exception is when
6924 vectorizing an outer-loop: the inner-loop is executed sequentially,
6925 and therefore vectorizing reductions in the inner-loop during
6926 outer-loop vectorization is safe. Likewise when we are vectorizing
6927 a series of reductions using SLP and the VF is one the reductions
6928 are performed in scalar order. */
6930 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6931 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
6933 else if (needs_fold_left_reduction_p (scalar_type
, orig_code
))
6935 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6936 is not directy used in stmt. */
6937 if (!only_slp_reduc_chain
6938 && reduc_chain_length
!= 1)
6940 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6942 "in-order reduction chain without SLP.\n");
6945 STMT_VINFO_REDUC_TYPE (reduc_info
)
6946 = reduction_type
= FOLD_LEFT_REDUCTION
;
6948 else if (!commutative_tree_code (orig_code
)
6949 || !associative_tree_code (orig_code
))
6951 if (dump_enabled_p ())
6952 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6953 "reduction: not commutative/associative");
6958 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6963 "multiple types in double reduction or condition "
6964 "reduction or fold-left reduction.\n");
6968 internal_fn reduc_fn
= IFN_LAST
;
6969 if (reduction_type
== TREE_CODE_REDUCTION
6970 || reduction_type
== FOLD_LEFT_REDUCTION
6971 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6972 || reduction_type
== CONST_COND_REDUCTION
)
6974 if (reduction_type
== FOLD_LEFT_REDUCTION
6975 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6976 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6978 if (reduc_fn
!= IFN_LAST
6979 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6980 OPTIMIZE_FOR_SPEED
))
6982 if (dump_enabled_p ())
6983 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6984 "reduc op not supported by target.\n");
6986 reduc_fn
= IFN_LAST
;
6991 if (!nested_cycle
|| double_reduc
)
6993 if (dump_enabled_p ())
6994 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6995 "no reduc code for scalar code.\n");
7001 else if (reduction_type
== COND_REDUCTION
)
7003 int scalar_precision
7004 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
7005 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7006 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
7009 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7010 OPTIMIZE_FOR_SPEED
))
7011 reduc_fn
= IFN_REDUC_MAX
;
7013 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
7015 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7016 && (!nested_cycle
|| double_reduc
)
7017 && reduc_fn
== IFN_LAST
7018 && !nunits_out
.is_constant ())
7020 if (dump_enabled_p ())
7021 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7022 "missing target support for reduction on"
7023 " variable-length vectors.\n");
7027 /* For SLP reductions, see if there is a neutral value we can use. */
7028 tree neutral_op
= NULL_TREE
;
7030 neutral_op
= neutral_op_for_slp_reduction
7031 (slp_node_instance
->reduc_phis
, vectype_out
, orig_code
,
7032 REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
7034 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7036 /* We can't support in-order reductions of code such as this:
7038 for (int i = 0; i < n1; ++i)
7039 for (int j = 0; j < n2; ++j)
7042 since GCC effectively transforms the loop when vectorizing:
7044 for (int i = 0; i < n1 / VF; ++i)
7045 for (int j = 0; j < n2; ++j)
7046 for (int k = 0; k < VF; ++k)
7049 which is a reassociation of the original operation. */
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7052 "in-order double reduction not supported.\n");
7057 if (reduction_type
== FOLD_LEFT_REDUCTION
7059 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7061 /* We cannot use in-order reductions in this case because there is
7062 an implicit reassociation of the operations involved. */
7063 if (dump_enabled_p ())
7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7065 "in-order unchained SLP reductions not supported.\n");
7069 /* For double reductions, and for SLP reductions with a neutral value,
7070 we construct a variable-length initial vector by loading a vector
7071 full of the neutral value and then shift-and-inserting the start
7072 values into the low-numbered elements. */
7073 if ((double_reduc
|| neutral_op
)
7074 && !nunits_out
.is_constant ()
7075 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7076 vectype_out
, OPTIMIZE_FOR_SPEED
))
7078 if (dump_enabled_p ())
7079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7080 "reduction on variable-length vectors requires"
7081 " target support for a vector-shift-and-insert"
7086 /* Check extra constraints for variable-length unchained SLP reductions. */
7087 if (STMT_SLP_TYPE (stmt_info
)
7088 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7089 && !nunits_out
.is_constant ())
7091 /* We checked above that we could build the initial vector when
7092 there's a neutral element value. Check here for the case in
7093 which each SLP statement has its own initial value and in which
7094 that value needs to be repeated for every instance of the
7095 statement within the initial vector. */
7096 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
7098 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
7099 TREE_TYPE (vectype_out
)))
7101 if (dump_enabled_p ())
7102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7103 "unsupported form of SLP reduction for"
7104 " variable-length vectors: cannot build"
7105 " initial vector.\n");
7108 /* The epilogue code relies on the number of elements being a multiple
7109 of the group size. The duplicate-and-interleave approach to setting
7110 up the initial vector does too. */
7111 if (!multiple_p (nunits_out
, group_size
))
7113 if (dump_enabled_p ())
7114 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7115 "unsupported form of SLP reduction for"
7116 " variable-length vectors: the vector size"
7117 " is not a multiple of the number of results.\n");
7122 if (reduction_type
== COND_REDUCTION
)
7126 if (! max_loop_iterations (loop
, &ni
))
7128 if (dump_enabled_p ())
7129 dump_printf_loc (MSG_NOTE
, vect_location
,
7130 "loop count not known, cannot create cond "
7134 /* Convert backedges to iterations. */
7137 /* The additional index will be the same type as the condition. Check
7138 that the loop can fit into this less one (because we'll use up the
7139 zero slot for when there are no matches). */
7140 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7141 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7143 if (dump_enabled_p ())
7144 dump_printf_loc (MSG_NOTE
, vect_location
,
7145 "loop size is greater than data size.\n");
7150 /* In case the vectorization factor (VF) is bigger than the number
7151 of elements that we can fit in a vectype (nunits), we have to generate
7152 more than one vector stmt - i.e - we need to "unroll" the
7153 vector stmt by a factor VF/nunits. For more details see documentation
7154 in vectorizable_operation. */
7156 /* If the reduction is used in an outer loop we need to generate
7157 VF intermediate results, like so (e.g. for ncopies=2):
7162 (i.e. we generate VF results in 2 registers).
7163 In this case we have a separate def-use cycle for each copy, and therefore
7164 for each copy we get the vector def for the reduction variable from the
7165 respective phi node created for this copy.
7167 Otherwise (the reduction is unused in the loop nest), we can combine
7168 together intermediate results, like so (e.g. for ncopies=2):
7172 (i.e. we generate VF/2 results in a single register).
7173 In this case for each copy we get the vector def for the reduction variable
7174 from the vectorized reduction operation generated in the previous iteration.
7176 This only works when we see both the reduction PHI and its only consumer
7177 in vectorizable_reduction and there are no intermediate stmts
7180 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7181 && reduc_chain_length
== 1)
7182 single_defuse_cycle
= true;
7184 if (single_defuse_cycle
|| lane_reduc_code_p
)
7186 gcc_assert (code
!= COND_EXPR
);
7188 /* 4. Supportable by target? */
7191 /* 4.1. check support for the operation in the loop */
7192 optab optab
= optab_for_tree_code (code
, vectype_in
, optab_vector
);
7195 if (dump_enabled_p ())
7196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7201 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
7202 if (ok
&& optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
7204 if (dump_enabled_p ())
7205 dump_printf (MSG_NOTE
, "op not supported by target.\n");
7206 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
7207 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
7210 if (dump_enabled_p ())
7211 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
7214 /* Worthwhile without SIMD support? */
7216 && !VECTOR_MODE_P (TYPE_MODE (vectype_in
))
7217 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
7219 if (dump_enabled_p ())
7220 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7221 "not worthwhile without SIMD support.\n");
7225 /* lane-reducing operations have to go through vect_transform_reduction.
7226 For the other cases try without the single cycle optimization. */
7229 if (lane_reduc_code_p
)
7232 single_defuse_cycle
= false;
7235 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
7237 /* If the reduction stmt is one of the patterns that have lane
7238 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7239 if ((ncopies
> 1 && ! single_defuse_cycle
)
7240 && lane_reduc_code_p
)
7242 if (dump_enabled_p ())
7243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7244 "multi def-use cycle not possible for lane-reducing "
7245 "reduction operation\n");
7250 && !(!single_defuse_cycle
7251 && code
!= DOT_PROD_EXPR
7252 && code
!= WIDEN_SUM_EXPR
7254 && reduction_type
!= FOLD_LEFT_REDUCTION
))
7255 for (i
= 0; i
< op_type
; i
++)
7256 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_in
))
7258 if (dump_enabled_p ())
7259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7260 "incompatible vector types for invariants\n");
7265 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7269 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
7270 reduction_type
, ncopies
, cost_vec
);
7271 /* Cost the reduction op inside the loop if transformed via
7272 vect_transform_reduction. Otherwise this is costed by the
7273 separate vectorizable_* routines. */
7274 if (single_defuse_cycle
7275 || code
== DOT_PROD_EXPR
7276 || code
== WIDEN_SUM_EXPR
7277 || code
== SAD_EXPR
)
7278 record_stmt_cost (cost_vec
, ncopies
, vector_stmt
, stmt_info
, 0, vect_body
);
7280 if (dump_enabled_p ()
7281 && reduction_type
== FOLD_LEFT_REDUCTION
)
7282 dump_printf_loc (MSG_NOTE
, vect_location
,
7283 "using an in-order (fold-left) reduction.\n");
7284 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
7285 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7286 reductions go through their own vectorizable_* routines. */
7287 if (!single_defuse_cycle
7288 && code
!= DOT_PROD_EXPR
7289 && code
!= WIDEN_SUM_EXPR
7291 && reduction_type
!= FOLD_LEFT_REDUCTION
)
7294 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
7295 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
7297 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
7298 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
7300 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
7301 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
7303 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7305 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7306 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7308 if (reduction_type
!= FOLD_LEFT_REDUCTION
7309 && !use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
)
7310 && (cond_fn
== IFN_LAST
7311 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7312 OPTIMIZE_FOR_SPEED
)))
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7316 "can't operate on partial vectors because"
7317 " no conditional operation is available.\n");
7318 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7320 else if (reduction_type
== FOLD_LEFT_REDUCTION
7321 && reduc_fn
== IFN_LAST
7322 && !expand_vec_cond_expr_p (vectype_in
,
7323 truth_type_for (vectype_in
),
7326 if (dump_enabled_p ())
7327 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7328 "can't operate on partial vectors because"
7329 " no conditional operation is available.\n");
7330 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7333 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7339 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7343 vect_transform_reduction (loop_vec_info loop_vinfo
,
7344 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
7345 gimple
**vec_stmt
, slp_tree slp_node
)
7347 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7348 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7353 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7354 gcc_assert (reduc_info
->is_reduc_info
);
7356 if (nested_in_vect_loop_p (loop
, stmt_info
))
7359 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
7362 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
7363 enum tree_code code
= gimple_assign_rhs_code (stmt
);
7364 int op_type
= TREE_CODE_LENGTH (code
);
7368 switch (get_gimple_rhs_class (code
))
7370 case GIMPLE_TERNARY_RHS
:
7371 ops
[2] = gimple_assign_rhs3 (stmt
);
7373 case GIMPLE_BINARY_RHS
:
7374 ops
[0] = gimple_assign_rhs1 (stmt
);
7375 ops
[1] = gimple_assign_rhs2 (stmt
);
7381 /* All uses but the last are expected to be defined in the loop.
7382 The last use is the reduction variable. In case of nested cycle this
7383 assumption is not true: we use reduc_index to record the index of the
7384 reduction variable. */
7385 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7386 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7387 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
7388 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7393 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7397 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7401 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7402 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7403 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7406 tree new_temp
= NULL_TREE
;
7407 auto_vec
<tree
> vec_oprnds0
;
7408 auto_vec
<tree
> vec_oprnds1
;
7409 auto_vec
<tree
> vec_oprnds2
;
7412 if (dump_enabled_p ())
7413 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7415 /* FORNOW: Multiple types are not supported for condition. */
7416 if (code
== COND_EXPR
)
7417 gcc_assert (ncopies
== 1);
7419 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7421 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7422 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7424 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7425 return vectorize_fold_left_reduction
7426 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
7427 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
7430 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
7431 gcc_assert (single_defuse_cycle
7432 || code
== DOT_PROD_EXPR
7433 || code
== WIDEN_SUM_EXPR
7434 || code
== SAD_EXPR
);
7436 /* Create the destination vector */
7437 tree scalar_dest
= gimple_assign_lhs (stmt
);
7438 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7440 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
7441 single_defuse_cycle
&& reduc_index
== 0
7442 ? NULL_TREE
: ops
[0], &vec_oprnds0
,
7443 single_defuse_cycle
&& reduc_index
== 1
7444 ? NULL_TREE
: ops
[1], &vec_oprnds1
,
7445 op_type
== ternary_op
7446 && !(single_defuse_cycle
&& reduc_index
== 2)
7447 ? ops
[2] : NULL_TREE
, &vec_oprnds2
);
7448 if (single_defuse_cycle
)
7450 gcc_assert (!slp_node
);
7451 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7453 reduc_index
== 0 ? &vec_oprnds0
7454 : (reduc_index
== 1 ? &vec_oprnds1
7458 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7461 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7462 if (masked_loop_p
&& !mask_by_cond_expr
)
7464 /* Make sure that the reduction accumulator is vop[0]. */
7465 if (reduc_index
== 1)
7467 gcc_assert (commutative_tree_code (code
));
7468 std::swap (vop
[0], vop
[1]);
7470 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7472 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7473 vop
[0], vop
[1], vop
[0]);
7474 new_temp
= make_ssa_name (vec_dest
, call
);
7475 gimple_call_set_lhs (call
, new_temp
);
7476 gimple_call_set_nothrow (call
, true);
7477 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
7482 if (op_type
== ternary_op
)
7483 vop
[2] = vec_oprnds2
[i
];
7485 if (masked_loop_p
&& mask_by_cond_expr
)
7487 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7489 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7492 new_stmt
= gimple_build_assign (vec_dest
, code
,
7493 vop
[0], vop
[1], vop
[2]);
7494 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7495 gimple_assign_set_lhs (new_stmt
, new_temp
);
7496 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7500 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7501 else if (single_defuse_cycle
7504 if (reduc_index
== 0)
7505 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
7506 else if (reduc_index
== 1)
7507 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
7508 else if (reduc_index
== 2)
7509 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
7512 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7516 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7521 /* Transform phase of a cycle PHI. */
7524 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7525 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7526 slp_tree slp_node
, slp_instance slp_node_instance
)
7528 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7529 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7533 bool nested_cycle
= false;
7536 if (nested_in_vect_loop_p (loop
, stmt_info
))
7539 nested_cycle
= true;
7542 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7543 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7544 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7545 gcc_assert (reduc_info
->is_reduc_info
);
7547 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7548 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7549 /* Leave the scalar phi in place. */
7552 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7553 /* For a nested cycle we do not fill the above. */
7555 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7556 gcc_assert (vectype_in
);
7560 /* The size vect_schedule_slp_instance computes is off for us. */
7561 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7562 * SLP_TREE_LANES (slp_node
), vectype_in
);
7568 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7571 /* Check whether we should use a single PHI node and accumulate
7572 vectors to one before the backedge. */
7573 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7576 /* Create the destination vector */
7577 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7578 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7581 /* Get the loop-entry arguments. */
7582 tree vec_initial_def
;
7583 auto_vec
<tree
> vec_initial_defs
;
7586 vec_initial_defs
.reserve (vec_num
);
7589 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
7590 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
7595 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
7596 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
);
7598 = neutral_op_for_slp_reduction (slp_node
, vectype_out
,
7599 STMT_VINFO_REDUC_CODE (reduc_info
),
7601 get_initial_defs_for_reduction (loop_vinfo
, slp_node_instance
->reduc_phis
,
7602 &vec_initial_defs
, vec_num
,
7603 first
!= NULL
, neutral_op
);
7608 /* Get at the scalar def before the loop, that defines the initial
7609 value of the reduction variable. */
7610 tree initial_def
= PHI_ARG_DEF_FROM_EDGE (phi
,
7611 loop_preheader_edge (loop
));
7612 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7613 and we can't use zero for induc_val, use initial_def. Similarly
7614 for REDUC_MIN and initial_def larger than the base. */
7615 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
7617 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
7618 if (TREE_CODE (initial_def
) == INTEGER_CST
7619 && !integer_zerop (induc_val
)
7620 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
7621 && tree_int_cst_lt (initial_def
, induc_val
))
7622 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
7623 && tree_int_cst_lt (induc_val
, initial_def
))))
7625 induc_val
= initial_def
;
7626 /* Communicate we used the initial_def to epilouge
7628 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
7630 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
7631 vec_initial_defs
.create (ncopies
);
7632 for (i
= 0; i
< ncopies
; ++i
)
7633 vec_initial_defs
.quick_push (vec_initial_def
);
7635 else if (nested_cycle
)
7637 /* Do not use an adjustment def as that case is not supported
7638 correctly if ncopies is not one. */
7639 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
7640 ncopies
, initial_def
,
7645 tree adjustment_def
= NULL_TREE
;
7646 tree
*adjustment_defp
= &adjustment_def
;
7647 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7648 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
7649 adjustment_defp
= NULL
;
7651 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
, code
,
7652 initial_def
, adjustment_defp
);
7653 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = adjustment_def
;
7654 vec_initial_defs
.create (ncopies
);
7655 for (i
= 0; i
< ncopies
; ++i
)
7656 vec_initial_defs
.quick_push (vec_initial_def
);
7660 /* Generate the reduction PHIs upfront. */
7661 for (i
= 0; i
< vec_num
; i
++)
7663 tree vec_init_def
= vec_initial_defs
[i
];
7664 for (j
= 0; j
< ncopies
; j
++)
7666 /* Create the reduction-phi that defines the reduction
7668 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
7670 /* Set the loop-entry arg of the reduction-phi. */
7671 if (j
!= 0 && nested_cycle
)
7672 vec_init_def
= vec_initial_defs
[j
];
7673 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
7676 /* The loop-latch arg is set in epilogue processing. */
7679 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7683 *vec_stmt
= new_phi
;
7684 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7692 /* Vectorizes LC PHIs. */
7695 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
7696 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7700 || !is_a
<gphi
*> (stmt_info
->stmt
)
7701 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
7704 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
7705 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
7708 if (!vec_stmt
) /* transformation not required. */
7710 /* Deal with copies from externs or constants that disguise as
7711 loop-closed PHI nodes (PR97886). */
7713 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
7714 SLP_TREE_VECTYPE (slp_node
)))
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7718 "incompatible vector types for invariants\n");
7721 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
7725 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7726 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7727 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7728 edge e
= single_pred_edge (bb
);
7729 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7730 auto_vec
<tree
> vec_oprnds
;
7731 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
7732 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
7733 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
7734 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
7736 /* Create the vectorized LC PHI node. */
7737 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
7738 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
7740 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7742 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7745 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7750 /* Vectorizes PHIs. */
7753 vectorizable_phi (vec_info
*,
7754 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7755 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
7757 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
7760 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
7763 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
7765 if (!vec_stmt
) /* transformation not required. */
7769 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
7772 if (dump_enabled_p ())
7773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7774 "PHI node with unvectorized backedge def\n");
7777 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
7779 if (dump_enabled_p ())
7780 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7781 "incompatible vector types for invariants\n");
7784 /* For single-argument PHIs assume coalescing which means zero cost
7785 for the scalar and the vector PHIs. This avoids artificially
7786 favoring the vector path (but may pessimize it in some cases). */
7787 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
7788 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
7789 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
7790 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
7794 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7795 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7796 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7797 auto_vec
<gphi
*> new_phis
;
7798 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
7800 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
7802 /* Skip not yet vectorized defs. */
7803 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
7804 && SLP_TREE_VEC_STMTS (child
).is_empty ())
7807 auto_vec
<tree
> vec_oprnds
;
7808 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
7809 if (!new_phis
.exists ())
7811 new_phis
.create (vec_oprnds
.length ());
7812 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
7814 /* Create the vectorized LC PHI node. */
7815 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
7816 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phis
[j
]);
7819 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
7820 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
7821 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
7823 /* We should have at least one already vectorized child. */
7824 gcc_assert (new_phis
.exists ());
7830 /* Function vect_min_worthwhile_factor.
7832 For a loop where we could vectorize the operation indicated by CODE,
7833 return the minimum vectorization factor that makes it worthwhile
7834 to use generic vectors. */
7836 vect_min_worthwhile_factor (enum tree_code code
)
7856 /* Return true if VINFO indicates we are doing loop vectorization and if
7857 it is worth decomposing CODE operations into scalar operations for
7858 that loop's vectorization factor. */
7861 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7863 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7864 unsigned HOST_WIDE_INT value
;
7866 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7867 && value
>= vect_min_worthwhile_factor (code
));
7870 /* Function vectorizable_induction
7872 Check if STMT_INFO performs an induction computation that can be vectorized.
7873 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7874 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7875 Return true if STMT_INFO is vectorizable in this way. */
7878 vectorizable_induction (loop_vec_info loop_vinfo
,
7879 stmt_vec_info stmt_info
,
7880 gimple
**vec_stmt
, slp_tree slp_node
,
7881 stmt_vector_for_cost
*cost_vec
)
7883 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7885 bool nested_in_vect_loop
= false;
7886 class loop
*iv_loop
;
7888 edge pe
= loop_preheader_edge (loop
);
7890 tree new_vec
, vec_init
, vec_step
, t
;
7893 gphi
*induction_phi
;
7894 tree induc_def
, vec_dest
;
7895 tree init_expr
, step_expr
;
7896 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7899 gimple_stmt_iterator si
;
7901 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
7905 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7908 /* Make sure it was recognized as induction computation. */
7909 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7912 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7913 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7918 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7919 gcc_assert (ncopies
>= 1);
7921 /* FORNOW. These restrictions should be relaxed. */
7922 if (nested_in_vect_loop_p (loop
, stmt_info
))
7924 imm_use_iterator imm_iter
;
7925 use_operand_p use_p
;
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7934 "multiple types in nested loop.\n");
7939 latch_e
= loop_latch_edge (loop
->inner
);
7940 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7941 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7943 gimple
*use_stmt
= USE_STMT (use_p
);
7944 if (is_gimple_debug (use_stmt
))
7947 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7949 exit_phi
= use_stmt
;
7955 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7956 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7957 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7959 if (dump_enabled_p ())
7960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7961 "inner-loop induction only used outside "
7962 "of the outer vectorized loop.\n");
7967 nested_in_vect_loop
= true;
7968 iv_loop
= loop
->inner
;
7972 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7974 if (slp_node
&& !nunits
.is_constant ())
7976 /* The current SLP code creates the step value element-by-element. */
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7979 "SLP induction not supported for variable-length"
7984 if (!vec_stmt
) /* transformation not required. */
7986 unsigned inside_cost
= 0, prologue_cost
= 0;
7989 /* We eventually need to set a vector type on invariant
7993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
7994 if (!vect_maybe_update_slp_op_vectype
7995 (child
, SLP_TREE_VECTYPE (slp_node
)))
7997 if (dump_enabled_p ())
7998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7999 "incompatible vector types for "
8003 /* loop cost for vec_loop. */
8005 = record_stmt_cost (cost_vec
,
8006 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
8007 vector_stmt
, stmt_info
, 0, vect_body
);
8008 /* prologue cost for vec_init (if not nested) and step. */
8009 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
8011 stmt_info
, 0, vect_prologue
);
8013 else /* if (!slp_node) */
8015 /* loop cost for vec_loop. */
8016 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
8017 stmt_info
, 0, vect_body
);
8018 /* prologue cost for vec_init and vec_step. */
8019 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
8020 stmt_info
, 0, vect_prologue
);
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_NOTE
, vect_location
,
8024 "vect_model_induction_cost: inside_cost = %d, "
8025 "prologue_cost = %d .\n", inside_cost
,
8028 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
8029 DUMP_VECT_SCOPE ("vectorizable_induction");
8035 /* Compute a vector variable, initialized with the first VF values of
8036 the induction variable. E.g., for an iv with IV_PHI='X' and
8037 evolution S, for a vector of 4 units, we want to compute:
8038 [X, X + S, X + 2*S, X + 3*S]. */
8040 if (dump_enabled_p ())
8041 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
8043 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
8044 gcc_assert (step_expr
!= NULL_TREE
);
8045 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
8047 pe
= loop_preheader_edge (iv_loop
);
8048 /* Find the first insertion point in the BB. */
8049 basic_block bb
= gimple_bb (phi
);
8050 si
= gsi_after_labels (bb
);
8052 /* For SLP induction we have to generate several IVs as for example
8053 with group size 3 we need
8054 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8055 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8058 /* Enforced above. */
8059 unsigned int const_nunits
= nunits
.to_constant ();
8061 /* The initial values are vectorized, but any lanes > group_size
8064 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
8066 /* Gather steps. Since we do not vectorize inductions as
8067 cycles we have to reconstruct the step from SCEV data. */
8068 unsigned group_size
= SLP_TREE_LANES (slp_node
);
8069 tree
*steps
= XALLOCAVEC (tree
, group_size
);
8070 tree
*inits
= XALLOCAVEC (tree
, group_size
);
8071 stmt_vec_info phi_info
;
8072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
8074 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
8076 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
8080 /* Now generate the IVs. */
8081 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8082 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
8084 if (nested_in_vect_loop
)
8088 /* Compute the number of distinct IVs we need. First reduce
8089 group_size if it is a multiple of const_nunits so we get
8090 one IV for a group_size of 4 but const_nunits 2. */
8091 unsigned group_sizep
= group_size
;
8092 if (group_sizep
% const_nunits
== 0)
8093 group_sizep
= group_sizep
/ const_nunits
;
8094 nivs
= least_common_multiple (group_sizep
,
8095 const_nunits
) / const_nunits
;
8097 tree stept
= TREE_TYPE (step_vectype
);
8098 tree lupdate_mul
= NULL_TREE
;
8099 if (!nested_in_vect_loop
)
8101 /* The number of iterations covered in one vector iteration. */
8102 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
8104 = build_vector_from_val (step_vectype
,
8105 SCALAR_FLOAT_TYPE_P (stept
)
8106 ? build_real_from_wide (stept
, lup_mul
,
8108 : build_int_cstu (stept
, lup_mul
));
8110 tree peel_mul
= NULL_TREE
;
8111 gimple_seq init_stmts
= NULL
;
8112 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
8114 if (SCALAR_FLOAT_TYPE_P (stept
))
8115 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
8116 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8118 peel_mul
= gimple_convert (&init_stmts
, stept
,
8119 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8120 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
8121 step_vectype
, peel_mul
);
8124 auto_vec
<tree
> vec_steps
;
8125 for (ivn
= 0; ivn
< nivs
; ++ivn
)
8127 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
8128 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
8129 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
8130 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
8132 /* The scalar steps of the IVs. */
8133 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
8134 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
8135 step_elts
.quick_push (elt
);
8138 /* The scalar inits of the IVs if not vectorized. */
8139 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
8140 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
8142 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
8143 TREE_TYPE (vectype
), elt
);
8144 init_elts
.quick_push (elt
);
8146 /* The number of steps to add to the initial values. */
8147 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
8148 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
8149 ? build_real_from_wide (stept
,
8151 : build_int_cstu (stept
, mul_elt
));
8153 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
8154 vec_steps
.safe_push (vec_step
);
8155 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
8157 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8158 step_mul
, peel_mul
);
8160 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
8162 /* Create the induction-phi that defines the induction-operand. */
8163 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
8165 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8166 induc_def
= PHI_RESULT (induction_phi
);
8168 /* Create the iv update inside the loop */
8171 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8172 vec_step
, lupdate_mul
);
8173 gimple_seq stmts
= NULL
;
8174 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8175 vec_def
= gimple_build (&stmts
,
8176 PLUS_EXPR
, step_vectype
, vec_def
, up
);
8177 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8178 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8179 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8183 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
8184 if (!nested_in_vect_loop
8185 && !integer_zerop (step_mul
))
8187 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
8188 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8189 vec_step
, step_mul
);
8190 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8192 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
8195 /* Set the arguments of the phi node: */
8196 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8198 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
8200 if (!nested_in_vect_loop
)
8202 /* Fill up to the number of vectors we need for the whole group. */
8203 nivs
= least_common_multiple (group_size
,
8204 const_nunits
) / const_nunits
;
8205 for (; ivn
< nivs
; ++ivn
)
8207 SLP_TREE_VEC_STMTS (slp_node
)
8208 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
8209 vec_steps
.safe_push (vec_steps
[0]);
8213 /* Re-use IVs when we can. We are generating further vector
8214 stmts by adding VF' * stride to the IVs generated above. */
8218 = least_common_multiple (group_size
, const_nunits
) / group_size
;
8220 = build_vector_from_val (step_vectype
,
8221 SCALAR_FLOAT_TYPE_P (stept
)
8222 ? build_real_from_wide (stept
,
8224 : build_int_cstu (stept
, vfp
));
8225 for (; ivn
< nvects
; ++ivn
)
8227 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
8228 tree def
= gimple_get_lhs (iv
);
8230 vec_steps
[ivn
- nivs
]
8231 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8232 vec_steps
[ivn
- nivs
], lupdate_mul
);
8233 gimple_seq stmts
= NULL
;
8234 def
= gimple_convert (&stmts
, step_vectype
, def
);
8235 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8236 def
, vec_steps
[ivn
% nivs
]);
8237 def
= gimple_convert (&stmts
, vectype
, def
);
8238 if (gimple_code (iv
) == GIMPLE_PHI
)
8239 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8242 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
8243 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
8245 SLP_TREE_VEC_STMTS (slp_node
)
8246 .quick_push (SSA_NAME_DEF_STMT (def
));
8250 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
8251 gcc_assert (!new_bb
);
8256 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
8257 loop_preheader_edge (iv_loop
));
8259 gimple_seq stmts
= NULL
;
8260 if (!nested_in_vect_loop
)
8262 /* Convert the initial value to the IV update type. */
8263 tree new_type
= TREE_TYPE (step_expr
);
8264 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
8266 /* If we are using the loop mask to "peel" for alignment then we need
8267 to adjust the start value here. */
8268 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
8269 if (skip_niters
!= NULL_TREE
)
8271 if (FLOAT_TYPE_P (vectype
))
8272 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
8275 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
8276 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
8277 skip_niters
, step_expr
);
8278 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
8279 init_expr
, skip_step
);
8285 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8286 gcc_assert (!new_bb
);
8289 /* Create the vector that holds the initial_value of the induction. */
8290 if (nested_in_vect_loop
)
8292 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8293 been created during vectorization of previous stmts. We obtain it
8294 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8295 auto_vec
<tree
> vec_inits
;
8296 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
8297 init_expr
, &vec_inits
);
8298 vec_init
= vec_inits
[0];
8299 /* If the initial value is not of proper type, convert it. */
8300 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
8303 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
8307 build1 (VIEW_CONVERT_EXPR
, vectype
,
8309 vec_init
= gimple_assign_lhs (new_stmt
);
8310 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
8312 gcc_assert (!new_bb
);
8317 /* iv_loop is the loop to be vectorized. Create:
8318 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8320 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
8322 unsigned HOST_WIDE_INT const_nunits
;
8323 if (nunits
.is_constant (&const_nunits
))
8325 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
8326 elts
.quick_push (new_name
);
8327 for (i
= 1; i
< const_nunits
; i
++)
8329 /* Create: new_name_i = new_name + step_expr */
8330 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
8331 new_name
, step_expr
);
8332 elts
.quick_push (new_name
);
8334 /* Create a vector from [new_name_0, new_name_1, ...,
8335 new_name_nunits-1] */
8336 vec_init
= gimple_build_vector (&stmts
, &elts
);
8338 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
8339 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8340 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
8341 new_name
, step_expr
);
8345 [base, base, base, ...]
8346 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8347 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
8348 gcc_assert (flag_associative_math
);
8349 tree index
= build_index_vector (step_vectype
, 0, 1);
8350 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8352 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8354 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
8355 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
8356 vec_init
, step_vec
);
8357 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8358 vec_init
, base_vec
);
8360 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
8364 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8365 gcc_assert (!new_bb
);
8370 /* Create the vector that holds the step of the induction. */
8371 if (nested_in_vect_loop
)
8372 /* iv_loop is nested in the loop to be vectorized. Generate:
8373 vec_step = [S, S, S, S] */
8374 new_name
= step_expr
;
8377 /* iv_loop is the loop to be vectorized. Generate:
8378 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8379 gimple_seq seq
= NULL
;
8380 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8382 expr
= build_int_cst (integer_type_node
, vf
);
8383 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8386 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
8387 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8391 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8392 gcc_assert (!new_bb
);
8396 t
= unshare_expr (new_name
);
8397 gcc_assert (CONSTANT_CLASS_P (new_name
)
8398 || TREE_CODE (new_name
) == SSA_NAME
);
8399 new_vec
= build_vector_from_val (step_vectype
, t
);
8400 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8401 new_vec
, step_vectype
, NULL
);
8404 /* Create the following def-use cycle:
8409 vec_iv = PHI <vec_init, vec_loop>
8413 vec_loop = vec_iv + vec_step; */
8415 /* Create the induction-phi that defines the induction-operand. */
8416 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
8417 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8418 induc_def
= PHI_RESULT (induction_phi
);
8420 /* Create the iv update inside the loop */
8422 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8423 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8424 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8425 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8426 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8428 /* Set the arguments of the phi node: */
8429 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8430 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8433 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
8434 *vec_stmt
= induction_phi
;
8436 /* In case that vectorization factor (VF) is bigger than the number
8437 of elements that we can fit in a vectype (nunits), we have to generate
8438 more than one vector stmt - i.e - we need to "unroll" the
8439 vector stmt by a factor VF/nunits. For more details see documentation
8440 in vectorizable_operation. */
8444 gimple_seq seq
= NULL
;
8445 /* FORNOW. This restriction should be relaxed. */
8446 gcc_assert (!nested_in_vect_loop
);
8448 /* Create the vector that holds the step of the induction. */
8449 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8451 expr
= build_int_cst (integer_type_node
, nunits
);
8452 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8455 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
8456 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8460 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8461 gcc_assert (!new_bb
);
8464 t
= unshare_expr (new_name
);
8465 gcc_assert (CONSTANT_CLASS_P (new_name
)
8466 || TREE_CODE (new_name
) == SSA_NAME
);
8467 new_vec
= build_vector_from_val (step_vectype
, t
);
8468 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8469 new_vec
, step_vectype
, NULL
);
8471 vec_def
= induc_def
;
8472 for (i
= 1; i
< ncopies
; i
++)
8474 /* vec_i = vec_prev + vec_step */
8475 gimple_seq stmts
= NULL
;
8476 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
8477 vec_def
= gimple_build (&stmts
,
8478 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8479 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8481 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8482 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8483 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
8487 if (dump_enabled_p ())
8488 dump_printf_loc (MSG_NOTE
, vect_location
,
8489 "transform induction: created def-use cycle: %G%G",
8490 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
8495 /* Function vectorizable_live_operation.
8497 STMT_INFO computes a value that is used outside the loop. Check if
8498 it can be supported. */
8501 vectorizable_live_operation (vec_info
*vinfo
,
8502 stmt_vec_info stmt_info
,
8503 gimple_stmt_iterator
*gsi
,
8504 slp_tree slp_node
, slp_instance slp_node_instance
,
8505 int slp_index
, bool vec_stmt_p
,
8506 stmt_vector_for_cost
*cost_vec
)
8508 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
8509 imm_use_iterator imm_iter
;
8510 tree lhs
, lhs_type
, bitsize
;
8511 tree vectype
= (slp_node
8512 ? SLP_TREE_VECTYPE (slp_node
)
8513 : STMT_VINFO_VECTYPE (stmt_info
));
8514 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8517 auto_vec
<tree
> vec_oprnds
;
8519 poly_uint64 vec_index
= 0;
8521 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
8523 /* If a stmt of a reduction is live, vectorize it via
8524 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8525 validity so just trigger the transform here. */
8526 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
8532 /* For reduction chains the meta-info is attached to
8533 the group leader. */
8534 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
8535 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
8536 /* For SLP reductions we vectorize the epilogue for
8537 all involved stmts together. */
8538 else if (slp_index
!= 0)
8541 /* For SLP reductions the meta-info is attached to
8542 the representative. */
8543 stmt_info
= SLP_TREE_REPRESENTATIVE (slp_node
);
8545 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8546 gcc_assert (reduc_info
->is_reduc_info
);
8547 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
8548 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
8550 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
8555 /* If STMT is not relevant and it is a simple assignment and its inputs are
8556 invariant then it can remain in place, unvectorized. The original last
8557 scalar value that it computes will be used. */
8558 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8560 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
8561 if (dump_enabled_p ())
8562 dump_printf_loc (MSG_NOTE
, vect_location
,
8563 "statement is simple and uses invariant. Leaving in "
8571 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8575 gcc_assert (slp_index
>= 0);
8577 /* Get the last occurrence of the scalar index from the concatenation of
8578 all the slp vectors. Calculate which slp vector it is and the index
8580 int num_scalar
= SLP_TREE_LANES (slp_node
);
8581 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8582 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
8584 /* Calculate which vector contains the result, and which lane of
8585 that vector we need. */
8586 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
8588 if (dump_enabled_p ())
8589 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8590 "Cannot determine which vector holds the"
8591 " final result.\n");
8598 /* No transformation required. */
8599 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
8601 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
8602 OPTIMIZE_FOR_SPEED
))
8604 if (dump_enabled_p ())
8605 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8606 "can't operate on partial vectors "
8607 "because the target doesn't support extract "
8608 "last reduction.\n");
8609 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8613 if (dump_enabled_p ())
8614 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8615 "can't operate on partial vectors "
8616 "because an SLP statement is live after "
8618 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8620 else if (ncopies
> 1)
8622 if (dump_enabled_p ())
8623 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8624 "can't operate on partial vectors "
8625 "because ncopies is greater than 1.\n");
8626 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8630 gcc_assert (ncopies
== 1 && !slp_node
);
8631 vect_record_loop_mask (loop_vinfo
,
8632 &LOOP_VINFO_MASKS (loop_vinfo
),
8636 /* ??? Enable for loop costing as well. */
8638 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
8643 /* Use the lhs of the original scalar statement. */
8644 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
8645 if (dump_enabled_p ())
8646 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
8649 lhs
= gimple_get_lhs (stmt
);
8650 lhs_type
= TREE_TYPE (lhs
);
8652 bitsize
= vector_element_bits_tree (vectype
);
8654 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8655 tree vec_lhs
, bitstart
;
8659 gcc_assert (!loop_vinfo
|| !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8661 /* Get the correct slp vectorized stmt. */
8662 vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
8663 vec_lhs
= gimple_get_lhs (vec_stmt
);
8665 /* Get entry to use. */
8666 bitstart
= bitsize_int (vec_index
);
8667 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8671 /* For multiple copies, get the last copy. */
8672 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
8673 vec_lhs
= gimple_get_lhs (vec_stmt
);
8675 /* Get the last lane in the vector. */
8676 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
8681 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8682 requirement, insert one phi node for it. It looks like:
8689 # vec_lhs' = PHI <vec_lhs>
8690 new_tree = lane_extract <vec_lhs', ...>;
8693 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8694 basic_block exit_bb
= single_exit (loop
)->dest
;
8695 gcc_assert (single_pred_p (exit_bb
));
8697 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
8698 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
8699 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
8701 gimple_seq stmts
= NULL
;
8703 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8707 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8709 where VEC_LHS is the vectorized live-out result and MASK is
8710 the loop mask for the final iteration. */
8711 gcc_assert (ncopies
== 1 && !slp_node
);
8712 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8713 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8715 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
8718 /* Convert the extracted vector element to the scalar type. */
8719 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8723 tree bftype
= TREE_TYPE (vectype
);
8724 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8725 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8726 new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8727 vec_lhs_phi
, bitsize
, bitstart
);
8728 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8729 &stmts
, true, NULL_TREE
);
8734 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
8735 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
8737 /* Remove existing phi from lhs and create one copy from new_tree. */
8738 tree lhs_phi
= NULL_TREE
;
8739 gimple_stmt_iterator gsi
;
8740 for (gsi
= gsi_start_phis (exit_bb
);
8741 !gsi_end_p (gsi
); gsi_next (&gsi
))
8743 gimple
*phi
= gsi_stmt (gsi
);
8744 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
8746 remove_phi_node (&gsi
, false);
8747 lhs_phi
= gimple_phi_result (phi
);
8748 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
8749 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
8755 /* Replace use of lhs with newly computed result. If the use stmt is a
8756 single arg PHI, just replace all uses of PHI result. It's necessary
8757 because lcssa PHI defining lhs may be before newly inserted stmt. */
8758 use_operand_p use_p
;
8759 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8760 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8761 && !is_gimple_debug (use_stmt
))
8763 if (gimple_code (use_stmt
) == GIMPLE_PHI
8764 && gimple_phi_num_args (use_stmt
) == 1)
8766 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8770 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8771 SET_USE (use_p
, new_tree
);
8773 update_stmt (use_stmt
);
8778 /* For basic-block vectorization simply insert the lane-extraction. */
8779 tree bftype
= TREE_TYPE (vectype
);
8780 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8781 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8782 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8783 vec_lhs
, bitsize
, bitstart
);
8784 gimple_seq stmts
= NULL
;
8785 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8786 &stmts
, true, NULL_TREE
);
8787 if (TREE_CODE (new_tree
) == SSA_NAME
8788 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
8789 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
8790 if (is_a
<gphi
*> (vec_stmt
))
8792 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
8793 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8797 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
8798 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
8801 /* Replace use of lhs with newly computed result. If the use stmt is a
8802 single arg PHI, just replace all uses of PHI result. It's necessary
8803 because lcssa PHI defining lhs may be before newly inserted stmt. */
8804 use_operand_p use_p
;
8805 stmt_vec_info use_stmt_info
;
8806 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8807 if (!is_gimple_debug (use_stmt
)
8808 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
8809 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
8811 /* ??? This can happen when the live lane ends up being
8812 used in a vector construction code-generated by an
8813 external SLP node (and code-generation for that already
8814 happened). See gcc.dg/vect/bb-slp-47.c.
8815 Doing this is what would happen if that vector CTOR
8816 were not code-generated yet so it is not too bad.
8817 ??? In fact we'd likely want to avoid this situation
8818 in the first place. */
8819 if (TREE_CODE (new_tree
) == SSA_NAME
8820 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
8821 && gimple_code (use_stmt
) != GIMPLE_PHI
8822 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
8825 enum tree_code code
= gimple_assign_rhs_code (use_stmt
);
8826 gcc_assert (code
== CONSTRUCTOR
8827 || code
== VIEW_CONVERT_EXPR
8828 || CONVERT_EXPR_CODE_P (code
));
8829 if (dump_enabled_p ())
8830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8831 "Using original scalar computation for "
8832 "live lane because use preceeds vector "
8836 /* ??? It can also happen that we end up pulling a def into
8837 a loop where replacing out-of-loop uses would require
8838 a new LC SSA PHI node. Retain the original scalar in
8839 those cases as well. PR98064. */
8840 if (TREE_CODE (new_tree
) == SSA_NAME
8841 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
8842 && (gimple_bb (use_stmt
)->loop_father
8843 != gimple_bb (vec_stmt
)->loop_father
)
8844 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
8845 gimple_bb (use_stmt
)->loop_father
))
8847 if (dump_enabled_p ())
8848 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8849 "Using original scalar computation for "
8850 "live lane because there is an out-of-loop "
8851 "definition for it\n");
8854 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8855 SET_USE (use_p
, new_tree
);
8856 update_stmt (use_stmt
);
8863 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8866 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
8868 ssa_op_iter op_iter
;
8869 imm_use_iterator imm_iter
;
8870 def_operand_p def_p
;
8873 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
8875 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8879 if (!is_gimple_debug (ustmt
))
8882 bb
= gimple_bb (ustmt
);
8884 if (!flow_bb_inside_loop_p (loop
, bb
))
8886 if (gimple_debug_bind_p (ustmt
))
8888 if (dump_enabled_p ())
8889 dump_printf_loc (MSG_NOTE
, vect_location
,
8890 "killing debug use\n");
8892 gimple_debug_bind_reset_value (ustmt
);
8893 update_stmt (ustmt
);
8902 /* Given loop represented by LOOP_VINFO, return true if computation of
8903 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8907 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8909 /* Constant case. */
8910 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8912 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8913 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8915 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8916 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8917 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8922 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8923 /* Check the upper bound of loop niters. */
8924 if (get_max_loop_iterations (loop
, &max
))
8926 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8927 signop sgn
= TYPE_SIGN (type
);
8928 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8935 /* Return a mask type with half the number of elements as OLD_TYPE,
8936 given that it should have mode NEW_MODE. */
8939 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
8941 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
8942 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8945 /* Return a mask type with twice as many elements as OLD_TYPE,
8946 given that it should have mode NEW_MODE. */
8949 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
8951 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
8952 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8955 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8956 contain a sequence of NVECTORS masks that each control a vector of type
8957 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8958 these vector masks with the vector version of SCALAR_MASK. */
8961 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8962 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
8964 gcc_assert (nvectors
!= 0);
8965 if (masks
->length () < nvectors
)
8966 masks
->safe_grow_cleared (nvectors
, true);
8967 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
8968 /* The number of scalars per iteration and the number of vectors are
8969 both compile-time constants. */
8970 unsigned int nscalars_per_iter
8971 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8972 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8976 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
8977 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
8980 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8982 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8983 rgm
->type
= truth_type_for (vectype
);
8988 /* Given a complete set of masks MASKS, extract mask number INDEX
8989 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8990 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8992 See the comment above vec_loop_masks for more details about the mask
8996 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8997 unsigned int nvectors
, tree vectype
, unsigned int index
)
8999 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
9000 tree mask_type
= rgm
->type
;
9002 /* Populate the rgroup's mask array, if this is the first time we've
9004 if (rgm
->controls
.is_empty ())
9006 rgm
->controls
.safe_grow_cleared (nvectors
, true);
9007 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9009 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
9010 /* Provide a dummy definition until the real one is available. */
9011 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
9012 rgm
->controls
[i
] = mask
;
9016 tree mask
= rgm
->controls
[index
];
9017 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
9018 TYPE_VECTOR_SUBPARTS (vectype
)))
9020 /* A loop mask for data type X can be reused for data type Y
9021 if X has N times more elements than Y and if Y's elements
9022 are N times bigger than X's. In this case each sequence
9023 of N elements in the loop mask will be all-zero or all-one.
9024 We can then view-convert the mask so that each sequence of
9025 N elements is replaced by a single element. */
9026 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
9027 TYPE_VECTOR_SUBPARTS (vectype
)));
9028 gimple_seq seq
= NULL
;
9029 mask_type
= truth_type_for (vectype
);
9030 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
9032 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
9037 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9038 lengths for controlling an operation on VECTYPE. The operation splits
9039 each element of VECTYPE into FACTOR separate subelements, measuring the
9040 length as a number of these subelements. */
9043 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9044 unsigned int nvectors
, tree vectype
, unsigned int factor
)
9046 gcc_assert (nvectors
!= 0);
9047 if (lens
->length () < nvectors
)
9048 lens
->safe_grow_cleared (nvectors
, true);
9049 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9051 /* The number of scalars per iteration, scalar occupied bytes and
9052 the number of vectors are both compile-time constants. */
9053 unsigned int nscalars_per_iter
9054 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
9055 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
9057 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
9059 /* For now, we only support cases in which all loads and stores fall back
9060 to VnQI or none do. */
9061 gcc_assert (!rgl
->max_nscalars_per_iter
9062 || (rgl
->factor
== 1 && factor
== 1)
9063 || (rgl
->max_nscalars_per_iter
* rgl
->factor
9064 == nscalars_per_iter
* factor
));
9065 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
9066 rgl
->type
= vectype
;
9067 rgl
->factor
= factor
;
9071 /* Given a complete set of length LENS, extract length number INDEX for an
9072 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9075 vect_get_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9076 unsigned int nvectors
, unsigned int index
)
9078 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9080 /* Populate the rgroup's len array, if this is the first time we've
9082 if (rgl
->controls
.is_empty ())
9084 rgl
->controls
.safe_grow_cleared (nvectors
, true);
9085 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9087 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
9088 gcc_assert (len_type
!= NULL_TREE
);
9089 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
9091 /* Provide a dummy definition until the real one is available. */
9092 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
9093 rgl
->controls
[i
] = len
;
9097 return rgl
->controls
[index
];
9100 /* Scale profiling counters by estimation for LOOP which is vectorized
9104 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
9106 edge preheader
= loop_preheader_edge (loop
);
9107 /* Reduce loop iterations by the vectorization factor. */
9108 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
9109 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
9111 if (freq_h
.nonzero_p ())
9113 profile_probability p
;
9115 /* Avoid dropping loop body profile counter to 0 because of zero count
9116 in loop's preheader. */
9117 if (!(freq_e
== profile_count::zero ()))
9118 freq_e
= freq_e
.force_nonzero ();
9119 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
9120 scale_loop_frequencies (loop
, p
);
9123 edge exit_e
= single_exit (loop
);
9124 exit_e
->probability
= profile_probability::always ()
9125 .apply_scale (1, new_est_niter
+ 1);
9127 edge exit_l
= single_pred_edge (loop
->latch
);
9128 profile_probability prob
= exit_l
->probability
;
9129 exit_l
->probability
= exit_e
->probability
.invert ();
9130 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
9131 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
9134 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9135 latch edge values originally defined by it. */
9138 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
9139 stmt_vec_info def_stmt_info
)
9141 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
9142 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
9144 stmt_vec_info phi_info
;
9145 imm_use_iterator iter
;
9146 use_operand_p use_p
;
9147 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
9148 if (gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
)))
9149 if (gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
9150 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
9151 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
9152 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
9153 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
9155 loop_p loop
= gimple_bb (phi
)->loop_father
;
9156 edge e
= loop_latch_edge (loop
);
9157 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) == def
)
9159 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
9160 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
9161 gcc_assert (phi_defs
.length () == latch_defs
.length ());
9162 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
9163 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
9164 gimple_get_lhs (latch_defs
[i
]), e
,
9165 gimple_phi_arg_location (phi
, e
->dest_idx
));
9170 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9171 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9175 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
9176 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
9178 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9179 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9181 if (dump_enabled_p ())
9182 dump_printf_loc (MSG_NOTE
, vect_location
,
9183 "------>vectorizing statement: %G", stmt_info
->stmt
);
9185 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9186 vect_loop_kill_debug_uses (loop
, stmt_info
);
9188 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9189 && !STMT_VINFO_LIVE_P (stmt_info
))
9192 if (STMT_VINFO_VECTYPE (stmt_info
))
9195 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
9196 if (!STMT_SLP_TYPE (stmt_info
)
9197 && maybe_ne (nunits
, vf
)
9198 && dump_enabled_p ())
9199 /* For SLP VF is set according to unrolling factor, and not
9200 to vector size, hence for SLP this print is not valid. */
9201 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9204 /* Pure SLP statements have already been vectorized. We still need
9205 to apply loop vectorization to hybrid SLP statements. */
9206 if (PURE_SLP_STMT (stmt_info
))
9209 if (dump_enabled_p ())
9210 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
9212 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
9213 *seen_store
= stmt_info
;
9218 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9219 in the hash_map with its corresponding values. */
9222 find_in_mapping (tree t
, void *context
)
9224 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
9226 tree
*value
= mapping
->get (t
);
9227 return value
? *value
: t
;
9230 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9231 original loop that has now been vectorized.
9233 The inits of the data_references need to be advanced with the number of
9234 iterations of the main loop. This has been computed in vect_do_peeling and
9235 is stored in parameter ADVANCE. We first restore the data_references
9236 initial offset with the values recored in ORIG_DRS_INIT.
9238 Since the loop_vec_info of this EPILOGUE was constructed for the original
9239 loop, its stmt_vec_infos all point to the original statements. These need
9240 to be updated to point to their corresponding copies as well as the SSA_NAMES
9241 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9243 The data_reference's connections also need to be updated. Their
9244 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9245 stmt_vec_infos, their statements need to point to their corresponding copy,
9246 if they are gather loads or scatter stores then their reference needs to be
9247 updated to point to its corresponding copy and finally we set
9248 'base_misaligned' to false as we have already peeled for alignment in the
9249 prologue of the main loop. */
9252 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
9254 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
9255 auto_vec
<gimple
*> stmt_worklist
;
9256 hash_map
<tree
,tree
> mapping
;
9257 gimple
*orig_stmt
, *new_stmt
;
9258 gimple_stmt_iterator epilogue_gsi
;
9259 gphi_iterator epilogue_phi_gsi
;
9260 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
9261 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
9264 free (LOOP_VINFO_BBS (epilogue_vinfo
));
9265 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
9267 /* Advance data_reference's with the number of iterations of the previous
9268 loop and its prologue. */
9269 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
9272 /* The EPILOGUE loop is a copy of the original loop so they share the same
9273 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9274 point to the copied statements. We also create a mapping of all LHS' in
9275 the original loop and all the LHS' in the EPILOGUE and create worklists to
9276 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9277 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
9279 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
9280 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
9282 new_stmt
= epilogue_phi_gsi
.phi ();
9284 gcc_assert (gimple_uid (new_stmt
) > 0);
9286 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9288 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9289 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9291 mapping
.put (gimple_phi_result (orig_stmt
),
9292 gimple_phi_result (new_stmt
));
9293 /* PHI nodes can not have patterns or related statements. */
9294 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
9295 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
9298 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
9299 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
9301 new_stmt
= gsi_stmt (epilogue_gsi
);
9302 if (is_gimple_debug (new_stmt
))
9305 gcc_assert (gimple_uid (new_stmt
) > 0);
9307 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9309 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9310 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9312 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
9313 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
9315 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
9317 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
9318 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
9319 !gsi_end_p (gsi
); gsi_next (&gsi
))
9320 stmt_worklist
.safe_push (gsi_stmt (gsi
));
9323 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
9324 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
9326 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
9327 stmt_worklist
.safe_push (stmt
);
9328 /* Set BB such that the assert in
9329 'get_initial_def_for_reduction' is able to determine that
9330 the BB of the related stmt is inside this loop. */
9331 gimple_set_bb (stmt
,
9332 gimple_bb (new_stmt
));
9333 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
9334 gcc_assert (related_vinfo
== NULL
9335 || related_vinfo
== stmt_vinfo
);
9340 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9341 using the original main loop and thus need to be updated to refer to the
9342 cloned variables used in the epilogue. */
9343 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
9345 gimple
*stmt
= stmt_worklist
[i
];
9348 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
9350 tree op
= gimple_op (stmt
, j
);
9351 if ((new_op
= mapping
.get(op
)))
9352 gimple_set_op (stmt
, j
, *new_op
);
9355 /* PR92429: The last argument of simplify_replace_tree disables
9356 folding when replacing arguments. This is required as
9357 otherwise you might end up with different statements than the
9358 ones analyzed in vect_loop_analyze, leading to different
9360 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
9361 &find_in_mapping
, &mapping
, false);
9362 gimple_set_op (stmt
, j
, op
);
9367 struct data_reference
*dr
;
9368 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
9369 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
9371 orig_stmt
= DR_STMT (dr
);
9372 gcc_assert (gimple_uid (orig_stmt
) > 0);
9373 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
9374 /* Data references for gather loads and scatter stores do not use the
9375 updated offset we set using ADVANCE. Instead we have to make sure the
9376 reference in the data references point to the corresponding copy of
9377 the original in the epilogue. */
9378 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
9379 == VMAT_GATHER_SCATTER
)
9382 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
9383 &find_in_mapping
, &mapping
);
9384 DR_BASE_ADDRESS (dr
)
9385 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
9386 &find_in_mapping
, &mapping
);
9388 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
9389 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
9390 /* The vector size of the epilogue is smaller than that of the main loop
9391 so the alignment is either the same or lower. This means the dr will
9392 thus by definition be aligned. */
9393 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
9396 epilogue_vinfo
->shared
->datarefs_copy
.release ();
9397 epilogue_vinfo
->shared
->save_datarefs ();
9400 /* Function vect_transform_loop.
9402 The analysis phase has determined that the loop is vectorizable.
9403 Vectorize the loop - created vectorized stmts to replace the scalar
9404 stmts in the loop, and update the loop exit condition.
9405 Returns scalar epilogue loop if any. */
9408 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
9410 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9411 class loop
*epilogue
= NULL
;
9412 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
9413 int nbbs
= loop
->num_nodes
;
9415 tree niters_vector
= NULL_TREE
;
9416 tree step_vector
= NULL_TREE
;
9417 tree niters_vector_mult_vf
= NULL_TREE
;
9418 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9419 unsigned int lowest_vf
= constant_lower_bound (vf
);
9421 bool check_profitability
= false;
9424 DUMP_VECT_SCOPE ("vec_transform_loop");
9426 loop_vinfo
->shared
->check_datarefs ();
9428 /* Use the more conservative vectorization threshold. If the number
9429 of iterations is constant assume the cost check has been performed
9430 by our caller. If the threshold makes all loops profitable that
9431 run at least the (estimated) vectorization factor number of times
9432 checking is pointless, too. */
9433 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
9434 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
9436 if (dump_enabled_p ())
9437 dump_printf_loc (MSG_NOTE
, vect_location
,
9438 "Profitability threshold is %d loop iterations.\n",
9440 check_profitability
= true;
9443 /* Make sure there exists a single-predecessor exit bb. Do this before
9445 edge e
= single_exit (loop
);
9446 if (! single_pred_p (e
->dest
))
9448 split_loop_exit_edge (e
, true);
9449 if (dump_enabled_p ())
9450 dump_printf (MSG_NOTE
, "split exit edge\n");
9453 /* Version the loop first, if required, so the profitability check
9456 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
9459 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
9460 sloop
->force_vectorize
= false;
9461 check_profitability
= false;
9464 /* Make sure there exists a single-predecessor exit bb also on the
9465 scalar loop copy. Do this after versioning but before peeling
9466 so CFG structure is fine for both scalar and if-converted loop
9467 to make slpeel_duplicate_current_defs_from_edges face matched
9468 loop closed PHI nodes on the exit. */
9469 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
9471 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
9472 if (! single_pred_p (e
->dest
))
9474 split_loop_exit_edge (e
, true);
9475 if (dump_enabled_p ())
9476 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
9480 tree niters
= vect_build_loop_niters (loop_vinfo
);
9481 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
9482 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
9483 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
9485 drs_init_vec orig_drs_init
;
9487 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
9488 &step_vector
, &niters_vector_mult_vf
, th
,
9489 check_profitability
, niters_no_overflow
,
9492 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
9493 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
9494 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
9495 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
9497 if (niters_vector
== NULL_TREE
)
9499 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
9500 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
9501 && known_eq (lowest_vf
, vf
))
9504 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
9505 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
9506 step_vector
= build_one_cst (TREE_TYPE (niters
));
9508 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9509 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
9510 &step_vector
, niters_no_overflow
);
9512 /* vect_do_peeling subtracted the number of peeled prologue
9513 iterations from LOOP_VINFO_NITERS. */
9514 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
9515 &niters_vector
, &step_vector
,
9516 niters_no_overflow
);
9519 /* 1) Make sure the loop header has exactly two entries
9520 2) Make sure we have a preheader basic block. */
9522 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
9524 split_edge (loop_preheader_edge (loop
));
9526 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9527 /* This will deal with any possible peeling. */
9528 vect_prepare_for_masked_peels (loop_vinfo
);
9530 /* Schedule the SLP instances first, then handle loop vectorization
9532 if (!loop_vinfo
->slp_instances
.is_empty ())
9534 DUMP_VECT_SCOPE ("scheduling SLP instances");
9535 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
9538 /* FORNOW: the vectorizer supports only loops which body consist
9539 of one basic block (header + empty latch). When the vectorizer will
9540 support more involved loop forms, the order by which the BBs are
9541 traversed need to be reconsidered. */
9543 for (i
= 0; i
< nbbs
; i
++)
9545 basic_block bb
= bbs
[i
];
9546 stmt_vec_info stmt_info
;
9548 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9551 gphi
*phi
= si
.phi ();
9552 if (dump_enabled_p ())
9553 dump_printf_loc (MSG_NOTE
, vect_location
,
9554 "------>vectorizing phi: %G", phi
);
9555 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9559 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9560 vect_loop_kill_debug_uses (loop
, stmt_info
);
9562 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9563 && !STMT_VINFO_LIVE_P (stmt_info
))
9566 if (STMT_VINFO_VECTYPE (stmt_info
)
9568 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
9569 && dump_enabled_p ())
9570 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9572 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9573 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9574 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9575 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9576 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9577 && ! PURE_SLP_STMT (stmt_info
))
9579 if (dump_enabled_p ())
9580 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
9581 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
9585 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9588 gphi
*phi
= si
.phi ();
9589 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9593 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9594 && !STMT_VINFO_LIVE_P (stmt_info
))
9597 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9598 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9599 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9600 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9601 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9602 && ! PURE_SLP_STMT (stmt_info
))
9603 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
9606 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
9609 stmt
= gsi_stmt (si
);
9610 /* During vectorization remove existing clobber stmts. */
9611 if (gimple_clobber_p (stmt
))
9613 unlink_stmt_vdef (stmt
);
9614 gsi_remove (&si
, true);
9615 release_defs (stmt
);
9619 /* Ignore vector stmts created in the outer loop. */
9620 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
9622 /* vector stmts created in the outer-loop during vectorization of
9623 stmts in an inner-loop may not have a stmt_info, and do not
9624 need to be vectorized. */
9625 stmt_vec_info seen_store
= NULL
;
9628 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
9630 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
9631 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
9632 !gsi_end_p (subsi
); gsi_next (&subsi
))
9634 stmt_vec_info pat_stmt_info
9635 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
9636 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9639 stmt_vec_info pat_stmt_info
9640 = STMT_VINFO_RELATED_STMT (stmt_info
);
9641 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9643 maybe_set_vectorized_backedge_value (loop_vinfo
,
9648 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
9650 maybe_set_vectorized_backedge_value (loop_vinfo
,
9657 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
9658 /* Interleaving. If IS_STORE is TRUE, the
9659 vectorization of the interleaving chain was
9660 completed - free all the stores in the chain. */
9661 vect_remove_stores (loop_vinfo
,
9662 DR_GROUP_FIRST_ELEMENT (seen_store
));
9664 /* Free the attached stmt_vec_info and remove the stmt. */
9665 loop_vinfo
->remove_stmt (stmt_info
);
9670 /* Stub out scalar statements that must not survive vectorization.
9671 Doing this here helps with grouped statements, or statements that
9672 are involved in patterns. */
9673 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
9674 !gsi_end_p (gsi
); gsi_next (&gsi
))
9676 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
9677 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
9679 tree lhs
= gimple_get_lhs (call
);
9680 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9682 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
9683 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
9684 gsi_replace (&gsi
, new_stmt
, true);
9690 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9691 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9692 if (integer_onep (step_vector
))
9693 niters_no_overflow
= true;
9694 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
9695 niters_vector_mult_vf
, !niters_no_overflow
);
9697 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
9698 scale_profile_for_vect_loop (loop
, assumed_vf
);
9700 /* True if the final iteration might not handle a full vector's
9701 worth of scalar iterations. */
9702 bool final_iter_may_be_partial
9703 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
);
9704 /* The minimum number of iterations performed by the epilogue. This
9705 is 1 when peeling for gaps because we always need a final scalar
9707 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
9708 /* +1 to convert latch counts to loop iteration counts,
9709 -min_epilogue_iters to remove iterations that cannot be performed
9710 by the vector code. */
9711 int bias_for_lowest
= 1 - min_epilogue_iters
;
9712 int bias_for_assumed
= bias_for_lowest
;
9713 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
9714 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
9716 /* When the amount of peeling is known at compile time, the first
9717 iteration will have exactly alignment_npeels active elements.
9718 In the worst case it will have at least one. */
9719 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
9720 bias_for_lowest
+= lowest_vf
- min_first_active
;
9721 bias_for_assumed
+= assumed_vf
- min_first_active
;
9723 /* In these calculations the "- 1" converts loop iteration counts
9724 back to latch counts. */
9725 if (loop
->any_upper_bound
)
9726 loop
->nb_iterations_upper_bound
9727 = (final_iter_may_be_partial
9728 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9730 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9732 if (loop
->any_likely_upper_bound
)
9733 loop
->nb_iterations_likely_upper_bound
9734 = (final_iter_may_be_partial
9735 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
9736 + bias_for_lowest
, lowest_vf
) - 1
9737 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
9738 + bias_for_lowest
, lowest_vf
) - 1);
9739 if (loop
->any_estimate
)
9740 loop
->nb_iterations_estimate
9741 = (final_iter_may_be_partial
9742 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
9744 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
9747 if (dump_enabled_p ())
9749 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
9751 dump_printf_loc (MSG_NOTE
, vect_location
,
9752 "LOOP VECTORIZED\n");
9754 dump_printf_loc (MSG_NOTE
, vect_location
,
9755 "OUTER LOOP VECTORIZED\n");
9756 dump_printf (MSG_NOTE
, "\n");
9759 dump_printf_loc (MSG_NOTE
, vect_location
,
9760 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9761 GET_MODE_NAME (loop_vinfo
->vector_mode
));
9764 /* Loops vectorized with a variable factor won't benefit from
9765 unrolling/peeling. */
9766 if (!vf
.is_constant ())
9769 if (dump_enabled_p ())
9770 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
9771 " variable-length vectorization factor\n");
9773 /* Free SLP instances here because otherwise stmt reference counting
9775 slp_instance instance
;
9776 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
9777 vect_free_slp_instance (instance
);
9778 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
9779 /* Clear-up safelen field since its value is invalid after vectorization
9780 since vectorized loop can have loop-carried dependencies. */
9785 update_epilogue_loop_vinfo (epilogue
, advance
);
9787 epilogue
->simduid
= loop
->simduid
;
9788 epilogue
->force_vectorize
= loop
->force_vectorize
;
9789 epilogue
->dont_vectorize
= false;
9795 /* The code below is trying to perform simple optimization - revert
9796 if-conversion for masked stores, i.e. if the mask of a store is zero
9797 do not perform it and all stored value producers also if possible.
9805 this transformation will produce the following semi-hammock:
9807 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9809 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9810 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9811 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9812 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9813 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9814 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9819 optimize_mask_stores (class loop
*loop
)
9821 basic_block
*bbs
= get_loop_body (loop
);
9822 unsigned nbbs
= loop
->num_nodes
;
9825 class loop
*bb_loop
;
9826 gimple_stmt_iterator gsi
;
9828 auto_vec
<gimple
*> worklist
;
9829 auto_purge_vect_location sentinel
;
9831 vect_location
= find_loop_location (loop
);
9832 /* Pick up all masked stores in loop if any. */
9833 for (i
= 0; i
< nbbs
; i
++)
9836 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
9839 stmt
= gsi_stmt (gsi
);
9840 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
9841 worklist
.safe_push (stmt
);
9846 if (worklist
.is_empty ())
9849 /* Loop has masked stores. */
9850 while (!worklist
.is_empty ())
9852 gimple
*last
, *last_store
;
9855 basic_block store_bb
, join_bb
;
9856 gimple_stmt_iterator gsi_to
;
9857 tree vdef
, new_vdef
;
9862 last
= worklist
.pop ();
9863 mask
= gimple_call_arg (last
, 2);
9864 bb
= gimple_bb (last
);
9865 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9866 the same loop as if_bb. It could be different to LOOP when two
9867 level loop-nest is vectorized and mask_store belongs to the inner
9869 e
= split_block (bb
, last
);
9870 bb_loop
= bb
->loop_father
;
9871 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
9873 store_bb
= create_empty_bb (bb
);
9874 add_bb_to_loop (store_bb
, bb_loop
);
9875 e
->flags
= EDGE_TRUE_VALUE
;
9876 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
9877 /* Put STORE_BB to likely part. */
9878 efalse
->probability
= profile_probability::unlikely ();
9879 store_bb
->count
= efalse
->count ();
9880 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
9881 if (dom_info_available_p (CDI_DOMINATORS
))
9882 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
9883 if (dump_enabled_p ())
9884 dump_printf_loc (MSG_NOTE
, vect_location
,
9885 "Create new block %d to sink mask stores.",
9887 /* Create vector comparison with boolean result. */
9888 vectype
= TREE_TYPE (mask
);
9889 zero
= build_zero_cst (vectype
);
9890 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
9891 gsi
= gsi_last_bb (bb
);
9892 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
9893 /* Create new PHI node for vdef of the last masked store:
9894 .MEM_2 = VDEF <.MEM_1>
9895 will be converted to
9896 .MEM.3 = VDEF <.MEM_1>
9897 and new PHI node will be created in join bb
9898 .MEM_2 = PHI <.MEM_1, .MEM_3>
9900 vdef
= gimple_vdef (last
);
9901 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
9902 gimple_set_vdef (last
, new_vdef
);
9903 phi
= create_phi_node (vdef
, join_bb
);
9904 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
9906 /* Put all masked stores with the same mask to STORE_BB if possible. */
9909 gimple_stmt_iterator gsi_from
;
9910 gimple
*stmt1
= NULL
;
9912 /* Move masked store to STORE_BB. */
9914 gsi
= gsi_for_stmt (last
);
9916 /* Shift GSI to the previous stmt for further traversal. */
9918 gsi_to
= gsi_start_bb (store_bb
);
9919 gsi_move_before (&gsi_from
, &gsi_to
);
9920 /* Setup GSI_TO to the non-empty block start. */
9921 gsi_to
= gsi_start_bb (store_bb
);
9922 if (dump_enabled_p ())
9923 dump_printf_loc (MSG_NOTE
, vect_location
,
9924 "Move stmt to created bb\n%G", last
);
9925 /* Move all stored value producers if possible. */
9926 while (!gsi_end_p (gsi
))
9929 imm_use_iterator imm_iter
;
9930 use_operand_p use_p
;
9933 /* Skip debug statements. */
9934 if (is_gimple_debug (gsi_stmt (gsi
)))
9939 stmt1
= gsi_stmt (gsi
);
9940 /* Do not consider statements writing to memory or having
9941 volatile operand. */
9942 if (gimple_vdef (stmt1
)
9943 || gimple_has_volatile_ops (stmt1
))
9947 lhs
= gimple_get_lhs (stmt1
);
9951 /* LHS of vectorized stmt must be SSA_NAME. */
9952 if (TREE_CODE (lhs
) != SSA_NAME
)
9955 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9957 /* Remove dead scalar statement. */
9958 if (has_zero_uses (lhs
))
9960 gsi_remove (&gsi_from
, true);
9965 /* Check that LHS does not have uses outside of STORE_BB. */
9967 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
9970 use_stmt
= USE_STMT (use_p
);
9971 if (is_gimple_debug (use_stmt
))
9973 if (gimple_bb (use_stmt
) != store_bb
)
9982 if (gimple_vuse (stmt1
)
9983 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
9986 /* Can move STMT1 to STORE_BB. */
9987 if (dump_enabled_p ())
9988 dump_printf_loc (MSG_NOTE
, vect_location
,
9989 "Move stmt to created bb\n%G", stmt1
);
9990 gsi_move_before (&gsi_from
, &gsi_to
);
9991 /* Shift GSI_TO for further insertion. */
9994 /* Put other masked stores with the same mask to STORE_BB. */
9995 if (worklist
.is_empty ()
9996 || gimple_call_arg (worklist
.last (), 2) != mask
9997 || worklist
.last () != stmt1
)
9999 last
= worklist
.pop ();
10001 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
10005 /* Decide whether it is possible to use a zero-based induction variable
10006 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10007 the value that the induction variable must be able to hold in order
10008 to ensure that the rgroups eventually have no active vector elements.
10009 Return -1 otherwise. */
10012 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
10014 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10015 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10016 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
10018 /* Calculate the value that the induction variable must be able
10019 to hit in order to ensure that we end the loop with an all-false mask.
10020 This involves adding the maximum number of inactive trailing scalar
10022 widest_int iv_limit
= -1;
10023 if (max_loop_iterations (loop
, &iv_limit
))
10027 /* Add the maximum number of skipped iterations to the
10028 maximum iteration count. */
10029 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
10030 iv_limit
+= wi::to_widest (niters_skip
);
10032 iv_limit
+= max_vf
- 1;
10034 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
10035 /* Make a conservatively-correct assumption. */
10036 iv_limit
+= max_vf
- 1;
10038 /* IV_LIMIT is the maximum number of latch iterations, which is also
10039 the maximum in-range IV value. Round this value down to the previous
10040 vector alignment boundary and then add an extra full iteration. */
10041 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10042 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
10047 /* For the given rgroup_controls RGC, check whether an induction variable
10048 would ever hit a value that produces a set of all-false masks or zero
10049 lengths before wrapping around. Return true if it's possible to wrap
10050 around before hitting the desirable value, otherwise return false. */
10053 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
10055 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
10057 if (iv_limit
== -1)
10060 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
10061 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
10062 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
10064 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)