2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
32 #include "tree-pass.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
78 for (i=0; i<N/8; i++){
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *,
159 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
160 bool *, bool *, bool);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
167 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
168 bool vectype_maybe_set_p
,
171 gimple
*stmt
= stmt_info
->stmt
;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
174 && !STMT_VINFO_LIVE_P (stmt_info
))
175 || gimple_clobber_p (stmt
))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype
, nunits_vectype
;
183 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
191 if (STMT_VINFO_VECTYPE (stmt_info
))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
196 || vectype_maybe_set_p
)
197 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
199 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
203 vect_update_max_nunits (vf
, nunits_vectype
);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
214 vect_determine_vf_for_stmt (vec_info
*vinfo
,
215 stmt_vec_info stmt_info
, poly_uint64
*vf
)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
220 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
225 && STMT_VINFO_RELATED_STMT (stmt_info
))
227 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
228 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
232 !gsi_end_p (si
); gsi_next (&si
))
234 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE
, vect_location
,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info
->stmt
);
239 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE
, vect_location
,
246 "==> examining pattern statement: %G",
248 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
284 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
285 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
286 unsigned nbbs
= loop
->num_nodes
;
287 poly_uint64 vectorization_factor
= 1;
288 tree scalar_type
= NULL_TREE
;
291 stmt_vec_info stmt_info
;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i
= 0; i
< nbbs
; i
++)
298 basic_block bb
= bbs
[i
];
300 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
304 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
309 gcc_assert (stmt_info
);
311 if (STMT_VINFO_RELEVANT_P (stmt_info
)
312 || STMT_VINFO_LIVE_P (stmt_info
))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
315 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE
, vect_location
,
319 "get vectype for scalar type: %T\n",
322 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
324 return opt_result::failure_at (phi
,
325 "not vectorized: unsupported "
328 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
337 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
338 dump_printf (MSG_NOTE
, "\n");
341 vect_update_max_nunits (&vectorization_factor
, vectype
);
345 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
348 if (is_gimple_debug (gsi_stmt (si
)))
350 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
352 = vect_determine_vf_for_stmt (loop_vinfo
,
353 stmt_info
, &vectorization_factor
);
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
363 dump_dec (MSG_NOTE
, vectorization_factor
);
364 dump_printf (MSG_NOTE
, "\n");
367 if (known_le (vectorization_factor
, 1U))
368 return opt_result::failure_at (vect_location
,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
381 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
386 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
389 /* When there is no evolution in this loop, the evolution function
391 if (evolution_part
== NULL_TREE
)
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part
))
399 step_expr
= evolution_part
;
400 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
404 step_expr
, init_expr
);
409 if (TREE_CODE (step_expr
) != INTEGER_CST
410 && (TREE_CODE (step_expr
) != SSA_NAME
411 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
412 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
415 || !flag_associative_math
)))
416 && (TREE_CODE (step_expr
) != REAL_CST
417 || !flag_associative_math
))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
428 /* Function vect_is_nonlinear_iv_evolution
430 Only support nonlinear induction for integer type
433 3. lshift/rshift by constant.
435 For neg induction, return a fake step as integer -1. */
437 vect_is_nonlinear_iv_evolution (class loop
* loop
, stmt_vec_info stmt_info
,
438 gphi
* loop_phi_node
, tree
*init
, tree
*step
)
440 tree init_expr
, ev_expr
, result
, op1
, op2
;
443 if (gimple_phi_num_args (loop_phi_node
) != 2)
446 init_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_preheader_edge (loop
));
447 ev_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_latch_edge (loop
));
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr
)))
454 result
= PHI_RESULT (loop_phi_node
);
456 if (TREE_CODE (ev_expr
) != SSA_NAME
457 || ((def
= SSA_NAME_DEF_STMT (ev_expr
)), false)
458 || !is_gimple_assign (def
))
461 enum tree_code t_code
= gimple_assign_rhs_code (def
);
465 if (gimple_assign_rhs1 (def
) != result
)
467 *step
= build_int_cst (TREE_TYPE (init_expr
), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_neg
;
474 op1
= gimple_assign_rhs1 (def
);
475 op2
= gimple_assign_rhs2 (def
);
476 if (TREE_CODE (op2
) != INTEGER_CST
480 if (t_code
== LSHIFT_EXPR
)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shl
;
482 else if (t_code
== RSHIFT_EXPR
)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shr
;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_mul
;
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info
) = *init
;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
) = *step
;
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
504 x_1 = PHI <x_4(outer2), ...>;
508 x_2 = PHI <x_1(outer1), ...>;
514 x_4 = PHI <x_3(inner)>;
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
525 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
526 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
527 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
532 /* Function vect_analyze_scalar_cycles_1.
534 Examine the cross iteration def-use cycles of scalar variables
535 in LOOP. LOOP_VINFO represents the loop that is now being
536 considered for vectorization (can be LOOP, or an outer-loop
537 enclosing LOOP). SLP indicates there will be some subsequent
538 slp analyses or not. */
541 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
,
544 basic_block bb
= loop
->header
;
546 auto_vec
<stmt_vec_info
, 64> worklist
;
548 bool double_reduc
, reduc_chain
;
550 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
552 /* First - identify all inductions. Reduction detection assumes that all the
553 inductions have been identified, therefore, this order must not be
555 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
557 gphi
*phi
= gsi
.phi ();
558 tree access_fn
= NULL
;
559 tree def
= PHI_RESULT (phi
);
560 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
566 /* Skip virtual phi's. The data dependences that are associated with
567 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
568 if (virtual_operand_p (def
))
571 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
573 /* Analyze the evolution function. */
574 access_fn
= analyze_scalar_evolution (loop
, def
);
577 STRIP_NOPS (access_fn
);
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE
, vect_location
,
580 "Access function of PHI: %T\n", access_fn
);
581 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
582 = initial_condition_in_loop_num (access_fn
, loop
->num
);
583 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
584 = evolution_part_in_loop_num (access_fn
, loop
->num
);
588 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
589 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
,
591 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
592 && TREE_CODE (step
) != INTEGER_CST
))
593 /* Only handle nonlinear iv for same loop. */
594 && (LOOP_VINFO_LOOP (loop_vinfo
) != loop
595 || !vect_is_nonlinear_iv_evolution (loop
, stmt_vinfo
,
598 worklist
.safe_push (stmt_vinfo
);
602 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
604 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
608 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
612 /* Second - identify all reductions and nested cycles. */
613 while (worklist
.length () > 0)
615 stmt_vec_info stmt_vinfo
= worklist
.pop ();
616 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
617 tree def
= PHI_RESULT (phi
);
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
623 gcc_assert (!virtual_operand_p (def
)
624 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
626 stmt_vec_info reduc_stmt_info
627 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
631 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
632 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_NOTE
, vect_location
,
637 "Detected double reduction.\n");
639 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
640 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
644 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE
, vect_location
,
648 "Detected vectorizable nested cycle.\n");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE
, vect_location
,
656 "Detected reduction.\n");
658 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
659 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
660 /* Store the reduction cycles for possible vectorization in
661 loop-aware SLP if it was not detected as reduction
664 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
672 "Unknown def-use cycle pattern.\n");
677 /* Function vect_analyze_scalar_cycles.
679 Examine the cross iteration def-use cycles of scalar variables, by
680 analyzing the loop-header PHIs of scalar variables. Classify each
681 cycle as one of the following: invariant, induction, reduction, unknown.
682 We do that for the loop represented by LOOP_VINFO, and also to its
683 inner-loop, if exists.
684 Examples for scalar cycles:
699 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
, bool slp
)
701 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
703 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
, slp
);
705 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
706 Reductions in such inner-loop therefore have different properties than
707 the reductions in the nest that gets vectorized:
708 1. When vectorized, they are executed in the same order as in the original
709 scalar loop, so we can't change the order of computation when
711 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
712 current checks are too strict. */
715 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
, slp
);
718 /* Transfer group and reduction information from STMT_INFO to its
722 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
724 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
726 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
727 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
728 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
731 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
732 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
733 == STMT_VINFO_DEF_TYPE (stmt_info
));
734 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
735 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
737 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
738 = STMT_VINFO_RELATED_STMT (stmt_info
);
743 /* Fixup scalar cycles that now have their stmts detected as patterns. */
746 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
751 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
753 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
756 if ((STMT_VINFO_IN_PATTERN_P (next
)
757 != STMT_VINFO_IN_PATTERN_P (first
))
758 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
760 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
762 /* If all reduction chain members are well-formed patterns adjust
763 the group to group the pattern stmts instead. */
765 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
767 if (STMT_VINFO_IN_PATTERN_P (first
))
769 vect_fixup_reduc_chain (first
);
770 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
771 = STMT_VINFO_RELATED_STMT (first
);
774 /* If not all stmt in the chain are patterns or if we failed
775 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
776 it as regular reduction instead. */
779 stmt_vec_info vinfo
= first
;
780 stmt_vec_info last
= NULL
;
783 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
784 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
785 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
789 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
791 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
792 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
798 /* Function vect_get_loop_niters.
800 Determine how many iterations the loop is executed and place it
801 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
802 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
803 niter information holds in ASSUMPTIONS.
805 Return the loop exit condition. */
809 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
810 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
812 edge exit
= single_exit (loop
);
813 class tree_niter_desc niter_desc
;
814 tree niter_assumptions
, niter
, may_be_zero
;
815 gcond
*cond
= get_loop_exit_condition (loop
);
817 *assumptions
= boolean_true_node
;
818 *number_of_iterationsm1
= chrec_dont_know
;
819 *number_of_iterations
= chrec_dont_know
;
820 DUMP_VECT_SCOPE ("get_loop_niters");
825 may_be_zero
= NULL_TREE
;
826 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
827 || chrec_contains_undetermined (niter_desc
.niter
))
830 niter_assumptions
= niter_desc
.assumptions
;
831 may_be_zero
= niter_desc
.may_be_zero
;
832 niter
= niter_desc
.niter
;
834 if (may_be_zero
&& integer_zerop (may_be_zero
))
835 may_be_zero
= NULL_TREE
;
839 if (COMPARISON_CLASS_P (may_be_zero
))
841 /* Try to combine may_be_zero with assumptions, this can simplify
842 computation of niter expression. */
843 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
844 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
846 fold_build1 (TRUTH_NOT_EXPR
,
850 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
851 build_int_cst (TREE_TYPE (niter
), 0),
852 rewrite_to_non_trapping_overflow (niter
));
854 may_be_zero
= NULL_TREE
;
856 else if (integer_nonzerop (may_be_zero
))
858 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
859 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
866 *assumptions
= niter_assumptions
;
867 *number_of_iterationsm1
= niter
;
869 /* We want the number of loop header executions which is the number
870 of latch executions plus one.
871 ??? For UINT_MAX latch executions this number overflows to zero
872 for loops like do { n++; } while (n != 0); */
873 if (niter
&& !chrec_contains_undetermined (niter
))
874 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
875 build_int_cst (TREE_TYPE (niter
), 1));
876 *number_of_iterations
= niter
;
881 /* Function bb_in_loop_p
883 Used as predicate for dfs order traversal of the loop bbs. */
886 bb_in_loop_p (const_basic_block bb
, const void *data
)
888 const class loop
*const loop
= (const class loop
*)data
;
889 if (flow_bb_inside_loop_p (loop
, bb
))
895 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
896 stmt_vec_info structs for all the stmts in LOOP_IN. */
898 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
899 : vec_info (vec_info::loop
, shared
),
901 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
902 num_itersm1 (NULL_TREE
),
903 num_iters (NULL_TREE
),
904 num_iters_unchanged (NULL_TREE
),
905 num_iters_assumptions (NULL_TREE
),
906 vector_costs (nullptr),
907 scalar_costs (nullptr),
909 versioning_threshold (0),
910 vectorization_factor (0),
911 main_loop_edge (nullptr),
912 skip_main_loop_edge (nullptr),
913 skip_this_loop_edge (nullptr),
914 reusable_accumulators (),
915 suggested_unroll_factor (1),
916 max_vectorization_factor (0),
917 mask_skip_niters (NULL_TREE
),
918 rgroup_compare_type (NULL_TREE
),
919 simd_if_cond (NULL_TREE
),
921 peeling_for_alignment (0),
925 slp_unrolling_factor (1),
926 inner_loop_cost_factor (param_vect_inner_loop_cost_factor
),
927 vectorizable (false),
928 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
929 using_partial_vectors_p (false),
930 epil_using_partial_vectors_p (false),
931 partial_load_store_bias (0),
932 peeling_for_gaps (false),
933 peeling_for_niter (false),
934 no_data_dependencies (false),
935 has_mask_store (false),
936 scalar_loop_scaling (profile_probability::uninitialized ()),
938 orig_loop_info (NULL
)
940 /* CHECKME: We want to visit all BBs before their successors (except for
941 latch blocks, for which this assertion wouldn't hold). In the simple
942 case of the loop forms we allow, a dfs order of the BBs would the same
943 as reversed postorder traversal, so we are safe. */
945 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
946 bbs
, loop
->num_nodes
, loop
);
947 gcc_assert (nbbs
== loop
->num_nodes
);
949 for (unsigned int i
= 0; i
< nbbs
; i
++)
951 basic_block bb
= bbs
[i
];
952 gimple_stmt_iterator si
;
954 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
956 gimple
*phi
= gsi_stmt (si
);
957 gimple_set_uid (phi
, 0);
961 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
963 gimple
*stmt
= gsi_stmt (si
);
964 gimple_set_uid (stmt
, 0);
965 if (is_gimple_debug (stmt
))
968 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
969 third argument is the #pragma omp simd if (x) condition, when 0,
970 loop shouldn't be vectorized, when non-zero constant, it should
971 be vectorized normally, otherwise versioned with vectorized loop
972 done if the condition is non-zero at runtime. */
974 && is_gimple_call (stmt
)
975 && gimple_call_internal_p (stmt
)
976 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
977 && gimple_call_num_args (stmt
) >= 3
978 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
980 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
982 tree arg
= gimple_call_arg (stmt
, 2);
983 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
986 gcc_assert (integer_nonzerop (arg
));
991 epilogue_vinfos
.create (6);
994 /* Free all levels of rgroup CONTROLS. */
997 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
999 rgroup_controls
*rgc
;
1001 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
1002 rgc
->controls
.release ();
1003 controls
->release ();
1006 /* Free all memory used by the _loop_vec_info, as well as all the
1007 stmt_vec_info structs of all the stmts in the loop. */
1009 _loop_vec_info::~_loop_vec_info ()
1013 release_vec_loop_controls (&masks
);
1014 release_vec_loop_controls (&lens
);
1017 epilogue_vinfos
.release ();
1018 delete scalar_costs
;
1019 delete vector_costs
;
1021 /* When we release an epiloge vinfo that we do not intend to use
1022 avoid clearing AUX of the main loop which should continue to
1023 point to the main loop vinfo since otherwise we'll leak that. */
1024 if (loop
->aux
== this)
1028 /* Return an invariant or register for EXPR and emit necessary
1029 computations in the LOOP_VINFO loop preheader. */
1032 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
1034 if (is_gimple_reg (expr
)
1035 || is_gimple_min_invariant (expr
))
1038 if (! loop_vinfo
->ivexpr_map
)
1039 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
1040 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
1043 gimple_seq stmts
= NULL
;
1044 cached
= force_gimple_operand (unshare_expr (expr
),
1045 &stmts
, true, NULL_TREE
);
1048 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1049 gsi_insert_seq_on_edge_immediate (e
, stmts
);
1055 /* Return true if we can use CMP_TYPE as the comparison type to produce
1056 all masks required to mask LOOP_VINFO. */
1059 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1061 rgroup_controls
*rgm
;
1063 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1064 if (rgm
->type
!= NULL_TREE
1065 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1066 cmp_type
, rgm
->type
,
1067 OPTIMIZE_FOR_SPEED
))
1072 /* Calculate the maximum number of scalars per iteration for every
1073 rgroup in LOOP_VINFO. */
1076 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1078 unsigned int res
= 1;
1080 rgroup_controls
*rgm
;
1081 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1082 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1086 /* Calculate the minimum precision necessary to represent:
1090 as an unsigned integer, where MAX_NITERS is the maximum number of
1091 loop header iterations for the original scalar form of LOOP_VINFO. */
1094 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1096 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1098 /* Get the maximum number of iterations that is representable
1099 in the counter type. */
1100 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1101 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1103 /* Get a more refined estimate for the number of iterations. */
1104 widest_int max_back_edges
;
1105 if (max_loop_iterations (loop
, &max_back_edges
))
1106 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1108 /* Work out how many bits we need to represent the limit. */
1109 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1112 /* True if the loop needs peeling or partial vectors when vectorized. */
1115 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1117 unsigned HOST_WIDE_INT const_vf
;
1118 HOST_WIDE_INT max_niter
1119 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1121 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1122 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1123 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1126 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1127 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1129 /* Work out the (constant) number of iterations that need to be
1130 peeled for reasons other than niters. */
1131 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1132 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1134 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1135 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1138 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1139 /* ??? When peeling for gaps but not alignment, we could
1140 try to check whether the (variable) niters is known to be
1141 VF * N + 1. That's something of a niche case though. */
1142 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1143 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1144 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1145 < (unsigned) exact_log2 (const_vf
))
1146 /* In case of versioning, check if the maximum number of
1147 iterations is greater than th. If they are identical,
1148 the epilogue is unnecessary. */
1149 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1150 || ((unsigned HOST_WIDE_INT
) max_niter
1151 > (th
/ const_vf
) * const_vf
))))
1157 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1158 whether we can actually generate the masks required. Return true if so,
1159 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1162 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1164 unsigned int min_ni_width
;
1165 unsigned int max_nscalars_per_iter
1166 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1168 /* Use a normal loop if there are no statements that need masking.
1169 This only happens in rare degenerate cases: it means that the loop
1170 has no loads, no stores, and no live-out values. */
1171 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1174 /* Work out how many bits we need to represent the limit. */
1176 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1178 /* Find a scalar mode for which WHILE_ULT is supported. */
1179 opt_scalar_int_mode cmp_mode_iter
;
1180 tree cmp_type
= NULL_TREE
;
1181 tree iv_type
= NULL_TREE
;
1182 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1183 unsigned int iv_precision
= UINT_MAX
;
1186 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1189 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1191 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1192 if (cmp_bits
>= min_ni_width
1193 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1195 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1197 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1199 /* Although we could stop as soon as we find a valid mode,
1200 there are at least two reasons why that's not always the
1203 - An IV that's Pmode or wider is more likely to be reusable
1204 in address calculations than an IV that's narrower than
1207 - Doing the comparison in IV_PRECISION or wider allows
1208 a natural 0-based IV, whereas using a narrower comparison
1209 type requires mitigations against wrap-around.
1211 Conversely, if the IV limit is variable, doing the comparison
1212 in a wider type than the original type can introduce
1213 unnecessary extensions, so picking the widest valid mode
1214 is not always a good choice either.
1216 Here we prefer the first IV type that's Pmode or wider,
1217 and the first comparison type that's IV_PRECISION or wider.
1218 (The comparison type must be no wider than the IV type,
1219 to avoid extensions in the vector loop.)
1221 ??? We might want to try continuing beyond Pmode for ILP32
1222 targets if CMP_BITS < IV_PRECISION. */
1223 iv_type
= this_type
;
1224 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1225 cmp_type
= this_type
;
1226 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1235 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1236 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1240 /* Check whether we can use vector access with length based on precison
1241 comparison. So far, to keep it simple, we only allow the case that the
1242 precision of the target supported length is larger than the precision
1243 required by loop niters. */
1246 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1248 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1251 machine_mode len_load_mode
= get_len_load_store_mode
1252 (loop_vinfo
->vector_mode
, true).require ();
1253 machine_mode len_store_mode
= get_len_load_store_mode
1254 (loop_vinfo
->vector_mode
, false).require ();
1256 signed char partial_load_bias
= internal_len_load_store_bias
1257 (IFN_LEN_LOAD
, len_load_mode
);
1259 signed char partial_store_bias
= internal_len_load_store_bias
1260 (IFN_LEN_STORE
, len_store_mode
);
1262 gcc_assert (partial_load_bias
== partial_store_bias
);
1264 if (partial_load_bias
== VECT_PARTIAL_BIAS_UNSUPPORTED
)
1267 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1268 len_loads with a length of zero. In order to avoid that we prohibit
1269 more than one loop length here. */
1270 if (partial_load_bias
== -1
1271 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
1274 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) = partial_load_bias
;
1276 unsigned int max_nitems_per_iter
= 1;
1278 rgroup_controls
*rgl
;
1279 /* Find the maximum number of items per iteration for every rgroup. */
1280 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1282 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1283 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1286 /* Work out how many bits we need to represent the length limit. */
1287 unsigned int min_ni_prec
1288 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1290 /* Now use the maximum of below precisions for one suitable IV type:
1291 - the IV's natural precision
1292 - the precision needed to hold: the maximum number of scalar
1293 iterations multiplied by the scale factor (min_ni_prec above)
1294 - the Pmode precision
1296 If min_ni_prec is less than the precision of the current niters,
1297 we perfer to still use the niters type. Prefer to use Pmode and
1298 wider IV to avoid narrow conversions. */
1300 unsigned int ni_prec
1301 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1302 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1303 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1305 tree iv_type
= NULL_TREE
;
1306 opt_scalar_int_mode tmode_iter
;
1307 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1309 scalar_mode tmode
= tmode_iter
.require ();
1310 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1312 /* ??? Do we really want to construct one IV whose precision exceeds
1314 if (tbits
> BITS_PER_WORD
)
1317 /* Find the first available standard integral type. */
1318 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1320 iv_type
= build_nonstandard_integer_type (tbits
, true);
1327 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1329 "can't vectorize with length-based partial vectors"
1330 " because there is no suitable iv type.\n");
1334 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1335 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1340 /* Calculate the cost of one scalar iteration of the loop. */
1342 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1344 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1345 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1346 int nbbs
= loop
->num_nodes
, factor
;
1347 int innerloop_iters
, i
;
1349 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1351 /* Gather costs for statements in the scalar loop. */
1354 innerloop_iters
= 1;
1356 innerloop_iters
= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
);
1358 for (i
= 0; i
< nbbs
; i
++)
1360 gimple_stmt_iterator si
;
1361 basic_block bb
= bbs
[i
];
1363 if (bb
->loop_father
== loop
->inner
)
1364 factor
= innerloop_iters
;
1368 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1370 gimple
*stmt
= gsi_stmt (si
);
1371 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1373 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1376 /* Skip stmts that are not vectorized inside the loop. */
1377 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1378 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1379 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1380 || !VECTORIZABLE_CYCLE_DEF
1381 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1384 vect_cost_for_stmt kind
;
1385 if (STMT_VINFO_DATA_REF (stmt_info
))
1387 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1390 kind
= scalar_store
;
1392 else if (vect_nop_conversion_p (stmt_info
))
1397 /* We are using vect_prologue here to avoid scaling twice
1398 by the inner loop factor. */
1399 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1400 factor
, kind
, stmt_info
, 0, vect_prologue
);
1404 /* Now accumulate cost. */
1405 loop_vinfo
->scalar_costs
= init_cost (loop_vinfo
, true);
1406 add_stmt_costs (loop_vinfo
->scalar_costs
,
1407 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
));
1408 loop_vinfo
->scalar_costs
->finish_cost (nullptr);
1412 /* Function vect_analyze_loop_form.
1414 Verify that certain CFG restrictions hold, including:
1415 - the loop has a pre-header
1416 - the loop has a single entry and exit
1417 - the loop exit condition is simple enough
1418 - the number of iterations can be analyzed, i.e, a countable loop. The
1419 niter could be analyzed under some assumptions. */
1422 vect_analyze_loop_form (class loop
*loop
, vect_loop_form_info
*info
)
1424 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1426 /* Different restrictions apply when we are considering an inner-most loop,
1427 vs. an outer (nested) loop.
1428 (FORNOW. May want to relax some of these restrictions in the future). */
1430 info
->inner_loop_cond
= NULL
;
1433 /* Inner-most loop. We currently require that the number of BBs is
1434 exactly 2 (the header and latch). Vectorizable inner-most loops
1445 if (loop
->num_nodes
!= 2)
1446 return opt_result::failure_at (vect_location
,
1448 " control flow in loop.\n");
1450 if (empty_block_p (loop
->header
))
1451 return opt_result::failure_at (vect_location
,
1452 "not vectorized: empty loop.\n");
1456 class loop
*innerloop
= loop
->inner
;
1459 /* Nested loop. We currently require that the loop is doubly-nested,
1460 contains a single inner loop, and the number of BBs is exactly 5.
1461 Vectorizable outer-loops look like this:
1473 The inner-loop has the properties expected of inner-most loops
1474 as described above. */
1476 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1477 return opt_result::failure_at (vect_location
,
1479 " multiple nested loops.\n");
1481 if (loop
->num_nodes
!= 5)
1482 return opt_result::failure_at (vect_location
,
1484 " control flow in loop.\n");
1486 entryedge
= loop_preheader_edge (innerloop
);
1487 if (entryedge
->src
!= loop
->header
1488 || !single_exit (innerloop
)
1489 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1490 return opt_result::failure_at (vect_location
,
1492 " unsupported outerloop form.\n");
1494 /* Analyze the inner-loop. */
1495 vect_loop_form_info inner
;
1496 opt_result res
= vect_analyze_loop_form (loop
->inner
, &inner
);
1499 if (dump_enabled_p ())
1500 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1501 "not vectorized: Bad inner loop.\n");
1505 /* Don't support analyzing niter under assumptions for inner
1507 if (!integer_onep (inner
.assumptions
))
1508 return opt_result::failure_at (vect_location
,
1509 "not vectorized: Bad inner loop.\n");
1511 if (!expr_invariant_in_loop_p (loop
, inner
.number_of_iterations
))
1512 return opt_result::failure_at (vect_location
,
1513 "not vectorized: inner-loop count not"
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_NOTE
, vect_location
,
1518 "Considering outer-loop vectorization.\n");
1519 info
->inner_loop_cond
= inner
.loop_cond
;
1522 if (!single_exit (loop
))
1523 return opt_result::failure_at (vect_location
,
1524 "not vectorized: multiple exits.\n");
1525 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1526 return opt_result::failure_at (vect_location
,
1528 " too many incoming edges.\n");
1530 /* We assume that the loop exit condition is at the end of the loop. i.e,
1531 that the loop is represented as a do-while (with a proper if-guard
1532 before the loop if needed), where the loop header contains all the
1533 executable statements, and the latch is empty. */
1534 if (!empty_block_p (loop
->latch
)
1535 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1536 return opt_result::failure_at (vect_location
,
1537 "not vectorized: latch block not empty.\n");
1539 /* Make sure the exit is not abnormal. */
1540 edge e
= single_exit (loop
);
1541 if (e
->flags
& EDGE_ABNORMAL
)
1542 return opt_result::failure_at (vect_location
,
1544 " abnormal loop exit edge.\n");
1547 = vect_get_loop_niters (loop
, &info
->assumptions
,
1548 &info
->number_of_iterations
,
1549 &info
->number_of_iterationsm1
);
1550 if (!info
->loop_cond
)
1551 return opt_result::failure_at
1553 "not vectorized: complicated exit condition.\n");
1555 if (integer_zerop (info
->assumptions
)
1556 || !info
->number_of_iterations
1557 || chrec_contains_undetermined (info
->number_of_iterations
))
1558 return opt_result::failure_at
1560 "not vectorized: number of iterations cannot be computed.\n");
1562 if (integer_zerop (info
->number_of_iterations
))
1563 return opt_result::failure_at
1565 "not vectorized: number of iterations = 0.\n");
1567 if (!(tree_fits_shwi_p (info
->number_of_iterations
)
1568 && tree_to_shwi (info
->number_of_iterations
) > 0))
1570 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_NOTE
, vect_location
,
1573 "Symbolic number of iterations is ");
1574 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, info
->number_of_iterations
);
1575 dump_printf (MSG_NOTE
, "\n");
1579 return opt_result::success ();
1582 /* Create a loop_vec_info for LOOP with SHARED and the
1583 vect_analyze_loop_form result. */
1586 vect_create_loop_vinfo (class loop
*loop
, vec_info_shared
*shared
,
1587 const vect_loop_form_info
*info
,
1588 loop_vec_info main_loop_info
)
1590 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1591 LOOP_VINFO_NITERSM1 (loop_vinfo
) = info
->number_of_iterationsm1
;
1592 LOOP_VINFO_NITERS (loop_vinfo
) = info
->number_of_iterations
;
1593 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = info
->number_of_iterations
;
1594 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = main_loop_info
;
1595 /* Also record the assumptions for versioning. */
1596 if (!integer_onep (info
->assumptions
) && !main_loop_info
)
1597 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = info
->assumptions
;
1599 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (info
->loop_cond
);
1600 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1601 if (info
->inner_loop_cond
)
1603 stmt_vec_info inner_loop_cond_info
1604 = loop_vinfo
->lookup_stmt (info
->inner_loop_cond
);
1605 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1606 /* If we have an estimate on the number of iterations of the inner
1607 loop use that to limit the scale for costing, otherwise use
1608 --param vect-inner-loop-cost-factor literally. */
1610 if (estimated_stmt_executions (loop
->inner
, &nit
))
1611 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
)
1612 = wi::smin (nit
, param_vect_inner_loop_cost_factor
).to_uhwi ();
1620 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1621 statements update the vectorization factor. */
1624 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1626 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1627 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1628 int nbbs
= loop
->num_nodes
;
1629 poly_uint64 vectorization_factor
;
1632 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1634 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1635 gcc_assert (known_ne (vectorization_factor
, 0U));
1637 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1638 vectorization factor of the loop is the unrolling factor required by
1639 the SLP instances. If that unrolling factor is 1, we say, that we
1640 perform pure SLP on loop - cross iteration parallelism is not
1642 bool only_slp_in_loop
= true;
1643 for (i
= 0; i
< nbbs
; i
++)
1645 basic_block bb
= bbs
[i
];
1646 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1649 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1652 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1653 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1654 && !PURE_SLP_STMT (stmt_info
))
1655 /* STMT needs both SLP and loop-based vectorization. */
1656 only_slp_in_loop
= false;
1658 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1661 if (is_gimple_debug (gsi_stmt (si
)))
1663 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1664 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1665 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1666 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1667 && !PURE_SLP_STMT (stmt_info
))
1668 /* STMT needs both SLP and loop-based vectorization. */
1669 only_slp_in_loop
= false;
1673 if (only_slp_in_loop
)
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_NOTE
, vect_location
,
1677 "Loop contains only SLP stmts\n");
1678 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE
, vect_location
,
1684 "Loop contains SLP and non-SLP stmts\n");
1685 /* Both the vectorization factor and unroll factor have the form
1686 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1687 so they must have a common multiple. */
1688 vectorization_factor
1689 = force_common_multiple (vectorization_factor
,
1690 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1693 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1694 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_NOTE
, vect_location
,
1697 "Updating vectorization factor to ");
1698 dump_dec (MSG_NOTE
, vectorization_factor
);
1699 dump_printf (MSG_NOTE
, ".\n");
1703 /* Return true if STMT_INFO describes a double reduction phi and if
1704 the other phi in the reduction is also relevant for vectorization.
1705 This rejects cases such as:
1708 x_1 = PHI <x_3(outer2), ...>;
1716 x_3 = PHI <x_2(inner)>;
1718 if nothing in x_2 or elsewhere makes x_1 relevant. */
1721 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1723 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1726 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1729 /* Function vect_analyze_loop_operations.
1731 Scan the loop stmts and make sure they are all vectorizable. */
1734 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1736 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1737 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1738 int nbbs
= loop
->num_nodes
;
1740 stmt_vec_info stmt_info
;
1741 bool need_to_vectorize
= false;
1744 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1746 auto_vec
<stmt_info_for_cost
> cost_vec
;
1748 for (i
= 0; i
< nbbs
; i
++)
1750 basic_block bb
= bbs
[i
];
1752 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1755 gphi
*phi
= si
.phi ();
1758 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G",
1762 if (virtual_operand_p (gimple_phi_result (phi
)))
1765 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1766 (i.e., a phi in the tail of the outer-loop). */
1767 if (! is_loop_header_bb_p (bb
))
1769 /* FORNOW: we currently don't support the case that these phis
1770 are not used in the outerloop (unless it is double reduction,
1771 i.e., this phi is vect_reduction_def), cause this case
1772 requires to actually do something here. */
1773 if (STMT_VINFO_LIVE_P (stmt_info
)
1774 && !vect_active_double_reduction_p (stmt_info
))
1775 return opt_result::failure_at (phi
,
1776 "Unsupported loop-closed phi"
1777 " in outer-loop.\n");
1779 /* If PHI is used in the outer loop, we check that its operand
1780 is defined in the inner loop. */
1781 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1785 if (gimple_phi_num_args (phi
) != 1)
1786 return opt_result::failure_at (phi
, "unsupported phi");
1788 phi_op
= PHI_ARG_DEF (phi
, 0);
1789 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1791 return opt_result::failure_at (phi
, "unsupported phi\n");
1793 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1794 && (STMT_VINFO_RELEVANT (op_def_info
)
1795 != vect_used_in_outer_by_reduction
))
1796 return opt_result::failure_at (phi
, "unsupported phi\n");
1798 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1799 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1800 == vect_double_reduction_def
))
1801 && !vectorizable_lc_phi (loop_vinfo
,
1802 stmt_info
, NULL
, NULL
))
1803 return opt_result::failure_at (phi
, "unsupported phi\n");
1809 gcc_assert (stmt_info
);
1811 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1812 || STMT_VINFO_LIVE_P (stmt_info
))
1813 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1814 /* A scalar-dependence cycle that we don't support. */
1815 return opt_result::failure_at (phi
,
1817 " scalar dependence cycle.\n");
1819 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1821 need_to_vectorize
= true;
1822 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1823 && ! PURE_SLP_STMT (stmt_info
))
1824 ok
= vectorizable_induction (loop_vinfo
,
1825 stmt_info
, NULL
, NULL
,
1827 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1828 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1829 == vect_double_reduction_def
)
1830 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1831 && ! PURE_SLP_STMT (stmt_info
))
1832 ok
= vectorizable_reduction (loop_vinfo
,
1833 stmt_info
, NULL
, NULL
, &cost_vec
);
1836 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1838 && STMT_VINFO_LIVE_P (stmt_info
)
1839 && !PURE_SLP_STMT (stmt_info
))
1840 ok
= vectorizable_live_operation (loop_vinfo
,
1841 stmt_info
, NULL
, NULL
, NULL
,
1842 -1, false, &cost_vec
);
1845 return opt_result::failure_at (phi
,
1846 "not vectorized: relevant phi not "
1848 static_cast <gimple
*> (phi
));
1851 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1854 gimple
*stmt
= gsi_stmt (si
);
1855 if (!gimple_clobber_p (stmt
)
1856 && !is_gimple_debug (stmt
))
1859 = vect_analyze_stmt (loop_vinfo
,
1860 loop_vinfo
->lookup_stmt (stmt
),
1862 NULL
, NULL
, &cost_vec
);
1869 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
1871 /* All operations in the loop are either irrelevant (deal with loop
1872 control, or dead), or only used outside the loop and can be moved
1873 out of the loop (e.g. invariants, inductions). The loop can be
1874 optimized away by scalar optimizations. We're better off not
1875 touching this loop. */
1876 if (!need_to_vectorize
)
1878 if (dump_enabled_p ())
1879 dump_printf_loc (MSG_NOTE
, vect_location
,
1880 "All the computation can be taken out of the loop.\n");
1881 return opt_result::failure_at
1883 "not vectorized: redundant loop. no profit to vectorize.\n");
1886 return opt_result::success ();
1889 /* Return true if we know that the iteration count is smaller than the
1890 vectorization factor. Return false if it isn't, or if we can't be sure
1894 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
1896 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1898 HOST_WIDE_INT max_niter
;
1899 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1900 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1902 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1904 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1910 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1911 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1912 definitely no, or -1 if it's worth retrying. */
1915 vect_analyze_loop_costing (loop_vec_info loop_vinfo
,
1916 unsigned *suggested_unroll_factor
)
1918 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1919 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1921 /* Only loops that can handle partially-populated vectors can have iteration
1922 counts less than the vectorization factor. */
1923 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
1925 if (vect_known_niters_smaller_than_vf (loop_vinfo
))
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1929 "not vectorized: iteration count smaller than "
1930 "vectorization factor.\n");
1935 /* If using the "very cheap" model. reject cases in which we'd keep
1936 a copy of the scalar code (even if we might be able to vectorize it). */
1937 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
1938 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1939 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1940 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1944 "some scalar iterations would need to be peeled\n");
1948 int min_profitable_iters
, min_profitable_estimate
;
1949 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1950 &min_profitable_estimate
,
1951 suggested_unroll_factor
);
1953 if (min_profitable_iters
< 0)
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1957 "not vectorized: vectorization not profitable.\n");
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1960 "not vectorized: vector version will never be "
1965 int min_scalar_loop_bound
= (param_min_vect_loop_bound
1968 /* Use the cost model only if it is more conservative than user specified
1970 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1971 min_profitable_iters
);
1973 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1976 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1980 "not vectorized: vectorization not profitable.\n");
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE
, vect_location
,
1983 "not vectorized: iteration count smaller than user "
1984 "specified loop bound parameter or minimum profitable "
1985 "iterations (whichever is more conservative).\n");
1989 /* The static profitablity threshold min_profitable_estimate includes
1990 the cost of having to check at runtime whether the scalar loop
1991 should be used instead. If it turns out that we don't need or want
1992 such a check, the threshold we should use for the static estimate
1993 is simply the point at which the vector loop becomes more profitable
1994 than the scalar loop. */
1995 if (min_profitable_estimate
> min_profitable_iters
1996 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1997 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
1998 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1999 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
2003 " choice between the scalar and vector loops\n");
2004 min_profitable_estimate
= min_profitable_iters
;
2007 /* If the vector loop needs multiple iterations to be beneficial then
2008 things are probably too close to call, and the conservative thing
2009 would be to stick with the scalar code. */
2010 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2011 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2015 "one iteration of the vector loop would be"
2016 " more expensive than the equivalent number of"
2017 " iterations of the scalar loop\n");
2021 HOST_WIDE_INT estimated_niter
;
2023 /* If we are vectorizing an epilogue then we know the maximum number of
2024 scalar iterations it will cover is at least one lower than the
2025 vectorization factor of the main loop. */
2026 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2028 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
2031 estimated_niter
= estimated_stmt_executions_int (loop
);
2032 if (estimated_niter
== -1)
2033 estimated_niter
= likely_max_stmt_executions_int (loop
);
2035 if (estimated_niter
!= -1
2036 && ((unsigned HOST_WIDE_INT
) estimated_niter
2037 < MAX (th
, (unsigned) min_profitable_estimate
)))
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2041 "not vectorized: estimated iteration count too "
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_NOTE
, vect_location
,
2045 "not vectorized: estimated iteration count smaller "
2046 "than specified loop bound parameter or minimum "
2047 "profitable iterations (whichever is more "
2048 "conservative).\n");
2056 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
2057 vec
<data_reference_p
> *datarefs
,
2058 unsigned int *n_stmts
)
2061 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
2062 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
2063 !gsi_end_p (gsi
); gsi_next (&gsi
))
2065 gimple
*stmt
= gsi_stmt (gsi
);
2066 if (is_gimple_debug (stmt
))
2069 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
2073 if (is_gimple_call (stmt
) && loop
->safelen
)
2075 tree fndecl
= gimple_call_fndecl (stmt
), op
;
2076 if (fndecl
!= NULL_TREE
)
2078 cgraph_node
*node
= cgraph_node::get (fndecl
);
2079 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2081 unsigned int j
, n
= gimple_call_num_args (stmt
);
2082 for (j
= 0; j
< n
; j
++)
2084 op
= gimple_call_arg (stmt
, j
);
2086 || (REFERENCE_CLASS_P (op
)
2087 && get_base_address (op
)))
2090 op
= gimple_call_lhs (stmt
);
2091 /* Ignore #pragma omp declare simd functions
2092 if they don't have data references in the
2093 call stmt itself. */
2097 || (REFERENCE_CLASS_P (op
)
2098 && get_base_address (op
)))))
2105 /* If dependence analysis will give up due to the limit on the
2106 number of datarefs stop here and fail fatally. */
2107 if (datarefs
->length ()
2108 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2109 return opt_result::failure_at (stmt
, "exceeded param "
2110 "loop-max-datarefs-for-datadeps\n");
2112 return opt_result::success ();
2115 /* Look for SLP-only access groups and turn each individual access into its own
2118 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2121 struct data_reference
*dr
;
2123 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2125 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2126 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2128 gcc_assert (DR_REF (dr
));
2129 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
2131 /* Check if the load is a part of an interleaving chain. */
2132 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2134 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2135 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_element
);
2136 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2138 /* Check if SLP-only groups. */
2139 if (!STMT_SLP_TYPE (stmt_info
)
2140 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2142 /* Dissolve the group. */
2143 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2145 stmt_vec_info vinfo
= first_element
;
2148 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2149 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2150 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2151 DR_GROUP_SIZE (vinfo
) = 1;
2152 if (STMT_VINFO_STRIDED_P (first_element
))
2153 DR_GROUP_GAP (vinfo
) = 0;
2155 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2156 /* Duplicate and adjust alignment info, it needs to
2157 be present on each group leader, see dr_misalignment. */
2158 if (vinfo
!= first_element
)
2160 dr_vec_info
*dr_info2
= STMT_VINFO_DR_INFO (vinfo
);
2161 dr_info2
->target_alignment
= dr_info
->target_alignment
;
2162 int misalignment
= dr_info
->misalignment
;
2163 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
)
2166 = (TREE_INT_CST_LOW (DR_INIT (dr_info2
->dr
))
2167 - TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
)));
2168 unsigned HOST_WIDE_INT align_c
2169 = dr_info
->target_alignment
.to_constant ();
2170 misalignment
= (misalignment
+ diff
) % align_c
;
2172 dr_info2
->misalignment
= misalignment
;
2181 /* Determine if operating on full vectors for LOOP_VINFO might leave
2182 some scalar iterations still to do. If so, decide how we should
2183 handle those scalar iterations. The possibilities are:
2185 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2188 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2189 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2190 LOOP_VINFO_PEELING_FOR_NITER == false
2192 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2193 to handle the remaining scalar iterations. In this case:
2195 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2196 LOOP_VINFO_PEELING_FOR_NITER == true
2198 There are two choices:
2200 (2a) Consider vectorizing the epilogue loop at the same VF as the
2201 main loop, but using partial vectors instead of full vectors.
2204 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2206 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2209 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2211 When FOR_EPILOGUE_P is true, make this determination based on the
2212 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2213 based on the assumption that LOOP_VINFO is the main loop. The caller
2214 has made sure that the number of iterations is set appropriately for
2215 this value of FOR_EPILOGUE_P. */
2218 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
,
2219 bool for_epilogue_p
)
2221 /* Determine whether there would be any scalar iterations left over. */
2222 bool need_peeling_or_partial_vectors_p
2223 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2225 /* Decide whether to vectorize the loop with partial vectors. */
2226 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2227 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2228 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2229 && need_peeling_or_partial_vectors_p
)
2231 /* For partial-vector-usage=1, try to push the handling of partial
2232 vectors to the epilogue, with the main loop continuing to operate
2235 If we are unrolling we also do not want to use partial vectors. This
2236 is to avoid the overhead of generating multiple masks and also to
2237 avoid having to execute entire iterations of FALSE masked instructions
2238 when dealing with one or less full iterations.
2240 ??? We could then end up failing to use partial vectors if we
2241 decide to peel iterations into a prologue, and if the main loop
2242 then ends up processing fewer than VF iterations. */
2243 if ((param_vect_partial_vector_usage
== 1
2244 || loop_vinfo
->suggested_unroll_factor
> 1)
2245 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2246 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2247 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2249 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2252 if (dump_enabled_p ())
2254 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2255 dump_printf_loc (MSG_NOTE
, vect_location
,
2256 "operating on partial vectors%s.\n",
2257 for_epilogue_p
? " for epilogue loop" : "");
2259 dump_printf_loc (MSG_NOTE
, vect_location
,
2260 "operating only on full vectors%s.\n",
2261 for_epilogue_p
? " for epilogue loop" : "");
2266 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2267 gcc_assert (orig_loop_vinfo
);
2268 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2269 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2270 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)));
2273 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2274 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2276 /* Check that the loop processes at least one full vector. */
2277 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2278 tree scalar_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2279 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2280 return opt_result::failure_at (vect_location
,
2281 "loop does not have enough iterations"
2282 " to support vectorization.\n");
2284 /* If we need to peel an extra epilogue iteration to handle data
2285 accesses with gaps, check that there are enough scalar iterations
2288 The check above is redundant with this one when peeling for gaps,
2289 but the distinction is useful for diagnostics. */
2290 tree scalar_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2291 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2292 && known_lt (wi::to_widest (scalar_nitersm1
), vf
))
2293 return opt_result::failure_at (vect_location
,
2294 "loop does not have enough iterations"
2295 " to support peeling for gaps.\n");
2298 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2299 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2300 && need_peeling_or_partial_vectors_p
);
2302 return opt_result::success ();
2305 /* Function vect_analyze_loop_2.
2307 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2308 analyses will record information in some members of LOOP_VINFO. FATAL
2309 indicates if some analysis meets fatal error. If one non-NULL pointer
2310 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2311 worked out suggested unroll factor, while one NULL pointer shows it's
2312 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2313 is to hold the slp decision when the suggested unroll factor is worked
2316 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
,
2317 unsigned *suggested_unroll_factor
,
2318 bool& slp_done_for_suggested_uf
)
2320 opt_result ok
= opt_result::success ();
2322 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2323 poly_uint64 min_vf
= 2;
2324 loop_vec_info orig_loop_vinfo
= NULL
;
2326 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2327 loop_vec_info of the first vectorized loop. */
2328 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2329 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2331 orig_loop_vinfo
= loop_vinfo
;
2332 gcc_assert (orig_loop_vinfo
);
2334 /* The first group of checks is independent of the vector size. */
2337 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2338 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2339 return opt_result::failure_at (vect_location
,
2340 "not vectorized: simd if(0)\n");
2342 /* Find all data references in the loop (which correspond to vdefs/vuses)
2343 and analyze their evolution in the loop. */
2345 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2347 /* Gather the data references and count stmts in the loop. */
2348 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2351 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2352 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2353 &LOOP_VINFO_N_STMTS (loop_vinfo
));
2356 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2358 "not vectorized: loop contains function "
2359 "calls or data references that cannot "
2363 loop_vinfo
->shared
->save_datarefs ();
2366 loop_vinfo
->shared
->check_datarefs ();
2368 /* Analyze the data references and also adjust the minimal
2369 vectorization factor according to the loads and stores. */
2371 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2376 "bad data references.\n");
2380 /* Check if we are applying unroll factor now. */
2381 bool applying_suggested_uf
= loop_vinfo
->suggested_unroll_factor
> 1;
2382 gcc_assert (!applying_suggested_uf
|| !suggested_unroll_factor
);
2384 /* If the slp decision is false when suggested unroll factor is worked
2385 out, and we are applying suggested unroll factor, we can simply skip
2386 all slp related analyses this time. */
2387 bool slp
= !applying_suggested_uf
|| slp_done_for_suggested_uf
;
2389 /* Classify all cross-iteration scalar data-flow cycles.
2390 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2391 vect_analyze_scalar_cycles (loop_vinfo
, slp
);
2393 vect_pattern_recog (loop_vinfo
);
2395 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2397 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2398 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2400 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2405 "bad data access.\n");
2409 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2411 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2416 "unexpected pattern.\n");
2420 /* While the rest of the analysis below depends on it in some way. */
2423 /* Analyze data dependences between the data-refs in the loop
2424 and adjust the maximum vectorization factor according to
2426 FORNOW: fail at the first data dependence that we encounter. */
2428 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2431 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2433 "bad data dependence.\n");
2436 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2437 && maybe_lt (max_vf
, min_vf
))
2438 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2439 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2441 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2444 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2446 "can't determine vectorization factor.\n");
2449 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2450 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2451 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2453 /* Compute the scalar iteration cost. */
2454 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2456 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2460 /* Check the SLP opportunities in the loop, analyze and build
2462 ok
= vect_analyze_slp (loop_vinfo
, LOOP_VINFO_N_STMTS (loop_vinfo
));
2466 /* If there are any SLP instances mark them as pure_slp. */
2467 slp
= vect_make_slp_decision (loop_vinfo
);
2470 /* Find stmts that need to be both vectorized and SLPed. */
2471 vect_detect_hybrid_slp (loop_vinfo
);
2473 /* Update the vectorization factor based on the SLP decision. */
2474 vect_update_vf_for_slp (loop_vinfo
);
2476 /* Optimize the SLP graph with the vectorization factor fixed. */
2477 vect_optimize_slp (loop_vinfo
);
2479 /* Gather the loads reachable from the SLP graph entries. */
2480 vect_gather_slp_loads (loop_vinfo
);
2484 bool saved_can_use_partial_vectors_p
2485 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2487 /* We don't expect to have to roll back to anything other than an empty
2489 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2491 /* This is the point where we can re-start analysis with SLP forced off. */
2494 /* Apply the suggested unrolling factor, this was determined by the backend
2495 during finish_cost the first time we ran the analyzis for this
2497 if (applying_suggested_uf
)
2498 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *= loop_vinfo
->suggested_unroll_factor
;
2500 /* Now the vectorization factor is final. */
2501 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2502 gcc_assert (known_ne (vectorization_factor
, 0U));
2504 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE
, vect_location
,
2507 "vectorization_factor = ");
2508 dump_dec (MSG_NOTE
, vectorization_factor
);
2509 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2510 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2513 loop_vinfo
->vector_costs
= init_cost (loop_vinfo
, false);
2515 /* Analyze the alignment of the data-refs in the loop.
2516 Fail if a data reference is found that cannot be vectorized. */
2518 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2523 "bad data alignment.\n");
2527 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2528 It is important to call pruning after vect_analyze_data_ref_accesses,
2529 since we use grouping information gathered by interleaving analysis. */
2530 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2534 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2535 vectorization, since we do not want to add extra peeling or
2536 add versioning for alignment. */
2537 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2538 /* This pass will decide on using loop versioning and/or loop peeling in
2539 order to enhance the alignment of data references in the loop. */
2540 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2546 /* Analyze operations in the SLP instances. Note this may
2547 remove unsupported SLP instances which makes the above
2548 SLP kind detection invalid. */
2549 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2550 vect_slp_analyze_operations (loop_vinfo
);
2551 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2553 ok
= opt_result::failure_at (vect_location
,
2554 "unsupported SLP instances\n");
2558 /* Check whether any load in ALL SLP instances is possibly permuted. */
2559 slp_tree load_node
, slp_root
;
2561 slp_instance instance
;
2562 bool can_use_lanes
= true;
2563 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2565 slp_root
= SLP_INSTANCE_TREE (instance
);
2566 int group_size
= SLP_TREE_LANES (slp_root
);
2567 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2568 bool loads_permuted
= false;
2569 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2571 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2574 stmt_vec_info load_info
;
2575 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2576 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2578 loads_permuted
= true;
2583 /* If the loads and stores can be handled with load/store-lane
2584 instructions record it and move on to the next instance. */
2586 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2587 && vect_store_lanes_supported (vectype
, group_size
, false))
2589 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2591 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2592 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2593 /* Use SLP for strided accesses (or if we can't
2595 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2596 || ! vect_load_lanes_supported
2597 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2598 DR_GROUP_SIZE (stmt_vinfo
), false))
2603 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
2605 if (can_use_lanes
&& dump_enabled_p ())
2606 dump_printf_loc (MSG_NOTE
, vect_location
,
2607 "SLP instance %p can use load/store-lanes\n",
2612 can_use_lanes
= false;
2617 /* If all SLP instances can use load/store-lanes abort SLP and try again
2618 with SLP disabled. */
2621 ok
= opt_result::failure_at (vect_location
,
2622 "Built SLP cancelled: can use "
2623 "load/store-lanes\n");
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2626 "Built SLP cancelled: all SLP instances support "
2627 "load/store-lanes\n");
2632 /* Dissolve SLP-only groups. */
2633 vect_dissolve_slp_only_groups (loop_vinfo
);
2635 /* Scan all the remaining operations in the loop that are not subject
2636 to SLP and make sure they are vectorizable. */
2637 ok
= vect_analyze_loop_operations (loop_vinfo
);
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2642 "bad operation or unsupported loop bound.\n");
2646 /* For now, we don't expect to mix both masking and length approaches for one
2647 loop, disable it if both are recorded. */
2648 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2649 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
2650 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
2652 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2654 "can't vectorize a loop with partial vectors"
2655 " because we don't expect to mix different"
2656 " approaches with partial vectors for the"
2658 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2661 /* If we still have the option of using partial vectors,
2662 check whether we can generate the necessary loop controls. */
2663 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2664 && !vect_verify_full_masking (loop_vinfo
)
2665 && !vect_verify_loop_lens (loop_vinfo
))
2666 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2668 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2669 to be able to handle fewer than VF scalars, or needs to have a lower VF
2670 than the main loop. */
2671 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2672 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2673 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2674 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2675 return opt_result::failure_at (vect_location
,
2676 "Vectorization factor too high for"
2677 " epilogue loop.\n");
2679 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2680 assuming that the loop will be used as a main loop. We will redo
2681 this analysis later if we instead decide to use the loop as an
2683 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
, false);
2687 /* Check the costings of the loop make vectorizing worthwhile. */
2688 res
= vect_analyze_loop_costing (loop_vinfo
, suggested_unroll_factor
);
2691 ok
= opt_result::failure_at (vect_location
,
2692 "Loop costings may not be worthwhile.\n");
2696 return opt_result::failure_at (vect_location
,
2697 "Loop costings not worthwhile.\n");
2699 /* If an epilogue loop is required make sure we can create one. */
2700 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2701 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2703 if (dump_enabled_p ())
2704 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2705 if (!vect_can_advance_ivs_p (loop_vinfo
)
2706 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2707 single_exit (LOOP_VINFO_LOOP
2710 ok
= opt_result::failure_at (vect_location
,
2711 "not vectorized: can't create required "
2717 /* During peeling, we need to check if number of loop iterations is
2718 enough for both peeled prolog loop and vector loop. This check
2719 can be merged along with threshold check of loop versioning, so
2720 increase threshold for this case if necessary.
2722 If we are analyzing an epilogue we still want to check what its
2723 versioning threshold would be. If we decide to vectorize the epilogues we
2724 will want to use the lowest versioning threshold of all epilogues and main
2725 loop. This will enable us to enter a vectorized epilogue even when
2726 versioning the loop. We can't simply check whether the epilogue requires
2727 versioning though since we may have skipped some versioning checks when
2728 analyzing the epilogue. For instance, checks for alias versioning will be
2729 skipped when dealing with epilogues as we assume we already checked them
2730 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2731 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2733 poly_uint64 niters_th
= 0;
2734 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2736 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2738 /* Niters for peeled prolog loop. */
2739 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2741 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2742 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2743 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2746 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2749 /* Niters for at least one iteration of vectorized loop. */
2750 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2751 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2752 /* One additional iteration because of peeling for gap. */
2753 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2756 /* Use the same condition as vect_transform_loop to decide when to use
2757 the cost to determine a versioning threshold. */
2758 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2759 && ordered_p (th
, niters_th
))
2760 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2762 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2765 gcc_assert (known_eq (vectorization_factor
,
2766 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2768 slp_done_for_suggested_uf
= slp
;
2770 /* Ok to vectorize! */
2771 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2772 return opt_result::success ();
2775 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2778 /* Try again with SLP forced off but if we didn't do any SLP there is
2779 no point in re-trying. */
2783 /* If the slp decision is true when suggested unroll factor is worked
2784 out, and we are applying suggested unroll factor, we don't need to
2786 if (applying_suggested_uf
&& slp_done_for_suggested_uf
)
2789 /* If there are reduction chains re-trying will fail anyway. */
2790 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2793 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2794 via interleaving or lane instructions. */
2795 slp_instance instance
;
2798 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2800 stmt_vec_info vinfo
;
2801 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2802 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2804 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2805 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2806 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2807 if (! vect_store_lanes_supported (vectype
, size
, false)
2808 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2809 && ! vect_grouped_store_supported (vectype
, size
))
2810 return opt_result::failure_at (vinfo
->stmt
,
2811 "unsupported grouped store\n");
2812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2814 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2815 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2816 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2817 size
= DR_GROUP_SIZE (vinfo
);
2818 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2819 if (! vect_load_lanes_supported (vectype
, size
, false)
2820 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2822 return opt_result::failure_at (vinfo
->stmt
,
2823 "unsupported grouped load\n");
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE
, vect_location
,
2829 "re-trying with SLP disabled\n");
2831 /* Roll back state appropriately. No SLP this time. */
2833 /* Restore vectorization factor as it were without SLP. */
2834 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2835 /* Free the SLP instances. */
2836 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2837 vect_free_slp_instance (instance
);
2838 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2839 /* Reset SLP type to loop_vect on all stmts. */
2840 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2842 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2843 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2844 !gsi_end_p (si
); gsi_next (&si
))
2846 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2847 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2848 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2849 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2851 /* vectorizable_reduction adjusts reduction stmt def-types,
2852 restore them to that of the PHI. */
2853 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2854 = STMT_VINFO_DEF_TYPE (stmt_info
);
2855 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2856 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2857 = STMT_VINFO_DEF_TYPE (stmt_info
);
2860 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2861 !gsi_end_p (si
); gsi_next (&si
))
2863 if (is_gimple_debug (gsi_stmt (si
)))
2865 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2866 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2867 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2869 stmt_vec_info pattern_stmt_info
2870 = STMT_VINFO_RELATED_STMT (stmt_info
);
2871 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
2872 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
2874 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2875 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
2876 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2877 !gsi_end_p (pi
); gsi_next (&pi
))
2878 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2883 /* Free optimized alias test DDRS. */
2884 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2885 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2886 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2887 /* Reset target cost data. */
2888 delete loop_vinfo
->vector_costs
;
2889 loop_vinfo
->vector_costs
= nullptr;
2890 /* Reset accumulated rgroup information. */
2891 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
));
2892 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
2893 /* Reset assorted flags. */
2894 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2895 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2896 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2897 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2898 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2899 = saved_can_use_partial_vectors_p
;
2904 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2905 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2906 OLD_LOOP_VINFO is better unless something specifically indicates
2909 Note that this deliberately isn't a partial order. */
2912 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2913 loop_vec_info old_loop_vinfo
)
2915 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2916 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2918 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2919 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2921 /* Always prefer a VF of loop->simdlen over any other VF. */
2924 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2925 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2926 if (new_simdlen_p
!= old_simdlen_p
)
2927 return new_simdlen_p
;
2930 const auto *old_costs
= old_loop_vinfo
->vector_costs
;
2931 const auto *new_costs
= new_loop_vinfo
->vector_costs
;
2932 if (loop_vec_info main_loop
= LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo
))
2933 return new_costs
->better_epilogue_loop_than_p (old_costs
, main_loop
);
2935 return new_costs
->better_main_loop_than_p (old_costs
);
2938 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2939 true if we should. */
2942 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
2943 loop_vec_info old_loop_vinfo
)
2945 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE
, vect_location
,
2950 "***** Preferring vector mode %s to vector mode %s\n",
2951 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
2952 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
2956 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2957 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2958 MODE_I to the next mode useful to analyze.
2959 Return the loop_vinfo on success and wrapped null on failure. */
2961 static opt_loop_vec_info
2962 vect_analyze_loop_1 (class loop
*loop
, vec_info_shared
*shared
,
2963 const vect_loop_form_info
*loop_form_info
,
2964 loop_vec_info main_loop_vinfo
,
2965 const vector_modes
&vector_modes
, unsigned &mode_i
,
2966 machine_mode
&autodetected_vector_mode
,
2969 loop_vec_info loop_vinfo
2970 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
2972 machine_mode vector_mode
= vector_modes
[mode_i
];
2973 loop_vinfo
->vector_mode
= vector_mode
;
2974 unsigned int suggested_unroll_factor
= 1;
2975 bool slp_done_for_suggested_uf
;
2977 /* Run the main analysis. */
2978 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
,
2979 &suggested_unroll_factor
,
2980 slp_done_for_suggested_uf
);
2981 if (dump_enabled_p ())
2982 dump_printf_loc (MSG_NOTE
, vect_location
,
2983 "***** Analysis %s with vector mode %s\n",
2984 res
? "succeeded" : " failed",
2985 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2987 if (!main_loop_vinfo
&& suggested_unroll_factor
> 1)
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_NOTE
, vect_location
,
2991 "***** Re-trying analysis for unrolling"
2992 " with unroll factor %d and slp %s.\n",
2993 suggested_unroll_factor
,
2994 slp_done_for_suggested_uf
? "on" : "off");
2995 loop_vec_info unroll_vinfo
2996 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
2997 unroll_vinfo
->vector_mode
= vector_mode
;
2998 unroll_vinfo
->suggested_unroll_factor
= suggested_unroll_factor
;
2999 opt_result new_res
= vect_analyze_loop_2 (unroll_vinfo
, fatal
, NULL
,
3000 slp_done_for_suggested_uf
);
3004 loop_vinfo
= unroll_vinfo
;
3007 delete unroll_vinfo
;
3010 /* Remember the autodetected vector mode. */
3011 if (vector_mode
== VOIDmode
)
3012 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
3014 /* Advance mode_i, first skipping modes that would result in the
3015 same analysis result. */
3016 while (mode_i
+ 1 < vector_modes
.length ()
3017 && vect_chooses_same_modes_p (loop_vinfo
,
3018 vector_modes
[mode_i
+ 1]))
3020 if (dump_enabled_p ())
3021 dump_printf_loc (MSG_NOTE
, vect_location
,
3022 "***** The result for vector mode %s would"
3024 GET_MODE_NAME (vector_modes
[mode_i
+ 1]));
3027 if (mode_i
+ 1 < vector_modes
.length ()
3028 && VECTOR_MODE_P (autodetected_vector_mode
)
3029 && (related_vector_mode (vector_modes
[mode_i
+ 1],
3030 GET_MODE_INNER (autodetected_vector_mode
))
3031 == autodetected_vector_mode
)
3032 && (related_vector_mode (autodetected_vector_mode
,
3033 GET_MODE_INNER (vector_modes
[mode_i
+ 1]))
3034 == vector_modes
[mode_i
+ 1]))
3036 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_NOTE
, vect_location
,
3038 "***** Skipping vector mode %s, which would"
3039 " repeat the analysis for %s\n",
3040 GET_MODE_NAME (vector_modes
[mode_i
+ 1]),
3041 GET_MODE_NAME (autodetected_vector_mode
));
3050 gcc_checking_assert (main_loop_vinfo
== NULL
);
3051 return opt_loop_vec_info::propagate_failure (res
);
3054 return opt_loop_vec_info::success (loop_vinfo
);
3057 /* Function vect_analyze_loop.
3059 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3060 for it. The different analyses will record information in the
3061 loop_vec_info struct. */
3063 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
3065 DUMP_VECT_SCOPE ("analyze_loop_nest");
3067 if (loop_outer (loop
)
3068 && loop_vec_info_for_loop (loop_outer (loop
))
3069 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
3070 return opt_loop_vec_info::failure_at (vect_location
,
3071 "outer-loop already vectorized.\n");
3073 if (!find_loop_nest (loop
, &shared
->loop_nest
))
3074 return opt_loop_vec_info::failure_at
3076 "not vectorized: loop nest containing two or more consecutive inner"
3077 " loops cannot be vectorized\n");
3079 /* Analyze the loop form. */
3080 vect_loop_form_info loop_form_info
;
3081 opt_result res
= vect_analyze_loop_form (loop
, &loop_form_info
);
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3086 "bad loop form.\n");
3087 return opt_loop_vec_info::propagate_failure (res
);
3089 if (!integer_onep (loop_form_info
.assumptions
))
3091 /* We consider to vectorize this loop by versioning it under
3092 some assumptions. In order to do this, we need to clear
3093 existing information computed by scev and niter analyzer. */
3095 free_numbers_of_iterations_estimates (loop
);
3096 /* Also set flag for this loop so that following scev and niter
3097 analysis are done under the assumptions. */
3098 loop_constraint_set (loop
, LOOP_C_FINITE
);
3101 auto_vector_modes vector_modes
;
3102 /* Autodetect first vector size we try. */
3103 vector_modes
.safe_push (VOIDmode
);
3104 unsigned int autovec_flags
3105 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
3106 loop
->simdlen
!= 0);
3107 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
3108 && !unlimited_cost_model (loop
));
3109 machine_mode autodetected_vector_mode
= VOIDmode
;
3110 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3111 unsigned int mode_i
= 0;
3112 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
3114 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3115 a mode has not been analyzed. */
3116 auto_vec
<poly_uint64
, 8> cached_vf_per_mode
;
3117 for (unsigned i
= 0; i
< vector_modes
.length (); ++i
)
3118 cached_vf_per_mode
.safe_push (0);
3120 /* First determine the main loop vectorization mode, either the first
3121 one that works, starting with auto-detecting the vector mode and then
3122 following the targets order of preference, or the one with the
3123 lowest cost if pick_lowest_cost_p. */
3127 unsigned int last_mode_i
= mode_i
;
3128 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3130 cached_vf_per_mode
[last_mode_i
] = -1;
3131 opt_loop_vec_info loop_vinfo
3132 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3133 NULL
, vector_modes
, mode_i
,
3134 autodetected_vector_mode
, fatal
);
3140 /* Analyzis has been successful so update the VF value. The
3141 VF should always be a multiple of unroll_factor and we want to
3142 capture the original VF here. */
3143 cached_vf_per_mode
[last_mode_i
]
3144 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
3145 loop_vinfo
->suggested_unroll_factor
);
3146 /* Once we hit the desired simdlen for the first time,
3147 discard any previous attempts. */
3149 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3151 delete first_loop_vinfo
;
3152 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3155 else if (pick_lowest_cost_p
3157 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3159 /* Pick loop_vinfo over first_loop_vinfo. */
3160 delete first_loop_vinfo
;
3161 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3163 if (first_loop_vinfo
== NULL
)
3164 first_loop_vinfo
= loop_vinfo
;
3168 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3171 /* Commit to first_loop_vinfo if we have no reason to try
3173 if (!simdlen
&& !pick_lowest_cost_p
)
3176 if (mode_i
== vector_modes
.length ()
3177 || autodetected_vector_mode
== VOIDmode
)
3180 /* Try the next biggest vector size. */
3181 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE
, vect_location
,
3183 "***** Re-trying analysis with vector mode %s\n",
3184 GET_MODE_NAME (vector_modes
[mode_i
]));
3186 if (!first_loop_vinfo
)
3187 return opt_loop_vec_info::propagate_failure (res
);
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE
, vect_location
,
3191 "***** Choosing vector mode %s\n",
3192 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3194 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3195 enabled, SIMDUID is not set, it is the innermost loop and we have
3196 either already found the loop's SIMDLEN or there was no SIMDLEN to
3198 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3199 bool vect_epilogues
= (!simdlen
3200 && loop
->inner
== NULL
3201 && param_vect_epilogues_nomask
3202 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3204 if (!vect_epilogues
)
3205 return first_loop_vinfo
;
3207 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3208 poly_uint64 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3210 /* For epilogues start the analysis from the first mode. The motivation
3211 behind starting from the beginning comes from cases where the VECTOR_MODES
3212 array may contain length-agnostic and length-specific modes. Their
3213 ordering is not guaranteed, so we could end up picking a mode for the main
3214 loop that is after the epilogue's optimal mode. */
3215 vector_modes
[0] = autodetected_vector_mode
;
3218 bool supports_partial_vectors
=
3219 partial_vectors_supported_p () && param_vect_partial_vector_usage
!= 0;
3220 poly_uint64 first_vinfo_vf
= LOOP_VINFO_VECT_FACTOR (first_loop_vinfo
);
3224 /* If the target does not support partial vectors we can shorten the
3225 number of modes to analyze for the epilogue as we know we can't pick a
3226 mode that would lead to a VF at least as big as the
3228 if (!supports_partial_vectors
3229 && maybe_ge (cached_vf_per_mode
[mode_i
], first_vinfo_vf
))
3232 if (mode_i
== vector_modes
.length ())
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE
, vect_location
,
3239 "***** Re-trying epilogue analysis with vector "
3240 "mode %s\n", GET_MODE_NAME (vector_modes
[mode_i
]));
3243 opt_loop_vec_info loop_vinfo
3244 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3246 vector_modes
, mode_i
,
3247 autodetected_vector_mode
, fatal
);
3253 if (pick_lowest_cost_p
)
3255 /* Keep trying to roll back vectorization attempts while the
3256 loop_vec_infos they produced were worse than this one. */
3257 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3258 while (!vinfos
.is_empty ()
3259 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3261 gcc_assert (vect_epilogues
);
3262 delete vinfos
.pop ();
3265 /* For now only allow one epilogue loop. */
3266 if (first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3268 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3269 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3270 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3271 || maybe_ne (lowest_th
, 0U));
3272 /* Keep track of the known smallest versioning
3274 if (ordered_p (lowest_th
, th
))
3275 lowest_th
= ordered_min (lowest_th
, th
);
3280 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3283 /* For now only allow one epilogue loop, but allow
3284 pick_lowest_cost_p to replace it, so commit to the
3285 first epilogue if we have no reason to try alternatives. */
3286 if (!pick_lowest_cost_p
)
3290 if (mode_i
== vector_modes
.length ())
3295 if (!first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3297 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_NOTE
, vect_location
,
3300 "***** Choosing epilogue vector mode %s\n",
3302 (first_loop_vinfo
->epilogue_vinfos
[0]->vector_mode
));
3305 return first_loop_vinfo
;
3308 /* Return true if there is an in-order reduction function for CODE, storing
3309 it in *REDUC_FN if so. */
3312 fold_left_reduction_fn (code_helper code
, internal_fn
*reduc_fn
)
3314 if (code
== PLUS_EXPR
)
3316 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3322 /* Function reduction_fn_for_scalar_code
3325 CODE - tree_code of a reduction operations.
3328 REDUC_FN - the corresponding internal function to be used to reduce the
3329 vector of partial results into a single scalar result, or IFN_LAST
3330 if the operation is a supported reduction operation, but does not have
3331 such an internal function.
3333 Return FALSE if CODE currently cannot be vectorized as reduction. */
3336 reduction_fn_for_scalar_code (code_helper code
, internal_fn
*reduc_fn
)
3338 if (code
.is_tree_code ())
3339 switch (tree_code (code
))
3342 *reduc_fn
= IFN_REDUC_MAX
;
3346 *reduc_fn
= IFN_REDUC_MIN
;
3350 *reduc_fn
= IFN_REDUC_PLUS
;
3354 *reduc_fn
= IFN_REDUC_AND
;
3358 *reduc_fn
= IFN_REDUC_IOR
;
3362 *reduc_fn
= IFN_REDUC_XOR
;
3367 *reduc_fn
= IFN_LAST
;
3374 switch (combined_fn (code
))
3377 *reduc_fn
= IFN_REDUC_FMAX
;
3381 *reduc_fn
= IFN_REDUC_FMIN
;
3389 /* If there is a neutral value X such that a reduction would not be affected
3390 by the introduction of additional X elements, return that X, otherwise
3391 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3392 of the scalar elements. If the reduction has just a single initial value
3393 then INITIAL_VALUE is that value, otherwise it is null. */
3396 neutral_op_for_reduction (tree scalar_type
, code_helper code
,
3399 if (code
.is_tree_code ())
3400 switch (tree_code (code
))
3402 case WIDEN_SUM_EXPR
:
3409 return build_zero_cst (scalar_type
);
3412 return build_one_cst (scalar_type
);
3415 return build_all_ones_cst (scalar_type
);
3419 return initial_value
;
3425 switch (combined_fn (code
))
3429 return initial_value
;
3436 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3437 STMT is printed with a message MSG. */
3440 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3442 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3445 /* Return true if we need an in-order reduction for operation CODE
3446 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3447 overflow must wrap. */
3450 needs_fold_left_reduction_p (tree type
, code_helper code
)
3452 /* CHECKME: check for !flag_finite_math_only too? */
3453 if (SCALAR_FLOAT_TYPE_P (type
))
3455 if (code
.is_tree_code ())
3456 switch (tree_code (code
))
3463 return !flag_associative_math
;
3466 switch (combined_fn (code
))
3473 return !flag_associative_math
;
3477 if (INTEGRAL_TYPE_P (type
))
3478 return (!code
.is_tree_code ()
3479 || !operation_no_trapping_overflow (type
, tree_code (code
)));
3481 if (SAT_FIXED_POINT_TYPE_P (type
))
3487 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3488 has a handled computation expression. Store the main reduction
3489 operation in *CODE. */
3492 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3493 tree loop_arg
, code_helper
*code
,
3494 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3496 auto_bitmap visited
;
3497 tree lookfor
= PHI_RESULT (phi
);
3499 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3500 while (USE_FROM_PTR (curr
) != loop_arg
)
3501 curr
= op_iter_next_use (&curri
);
3502 curri
.i
= curri
.numops
;
3505 path
.safe_push (std::make_pair (curri
, curr
));
3506 tree use
= USE_FROM_PTR (curr
);
3509 gimple
*def
= SSA_NAME_DEF_STMT (use
);
3510 if (gimple_nop_p (def
)
3511 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3516 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3520 curr
= op_iter_next_use (&curri
);
3521 /* Skip already visited or non-SSA operands (from iterating
3523 while (curr
!= NULL_USE_OPERAND_P
3524 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3525 || ! bitmap_set_bit (visited
,
3527 (USE_FROM_PTR (curr
)))));
3529 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3530 if (curr
== NULL_USE_OPERAND_P
)
3535 if (gimple_code (def
) == GIMPLE_PHI
)
3536 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3538 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3539 while (curr
!= NULL_USE_OPERAND_P
3540 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3541 || ! bitmap_set_bit (visited
,
3543 (USE_FROM_PTR (curr
)))))
3544 curr
= op_iter_next_use (&curri
);
3545 if (curr
== NULL_USE_OPERAND_P
)
3550 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3552 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3554 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3555 FOR_EACH_VEC_ELT (path
, i
, x
)
3556 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
3557 dump_printf (MSG_NOTE
, "\n");
3560 /* Check whether the reduction path detected is valid. */
3561 bool fail
= path
.length () == 0;
3565 for (unsigned i
= 1; i
< path
.length (); ++i
)
3567 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3569 if (!gimple_extract_op (use_stmt
, &op
))
3574 unsigned int opi
= op
.num_ops
;
3575 if (gassign
*assign
= dyn_cast
<gassign
*> (use_stmt
))
3577 /* The following make sure we can compute the operand index
3578 easily plus it mostly disallows chaining via COND_EXPR condition
3580 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3581 if (gimple_assign_rhs1_ptr (assign
) + opi
== path
[i
].second
->use
)
3584 else if (gcall
*call
= dyn_cast
<gcall
*> (use_stmt
))
3586 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3587 if (gimple_call_arg_ptr (call
, opi
) == path
[i
].second
->use
)
3590 if (opi
== op
.num_ops
)
3595 op
.code
= canonicalize_code (op
.code
, op
.type
);
3596 if (op
.code
== MINUS_EXPR
)
3598 op
.code
= PLUS_EXPR
;
3599 /* Track whether we negate the reduction value each iteration. */
3600 if (op
.ops
[1] == op
.ops
[opi
])
3603 if (CONVERT_EXPR_CODE_P (op
.code
)
3604 && tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
3606 else if (*code
== ERROR_MARK
)
3609 sign
= TYPE_SIGN (op
.type
);
3611 else if (op
.code
!= *code
)
3616 else if ((op
.code
== MIN_EXPR
3617 || op
.code
== MAX_EXPR
)
3618 && sign
!= TYPE_SIGN (op
.type
))
3623 /* Check there's only a single stmt the op is used on. For the
3624 not value-changing tail and the last stmt allow out-of-loop uses.
3625 ??? We could relax this and handle arbitrary live stmts by
3626 forcing a scalar epilogue for example. */
3627 imm_use_iterator imm_iter
;
3628 gimple
*op_use_stmt
;
3630 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
.ops
[opi
])
3631 if (!is_gimple_debug (op_use_stmt
)
3632 && (*code
!= ERROR_MARK
3633 || flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
))))
3635 /* We want to allow x + x but not x < 1 ? x : 2. */
3636 if (is_gimple_assign (op_use_stmt
)
3637 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3639 use_operand_p use_p
;
3640 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3652 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3656 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3657 tree loop_arg
, enum tree_code code
)
3659 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3661 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3667 /* Function vect_is_simple_reduction
3669 (1) Detect a cross-iteration def-use cycle that represents a simple
3670 reduction computation. We look for the following pattern:
3675 a2 = operation (a3, a1)
3682 a2 = operation (a3, a1)
3685 1. operation is commutative and associative and it is safe to
3686 change the order of the computation
3687 2. no uses for a2 in the loop (a2 is used out of the loop)
3688 3. no uses of a1 in the loop besides the reduction operation
3689 4. no uses of a1 outside the loop.
3691 Conditions 1,4 are tested here.
3692 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3694 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3697 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3701 inner loop (def of a3)
3704 (4) Detect condition expressions, ie:
3705 for (int i = 0; i < N; i++)
3711 static stmt_vec_info
3712 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3713 bool *double_reduc
, bool *reduc_chain_p
, bool slp
)
3715 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3716 gimple
*phi_use_stmt
= NULL
;
3717 imm_use_iterator imm_iter
;
3718 use_operand_p use_p
;
3720 *double_reduc
= false;
3721 *reduc_chain_p
= false;
3722 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3724 tree phi_name
= PHI_RESULT (phi
);
3725 /* ??? If there are no uses of the PHI result the inner loop reduction
3726 won't be detected as possibly double-reduction by vectorizable_reduction
3727 because that tries to walk the PHI arg from the preheader edge which
3728 can be constant. See PR60382. */
3729 if (has_zero_uses (phi_name
))
3731 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3732 unsigned nphi_def_loop_uses
= 0;
3733 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3735 gimple
*use_stmt
= USE_STMT (use_p
);
3736 if (is_gimple_debug (use_stmt
))
3739 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3741 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3743 "intermediate value used outside loop.\n");
3748 nphi_def_loop_uses
++;
3749 phi_use_stmt
= use_stmt
;
3752 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3753 if (TREE_CODE (latch_def
) != SSA_NAME
)
3755 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3757 "reduction: not ssa_name: %T\n", latch_def
);
3761 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3763 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3766 bool nested_in_vect_loop
3767 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3768 unsigned nlatch_def_loop_uses
= 0;
3769 auto_vec
<gphi
*, 3> lcphis
;
3770 bool inner_loop_of_double_reduc
= false;
3771 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3773 gimple
*use_stmt
= USE_STMT (use_p
);
3774 if (is_gimple_debug (use_stmt
))
3776 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3777 nlatch_def_loop_uses
++;
3780 /* We can have more than one loop-closed PHI. */
3781 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3782 if (nested_in_vect_loop
3783 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3784 == vect_double_reduction_def
))
3785 inner_loop_of_double_reduc
= true;
3789 /* If we are vectorizing an inner reduction we are executing that
3790 in the original order only in case we are not dealing with a
3791 double reduction. */
3792 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3794 if (dump_enabled_p ())
3795 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3796 "detected nested cycle: ");
3797 return def_stmt_info
;
3800 /* When the inner loop of a double reduction ends up with more than
3801 one loop-closed PHI we have failed to classify alternate such
3802 PHIs as double reduction, leading to wrong code. See PR103237. */
3803 if (inner_loop_of_double_reduc
&& lcphis
.length () != 1)
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3807 "unhandle double reduction\n");
3811 /* If this isn't a nested cycle or if the nested cycle reduction value
3812 is used ouside of the inner loop we cannot handle uses of the reduction
3814 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3818 "reduction used in loop.\n");
3822 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3823 defined in the inner loop. */
3824 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3826 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3827 if (gimple_phi_num_args (def_stmt
) != 1
3828 || TREE_CODE (op1
) != SSA_NAME
)
3830 if (dump_enabled_p ())
3831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3832 "unsupported phi node definition.\n");
3837 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3838 if (gimple_bb (def1
)
3839 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3841 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3842 && (is_gimple_assign (def1
) || is_gimple_call (def1
))
3843 && is_a
<gphi
*> (phi_use_stmt
)
3844 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3846 if (dump_enabled_p ())
3847 report_vect_op (MSG_NOTE
, def_stmt
,
3848 "detected double reduction: ");
3850 *double_reduc
= true;
3851 return def_stmt_info
;
3857 /* Look for the expression computing latch_def from then loop PHI result. */
3858 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3860 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3863 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3864 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3865 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3867 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3868 reduction chain for which the additional restriction is that
3869 all operations in the chain are the same. */
3870 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3872 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3873 for (i
= path
.length () - 1; i
>= 1; --i
)
3875 gimple
*stmt
= USE_STMT (path
[i
].second
);
3876 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3878 if (!gimple_extract_op (stmt
, &op
))
3880 if (gassign
*assign
= dyn_cast
<gassign
*> (stmt
))
3881 STMT_VINFO_REDUC_IDX (stmt_info
)
3882 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (assign
);
3885 gcall
*call
= as_a
<gcall
*> (stmt
);
3886 STMT_VINFO_REDUC_IDX (stmt_info
)
3887 = path
[i
].second
->use
- gimple_call_arg_ptr (call
, 0);
3889 bool leading_conversion
= (CONVERT_EXPR_CODE_P (op
.code
)
3890 && (i
== 1 || i
== path
.length () - 1));
3891 if ((op
.code
!= code
&& !leading_conversion
)
3892 /* We can only handle the final value in epilogue
3893 generation for reduction chains. */
3894 || (i
!= 1 && !has_single_use (gimple_get_lhs (stmt
))))
3895 is_slp_reduc
= false;
3896 /* For reduction chains we support a trailing/leading
3897 conversions. We do not store those in the actual chain. */
3898 if (leading_conversion
)
3900 reduc_chain
.safe_push (stmt_info
);
3902 if (slp
&& is_slp_reduc
&& reduc_chain
.length () > 1)
3904 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3906 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3907 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3909 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3910 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3912 /* Save the chain for further analysis in SLP detection. */
3913 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3914 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3916 *reduc_chain_p
= true;
3917 if (dump_enabled_p ())
3918 dump_printf_loc (MSG_NOTE
, vect_location
,
3919 "reduction: detected reduction chain\n");
3921 else if (dump_enabled_p ())
3922 dump_printf_loc (MSG_NOTE
, vect_location
,
3923 "reduction: detected reduction\n");
3925 return def_stmt_info
;
3928 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_NOTE
, vect_location
,
3930 "reduction: unknown pattern\n");
3935 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3936 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3937 or -1 if not known. */
3940 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
3942 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3943 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
3945 if (dump_enabled_p ())
3946 dump_printf_loc (MSG_NOTE
, vect_location
,
3947 "cost model: epilogue peel iters set to vf/2 "
3948 "because loop iterations are unknown .\n");
3949 return assumed_vf
/ 2;
3953 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3954 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
3955 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3956 /* If we need to peel for gaps, but no peeling is required, we have to
3957 peel VF iterations. */
3958 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
3959 peel_iters_epilogue
= assumed_vf
;
3960 return peel_iters_epilogue
;
3964 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3966 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3967 int *peel_iters_epilogue
,
3968 stmt_vector_for_cost
*scalar_cost_vec
,
3969 stmt_vector_for_cost
*prologue_cost_vec
,
3970 stmt_vector_for_cost
*epilogue_cost_vec
)
3974 *peel_iters_epilogue
3975 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
3977 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3979 /* If peeled iterations are known but number of scalar loop
3980 iterations are unknown, count a taken branch per peeled loop. */
3981 if (peel_iters_prologue
> 0)
3982 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3984 if (*peel_iters_epilogue
> 0)
3985 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3989 stmt_info_for_cost
*si
;
3991 if (peel_iters_prologue
)
3992 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3993 retval
+= record_stmt_cost (prologue_cost_vec
,
3994 si
->count
* peel_iters_prologue
,
3995 si
->kind
, si
->stmt_info
, si
->misalign
,
3997 if (*peel_iters_epilogue
)
3998 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3999 retval
+= record_stmt_cost (epilogue_cost_vec
,
4000 si
->count
* *peel_iters_epilogue
,
4001 si
->kind
, si
->stmt_info
, si
->misalign
,
4007 /* Function vect_estimate_min_profitable_iters
4009 Return the number of iterations required for the vector version of the
4010 loop to be profitable relative to the cost of the scalar version of the
4013 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4014 of iterations for vectorization. -1 value means loop vectorization
4015 is not profitable. This returned value may be used for dynamic
4016 profitability check.
4018 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4019 for static check against estimated number of iterations. */
4022 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
4023 int *ret_min_profitable_niters
,
4024 int *ret_min_profitable_estimate
,
4025 unsigned *suggested_unroll_factor
)
4027 int min_profitable_iters
;
4028 int min_profitable_estimate
;
4029 int peel_iters_prologue
;
4030 int peel_iters_epilogue
;
4031 unsigned vec_inside_cost
= 0;
4032 int vec_outside_cost
= 0;
4033 unsigned vec_prologue_cost
= 0;
4034 unsigned vec_epilogue_cost
= 0;
4035 int scalar_single_iter_cost
= 0;
4036 int scalar_outside_cost
= 0;
4037 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4038 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
4039 vector_costs
*target_cost_data
= loop_vinfo
->vector_costs
;
4041 /* Cost model disabled. */
4042 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
4044 if (dump_enabled_p ())
4045 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
4046 *ret_min_profitable_niters
= 0;
4047 *ret_min_profitable_estimate
= 0;
4051 /* Requires loop versioning tests to handle misalignment. */
4052 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
4054 /* FIXME: Make cost depend on complexity of individual check. */
4055 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
4056 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4057 if (dump_enabled_p ())
4058 dump_printf (MSG_NOTE
,
4059 "cost model: Adding cost of checks for loop "
4060 "versioning to treat misalignment.\n");
4063 /* Requires loop versioning with alias checks. */
4064 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
4066 /* FIXME: Make cost depend on complexity of individual check. */
4067 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
4068 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4069 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
4071 /* Count LEN - 1 ANDs and LEN comparisons. */
4072 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1,
4073 scalar_stmt
, vect_prologue
);
4074 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
4077 /* Count LEN - 1 ANDs and LEN comparisons. */
4078 unsigned int nstmts
= len
* 2 - 1;
4079 /* +1 for each bias that needs adding. */
4080 for (unsigned int i
= 0; i
< len
; ++i
)
4081 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
4083 (void) add_stmt_cost (target_cost_data
, nstmts
,
4084 scalar_stmt
, vect_prologue
);
4086 if (dump_enabled_p ())
4087 dump_printf (MSG_NOTE
,
4088 "cost model: Adding cost of checks for loop "
4089 "versioning aliasing.\n");
4092 /* Requires loop versioning with niter checks. */
4093 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
4095 /* FIXME: Make cost depend on complexity of individual check. */
4096 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
,
4097 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4098 if (dump_enabled_p ())
4099 dump_printf (MSG_NOTE
,
4100 "cost model: Adding cost of checks for loop "
4101 "versioning niters.\n");
4104 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4105 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4108 /* Count statements in scalar loop. Using this as scalar cost for a single
4111 TODO: Add outer loop support.
4113 TODO: Consider assigning different costs to different scalar
4116 scalar_single_iter_cost
= loop_vinfo
->scalar_costs
->total_cost ();
4118 /* Add additional cost for the peeled instructions in prologue and epilogue
4119 loop. (For fully-masked loops there will be no peeling.)
4121 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4122 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4124 TODO: Build an expression that represents peel_iters for prologue and
4125 epilogue to be used in a run-time test. */
4127 bool prologue_need_br_taken_cost
= false;
4128 bool prologue_need_br_not_taken_cost
= false;
4130 /* Calculate peel_iters_prologue. */
4131 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
4132 peel_iters_prologue
= 0;
4135 peel_iters_prologue
= assumed_vf
/ 2;
4136 if (dump_enabled_p ())
4137 dump_printf (MSG_NOTE
, "cost model: "
4138 "prologue peel iters set to vf/2.\n");
4140 /* If peeled iterations are unknown, count a taken branch and a not taken
4141 branch per peeled loop. Even if scalar loop iterations are known,
4142 vector iterations are not known since peeled prologue iterations are
4143 not known. Hence guards remain the same. */
4144 prologue_need_br_taken_cost
= true;
4145 prologue_need_br_not_taken_cost
= true;
4149 peel_iters_prologue
= npeel
;
4150 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
4151 /* If peeled iterations are known but number of scalar loop
4152 iterations are unknown, count a taken branch per peeled loop. */
4153 prologue_need_br_taken_cost
= true;
4156 bool epilogue_need_br_taken_cost
= false;
4157 bool epilogue_need_br_not_taken_cost
= false;
4159 /* Calculate peel_iters_epilogue. */
4160 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4161 /* We need to peel exactly one iteration for gaps. */
4162 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
4165 /* If peeling for alignment is unknown, loop bound of main loop
4167 peel_iters_epilogue
= assumed_vf
/ 2;
4168 if (dump_enabled_p ())
4169 dump_printf (MSG_NOTE
, "cost model: "
4170 "epilogue peel iters set to vf/2 because "
4171 "peeling for alignment is unknown.\n");
4173 /* See the same reason above in peel_iters_prologue calculation. */
4174 epilogue_need_br_taken_cost
= true;
4175 epilogue_need_br_not_taken_cost
= true;
4179 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4180 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4181 /* If peeled iterations are known but number of scalar loop
4182 iterations are unknown, count a taken branch per peeled loop. */
4183 epilogue_need_br_taken_cost
= true;
4186 stmt_info_for_cost
*si
;
4188 /* Add costs associated with peel_iters_prologue. */
4189 if (peel_iters_prologue
)
4190 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4192 (void) add_stmt_cost (target_cost_data
,
4193 si
->count
* peel_iters_prologue
, si
->kind
,
4194 si
->stmt_info
, si
->node
, si
->vectype
,
4195 si
->misalign
, vect_prologue
);
4198 /* Add costs associated with peel_iters_epilogue. */
4199 if (peel_iters_epilogue
)
4200 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4202 (void) add_stmt_cost (target_cost_data
,
4203 si
->count
* peel_iters_epilogue
, si
->kind
,
4204 si
->stmt_info
, si
->node
, si
->vectype
,
4205 si
->misalign
, vect_epilogue
);
4208 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4210 if (prologue_need_br_taken_cost
)
4211 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4214 if (prologue_need_br_not_taken_cost
)
4215 (void) add_stmt_cost (target_cost_data
, 1,
4216 cond_branch_not_taken
, vect_prologue
);
4218 if (epilogue_need_br_taken_cost
)
4219 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4222 if (epilogue_need_br_not_taken_cost
)
4223 (void) add_stmt_cost (target_cost_data
, 1,
4224 cond_branch_not_taken
, vect_epilogue
);
4226 /* Take care of special costs for rgroup controls of partial vectors. */
4227 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
4229 /* Calculate how many masks we need to generate. */
4230 unsigned int num_masks
= 0;
4231 rgroup_controls
*rgm
;
4232 unsigned int num_vectors_m1
;
4233 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
4235 num_masks
+= num_vectors_m1
+ 1;
4236 gcc_assert (num_masks
> 0);
4238 /* In the worst case, we need to generate each mask in the prologue
4239 and in the loop body. One of the loop body mask instructions
4240 replaces the comparison in the scalar loop, and since we don't
4241 count the scalar comparison against the scalar body, we shouldn't
4242 count that vector instruction against the vector body either.
4244 Sometimes we can use unpacks instead of generating prologue
4245 masks and sometimes the prologue mask will fold to a constant,
4246 so the actual prologue cost might be smaller. However, it's
4247 simpler and safer to use the worst-case cost; if this ends up
4248 being the tie-breaker between vectorizing or not, then it's
4249 probably better not to vectorize. */
4250 (void) add_stmt_cost (target_cost_data
, num_masks
,
4251 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4253 (void) add_stmt_cost (target_cost_data
, num_masks
- 1,
4254 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4257 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4259 /* Referring to the functions vect_set_loop_condition_partial_vectors
4260 and vect_set_loop_controls_directly, we need to generate each
4261 length in the prologue and in the loop body if required. Although
4262 there are some possible optimizations, we consider the worst case
4265 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4266 signed char partial_load_store_bias
4267 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
4269 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4270 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4272 /* Calculate how many statements to be added. */
4273 unsigned int prologue_stmts
= 0;
4274 unsigned int body_stmts
= 0;
4276 rgroup_controls
*rgc
;
4277 unsigned int num_vectors_m1
;
4278 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4281 /* May need one SHIFT for nitems_total computation. */
4282 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4283 if (nitems
!= 1 && !niters_known_p
)
4284 prologue_stmts
+= 1;
4286 /* May need one MAX and one MINUS for wrap around. */
4287 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4288 prologue_stmts
+= 2;
4290 /* Need one MAX and one MINUS for each batch limit excepting for
4292 prologue_stmts
+= num_vectors_m1
* 2;
4294 unsigned int num_vectors
= num_vectors_m1
+ 1;
4296 /* Need to set up lengths in prologue, only one MIN required
4297 for each since start index is zero. */
4298 prologue_stmts
+= num_vectors
;
4300 /* If we have a non-zero partial load bias, we need one PLUS
4301 to adjust the load length. */
4302 if (partial_load_store_bias
!= 0)
4305 /* Each may need two MINs and one MINUS to update lengths in body
4306 for next iteration. */
4308 body_stmts
+= 3 * num_vectors
;
4311 (void) add_stmt_cost (target_cost_data
, prologue_stmts
,
4312 scalar_stmt
, vect_prologue
);
4313 (void) add_stmt_cost (target_cost_data
, body_stmts
,
4314 scalar_stmt
, vect_body
);
4317 /* FORNOW: The scalar outside cost is incremented in one of the
4320 1. The vectorizer checks for alignment and aliasing and generates
4321 a condition that allows dynamic vectorization. A cost model
4322 check is ANDED with the versioning condition. Hence scalar code
4323 path now has the added cost of the versioning check.
4325 if (cost > th & versioning_check)
4328 Hence run-time scalar is incremented by not-taken branch cost.
4330 2. The vectorizer then checks if a prologue is required. If the
4331 cost model check was not done before during versioning, it has to
4332 be done before the prologue check.
4335 prologue = scalar_iters
4340 if (prologue == num_iters)
4343 Hence the run-time scalar cost is incremented by a taken branch,
4344 plus a not-taken branch, plus a taken branch cost.
4346 3. The vectorizer then checks if an epilogue is required. If the
4347 cost model check was not done before during prologue check, it
4348 has to be done with the epilogue check.
4354 if (prologue == num_iters)
4357 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4360 Hence the run-time scalar cost should be incremented by 2 taken
4363 TODO: The back end may reorder the BBS's differently and reverse
4364 conditions/branch directions. Change the estimates below to
4365 something more reasonable. */
4367 /* If the number of iterations is known and we do not do versioning, we can
4368 decide whether to vectorize at compile time. Hence the scalar version
4369 do not carry cost model guard costs. */
4370 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4371 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4373 /* Cost model check occurs at versioning. */
4374 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4375 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4378 /* Cost model check occurs at prologue generation. */
4379 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4380 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4381 + vect_get_stmt_cost (cond_branch_not_taken
);
4382 /* Cost model check occurs at epilogue generation. */
4384 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4388 /* Complete the target-specific cost calculations. */
4389 finish_cost (loop_vinfo
->vector_costs
, loop_vinfo
->scalar_costs
,
4390 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
,
4391 suggested_unroll_factor
);
4393 if (suggested_unroll_factor
&& *suggested_unroll_factor
> 1
4394 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) != MAX_VECTORIZATION_FACTOR
4395 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *
4396 *suggested_unroll_factor
,
4397 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
)))
4399 if (dump_enabled_p ())
4400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4401 "can't unroll as unrolled vectorization factor larger"
4402 " than maximum vectorization factor: "
4403 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
4404 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
));
4405 *suggested_unroll_factor
= 1;
4408 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
4410 if (dump_enabled_p ())
4412 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4413 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
4415 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
4417 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
4419 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
4420 scalar_single_iter_cost
);
4421 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
4422 scalar_outside_cost
);
4423 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
4425 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
4426 peel_iters_prologue
);
4427 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
4428 peel_iters_epilogue
);
4431 /* Calculate number of iterations required to make the vector version
4432 profitable, relative to the loop bodies only. The following condition
4434 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4436 SIC = scalar iteration cost, VIC = vector iteration cost,
4437 VOC = vector outside cost, VF = vectorization factor,
4438 NPEEL = prologue iterations + epilogue iterations,
4439 SOC = scalar outside cost for run time cost model check. */
4441 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
4443 if (saving_per_viter
<= 0)
4445 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
4446 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
4447 "vectorization did not happen for a simd loop");
4449 if (dump_enabled_p ())
4450 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4451 "cost model: the vector iteration cost = %d "
4452 "divided by the scalar iteration cost = %d "
4453 "is greater or equal to the vectorization factor = %d"
4455 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4456 *ret_min_profitable_niters
= -1;
4457 *ret_min_profitable_estimate
= -1;
4461 /* ??? The "if" arm is written to handle all cases; see below for what
4462 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4463 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4465 /* Rewriting the condition above in terms of the number of
4466 vector iterations (vniters) rather than the number of
4467 scalar iterations (niters) gives:
4469 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4471 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4473 For integer N, X and Y when X > 0:
4475 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4476 int outside_overhead
= (vec_outside_cost
4477 - scalar_single_iter_cost
* peel_iters_prologue
4478 - scalar_single_iter_cost
* peel_iters_epilogue
4479 - scalar_outside_cost
);
4480 /* We're only interested in cases that require at least one
4481 vector iteration. */
4482 int min_vec_niters
= 1;
4483 if (outside_overhead
> 0)
4484 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4486 if (dump_enabled_p ())
4487 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
4490 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4492 /* Now that we know the minimum number of vector iterations,
4493 find the minimum niters for which the scalar cost is larger:
4495 SIC * niters > VIC * vniters + VOC - SOC
4497 We know that the minimum niters is no more than
4498 vniters * VF + NPEEL, but it might be (and often is) less
4499 than that if a partial vector iteration is cheaper than the
4500 equivalent scalar code. */
4501 int threshold
= (vec_inside_cost
* min_vec_niters
4503 - scalar_outside_cost
);
4505 min_profitable_iters
= 1;
4507 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
4510 /* Convert the number of vector iterations into a number of
4511 scalar iterations. */
4512 min_profitable_iters
= (min_vec_niters
* assumed_vf
4513 + peel_iters_prologue
4514 + peel_iters_epilogue
);
4518 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
4520 - vec_inside_cost
* peel_iters_prologue
4521 - vec_inside_cost
* peel_iters_epilogue
);
4522 if (min_profitable_iters
<= 0)
4523 min_profitable_iters
= 0;
4526 min_profitable_iters
/= saving_per_viter
;
4528 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
4529 <= (((int) vec_inside_cost
* min_profitable_iters
)
4530 + (((int) vec_outside_cost
- scalar_outside_cost
)
4532 min_profitable_iters
++;
4536 if (dump_enabled_p ())
4537 dump_printf (MSG_NOTE
,
4538 " Calculated minimum iters for profitability: %d\n",
4539 min_profitable_iters
);
4541 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
4542 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4543 /* We want the vectorized loop to execute at least once. */
4544 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4545 else if (min_profitable_iters
< peel_iters_prologue
)
4546 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4547 vectorized loop executes at least once. */
4548 min_profitable_iters
= peel_iters_prologue
;
4550 if (dump_enabled_p ())
4551 dump_printf_loc (MSG_NOTE
, vect_location
,
4552 " Runtime profitability threshold = %d\n",
4553 min_profitable_iters
);
4555 *ret_min_profitable_niters
= min_profitable_iters
;
4557 /* Calculate number of iterations required to make the vector version
4558 profitable, relative to the loop bodies only.
4560 Non-vectorized variant is SIC * niters and it must win over vector
4561 variant on the expected loop trip count. The following condition must hold true:
4562 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4564 if (vec_outside_cost
<= 0)
4565 min_profitable_estimate
= 0;
4566 /* ??? This "else if" arm is written to handle all cases; see below for
4567 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4568 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4570 /* This is a repeat of the code above, but with + SOC rather
4572 int outside_overhead
= (vec_outside_cost
4573 - scalar_single_iter_cost
* peel_iters_prologue
4574 - scalar_single_iter_cost
* peel_iters_epilogue
4575 + scalar_outside_cost
);
4576 int min_vec_niters
= 1;
4577 if (outside_overhead
> 0)
4578 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4580 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4582 int threshold
= (vec_inside_cost
* min_vec_niters
4584 + scalar_outside_cost
);
4585 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
4588 min_profitable_estimate
= (min_vec_niters
* assumed_vf
4589 + peel_iters_prologue
4590 + peel_iters_epilogue
);
4594 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4596 - vec_inside_cost
* peel_iters_prologue
4597 - vec_inside_cost
* peel_iters_epilogue
)
4598 / ((scalar_single_iter_cost
* assumed_vf
)
4601 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4602 if (dump_enabled_p ())
4603 dump_printf_loc (MSG_NOTE
, vect_location
,
4604 " Static estimate profitability threshold = %d\n",
4605 min_profitable_estimate
);
4607 *ret_min_profitable_estimate
= min_profitable_estimate
;
4610 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4611 vector elements (not bits) for a vector with NELT elements. */
4613 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4614 vec_perm_builder
*sel
)
4616 /* The encoding is a single stepped pattern. Any wrap-around is handled
4617 by vec_perm_indices. */
4618 sel
->new_vector (nelt
, 1, 3);
4619 for (unsigned int i
= 0; i
< 3; i
++)
4620 sel
->quick_push (i
+ offset
);
4623 /* Checks whether the target supports whole-vector shifts for vectors of mode
4624 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4625 it supports vec_perm_const with masks for all necessary shift amounts. */
4627 have_whole_vector_shift (machine_mode mode
)
4629 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4632 /* Variable-length vectors should be handled via the optab. */
4634 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4637 vec_perm_builder sel
;
4638 vec_perm_indices indices
;
4639 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4641 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4642 indices
.new_vector (sel
, 2, nelt
);
4643 if (!can_vec_perm_const_p (mode
, mode
, indices
, false))
4649 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4650 multiplication operands have differing signs and (b) we intend
4651 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4652 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4655 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo
,
4656 stmt_vec_info stmt_info
)
4658 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
4659 if (!assign
|| gimple_assign_rhs_code (assign
) != DOT_PROD_EXPR
)
4662 tree rhs1
= gimple_assign_rhs1 (assign
);
4663 tree rhs2
= gimple_assign_rhs2 (assign
);
4664 if (TYPE_SIGN (TREE_TYPE (rhs1
)) == TYPE_SIGN (TREE_TYPE (rhs2
)))
4667 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
4668 gcc_assert (reduc_info
->is_reduc_info
);
4669 return !directly_supported_p (DOT_PROD_EXPR
,
4670 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
),
4671 optab_vector_mixed_sign
);
4674 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4675 functions. Design better to avoid maintenance issues. */
4677 /* Function vect_model_reduction_cost.
4679 Models cost for a reduction operation, including the vector ops
4680 generated within the strip-mine loop in some cases, the initial
4681 definition before the loop, and the epilogue code that must be generated. */
4684 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
4685 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4686 vect_reduction_type reduction_type
,
4687 int ncopies
, stmt_vector_for_cost
*cost_vec
)
4689 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
= 0;
4692 class loop
*loop
= NULL
;
4695 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4697 /* Condition reductions generate two reductions in the loop. */
4698 if (reduction_type
== COND_REDUCTION
)
4701 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4702 mode
= TYPE_MODE (vectype
);
4703 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4706 if (!gimple_extract_op (orig_stmt_info
->stmt
, &op
))
4709 bool emulated_mixed_dot_prod
4710 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
4711 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
4712 /* No extra instructions are needed in the prologue. The loop body
4713 operations are costed in vectorizable_condition. */
4715 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
4717 /* No extra instructions needed in the prologue. */
4720 if (reduc_fn
!= IFN_LAST
)
4721 /* Count one reduction-like operation per vector. */
4722 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
4723 stmt_info
, 0, vect_body
);
4726 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4727 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4728 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
4729 vec_to_scalar
, stmt_info
, 0,
4731 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
4732 scalar_stmt
, stmt_info
, 0,
4738 /* Add in the cost of the initial definitions. */
4740 if (reduction_type
== COND_REDUCTION
)
4741 /* For cond reductions we have four vectors: initial index, step,
4742 initial result of the data reduction, initial value of the index
4745 else if (emulated_mixed_dot_prod
)
4746 /* We need the initial reduction value and two invariants:
4747 one that contains the minimum signed value and one that
4748 contains half of its negative. */
4752 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
4753 scalar_to_vec
, stmt_info
, 0,
4757 /* Determine cost of epilogue code.
4759 We have a reduction operator that will reduce the vector in one statement.
4760 Also requires scalar extract. */
4762 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4764 if (reduc_fn
!= IFN_LAST
)
4766 if (reduction_type
== COND_REDUCTION
)
4768 /* An EQ stmt and an COND_EXPR stmt. */
4769 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4770 vector_stmt
, stmt_info
, 0,
4772 /* Reduction of the max index and a reduction of the found
4774 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4775 vec_to_scalar
, stmt_info
, 0,
4777 /* A broadcast of the max value. */
4778 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4779 scalar_to_vec
, stmt_info
, 0,
4784 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4785 stmt_info
, 0, vect_epilogue
);
4786 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4787 vec_to_scalar
, stmt_info
, 0,
4791 else if (reduction_type
== COND_REDUCTION
)
4793 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4794 /* Extraction of scalar elements. */
4795 epilogue_cost
+= record_stmt_cost (cost_vec
,
4796 2 * estimated_nunits
,
4797 vec_to_scalar
, stmt_info
, 0,
4799 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4800 epilogue_cost
+= record_stmt_cost (cost_vec
,
4801 2 * estimated_nunits
- 3,
4802 scalar_stmt
, stmt_info
, 0,
4805 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4806 || reduction_type
== FOLD_LEFT_REDUCTION
)
4807 /* No extra instructions need in the epilogue. */
4811 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4812 tree bitsize
= TYPE_SIZE (op
.type
);
4813 int element_bitsize
= tree_to_uhwi (bitsize
);
4814 int nelements
= vec_size_in_bits
/ element_bitsize
;
4816 if (op
.code
== COND_EXPR
)
4819 /* We have a whole vector shift available. */
4820 if (VECTOR_MODE_P (mode
)
4821 && directly_supported_p (op
.code
, vectype
)
4822 && have_whole_vector_shift (mode
))
4824 /* Final reduction via vector shifts and the reduction operator.
4825 Also requires scalar extract. */
4826 epilogue_cost
+= record_stmt_cost (cost_vec
,
4827 exact_log2 (nelements
) * 2,
4828 vector_stmt
, stmt_info
, 0,
4830 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4831 vec_to_scalar
, stmt_info
, 0,
4835 /* Use extracts and reduction op for final reduction. For N
4836 elements, we have N extracts and N-1 reduction ops. */
4837 epilogue_cost
+= record_stmt_cost (cost_vec
,
4838 nelements
+ nelements
- 1,
4839 vector_stmt
, stmt_info
, 0,
4844 if (dump_enabled_p ())
4845 dump_printf (MSG_NOTE
,
4846 "vect_model_reduction_cost: inside_cost = %d, "
4847 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4848 prologue_cost
, epilogue_cost
);
4851 /* SEQ is a sequence of instructions that initialize the reduction
4852 described by REDUC_INFO. Emit them in the appropriate place. */
4855 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo
,
4856 stmt_vec_info reduc_info
, gimple
*seq
)
4858 if (reduc_info
->reused_accumulator
)
4860 /* When reusing an accumulator from the main loop, we only need
4861 initialization instructions if the main loop can be skipped.
4862 In that case, emit the initialization instructions at the end
4863 of the guard block that does the skip. */
4864 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
4865 gcc_assert (skip_edge
);
4866 gimple_stmt_iterator gsi
= gsi_last_bb (skip_edge
->src
);
4867 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
4871 /* The normal case: emit the initialization instructions on the
4873 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4874 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), seq
);
4878 /* Function get_initial_def_for_reduction
4881 REDUC_INFO - the info_for_reduction
4882 INIT_VAL - the initial value of the reduction variable
4883 NEUTRAL_OP - a value that has no effect on the reduction, as per
4884 neutral_op_for_reduction
4887 Return a vector variable, initialized according to the operation that
4888 STMT_VINFO performs. This vector will be used as the initial value
4889 of the vector of partial results.
4891 The value we need is a vector in which element 0 has value INIT_VAL
4892 and every other element has value NEUTRAL_OP. */
4895 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4896 stmt_vec_info reduc_info
,
4897 tree init_val
, tree neutral_op
)
4899 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4900 tree scalar_type
= TREE_TYPE (init_val
);
4901 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4903 gimple_seq stmts
= NULL
;
4905 gcc_assert (vectype
);
4907 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4908 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4910 gcc_assert (nested_in_vect_loop_p (loop
, reduc_info
)
4911 || loop
== (gimple_bb (reduc_info
->stmt
))->loop_father
);
4913 if (operand_equal_p (init_val
, neutral_op
))
4915 /* If both elements are equal then the vector described above is
4917 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4918 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, neutral_op
);
4922 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4923 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4924 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4926 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4928 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4930 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4931 vectype
, init_def
, init_val
);
4935 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4936 tree_vector_builder
elts (vectype
, 1, 2);
4937 elts
.quick_push (init_val
);
4938 elts
.quick_push (neutral_op
);
4939 init_def
= gimple_build_vector (&stmts
, &elts
);
4944 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, stmts
);
4948 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4949 which performs a reduction involving GROUP_SIZE scalar statements.
4950 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4951 is nonnull, introducing extra elements of that value will not change the
4955 get_initial_defs_for_reduction (loop_vec_info loop_vinfo
,
4956 stmt_vec_info reduc_info
,
4957 vec
<tree
> *vec_oprnds
,
4958 unsigned int number_of_vectors
,
4959 unsigned int group_size
, tree neutral_op
)
4961 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
4962 unsigned HOST_WIDE_INT nunits
;
4963 unsigned j
, number_of_places_left_in_vector
;
4964 tree vector_type
= STMT_VINFO_VECTYPE (reduc_info
);
4967 gcc_assert (group_size
== initial_values
.length () || neutral_op
);
4969 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4970 created vectors. It is greater than 1 if unrolling is performed.
4972 For example, we have two scalar operands, s1 and s2 (e.g., group of
4973 strided accesses of size two), while NUNITS is four (i.e., four scalars
4974 of this type can be packed in a vector). The output vector will contain
4975 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4978 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4979 vectors containing the operands.
4981 For example, NUNITS is four as before, and the group size is 8
4982 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4983 {s5, s6, s7, s8}. */
4985 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4986 nunits
= group_size
;
4988 number_of_places_left_in_vector
= nunits
;
4989 bool constant_p
= true;
4990 tree_vector_builder
elts (vector_type
, nunits
, 1);
4991 elts
.quick_grow (nunits
);
4992 gimple_seq ctor_seq
= NULL
;
4993 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4998 /* Get the def before the loop. In reduction chain we have only
4999 one initial value. Else we have as many as PHIs in the group. */
5000 if (i
>= initial_values
.length () || (j
> i
&& neutral_op
))
5003 op
= initial_values
[i
];
5005 /* Create 'vect_ = {op0,op1,...,opn}'. */
5006 number_of_places_left_in_vector
--;
5007 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
5008 if (!CONSTANT_CLASS_P (op
))
5011 if (number_of_places_left_in_vector
== 0)
5014 if (constant_p
&& !neutral_op
5015 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5016 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5017 /* Build the vector directly from ELTS. */
5018 init
= gimple_build_vector (&ctor_seq
, &elts
);
5019 else if (neutral_op
)
5021 /* Build a vector of the neutral value and shift the
5022 other elements into place. */
5023 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
5026 while (k
> 0 && elts
[k
- 1] == neutral_op
)
5031 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
5032 vector_type
, init
, elts
[k
]);
5037 /* First time round, duplicate ELTS to fill the
5038 required number of vectors. */
5039 duplicate_and_interleave (loop_vinfo
, &ctor_seq
, vector_type
,
5040 elts
, number_of_vectors
, *vec_oprnds
);
5043 vec_oprnds
->quick_push (init
);
5045 number_of_places_left_in_vector
= nunits
;
5046 elts
.new_vector (vector_type
, nunits
, 1);
5047 elts
.quick_grow (nunits
);
5051 if (ctor_seq
!= NULL
)
5052 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, ctor_seq
);
5055 /* For a statement STMT_INFO taking part in a reduction operation return
5056 the stmt_vec_info the meta information is stored on. */
5059 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
5061 stmt_info
= vect_orig_stmt (stmt_info
);
5062 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
5063 if (!is_a
<gphi
*> (stmt_info
->stmt
)
5064 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
5065 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5066 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
5067 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5069 if (gimple_phi_num_args (phi
) == 1)
5070 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5072 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
5074 stmt_vec_info info
= vinfo
->lookup_def (vect_phi_initial_value (phi
));
5075 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
5081 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5082 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5086 vect_find_reusable_accumulator (loop_vec_info loop_vinfo
,
5087 stmt_vec_info reduc_info
)
5089 loop_vec_info main_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
5090 if (!main_loop_vinfo
)
5093 if (STMT_VINFO_REDUC_TYPE (reduc_info
) != TREE_CODE_REDUCTION
)
5096 unsigned int num_phis
= reduc_info
->reduc_initial_values
.length ();
5097 auto_vec
<tree
, 16> main_loop_results (num_phis
);
5098 auto_vec
<tree
, 16> initial_values (num_phis
);
5099 if (edge main_loop_edge
= loop_vinfo
->main_loop_edge
)
5101 /* The epilogue loop can be entered either from the main loop or
5102 from an earlier guard block. */
5103 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5104 for (tree incoming_value
: reduc_info
->reduc_initial_values
)
5108 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5109 INITIAL_VALUE(guard block)>. */
5110 gcc_assert (TREE_CODE (incoming_value
) == SSA_NAME
);
5112 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (incoming_value
));
5113 gcc_assert (gimple_bb (phi
) == main_loop_edge
->dest
);
5115 tree from_main_loop
= PHI_ARG_DEF_FROM_EDGE (phi
, main_loop_edge
);
5116 tree from_skip
= PHI_ARG_DEF_FROM_EDGE (phi
, skip_edge
);
5118 main_loop_results
.quick_push (from_main_loop
);
5119 initial_values
.quick_push (from_skip
);
5123 /* The main loop dominates the epilogue loop. */
5124 main_loop_results
.splice (reduc_info
->reduc_initial_values
);
5126 /* See if the main loop has the kind of accumulator we need. */
5127 vect_reusable_accumulator
*accumulator
5128 = main_loop_vinfo
->reusable_accumulators
.get (main_loop_results
[0]);
5130 || num_phis
!= accumulator
->reduc_info
->reduc_scalar_results
.length ()
5131 || !std::equal (main_loop_results
.begin (), main_loop_results
.end (),
5132 accumulator
->reduc_info
->reduc_scalar_results
.begin ()))
5135 /* Handle the case where we can reduce wider vectors to narrower ones. */
5136 tree vectype
= STMT_VINFO_VECTYPE (reduc_info
);
5137 tree old_vectype
= TREE_TYPE (accumulator
->reduc_input
);
5138 unsigned HOST_WIDE_INT m
;
5139 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype
),
5140 TYPE_VECTOR_SUBPARTS (vectype
), &m
))
5142 /* Check the intermediate vector types and operations are available. */
5143 tree prev_vectype
= old_vectype
;
5144 poly_uint64 intermediate_nunits
= TYPE_VECTOR_SUBPARTS (old_vectype
);
5145 while (known_gt (intermediate_nunits
, TYPE_VECTOR_SUBPARTS (vectype
)))
5147 intermediate_nunits
= exact_div (intermediate_nunits
, 2);
5148 tree intermediate_vectype
= get_related_vectype_for_scalar_type
5149 (TYPE_MODE (vectype
), TREE_TYPE (vectype
), intermediate_nunits
);
5150 if (!intermediate_vectype
5151 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info
),
5152 intermediate_vectype
)
5153 || !can_vec_extract (TYPE_MODE (prev_vectype
),
5154 TYPE_MODE (intermediate_vectype
)))
5156 prev_vectype
= intermediate_vectype
;
5159 /* Non-SLP reductions might apply an adjustment after the reduction
5160 operation, in order to simplify the initialization of the accumulator.
5161 If the epilogue loop carries on from where the main loop left off,
5162 it should apply the same adjustment to the final reduction result.
5164 If the epilogue loop can also be entered directly (rather than via
5165 the main loop), we need to be able to handle that case in the same way,
5166 with the same adjustment. (In principle we could add a PHI node
5167 to select the correct adjustment, but in practice that shouldn't be
5169 tree main_adjustment
5170 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator
->reduc_info
);
5171 if (loop_vinfo
->main_loop_edge
&& main_adjustment
)
5173 gcc_assert (num_phis
== 1);
5174 tree initial_value
= initial_values
[0];
5175 /* Check that we can use INITIAL_VALUE as the adjustment and
5176 initialize the accumulator with a neutral value instead. */
5177 if (!operand_equal_p (initial_value
, main_adjustment
))
5179 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5180 initial_values
[0] = neutral_op_for_reduction (TREE_TYPE (initial_value
),
5181 code
, initial_value
);
5183 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = main_adjustment
;
5184 reduc_info
->reduc_initial_values
.truncate (0);
5185 reduc_info
->reduc_initial_values
.splice (initial_values
);
5186 reduc_info
->reused_accumulator
= accumulator
;
5190 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5191 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5194 vect_create_partial_epilog (tree vec_def
, tree vectype
, code_helper code
,
5197 unsigned nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def
)).to_constant ();
5198 unsigned nunits1
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5199 tree stype
= TREE_TYPE (vectype
);
5200 tree new_temp
= vec_def
;
5201 while (nunits
> nunits1
)
5204 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5206 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5208 /* The target has to make sure we support lowpart/highpart
5209 extraction, either via direct vector extract or through
5210 an integer mode punning. */
5212 gimple
*epilog_stmt
;
5213 if (convert_optab_handler (vec_extract_optab
,
5214 TYPE_MODE (TREE_TYPE (new_temp
)),
5215 TYPE_MODE (vectype1
))
5216 != CODE_FOR_nothing
)
5218 /* Extract sub-vectors directly once vec_extract becomes
5219 a conversion optab. */
5220 dst1
= make_ssa_name (vectype1
);
5222 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5223 build3 (BIT_FIELD_REF
, vectype1
,
5224 new_temp
, TYPE_SIZE (vectype1
),
5226 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5227 dst2
= make_ssa_name (vectype1
);
5229 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5230 build3 (BIT_FIELD_REF
, vectype1
,
5231 new_temp
, TYPE_SIZE (vectype1
),
5232 bitsize_int (bitsize
)));
5233 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5237 /* Extract via punning to appropriately sized integer mode
5239 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5240 tree etype
= build_vector_type (eltype
, 2);
5241 gcc_assert (convert_optab_handler (vec_extract_optab
,
5244 != CODE_FOR_nothing
);
5245 tree tem
= make_ssa_name (etype
);
5246 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5247 build1 (VIEW_CONVERT_EXPR
,
5249 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5251 tem
= make_ssa_name (eltype
);
5253 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5254 build3 (BIT_FIELD_REF
, eltype
,
5255 new_temp
, TYPE_SIZE (eltype
),
5257 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5258 dst1
= make_ssa_name (vectype1
);
5259 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5260 build1 (VIEW_CONVERT_EXPR
,
5262 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5263 tem
= make_ssa_name (eltype
);
5265 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5266 build3 (BIT_FIELD_REF
, eltype
,
5267 new_temp
, TYPE_SIZE (eltype
),
5268 bitsize_int (bitsize
)));
5269 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5270 dst2
= make_ssa_name (vectype1
);
5271 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5272 build1 (VIEW_CONVERT_EXPR
,
5274 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5277 new_temp
= gimple_build (seq
, code
, vectype1
, dst1
, dst2
);
5283 /* Function vect_create_epilog_for_reduction
5285 Create code at the loop-epilog to finalize the result of a reduction
5288 STMT_INFO is the scalar reduction stmt that is being vectorized.
5289 SLP_NODE is an SLP node containing a group of reduction statements. The
5290 first one in this group is STMT_INFO.
5291 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5292 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5296 1. Completes the reduction def-use cycles.
5297 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5298 by calling the function specified by REDUC_FN if available, or by
5299 other means (whole-vector shifts or a scalar loop).
5300 The function also creates a new phi node at the loop exit to preserve
5301 loop-closed form, as illustrated below.
5303 The flow at the entry to this function:
5306 vec_def = phi <vec_init, null> # REDUCTION_PHI
5307 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5308 s_loop = scalar_stmt # (scalar) STMT_INFO
5310 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5314 The above is transformed by this function into:
5317 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5318 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5319 s_loop = scalar_stmt # (scalar) STMT_INFO
5321 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5322 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5323 v_out2 = reduce <v_out1>
5324 s_out3 = extract_field <v_out2, 0>
5325 s_out4 = adjust_result <s_out3>
5331 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
5332 stmt_vec_info stmt_info
,
5334 slp_instance slp_node_instance
)
5336 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5337 gcc_assert (reduc_info
->is_reduc_info
);
5338 /* For double reductions we need to get at the inner loop reduction
5339 stmt which has the meta info attached. Our stmt_info is that of the
5340 loop-closed PHI of the inner loop which we remember as
5341 def for the reduction PHI generation. */
5342 bool double_reduc
= false;
5343 stmt_vec_info rdef_info
= stmt_info
;
5344 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5346 gcc_assert (!slp_node
);
5347 double_reduc
= true;
5348 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
5349 (stmt_info
->stmt
, 0));
5350 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
5352 gphi
*reduc_def_stmt
5353 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
5354 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5355 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
5358 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
5359 basic_block exit_bb
;
5362 gimple
*new_phi
= NULL
, *phi
;
5363 gimple_stmt_iterator exit_gsi
;
5364 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
5365 gimple
*epilog_stmt
= NULL
;
5369 tree orig_name
, scalar_result
;
5370 imm_use_iterator imm_iter
, phi_imm_iter
;
5371 use_operand_p use_p
, phi_use_p
;
5373 auto_vec
<tree
> reduc_inputs
;
5375 vec
<tree
> &scalar_results
= reduc_info
->reduc_scalar_results
;
5376 unsigned int group_size
= 1, k
;
5377 auto_vec
<gimple
*> phis
;
5378 /* SLP reduction without reduction chain, e.g.,
5382 b2 = operation (b1) */
5383 bool slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5384 bool direct_slp_reduc
;
5385 tree induction_index
= NULL_TREE
;
5388 group_size
= SLP_TREE_LANES (slp_node
);
5390 if (nested_in_vect_loop_p (loop
, stmt_info
))
5394 gcc_assert (!slp_node
&& double_reduc
);
5397 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
5398 gcc_assert (vectype
);
5399 mode
= TYPE_MODE (vectype
);
5401 tree induc_val
= NULL_TREE
;
5402 tree adjustment_def
= NULL
;
5407 /* Optimize: for induction condition reduction, if we can't use zero
5408 for induc_val, use initial_def. */
5409 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5410 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
5411 else if (double_reduc
)
5414 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
5417 stmt_vec_info single_live_out_stmt
[] = { stmt_info
};
5418 array_slice
<const stmt_vec_info
> live_out_stmts
= single_live_out_stmt
;
5420 /* All statements produce live-out values. */
5421 live_out_stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
5424 /* The last statement in the reduction chain produces the live-out
5425 value. Note SLP optimization can shuffle scalar stmts to
5426 optimize permutations so we have to search for the last stmt. */
5427 for (k
= 0; k
< group_size
; ++k
)
5428 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node
)[k
]))
5430 single_live_out_stmt
[0] = SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5439 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
5444 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
5446 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
5449 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5450 which is updated with the current index of the loop for every match of
5451 the original loop's cond_expr (VEC_STMT). This results in a vector
5452 containing the last time the condition passed for that vector lane.
5453 The first match will be a 1 to allow 0 to be used for non-matching
5454 indexes. If there are no matches at all then the vector will be all
5457 PR92772: This algorithm is broken for architectures that support
5458 masked vectors, but do not provide fold_extract_last. */
5459 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
5461 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
5462 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
5463 cond_info
= vect_stmt_to_vectorize (cond_info
);
5464 while (cond_info
!= reduc_info
)
5466 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
5468 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
5469 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
5471 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
5472 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
5475 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
5476 1 + STMT_VINFO_REDUC_IDX
5478 cond_info
= vect_stmt_to_vectorize (cond_info
);
5480 gcc_assert (ccompares
.length () != 0);
5482 tree indx_before_incr
, indx_after_incr
;
5483 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
5484 int scalar_precision
5485 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
5486 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5487 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
5488 (TYPE_MODE (vectype
), cr_index_scalar_type
,
5489 TYPE_VECTOR_SUBPARTS (vectype
));
5491 /* First we create a simple vector induction variable which starts
5492 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5493 vector size (STEP). */
5495 /* Create a {1,2,3,...} vector. */
5496 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
5498 /* Create a vector of the step value. */
5499 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
5500 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
5502 /* Create an induction variable. */
5503 gimple_stmt_iterator incr_gsi
;
5505 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5506 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
5507 insert_after
, &indx_before_incr
, &indx_after_incr
);
5509 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5510 filled with zeros (VEC_ZERO). */
5512 /* Create a vector of 0s. */
5513 tree zero
= build_zero_cst (cr_index_scalar_type
);
5514 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
5516 /* Create a vector phi node. */
5517 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
5518 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
5519 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
5520 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
5522 /* Now take the condition from the loops original cond_exprs
5523 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5524 every match uses values from the induction variable
5525 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5527 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5528 the new cond_expr (INDEX_COND_EXPR). */
5529 gimple_seq stmts
= NULL
;
5530 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
5532 tree ccompare
= ccompares
[i
].first
;
5533 if (ccompares
[i
].second
)
5534 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5535 cr_index_vector_type
,
5537 indx_before_incr
, new_phi_tree
);
5539 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5540 cr_index_vector_type
,
5542 new_phi_tree
, indx_before_incr
);
5544 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
5546 /* Update the phi with the vec cond. */
5547 induction_index
= new_phi_tree
;
5548 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
5549 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
5552 /* 2. Create epilog code.
5553 The reduction epilog code operates across the elements of the vector
5554 of partial results computed by the vectorized loop.
5555 The reduction epilog code consists of:
5557 step 1: compute the scalar result in a vector (v_out2)
5558 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5559 step 3: adjust the scalar result (s_out3) if needed.
5561 Step 1 can be accomplished using one the following three schemes:
5562 (scheme 1) using reduc_fn, if available.
5563 (scheme 2) using whole-vector shifts, if available.
5564 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5567 The overall epilog code looks like this:
5569 s_out0 = phi <s_loop> # original EXIT_PHI
5570 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5571 v_out2 = reduce <v_out1> # step 1
5572 s_out3 = extract_field <v_out2, 0> # step 2
5573 s_out4 = adjust_result <s_out3> # step 3
5575 (step 3 is optional, and steps 1 and 2 may be combined).
5576 Lastly, the uses of s_out0 are replaced by s_out4. */
5579 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5580 v_out1 = phi <VECT_DEF>
5581 Store them in NEW_PHIS. */
5584 exit_bb
= single_exit (loop
)->dest
;
5585 exit_gsi
= gsi_after_labels (exit_bb
);
5586 reduc_inputs
.create (slp_node
? vec_num
: ncopies
);
5587 for (unsigned i
= 0; i
< vec_num
; i
++)
5589 gimple_seq stmts
= NULL
;
5591 def
= vect_get_slp_vect_def (slp_node
, i
);
5593 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
5594 for (j
= 0; j
< ncopies
; j
++)
5596 tree new_def
= copy_ssa_name (def
);
5597 phi
= create_phi_node (new_def
, exit_bb
);
5599 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
5600 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5601 new_def
= gimple_convert (&stmts
, vectype
, new_def
);
5602 reduc_inputs
.quick_push (new_def
);
5604 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5607 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5608 (i.e. when reduc_fn is not available) and in the final adjustment
5609 code (if needed). Also get the original scalar reduction variable as
5610 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5611 represents a reduction pattern), the tree-code and scalar-def are
5612 taken from the original stmt that the pattern-stmt (STMT) replaces.
5613 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5614 are taken from STMT. */
5616 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5617 if (orig_stmt_info
!= stmt_info
)
5619 /* Reduction pattern */
5620 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5621 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
5624 scalar_dest
= gimple_get_lhs (orig_stmt_info
->stmt
);
5625 scalar_type
= TREE_TYPE (scalar_dest
);
5626 scalar_results
.truncate (0);
5627 scalar_results
.reserve_exact (group_size
);
5628 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5629 bitsize
= TYPE_SIZE (scalar_type
);
5631 /* True if we should implement SLP_REDUC using native reduction operations
5632 instead of scalar operations. */
5633 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5635 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5637 /* In case of reduction chain, e.g.,
5640 a3 = operation (a2),
5642 we may end up with more than one vector result. Here we reduce them
5645 The same is true if we couldn't use a single defuse cycle. */
5646 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
5650 gimple_seq stmts
= NULL
;
5651 tree single_input
= reduc_inputs
[0];
5652 for (k
= 1; k
< reduc_inputs
.length (); k
++)
5653 single_input
= gimple_build (&stmts
, code
, vectype
,
5654 single_input
, reduc_inputs
[k
]);
5655 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5657 reduc_inputs
.truncate (0);
5658 reduc_inputs
.safe_push (single_input
);
5661 tree orig_reduc_input
= reduc_inputs
[0];
5663 /* If this loop is an epilogue loop that can be skipped after the
5664 main loop, we can only share a reduction operation between the
5665 main loop and the epilogue if we put it at the target of the
5668 We can still reuse accumulators if this check fails. Doing so has
5669 the minor(?) benefit of making the epilogue loop's scalar result
5670 independent of the main loop's scalar result. */
5671 bool unify_with_main_loop_p
= false;
5672 if (reduc_info
->reused_accumulator
5673 && loop_vinfo
->skip_this_loop_edge
5674 && single_succ_p (exit_bb
)
5675 && single_succ (exit_bb
) == loop_vinfo
->skip_this_loop_edge
->dest
)
5677 unify_with_main_loop_p
= true;
5679 basic_block reduc_block
= loop_vinfo
->skip_this_loop_edge
->dest
;
5680 reduc_inputs
[0] = make_ssa_name (vectype
);
5681 gphi
*new_phi
= create_phi_node (reduc_inputs
[0], reduc_block
);
5682 add_phi_arg (new_phi
, orig_reduc_input
, single_succ_edge (exit_bb
),
5684 add_phi_arg (new_phi
, reduc_info
->reused_accumulator
->reduc_input
,
5685 loop_vinfo
->skip_this_loop_edge
, UNKNOWN_LOCATION
);
5686 exit_gsi
= gsi_after_labels (reduc_block
);
5689 /* Shouldn't be used beyond this point. */
5692 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5693 && reduc_fn
!= IFN_LAST
)
5695 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5696 various data values where the condition matched and another vector
5697 (INDUCTION_INDEX) containing all the indexes of those matches. We
5698 need to extract the last matching index (which will be the index with
5699 highest value) and use this to index into the data vector.
5700 For the case where there were no matches, the data vector will contain
5701 all default values and the index vector will be all zeros. */
5703 /* Get various versions of the type of the vector of indexes. */
5704 tree index_vec_type
= TREE_TYPE (induction_index
);
5705 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5706 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5707 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
5709 /* Get an unsigned integer version of the type of the data vector. */
5710 int scalar_precision
5711 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5712 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5713 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
5716 /* First we need to create a vector (ZERO_VEC) of zeros and another
5717 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5718 can create using a MAX reduction and then expanding.
5719 In the case where the loop never made any matches, the max index will
5722 /* Vector of {0, 0, 0,...}. */
5723 tree zero_vec
= build_zero_cst (vectype
);
5725 /* Find maximum value from the vector of found indexes. */
5726 tree max_index
= make_ssa_name (index_scalar_type
);
5727 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5728 1, induction_index
);
5729 gimple_call_set_lhs (max_index_stmt
, max_index
);
5730 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5732 /* Vector of {max_index, max_index, max_index,...}. */
5733 tree max_index_vec
= make_ssa_name (index_vec_type
);
5734 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5736 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5738 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5740 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5741 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5742 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5743 otherwise. Only one value should match, resulting in a vector
5744 (VEC_COND) with one data value and the rest zeros.
5745 In the case where the loop never made any matches, every index will
5746 match, resulting in a vector with all data values (which will all be
5747 the default value). */
5749 /* Compare the max index vector to the vector of found indexes to find
5750 the position of the max value. */
5751 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5752 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5755 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5757 /* Use the compare to choose either values from the data vector or
5759 tree vec_cond
= make_ssa_name (vectype
);
5760 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5764 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5766 /* Finally we need to extract the data value from the vector (VEC_COND)
5767 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5768 reduction, but because this doesn't exist, we can use a MAX reduction
5769 instead. The data value might be signed or a float so we need to cast
5771 In the case where the loop never made any matches, the data values are
5772 all identical, and so will reduce down correctly. */
5774 /* Make the matched data values unsigned. */
5775 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5776 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5778 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5781 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5783 /* Reduce down to a scalar value. */
5784 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5785 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5787 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5788 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5790 /* Convert the reduced value back to the result type and set as the
5792 gimple_seq stmts
= NULL
;
5793 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5795 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5796 scalar_results
.safe_push (new_temp
);
5798 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5799 && reduc_fn
== IFN_LAST
)
5801 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5803 idx_val = induction_index[0];
5804 val = data_reduc[0];
5805 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5806 if (induction_index[i] > idx_val)
5807 val = data_reduc[i], idx_val = induction_index[i];
5810 tree data_eltype
= TREE_TYPE (vectype
);
5811 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5812 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5813 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5814 /* Enforced by vectorizable_reduction, which ensures we have target
5815 support before allowing a conditional reduction on variable-length
5817 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5818 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5819 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5821 tree old_idx_val
= idx_val
;
5823 idx_val
= make_ssa_name (idx_eltype
);
5824 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5825 build3 (BIT_FIELD_REF
, idx_eltype
,
5827 bitsize_int (el_size
),
5828 bitsize_int (off
)));
5829 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5830 val
= make_ssa_name (data_eltype
);
5831 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5832 build3 (BIT_FIELD_REF
,
5835 bitsize_int (el_size
),
5836 bitsize_int (off
)));
5837 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5840 tree new_idx_val
= idx_val
;
5841 if (off
!= v_size
- el_size
)
5843 new_idx_val
= make_ssa_name (idx_eltype
);
5844 epilog_stmt
= gimple_build_assign (new_idx_val
,
5847 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5849 tree cond
= make_ssa_name (boolean_type_node
);
5850 epilog_stmt
= gimple_build_assign (cond
, GT_EXPR
,
5851 idx_val
, old_idx_val
);
5852 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5853 tree new_val
= make_ssa_name (data_eltype
);
5854 epilog_stmt
= gimple_build_assign (new_val
, COND_EXPR
,
5855 cond
, val
, old_val
);
5856 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5857 idx_val
= new_idx_val
;
5861 /* Convert the reduced value back to the result type and set as the
5863 gimple_seq stmts
= NULL
;
5864 val
= gimple_convert (&stmts
, scalar_type
, val
);
5865 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5866 scalar_results
.safe_push (val
);
5869 /* 2.3 Create the reduction code, using one of the three schemes described
5870 above. In SLP we simply need to extract all the elements from the
5871 vector (without reducing them), so we use scalar shifts. */
5872 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5878 v_out2 = reduc_expr <v_out1> */
5880 if (dump_enabled_p ())
5881 dump_printf_loc (MSG_NOTE
, vect_location
,
5882 "Reduce using direct vector reduction.\n");
5884 gimple_seq stmts
= NULL
;
5885 vec_elem_type
= TREE_TYPE (vectype
);
5886 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5887 vec_elem_type
, reduc_inputs
[0]);
5888 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5889 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5891 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5894 /* Earlier we set the initial value to be a vector if induc_val
5895 values. Check the result and if it is induc_val then replace
5896 with the original initial value, unless induc_val is
5897 the same as initial_def already. */
5898 tree zcompare
= make_ssa_name (boolean_type_node
);
5899 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
,
5900 new_temp
, induc_val
);
5901 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5902 tree initial_def
= reduc_info
->reduc_initial_values
[0];
5903 tmp
= make_ssa_name (new_scalar_dest
);
5904 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5905 initial_def
, new_temp
);
5906 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5910 scalar_results
.safe_push (new_temp
);
5912 else if (direct_slp_reduc
)
5914 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5915 with the elements for other SLP statements replaced with the
5916 neutral value. We can then do a normal reduction on each vector. */
5918 /* Enforced by vectorizable_reduction. */
5919 gcc_assert (reduc_inputs
.length () == 1);
5920 gcc_assert (pow2p_hwi (group_size
));
5922 gimple_seq seq
= NULL
;
5924 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5925 and the same element size as VECTYPE. */
5926 tree index
= build_index_vector (vectype
, 0, 1);
5927 tree index_type
= TREE_TYPE (index
);
5928 tree index_elt_type
= TREE_TYPE (index_type
);
5929 tree mask_type
= truth_type_for (index_type
);
5931 /* Create a vector that, for each element, identifies which of
5932 the REDUC_GROUP_SIZE results should use it. */
5933 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5934 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5935 build_vector_from_val (index_type
, index_mask
));
5937 /* Get a neutral vector value. This is simply a splat of the neutral
5938 scalar value if we have one, otherwise the initial scalar value
5939 is itself a neutral value. */
5940 tree vector_identity
= NULL_TREE
;
5941 tree neutral_op
= NULL_TREE
;
5944 tree initial_value
= NULL_TREE
;
5945 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5946 initial_value
= reduc_info
->reduc_initial_values
[0];
5947 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype
), code
,
5951 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5953 for (unsigned int i
= 0; i
< group_size
; ++i
)
5955 /* If there's no univeral neutral value, we can use the
5956 initial scalar value from the original PHI. This is used
5957 for MIN and MAX reduction, for example. */
5960 tree scalar_value
= reduc_info
->reduc_initial_values
[i
];
5961 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
5963 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5967 /* Calculate the equivalent of:
5969 sel[j] = (index[j] == i);
5971 which selects the elements of REDUC_INPUTS[0] that should
5972 be included in the result. */
5973 tree compare_val
= build_int_cst (index_elt_type
, i
);
5974 compare_val
= build_vector_from_val (index_type
, compare_val
);
5975 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5976 index
, compare_val
);
5978 /* Calculate the equivalent of:
5980 vec = seq ? reduc_inputs[0] : vector_identity;
5982 VEC is now suitable for a full vector reduction. */
5983 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5984 sel
, reduc_inputs
[0], vector_identity
);
5986 /* Do the reduction and convert it to the appropriate type. */
5987 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5988 TREE_TYPE (vectype
), vec
);
5989 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5990 scalar_results
.safe_push (scalar
);
5992 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5996 bool reduce_with_shift
;
5999 gcc_assert (slp_reduc
|| reduc_inputs
.length () == 1);
6001 /* See if the target wants to do the final (shift) reduction
6002 in a vector mode of smaller size and first reduce upper/lower
6003 halves against each other. */
6004 enum machine_mode mode1
= mode
;
6005 tree stype
= TREE_TYPE (vectype
);
6006 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6007 unsigned nunits1
= nunits
;
6008 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
6009 && reduc_inputs
.length () == 1)
6011 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6012 /* For SLP reductions we have to make sure lanes match up, but
6013 since we're doing individual element final reduction reducing
6014 vector width here is even more important.
6015 ??? We can also separate lanes with permutes, for the common
6016 case of power-of-two group-size odd/even extracts would work. */
6017 if (slp_reduc
&& nunits
!= nunits1
)
6019 nunits1
= least_common_multiple (nunits1
, group_size
);
6020 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
6024 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
6025 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6027 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
6029 reduce_with_shift
= have_whole_vector_shift (mode1
);
6030 if (!VECTOR_MODE_P (mode1
)
6031 || !directly_supported_p (code
, vectype1
))
6032 reduce_with_shift
= false;
6034 /* First reduce the vector to the desired vector size we should
6035 do shift reduction on by combining upper and lower halves. */
6036 gimple_seq stmts
= NULL
;
6037 new_temp
= vect_create_partial_epilog (reduc_inputs
[0], vectype1
,
6039 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6040 reduc_inputs
[0] = new_temp
;
6042 if (reduce_with_shift
&& !slp_reduc
)
6044 int element_bitsize
= tree_to_uhwi (bitsize
);
6045 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6046 for variable-length vectors and also requires direct target support
6047 for loop reductions. */
6048 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6049 int nelements
= vec_size_in_bits
/ element_bitsize
;
6050 vec_perm_builder sel
;
6051 vec_perm_indices indices
;
6055 tree zero_vec
= build_zero_cst (vectype1
);
6057 for (offset = nelements/2; offset >= 1; offset/=2)
6059 Create: va' = vec_shift <va, offset>
6060 Create: va = vop <va, va'>
6065 if (dump_enabled_p ())
6066 dump_printf_loc (MSG_NOTE
, vect_location
,
6067 "Reduce using vector shifts\n");
6069 gimple_seq stmts
= NULL
;
6070 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
6071 for (elt_offset
= nelements
/ 2;
6075 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
6076 indices
.new_vector (sel
, 2, nelements
);
6077 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
6078 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
6079 new_temp
, zero_vec
, mask
);
6080 new_temp
= gimple_build (&stmts
, code
,
6081 vectype1
, new_name
, new_temp
);
6083 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6085 /* 2.4 Extract the final scalar result. Create:
6086 s_out3 = extract_field <v_out2, bitpos> */
6088 if (dump_enabled_p ())
6089 dump_printf_loc (MSG_NOTE
, vect_location
,
6090 "extract scalar result\n");
6092 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
6093 bitsize
, bitsize_zero_node
);
6094 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
6095 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
6096 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
6097 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6098 scalar_results
.safe_push (new_temp
);
6103 s = extract_field <v_out2, 0>
6104 for (offset = element_size;
6105 offset < vector_size;
6106 offset += element_size;)
6108 Create: s' = extract_field <v_out2, offset>
6109 Create: s = op <s, s'> // For non SLP cases
6112 if (dump_enabled_p ())
6113 dump_printf_loc (MSG_NOTE
, vect_location
,
6114 "Reduce using scalar code.\n");
6116 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6117 int element_bitsize
= tree_to_uhwi (bitsize
);
6118 tree compute_type
= TREE_TYPE (vectype
);
6119 gimple_seq stmts
= NULL
;
6120 FOR_EACH_VEC_ELT (reduc_inputs
, i
, vec_temp
)
6123 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
6124 vec_temp
, bitsize
, bitsize_zero_node
);
6126 /* In SLP we don't need to apply reduction operation, so we just
6127 collect s' values in SCALAR_RESULTS. */
6129 scalar_results
.safe_push (new_temp
);
6131 for (bit_offset
= element_bitsize
;
6132 bit_offset
< vec_size_in_bits
;
6133 bit_offset
+= element_bitsize
)
6135 tree bitpos
= bitsize_int (bit_offset
);
6136 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
6137 compute_type
, vec_temp
,
6141 /* In SLP we don't need to apply reduction operation, so
6142 we just collect s' values in SCALAR_RESULTS. */
6143 new_temp
= new_name
;
6144 scalar_results
.safe_push (new_name
);
6147 new_temp
= gimple_build (&stmts
, code
, compute_type
,
6148 new_name
, new_temp
);
6152 /* The only case where we need to reduce scalar results in SLP, is
6153 unrolling. If the size of SCALAR_RESULTS is greater than
6154 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6155 REDUC_GROUP_SIZE. */
6158 tree res
, first_res
, new_res
;
6160 /* Reduce multiple scalar results in case of SLP unrolling. */
6161 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
6164 first_res
= scalar_results
[j
% group_size
];
6165 new_res
= gimple_build (&stmts
, code
, compute_type
,
6167 scalar_results
[j
% group_size
] = new_res
;
6169 scalar_results
.truncate (group_size
);
6170 for (k
= 0; k
< group_size
; k
++)
6171 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
6176 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6177 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6178 scalar_results
.safe_push (new_temp
);
6181 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6184 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6187 /* Earlier we set the initial value to be a vector if induc_val
6188 values. Check the result and if it is induc_val then replace
6189 with the original initial value, unless induc_val is
6190 the same as initial_def already. */
6191 tree zcompare
= make_ssa_name (boolean_type_node
);
6192 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
, new_temp
,
6194 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6195 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6196 tree tmp
= make_ssa_name (new_scalar_dest
);
6197 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6198 initial_def
, new_temp
);
6199 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6200 scalar_results
[0] = tmp
;
6204 /* 2.5 Adjust the final result by the initial value of the reduction
6205 variable. (When such adjustment is not needed, then
6206 'adjustment_def' is zero). For example, if code is PLUS we create:
6207 new_temp = loop_exit_def + adjustment_def */
6211 gcc_assert (!slp_reduc
);
6212 gimple_seq stmts
= NULL
;
6215 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
6216 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
6217 new_temp
= gimple_build (&stmts
, code
, vectype
,
6218 reduc_inputs
[0], adjustment_def
);
6222 new_temp
= scalar_results
[0];
6223 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
6224 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
6225 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
6226 new_temp
, adjustment_def
);
6229 epilog_stmt
= gimple_seq_last_stmt (stmts
);
6230 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6231 scalar_results
[0] = new_temp
;
6234 /* Record this operation if it could be reused by the epilogue loop. */
6235 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
)
6236 loop_vinfo
->reusable_accumulators
.put (scalar_results
[0],
6237 { orig_reduc_input
, reduc_info
});
6242 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6243 phis with new adjusted scalar results, i.e., replace use <s_out0>
6248 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6249 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6250 v_out2 = reduce <v_out1>
6251 s_out3 = extract_field <v_out2, 0>
6252 s_out4 = adjust_result <s_out3>
6259 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6260 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6261 v_out2 = reduce <v_out1>
6262 s_out3 = extract_field <v_out2, 0>
6263 s_out4 = adjust_result <s_out3>
6267 gcc_assert (live_out_stmts
.size () == scalar_results
.length ());
6268 for (k
= 0; k
< live_out_stmts
.size (); k
++)
6270 stmt_vec_info scalar_stmt_info
= vect_orig_stmt (live_out_stmts
[k
]);
6271 scalar_dest
= gimple_get_lhs (scalar_stmt_info
->stmt
);
6274 /* Find the loop-closed-use at the loop exit of the original scalar
6275 result. (The reduction result is expected to have two immediate uses,
6276 one at the latch block, and one at the loop exit). For double
6277 reductions we are looking for exit phis of the outer loop. */
6278 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6280 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6282 if (!is_gimple_debug (USE_STMT (use_p
)))
6283 phis
.safe_push (USE_STMT (use_p
));
6287 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6289 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6291 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6293 if (!flow_bb_inside_loop_p (loop
,
6294 gimple_bb (USE_STMT (phi_use_p
)))
6295 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6296 phis
.safe_push (USE_STMT (phi_use_p
));
6302 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6304 /* Replace the uses: */
6305 orig_name
= PHI_RESULT (exit_phi
);
6307 /* Look for a single use at the target of the skip edge. */
6308 if (unify_with_main_loop_p
)
6310 use_operand_p use_p
;
6312 if (!single_imm_use (orig_name
, &use_p
, &user
))
6314 orig_name
= gimple_get_lhs (user
);
6317 scalar_result
= scalar_results
[k
];
6318 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6320 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6321 SET_USE (use_p
, scalar_result
);
6322 update_stmt (use_stmt
);
6330 /* Return a vector of type VECTYPE that is equal to the vector select
6331 operation "MASK ? VEC : IDENTITY". Insert the select statements
6335 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6336 tree vec
, tree identity
)
6338 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6339 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6340 mask
, vec
, identity
);
6341 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6345 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6346 order, starting with LHS. Insert the extraction statements before GSI and
6347 associate the new scalar SSA names with variable SCALAR_DEST.
6348 Return the SSA name for the result. */
6351 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6352 tree_code code
, tree lhs
, tree vector_rhs
)
6354 tree vectype
= TREE_TYPE (vector_rhs
);
6355 tree scalar_type
= TREE_TYPE (vectype
);
6356 tree bitsize
= TYPE_SIZE (scalar_type
);
6357 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6358 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6360 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6361 bit_offset
< vec_size_in_bits
;
6362 bit_offset
+= element_bitsize
)
6364 tree bitpos
= bitsize_int (bit_offset
);
6365 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6368 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6369 rhs
= make_ssa_name (scalar_dest
, stmt
);
6370 gimple_assign_set_lhs (stmt
, rhs
);
6371 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6373 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6374 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6375 gimple_assign_set_lhs (stmt
, new_name
);
6376 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6382 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6383 type of the vector input. */
6386 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
6388 internal_fn mask_reduc_fn
;
6392 case IFN_FOLD_LEFT_PLUS
:
6393 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
6400 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
6401 OPTIMIZE_FOR_SPEED
))
6402 return mask_reduc_fn
;
6406 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6407 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6408 statement. CODE is the operation performed by STMT_INFO and OPS are
6409 its scalar operands. REDUC_INDEX is the index of the operand in
6410 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6411 implements in-order reduction, or IFN_LAST if we should open-code it.
6412 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6413 that should be used to control the operation in a fully-masked loop. */
6416 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
6417 stmt_vec_info stmt_info
,
6418 gimple_stmt_iterator
*gsi
,
6419 gimple
**vec_stmt
, slp_tree slp_node
,
6420 gimple
*reduc_def_stmt
,
6421 tree_code code
, internal_fn reduc_fn
,
6422 tree ops
[3], tree vectype_in
,
6423 int reduc_index
, vec_loop_masks
*masks
)
6425 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6426 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6427 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
6433 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6435 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
6436 gcc_assert (ncopies
== 1);
6437 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6440 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6441 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6443 tree op0
= ops
[1 - reduc_index
];
6446 stmt_vec_info scalar_dest_def_info
;
6447 auto_vec
<tree
> vec_oprnds0
;
6450 auto_vec
<vec
<tree
> > vec_defs (2);
6451 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6452 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
6453 vec_defs
[0].release ();
6454 vec_defs
[1].release ();
6455 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6456 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6460 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
6462 scalar_dest_def_info
= stmt_info
;
6465 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
6466 tree scalar_type
= TREE_TYPE (scalar_dest
);
6467 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6469 int vec_num
= vec_oprnds0
.length ();
6470 gcc_assert (vec_num
== 1 || slp_node
);
6471 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6472 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6474 tree vector_identity
= NULL_TREE
;
6475 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6476 vector_identity
= build_zero_cst (vectype_out
);
6478 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6481 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6484 tree mask
= NULL_TREE
;
6485 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6486 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6488 /* Handle MINUS by adding the negative. */
6489 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6491 tree negated
= make_ssa_name (vectype_out
);
6492 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6493 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6497 if (mask
&& mask_reduc_fn
== IFN_LAST
)
6498 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6501 /* On the first iteration the input is simply the scalar phi
6502 result, and for subsequent iterations it is the output of
6503 the preceding operation. */
6504 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
6506 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
6507 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
6510 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
6512 /* For chained SLP reductions the output of the previous reduction
6513 operation serves as the input of the next. For the final statement
6514 the output cannot be a temporary - we reuse the original
6515 scalar destination of the last statement. */
6516 if (i
!= vec_num
- 1)
6518 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6519 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6520 gimple_set_lhs (new_stmt
, reduc_var
);
6525 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6527 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6528 /* Remove the statement, so that we can use the same code paths
6529 as for statements that we've just created. */
6530 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6531 gsi_remove (&tmp_gsi
, true);
6534 if (i
== vec_num
- 1)
6536 gimple_set_lhs (new_stmt
, scalar_dest
);
6537 vect_finish_replace_stmt (loop_vinfo
,
6538 scalar_dest_def_info
,
6542 vect_finish_stmt_generation (loop_vinfo
,
6543 scalar_dest_def_info
,
6547 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6550 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
6551 *vec_stmt
= new_stmt
;
6558 /* Function is_nonwrapping_integer_induction.
6560 Check if STMT_VINO (which is part of loop LOOP) both increments and
6561 does not cause overflow. */
6564 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
6566 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
6567 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6568 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6569 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
6570 widest_int ni
, max_loop_value
, lhs_max
;
6571 wi::overflow_type overflow
= wi::OVF_NONE
;
6573 /* Make sure the loop is integer based. */
6574 if (TREE_CODE (base
) != INTEGER_CST
6575 || TREE_CODE (step
) != INTEGER_CST
)
6578 /* Check that the max size of the loop will not wrap. */
6580 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6583 if (! max_stmt_executions (loop
, &ni
))
6586 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6591 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6592 TYPE_SIGN (lhs_type
), &overflow
);
6596 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6597 <= TYPE_PRECISION (lhs_type
));
6600 /* Check if masking can be supported by inserting a conditional expression.
6601 CODE is the code for the operation. COND_FN is the conditional internal
6602 function, if it exists. VECTYPE_IN is the type of the vector input. */
6604 use_mask_by_cond_expr_p (code_helper code
, internal_fn cond_fn
,
6607 if (cond_fn
!= IFN_LAST
6608 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6609 OPTIMIZE_FOR_SPEED
))
6612 if (code
.is_tree_code ())
6613 switch (tree_code (code
))
6625 /* Insert a conditional expression to enable masked vectorization. CODE is the
6626 code for the operation. VOP is the array of operands. MASK is the loop
6627 mask. GSI is a statement iterator used to place the new conditional
6630 build_vect_cond_expr (code_helper code
, tree vop
[3], tree mask
,
6631 gimple_stmt_iterator
*gsi
)
6633 switch (tree_code (code
))
6637 tree vectype
= TREE_TYPE (vop
[1]);
6638 tree zero
= build_zero_cst (vectype
);
6639 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6640 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6641 mask
, vop
[1], zero
);
6642 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6643 vop
[1] = masked_op1
;
6649 tree vectype
= TREE_TYPE (vop
[1]);
6650 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6651 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6652 mask
, vop
[1], vop
[0]);
6653 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6654 vop
[1] = masked_op1
;
6663 /* Function vectorizable_reduction.
6665 Check if STMT_INFO performs a reduction operation that can be vectorized.
6666 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6667 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6668 Return true if STMT_INFO is vectorizable in this way.
6670 This function also handles reduction idioms (patterns) that have been
6671 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6672 may be of this form:
6673 X = pattern_expr (arg0, arg1, ..., X)
6674 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6675 sequence that had been detected and replaced by the pattern-stmt
6678 This function also handles reduction of condition expressions, for example:
6679 for (int i = 0; i < N; i++)
6682 This is handled by vectorising the loop and creating an additional vector
6683 containing the loop indexes for which "a[i] < value" was true. In the
6684 function epilogue this is reduced to a single max value and then used to
6685 index into the vector of results.
6687 In some cases of reduction patterns, the type of the reduction variable X is
6688 different than the type of the other arguments of STMT_INFO.
6689 In such cases, the vectype that is used when transforming STMT_INFO into
6690 a vector stmt is different than the vectype that is used to determine the
6691 vectorization factor, because it consists of a different number of elements
6692 than the actual number of elements that are being operated upon in parallel.
6694 For example, consider an accumulation of shorts into an int accumulator.
6695 On some targets it's possible to vectorize this pattern operating on 8
6696 shorts at a time (hence, the vectype for purposes of determining the
6697 vectorization factor should be V8HI); on the other hand, the vectype that
6698 is used to create the vector form is actually V4SI (the type of the result).
6700 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6701 indicates what is the actual level of parallelism (V8HI in the example), so
6702 that the right vectorization factor would be derived. This vectype
6703 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6704 be used to create the vectorized stmt. The right vectype for the vectorized
6705 stmt is obtained from the type of the result X:
6706 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6708 This means that, contrary to "regular" reductions (or "regular" stmts in
6709 general), the following equation:
6710 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6711 does *NOT* necessarily hold for reduction patterns. */
6714 vectorizable_reduction (loop_vec_info loop_vinfo
,
6715 stmt_vec_info stmt_info
, slp_tree slp_node
,
6716 slp_instance slp_node_instance
,
6717 stmt_vector_for_cost
*cost_vec
)
6719 tree vectype_in
= NULL_TREE
;
6720 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6721 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
6722 stmt_vec_info cond_stmt_vinfo
= NULL
;
6725 bool single_defuse_cycle
= false;
6726 bool nested_cycle
= false;
6727 bool double_reduc
= false;
6730 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6731 tree cond_reduc_val
= NULL_TREE
;
6733 /* Make sure it was already recognized as a reduction computation. */
6734 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6735 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6736 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6739 /* The stmt we store reduction analysis meta on. */
6740 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6741 reduc_info
->is_reduc_info
= true;
6743 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6745 if (is_a
<gphi
*> (stmt_info
->stmt
))
6749 /* We eventually need to set a vector type on invariant
6753 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
6754 if (!vect_maybe_update_slp_op_vectype
6755 (child
, SLP_TREE_VECTYPE (slp_node
)))
6757 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6759 "incompatible vector types for "
6764 /* Analysis for double-reduction is done on the outer
6765 loop PHI, nested cycles have no further restrictions. */
6766 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6769 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6773 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6774 stmt_vec_info phi_info
= stmt_info
;
6775 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6777 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6782 slp_node_instance
->reduc_phis
= slp_node
;
6783 /* ??? We're leaving slp_node to point to the PHIs, we only
6784 need it to get at the number of vector stmts which wasn't
6785 yet initialized for the instance root. */
6787 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6789 use_operand_p use_p
;
6791 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6794 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6797 /* PHIs should not participate in patterns. */
6798 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6799 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6801 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6802 and compute the reduction chain length. Discover the real
6803 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6805 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6807 (gimple_bb (reduc_def_phi
)->loop_father
));
6808 unsigned reduc_chain_length
= 0;
6809 bool only_slp_reduc_chain
= true;
6811 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
6812 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6814 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6815 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6816 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6820 "reduction chain broken by patterns.\n");
6823 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6824 only_slp_reduc_chain
= false;
6825 /* ??? For epilogue generation live members of the chain need
6826 to point back to the PHI via their original stmt for
6827 info_for_reduction to work. */
6828 if (STMT_VINFO_LIVE_P (vdef
))
6829 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6831 if (!gimple_extract_op (vdef
->stmt
, &op
))
6833 if (dump_enabled_p ())
6834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6835 "reduction chain includes unsupported"
6836 " statement type.\n");
6839 if (CONVERT_EXPR_CODE_P (op
.code
))
6841 if (!tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6845 "conversion in the reduction chain.\n");
6849 else if (!stmt_info
)
6850 /* First non-conversion stmt. */
6852 reduc_def
= op
.ops
[STMT_VINFO_REDUC_IDX (vdef
)];
6853 reduc_chain_length
++;
6854 if (!stmt_info
&& slp_node
)
6855 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6857 /* PHIs should not participate in patterns. */
6858 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6860 if (nested_in_vect_loop_p (loop
, stmt_info
))
6863 nested_cycle
= true;
6866 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6868 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6870 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6871 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6873 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6874 gcc_assert (slp_node
6875 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6877 /* 1. Is vectorizable reduction? */
6878 /* Not supportable if the reduction variable is used in the loop, unless
6879 it's a reduction chain. */
6880 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6881 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6884 /* Reductions that are not used even in an enclosing outer-loop,
6885 are expected to be "live" (used out of the loop). */
6886 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6887 && !STMT_VINFO_LIVE_P (stmt_info
))
6890 /* 2. Has this been recognized as a reduction pattern?
6892 Check if STMT represents a pattern that has been recognized
6893 in earlier analysis stages. For stmts that represent a pattern,
6894 the STMT_VINFO_RELATED_STMT field records the last stmt in
6895 the original sequence that constitutes the pattern. */
6897 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6900 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6901 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6904 /* 3. Check the operands of the operation. The first operands are defined
6905 inside the loop body. The last operand is the reduction variable,
6906 which is defined by the loop-header-phi. */
6908 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6909 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6911 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
6913 bool lane_reduc_code_p
= (op
.code
== DOT_PROD_EXPR
6914 || op
.code
== WIDEN_SUM_EXPR
6915 || op
.code
== SAD_EXPR
);
6917 if (!POINTER_TYPE_P (op
.type
) && !INTEGRAL_TYPE_P (op
.type
)
6918 && !SCALAR_FLOAT_TYPE_P (op
.type
))
6921 /* Do not try to vectorize bit-precision reductions. */
6922 if (!type_has_mode_precision_p (op
.type
))
6925 /* For lane-reducing ops we're reducing the number of reduction PHIs
6926 which means the only use of that may be in the lane-reducing operation. */
6927 if (lane_reduc_code_p
6928 && reduc_chain_length
!= 1
6929 && !only_slp_reduc_chain
)
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6933 "lane-reducing reduction with extra stmts.\n");
6937 /* All uses but the last are expected to be defined in the loop.
6938 The last use is the reduction variable. In case of nested cycle this
6939 assumption is not true: we use reduc_index to record the index of the
6940 reduction variable. */
6941 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op
.num_ops
);
6942 /* We need to skip an extra operand for COND_EXPRs with embedded
6944 unsigned opno_adjust
= 0;
6945 if (op
.code
== COND_EXPR
&& COMPARISON_CLASS_P (op
.ops
[0]))
6947 for (i
= 0; i
< (int) op
.num_ops
; i
++)
6949 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6950 if (i
== 0 && op
.code
== COND_EXPR
)
6953 stmt_vec_info def_stmt_info
;
6954 enum vect_def_type dt
;
6955 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
6956 i
+ opno_adjust
, &op
.ops
[i
], &slp_op
[i
], &dt
,
6957 &tem
, &def_stmt_info
))
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6961 "use not simple.\n");
6964 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
6967 /* There should be only one cycle def in the stmt, the one
6968 leading to reduc_def. */
6969 if (VECTORIZABLE_CYCLE_DEF (dt
))
6972 /* To properly compute ncopies we are interested in the widest
6973 non-reduction input type in case we're looking at a widening
6974 accumulation that we later handle in vect_transform_reduction. */
6975 if (lane_reduc_code_p
6978 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6979 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
))))))
6982 if (op
.code
== COND_EXPR
)
6984 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6985 if (dt
== vect_constant_def
)
6988 cond_reduc_val
= op
.ops
[i
];
6990 if (dt
== vect_induction_def
6992 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6995 cond_stmt_vinfo
= def_stmt_info
;
7000 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
7001 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
7003 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
7004 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
7005 /* If we have a condition reduction, see if we can simplify it further. */
7006 if (v_reduc_type
== COND_REDUCTION
)
7011 /* When the condition uses the reduction value in the condition, fail. */
7012 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
7014 if (dump_enabled_p ())
7015 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7016 "condition depends on previous iteration\n");
7020 if (reduc_chain_length
== 1
7021 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
7022 vectype_in
, OPTIMIZE_FOR_SPEED
))
7024 if (dump_enabled_p ())
7025 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7026 "optimizing condition reduction with"
7027 " FOLD_EXTRACT_LAST.\n");
7028 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
7030 else if (cond_reduc_dt
== vect_induction_def
)
7033 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
7034 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
7036 gcc_assert (TREE_CODE (base
) == INTEGER_CST
7037 && TREE_CODE (step
) == INTEGER_CST
);
7038 cond_reduc_val
= NULL_TREE
;
7039 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
7040 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
7041 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
7043 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7044 above base; punt if base is the minimum value of the type for
7045 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7046 else if (tree_int_cst_sgn (step
) == -1)
7048 cond_reduc_op_code
= MIN_EXPR
;
7049 if (tree_int_cst_sgn (base
) == -1)
7050 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7051 else if (tree_int_cst_lt (base
,
7052 TYPE_MAX_VALUE (TREE_TYPE (base
))))
7054 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
7058 cond_reduc_op_code
= MAX_EXPR
;
7059 if (tree_int_cst_sgn (base
) == 1)
7060 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7061 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
7064 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
7068 if (dump_enabled_p ())
7069 dump_printf_loc (MSG_NOTE
, vect_location
,
7070 "condition expression based on "
7071 "integer induction.\n");
7072 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
7073 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
7075 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
7078 else if (cond_reduc_dt
== vect_constant_def
)
7080 enum vect_def_type cond_initial_dt
;
7081 tree cond_initial_val
= vect_phi_initial_value (reduc_def_phi
);
7082 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
7083 if (cond_initial_dt
== vect_constant_def
7084 && types_compatible_p (TREE_TYPE (cond_initial_val
),
7085 TREE_TYPE (cond_reduc_val
)))
7087 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
7088 cond_initial_val
, cond_reduc_val
);
7089 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
7091 if (dump_enabled_p ())
7092 dump_printf_loc (MSG_NOTE
, vect_location
,
7093 "condition expression based on "
7094 "compile time constant.\n");
7095 /* Record reduction code at analysis stage. */
7096 STMT_VINFO_REDUC_CODE (reduc_info
)
7097 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
7098 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
7104 if (STMT_VINFO_LIVE_P (phi_info
))
7110 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7112 gcc_assert (ncopies
>= 1);
7114 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
7118 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
7119 == vect_double_reduction_def
);
7120 double_reduc
= true;
7123 /* 4.2. Check support for the epilog operation.
7125 If STMT represents a reduction pattern, then the type of the
7126 reduction variable may be different than the type of the rest
7127 of the arguments. For example, consider the case of accumulation
7128 of shorts into an int accumulator; The original code:
7129 S1: int_a = (int) short_a;
7130 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7133 STMT: int_acc = widen_sum <short_a, int_acc>
7136 1. The tree-code that is used to create the vector operation in the
7137 epilog code (that reduces the partial results) is not the
7138 tree-code of STMT, but is rather the tree-code of the original
7139 stmt from the pattern that STMT is replacing. I.e, in the example
7140 above we want to use 'widen_sum' in the loop, but 'plus' in the
7142 2. The type (mode) we use to check available target support
7143 for the vector operation to be created in the *epilog*, is
7144 determined by the type of the reduction variable (in the example
7145 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7146 However the type (mode) we use to check available target support
7147 for the vector operation to be created *inside the loop*, is
7148 determined by the type of the other arguments to STMT (in the
7149 example we'd check this: optab_handler (widen_sum_optab,
7152 This is contrary to "regular" reductions, in which the types of all
7153 the arguments are the same as the type of the reduction variable.
7154 For "regular" reductions we can therefore use the same vector type
7155 (and also the same tree-code) when generating the epilog code and
7156 when generating the code inside the loop. */
7158 code_helper orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
7159 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
7161 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7162 if (reduction_type
== TREE_CODE_REDUCTION
)
7164 /* Check whether it's ok to change the order of the computation.
7165 Generally, when vectorizing a reduction we change the order of the
7166 computation. This may change the behavior of the program in some
7167 cases, so we need to check that this is ok. One exception is when
7168 vectorizing an outer-loop: the inner-loop is executed sequentially,
7169 and therefore vectorizing reductions in the inner-loop during
7170 outer-loop vectorization is safe. Likewise when we are vectorizing
7171 a series of reductions using SLP and the VF is one the reductions
7172 are performed in scalar order. */
7174 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7175 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
7177 else if (needs_fold_left_reduction_p (op
.type
, orig_code
))
7179 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7180 is not directy used in stmt. */
7181 if (!only_slp_reduc_chain
7182 && reduc_chain_length
!= 1)
7184 if (dump_enabled_p ())
7185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7186 "in-order reduction chain without SLP.\n");
7189 STMT_VINFO_REDUC_TYPE (reduc_info
)
7190 = reduction_type
= FOLD_LEFT_REDUCTION
;
7192 else if (!commutative_binary_op_p (orig_code
, op
.type
)
7193 || !associative_binary_op_p (orig_code
, op
.type
))
7195 if (dump_enabled_p ())
7196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7197 "reduction: not commutative/associative");
7202 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
7205 if (dump_enabled_p ())
7206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7207 "multiple types in double reduction or condition "
7208 "reduction or fold-left reduction.\n");
7212 internal_fn reduc_fn
= IFN_LAST
;
7213 if (reduction_type
== TREE_CODE_REDUCTION
7214 || reduction_type
== FOLD_LEFT_REDUCTION
7215 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
7216 || reduction_type
== CONST_COND_REDUCTION
)
7218 if (reduction_type
== FOLD_LEFT_REDUCTION
7219 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
7220 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
7222 if (reduc_fn
!= IFN_LAST
7223 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
7224 OPTIMIZE_FOR_SPEED
))
7226 if (dump_enabled_p ())
7227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7228 "reduc op not supported by target.\n");
7230 reduc_fn
= IFN_LAST
;
7235 if (!nested_cycle
|| double_reduc
)
7237 if (dump_enabled_p ())
7238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7239 "no reduc code for scalar code.\n");
7245 else if (reduction_type
== COND_REDUCTION
)
7247 int scalar_precision
7248 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op
.type
));
7249 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7250 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
7253 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7254 OPTIMIZE_FOR_SPEED
))
7255 reduc_fn
= IFN_REDUC_MAX
;
7257 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
7259 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7260 && (!nested_cycle
|| double_reduc
)
7261 && reduc_fn
== IFN_LAST
7262 && !nunits_out
.is_constant ())
7264 if (dump_enabled_p ())
7265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7266 "missing target support for reduction on"
7267 " variable-length vectors.\n");
7271 /* For SLP reductions, see if there is a neutral value we can use. */
7272 tree neutral_op
= NULL_TREE
;
7275 tree initial_value
= NULL_TREE
;
7276 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
)
7277 initial_value
= vect_phi_initial_value (reduc_def_phi
);
7278 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype_out
),
7279 orig_code
, initial_value
);
7282 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7284 /* We can't support in-order reductions of code such as this:
7286 for (int i = 0; i < n1; ++i)
7287 for (int j = 0; j < n2; ++j)
7290 since GCC effectively transforms the loop when vectorizing:
7292 for (int i = 0; i < n1 / VF; ++i)
7293 for (int j = 0; j < n2; ++j)
7294 for (int k = 0; k < VF; ++k)
7297 which is a reassociation of the original operation. */
7298 if (dump_enabled_p ())
7299 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7300 "in-order double reduction not supported.\n");
7305 if (reduction_type
== FOLD_LEFT_REDUCTION
7307 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7309 /* We cannot use in-order reductions in this case because there is
7310 an implicit reassociation of the operations involved. */
7311 if (dump_enabled_p ())
7312 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7313 "in-order unchained SLP reductions not supported.\n");
7317 /* For double reductions, and for SLP reductions with a neutral value,
7318 we construct a variable-length initial vector by loading a vector
7319 full of the neutral value and then shift-and-inserting the start
7320 values into the low-numbered elements. */
7321 if ((double_reduc
|| neutral_op
)
7322 && !nunits_out
.is_constant ()
7323 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7324 vectype_out
, OPTIMIZE_FOR_SPEED
))
7326 if (dump_enabled_p ())
7327 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7328 "reduction on variable-length vectors requires"
7329 " target support for a vector-shift-and-insert"
7334 /* Check extra constraints for variable-length unchained SLP reductions. */
7335 if (STMT_SLP_TYPE (stmt_info
)
7336 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7337 && !nunits_out
.is_constant ())
7339 /* We checked above that we could build the initial vector when
7340 there's a neutral element value. Check here for the case in
7341 which each SLP statement has its own initial value and in which
7342 that value needs to be repeated for every instance of the
7343 statement within the initial vector. */
7344 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
7346 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
7347 TREE_TYPE (vectype_out
)))
7349 if (dump_enabled_p ())
7350 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7351 "unsupported form of SLP reduction for"
7352 " variable-length vectors: cannot build"
7353 " initial vector.\n");
7356 /* The epilogue code relies on the number of elements being a multiple
7357 of the group size. The duplicate-and-interleave approach to setting
7358 up the initial vector does too. */
7359 if (!multiple_p (nunits_out
, group_size
))
7361 if (dump_enabled_p ())
7362 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7363 "unsupported form of SLP reduction for"
7364 " variable-length vectors: the vector size"
7365 " is not a multiple of the number of results.\n");
7370 if (reduction_type
== COND_REDUCTION
)
7374 if (! max_loop_iterations (loop
, &ni
))
7376 if (dump_enabled_p ())
7377 dump_printf_loc (MSG_NOTE
, vect_location
,
7378 "loop count not known, cannot create cond "
7382 /* Convert backedges to iterations. */
7385 /* The additional index will be the same type as the condition. Check
7386 that the loop can fit into this less one (because we'll use up the
7387 zero slot for when there are no matches). */
7388 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7389 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7391 if (dump_enabled_p ())
7392 dump_printf_loc (MSG_NOTE
, vect_location
,
7393 "loop size is greater than data size.\n");
7398 /* In case the vectorization factor (VF) is bigger than the number
7399 of elements that we can fit in a vectype (nunits), we have to generate
7400 more than one vector stmt - i.e - we need to "unroll" the
7401 vector stmt by a factor VF/nunits. For more details see documentation
7402 in vectorizable_operation. */
7404 /* If the reduction is used in an outer loop we need to generate
7405 VF intermediate results, like so (e.g. for ncopies=2):
7410 (i.e. we generate VF results in 2 registers).
7411 In this case we have a separate def-use cycle for each copy, and therefore
7412 for each copy we get the vector def for the reduction variable from the
7413 respective phi node created for this copy.
7415 Otherwise (the reduction is unused in the loop nest), we can combine
7416 together intermediate results, like so (e.g. for ncopies=2):
7420 (i.e. we generate VF/2 results in a single register).
7421 In this case for each copy we get the vector def for the reduction variable
7422 from the vectorized reduction operation generated in the previous iteration.
7424 This only works when we see both the reduction PHI and its only consumer
7425 in vectorizable_reduction and there are no intermediate stmts
7426 participating. When unrolling we want each unrolled iteration to have its
7427 own reduction accumulator since one of the main goals of unrolling a
7428 reduction is to reduce the aggregate loop-carried latency. */
7430 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7431 && reduc_chain_length
== 1
7432 && loop_vinfo
->suggested_unroll_factor
== 1)
7433 single_defuse_cycle
= true;
7435 if (single_defuse_cycle
|| lane_reduc_code_p
)
7437 gcc_assert (op
.code
!= COND_EXPR
);
7439 /* 4. Supportable by target? */
7442 /* 4.1. check support for the operation in the loop
7444 This isn't necessary for the lane reduction codes, since they
7445 can only be produced by pattern matching, and it's up to the
7446 pattern matcher to test for support. The main reason for
7447 specifically skipping this step is to avoid rechecking whether
7448 mixed-sign dot-products can be implemented using signed
7450 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
7451 if (!lane_reduc_code_p
7452 && !directly_supported_p (op
.code
, vectype_in
, optab_vector
))
7454 if (dump_enabled_p ())
7455 dump_printf (MSG_NOTE
, "op not supported by target.\n");
7456 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
7457 || !vect_can_vectorize_without_simd_p (op
.code
))
7460 if (dump_enabled_p ())
7461 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
7464 if (vect_emulated_vector_p (vectype_in
)
7465 && !vect_can_vectorize_without_simd_p (op
.code
))
7467 if (dump_enabled_p ())
7468 dump_printf (MSG_NOTE
, "using word mode not possible.\n");
7472 /* lane-reducing operations have to go through vect_transform_reduction.
7473 For the other cases try without the single cycle optimization. */
7476 if (lane_reduc_code_p
)
7479 single_defuse_cycle
= false;
7482 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
7484 /* If the reduction stmt is one of the patterns that have lane
7485 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7486 if ((ncopies
> 1 && ! single_defuse_cycle
)
7487 && lane_reduc_code_p
)
7489 if (dump_enabled_p ())
7490 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7491 "multi def-use cycle not possible for lane-reducing "
7492 "reduction operation\n");
7497 && !(!single_defuse_cycle
7498 && !lane_reduc_code_p
7499 && reduction_type
!= FOLD_LEFT_REDUCTION
))
7500 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7501 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_in
))
7503 if (dump_enabled_p ())
7504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7505 "incompatible vector types for invariants\n");
7510 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7514 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
7515 reduction_type
, ncopies
, cost_vec
);
7516 /* Cost the reduction op inside the loop if transformed via
7517 vect_transform_reduction. Otherwise this is costed by the
7518 separate vectorizable_* routines. */
7519 if (single_defuse_cycle
|| lane_reduc_code_p
)
7522 if (vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
))
7523 /* Three dot-products and a subtraction. */
7525 record_stmt_cost (cost_vec
, ncopies
* factor
, vector_stmt
,
7526 stmt_info
, 0, vect_body
);
7529 if (dump_enabled_p ()
7530 && reduction_type
== FOLD_LEFT_REDUCTION
)
7531 dump_printf_loc (MSG_NOTE
, vect_location
,
7532 "using an in-order (fold-left) reduction.\n");
7533 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
7534 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7535 reductions go through their own vectorizable_* routines. */
7536 if (!single_defuse_cycle
7537 && !lane_reduc_code_p
7538 && reduction_type
!= FOLD_LEFT_REDUCTION
)
7541 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
7542 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
7544 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
7545 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
7547 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
7548 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
7550 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7552 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7553 internal_fn cond_fn
= get_conditional_internal_fn (op
.code
, op
.type
);
7555 if (reduction_type
!= FOLD_LEFT_REDUCTION
7556 && !use_mask_by_cond_expr_p (op
.code
, cond_fn
, vectype_in
)
7557 && (cond_fn
== IFN_LAST
7558 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7559 OPTIMIZE_FOR_SPEED
)))
7561 if (dump_enabled_p ())
7562 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7563 "can't operate on partial vectors because"
7564 " no conditional operation is available.\n");
7565 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7567 else if (reduction_type
== FOLD_LEFT_REDUCTION
7568 && reduc_fn
== IFN_LAST
7569 && !expand_vec_cond_expr_p (vectype_in
,
7570 truth_type_for (vectype_in
),
7573 if (dump_enabled_p ())
7574 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7575 "can't operate on partial vectors because"
7576 " no conditional operation is available.\n");
7577 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7580 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7586 /* STMT_INFO is a dot-product reduction whose multiplication operands
7587 have different signs. Emit a sequence to emulate the operation
7588 using a series of signed DOT_PROD_EXPRs and return the last
7589 statement generated. VEC_DEST is the result of the vector operation
7590 and VOP lists its inputs. */
7593 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
7594 gimple_stmt_iterator
*gsi
, tree vec_dest
,
7597 tree wide_vectype
= signed_type_for (TREE_TYPE (vec_dest
));
7598 tree narrow_vectype
= signed_type_for (TREE_TYPE (vop
[0]));
7599 tree narrow_elttype
= TREE_TYPE (narrow_vectype
);
7602 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7603 if (!TYPE_UNSIGNED (TREE_TYPE (vop
[0])))
7604 std::swap (vop
[0], vop
[1]);
7606 /* Convert all inputs to signed types. */
7607 for (int i
= 0; i
< 3; ++i
)
7608 if (TYPE_UNSIGNED (TREE_TYPE (vop
[i
])))
7610 tree tmp
= make_ssa_name (signed_type_for (TREE_TYPE (vop
[i
])));
7611 new_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, vop
[i
]);
7612 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7616 /* In the comments below we assume 8-bit inputs for simplicity,
7617 but the approach works for any full integer type. */
7619 /* Create a vector of -128. */
7620 tree min_narrow_elttype
= TYPE_MIN_VALUE (narrow_elttype
);
7621 tree min_narrow
= build_vector_from_val (narrow_vectype
,
7622 min_narrow_elttype
);
7624 /* Create a vector of 64. */
7625 auto half_wi
= wi::lrshift (wi::to_wide (min_narrow_elttype
), 1);
7626 tree half_narrow
= wide_int_to_tree (narrow_elttype
, half_wi
);
7627 half_narrow
= build_vector_from_val (narrow_vectype
, half_narrow
);
7629 /* Emit: SUB_RES = VOP[0] - 128. */
7630 tree sub_res
= make_ssa_name (narrow_vectype
);
7631 new_stmt
= gimple_build_assign (sub_res
, PLUS_EXPR
, vop
[0], min_narrow
);
7632 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7636 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7637 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7638 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7640 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7641 Doing the two 64 * y steps first allows more time to compute x. */
7642 tree stage1
= make_ssa_name (wide_vectype
);
7643 new_stmt
= gimple_build_assign (stage1
, DOT_PROD_EXPR
,
7644 vop
[1], half_narrow
, vop
[2]);
7645 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7647 tree stage2
= make_ssa_name (wide_vectype
);
7648 new_stmt
= gimple_build_assign (stage2
, DOT_PROD_EXPR
,
7649 vop
[1], half_narrow
, stage1
);
7650 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7652 tree stage3
= make_ssa_name (wide_vectype
);
7653 new_stmt
= gimple_build_assign (stage3
, DOT_PROD_EXPR
,
7654 sub_res
, vop
[1], stage2
);
7655 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7657 /* Convert STAGE3 to the reduction type. */
7658 return gimple_build_assign (vec_dest
, CONVERT_EXPR
, stage3
);
7661 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7665 vect_transform_reduction (loop_vec_info loop_vinfo
,
7666 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
7667 gimple
**vec_stmt
, slp_tree slp_node
)
7669 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7670 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7675 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7676 gcc_assert (reduc_info
->is_reduc_info
);
7678 if (nested_in_vect_loop_p (loop
, stmt_info
))
7681 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
7685 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7687 gcc_assert (op
.code
.is_tree_code ());
7688 auto code
= tree_code (op
.code
);
7690 /* All uses but the last are expected to be defined in the loop.
7691 The last use is the reduction variable. In case of nested cycle this
7692 assumption is not true: we use reduc_index to record the index of the
7693 reduction variable. */
7694 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7695 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7696 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
7697 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7702 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7706 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7710 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7711 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7712 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7715 tree new_temp
= NULL_TREE
;
7716 auto_vec
<tree
> vec_oprnds0
;
7717 auto_vec
<tree
> vec_oprnds1
;
7718 auto_vec
<tree
> vec_oprnds2
;
7721 if (dump_enabled_p ())
7722 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7724 /* FORNOW: Multiple types are not supported for condition. */
7725 if (code
== COND_EXPR
)
7726 gcc_assert (ncopies
== 1);
7728 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7730 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7731 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7733 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7734 return vectorize_fold_left_reduction
7735 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
7736 reduc_fn
, op
.ops
, vectype_in
, reduc_index
, masks
);
7739 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
7740 gcc_assert (single_defuse_cycle
7741 || code
== DOT_PROD_EXPR
7742 || code
== WIDEN_SUM_EXPR
7743 || code
== SAD_EXPR
);
7745 /* Create the destination vector */
7746 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
7747 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7749 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
7750 single_defuse_cycle
&& reduc_index
== 0
7751 ? NULL_TREE
: op
.ops
[0], &vec_oprnds0
,
7752 single_defuse_cycle
&& reduc_index
== 1
7753 ? NULL_TREE
: op
.ops
[1], &vec_oprnds1
,
7755 && !(single_defuse_cycle
&& reduc_index
== 2)
7756 ? op
.ops
[2] : NULL_TREE
, &vec_oprnds2
);
7757 if (single_defuse_cycle
)
7759 gcc_assert (!slp_node
);
7760 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7761 op
.ops
[reduc_index
],
7762 reduc_index
== 0 ? &vec_oprnds0
7763 : (reduc_index
== 1 ? &vec_oprnds1
7767 bool emulated_mixed_dot_prod
7768 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
7769 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7772 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7773 if (masked_loop_p
&& !mask_by_cond_expr
)
7775 /* No conditional ifns have been defined for dot-product yet. */
7776 gcc_assert (code
!= DOT_PROD_EXPR
);
7778 /* Make sure that the reduction accumulator is vop[0]. */
7779 if (reduc_index
== 1)
7781 gcc_assert (commutative_tree_code (code
));
7782 std::swap (vop
[0], vop
[1]);
7784 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7786 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7787 vop
[0], vop
[1], vop
[0]);
7788 new_temp
= make_ssa_name (vec_dest
, call
);
7789 gimple_call_set_lhs (call
, new_temp
);
7790 gimple_call_set_nothrow (call
, true);
7791 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
7796 if (op
.num_ops
== 3)
7797 vop
[2] = vec_oprnds2
[i
];
7799 if (masked_loop_p
&& mask_by_cond_expr
)
7801 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7803 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7806 if (emulated_mixed_dot_prod
)
7807 new_stmt
= vect_emulate_mixed_dot_prod (loop_vinfo
, stmt_info
, gsi
,
7810 new_stmt
= gimple_build_assign (vec_dest
, code
,
7811 vop
[0], vop
[1], vop
[2]);
7812 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7813 gimple_assign_set_lhs (new_stmt
, new_temp
);
7814 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7818 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7819 else if (single_defuse_cycle
7822 if (reduc_index
== 0)
7823 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
7824 else if (reduc_index
== 1)
7825 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
7826 else if (reduc_index
== 2)
7827 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
7830 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7834 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7839 /* Transform phase of a cycle PHI. */
7842 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7843 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7844 slp_tree slp_node
, slp_instance slp_node_instance
)
7846 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7847 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7851 bool nested_cycle
= false;
7854 if (nested_in_vect_loop_p (loop
, stmt_info
))
7857 nested_cycle
= true;
7860 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7861 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7862 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7863 gcc_assert (reduc_info
->is_reduc_info
);
7865 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7866 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7867 /* Leave the scalar phi in place. */
7870 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7871 /* For a nested cycle we do not fill the above. */
7873 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7874 gcc_assert (vectype_in
);
7878 /* The size vect_schedule_slp_instance computes is off for us. */
7879 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7880 * SLP_TREE_LANES (slp_node
), vectype_in
);
7886 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7889 /* Check whether we should use a single PHI node and accumulate
7890 vectors to one before the backedge. */
7891 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7894 /* Create the destination vector */
7895 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7896 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7899 /* Get the loop-entry arguments. */
7900 tree vec_initial_def
= NULL_TREE
;
7901 auto_vec
<tree
> vec_initial_defs
;
7904 vec_initial_defs
.reserve (vec_num
);
7907 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
7908 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
7913 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
7914 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
7915 vec
<stmt_vec_info
> &stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
7917 unsigned int num_phis
= stmts
.length ();
7918 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
7920 initial_values
.reserve (num_phis
);
7921 for (unsigned int i
= 0; i
< num_phis
; ++i
)
7923 gphi
*this_phi
= as_a
<gphi
*> (stmts
[i
]->stmt
);
7924 initial_values
.quick_push (vect_phi_initial_value (this_phi
));
7927 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
7928 if (!initial_values
.is_empty ())
7931 = (num_phis
== 1 ? initial_values
[0] : NULL_TREE
);
7932 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7934 = neutral_op_for_reduction (TREE_TYPE (vectype_out
),
7935 code
, initial_value
);
7936 get_initial_defs_for_reduction (loop_vinfo
, reduc_info
,
7937 &vec_initial_defs
, vec_num
,
7938 stmts
.length (), neutral_op
);
7944 /* Get at the scalar def before the loop, that defines the initial
7945 value of the reduction variable. */
7946 tree initial_def
= vect_phi_initial_value (phi
);
7947 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
7948 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7949 and we can't use zero for induc_val, use initial_def. Similarly
7950 for REDUC_MIN and initial_def larger than the base. */
7951 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
7953 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
7954 if (TREE_CODE (initial_def
) == INTEGER_CST
7955 && !integer_zerop (induc_val
)
7956 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
7957 && tree_int_cst_lt (initial_def
, induc_val
))
7958 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
7959 && tree_int_cst_lt (induc_val
, initial_def
))))
7961 induc_val
= initial_def
;
7962 /* Communicate we used the initial_def to epilouge
7964 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
7966 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
7968 else if (nested_cycle
)
7970 /* Do not use an adjustment def as that case is not supported
7971 correctly if ncopies is not one. */
7972 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
7973 ncopies
, initial_def
,
7976 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == CONST_COND_REDUCTION
7977 || STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
7978 /* Fill the initial vector with the initial scalar value. */
7980 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
,
7981 initial_def
, initial_def
);
7985 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
7986 if (!reduc_info
->reduc_initial_values
.is_empty ())
7988 initial_def
= reduc_info
->reduc_initial_values
[0];
7989 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7991 = neutral_op_for_reduction (TREE_TYPE (initial_def
),
7993 gcc_assert (neutral_op
);
7994 /* Try to simplify the vector initialization by applying an
7995 adjustment after the reduction has been performed. */
7996 if (!reduc_info
->reused_accumulator
7997 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
7998 && !operand_equal_p (neutral_op
, initial_def
))
8000 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
)
8002 initial_def
= neutral_op
;
8005 = get_initial_def_for_reduction (loop_vinfo
, reduc_info
,
8006 initial_def
, neutral_op
);
8011 if (vec_initial_def
)
8013 vec_initial_defs
.create (ncopies
);
8014 for (i
= 0; i
< ncopies
; ++i
)
8015 vec_initial_defs
.quick_push (vec_initial_def
);
8018 if (auto *accumulator
= reduc_info
->reused_accumulator
)
8020 tree def
= accumulator
->reduc_input
;
8021 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8023 unsigned int nreduc
;
8024 bool res
= constant_multiple_p (TYPE_VECTOR_SUBPARTS
8026 TYPE_VECTOR_SUBPARTS (vectype_out
),
8029 gimple_seq stmts
= NULL
;
8030 /* Reduce the single vector to a smaller one. */
8033 /* Perform the reduction in the appropriate type. */
8034 tree rvectype
= vectype_out
;
8035 if (!useless_type_conversion_p (TREE_TYPE (vectype_out
),
8036 TREE_TYPE (TREE_TYPE (def
))))
8037 rvectype
= build_vector_type (TREE_TYPE (TREE_TYPE (def
)),
8038 TYPE_VECTOR_SUBPARTS
8040 def
= vect_create_partial_epilog (def
, rvectype
,
8041 STMT_VINFO_REDUC_CODE
8045 /* The epilogue loop might use a different vector mode, like
8047 if (TYPE_MODE (vectype_out
) != TYPE_MODE (TREE_TYPE (def
)))
8049 tree reduc_type
= build_vector_type_for_mode
8050 (TREE_TYPE (TREE_TYPE (def
)), TYPE_MODE (vectype_out
));
8051 def
= gimple_convert (&stmts
, reduc_type
, def
);
8053 /* Adjust the input so we pick up the partially reduced value
8054 for the skip edge in vect_create_epilog_for_reduction. */
8055 accumulator
->reduc_input
= def
;
8056 /* And the reduction could be carried out using a different sign. */
8057 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8058 def
= gimple_convert (&stmts
, vectype_out
, def
);
8059 if (loop_vinfo
->main_loop_edge
)
8061 /* While we'd like to insert on the edge this will split
8062 blocks and disturb bookkeeping, we also will eventually
8063 need this on the skip edge. Rely on sinking to
8064 fixup optimal placement and insert in the pred. */
8065 gimple_stmt_iterator gsi
8066 = gsi_last_bb (loop_vinfo
->main_loop_edge
->src
);
8067 /* Insert before a cond that eventually skips the
8069 if (!gsi_end_p (gsi
) && stmt_ends_bb_p (gsi_stmt (gsi
)))
8071 gsi_insert_seq_after (&gsi
, stmts
, GSI_CONTINUE_LINKING
);
8074 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
),
8077 if (loop_vinfo
->main_loop_edge
)
8079 = vect_get_main_loop_result (loop_vinfo
, def
,
8080 vec_initial_defs
[0]);
8082 vec_initial_defs
.safe_push (def
);
8085 /* Generate the reduction PHIs upfront. */
8086 for (i
= 0; i
< vec_num
; i
++)
8088 tree vec_init_def
= vec_initial_defs
[i
];
8089 for (j
= 0; j
< ncopies
; j
++)
8091 /* Create the reduction-phi that defines the reduction
8093 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
8095 /* Set the loop-entry arg of the reduction-phi. */
8096 if (j
!= 0 && nested_cycle
)
8097 vec_init_def
= vec_initial_defs
[j
];
8098 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
8101 /* The loop-latch arg is set in epilogue processing. */
8104 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
8108 *vec_stmt
= new_phi
;
8109 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
8117 /* Vectorizes LC PHIs. */
8120 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
8121 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8125 || !is_a
<gphi
*> (stmt_info
->stmt
)
8126 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
8129 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
8130 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
8133 if (!vec_stmt
) /* transformation not required. */
8135 /* Deal with copies from externs or constants that disguise as
8136 loop-closed PHI nodes (PR97886). */
8138 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
8139 SLP_TREE_VECTYPE (slp_node
)))
8141 if (dump_enabled_p ())
8142 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8143 "incompatible vector types for invariants\n");
8146 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
8150 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8151 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
8152 basic_block bb
= gimple_bb (stmt_info
->stmt
);
8153 edge e
= single_pred_edge (bb
);
8154 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
8155 auto_vec
<tree
> vec_oprnds
;
8156 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
8157 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
8158 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
8159 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
8161 /* Create the vectorized LC PHI node. */
8162 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
8163 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
8165 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
8167 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
8170 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
8175 /* Vectorizes PHIs. */
8178 vectorizable_phi (vec_info
*,
8179 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8180 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
8182 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
8185 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
8188 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
8190 if (!vec_stmt
) /* transformation not required. */
8194 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
8197 if (dump_enabled_p ())
8198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8199 "PHI node with unvectorized backedge def\n");
8202 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
8204 if (dump_enabled_p ())
8205 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8206 "incompatible vector types for invariants\n");
8209 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
8210 && !useless_type_conversion_p (vectype
,
8211 SLP_TREE_VECTYPE (child
)))
8213 /* With bools we can have mask and non-mask precision vectors
8214 or different non-mask precisions. while pattern recog is
8215 supposed to guarantee consistency here bugs in it can cause
8216 mismatches (PR103489 and PR103800 for example).
8217 Deal with them here instead of ICEing later. */
8218 if (dump_enabled_p ())
8219 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8220 "incompatible vector type setup from "
8221 "bool pattern detection\n");
8225 /* For single-argument PHIs assume coalescing which means zero cost
8226 for the scalar and the vector PHIs. This avoids artificially
8227 favoring the vector path (but may pessimize it in some cases). */
8228 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
8229 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
8230 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
8231 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
8235 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
8236 basic_block bb
= gimple_bb (stmt_info
->stmt
);
8237 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
8238 auto_vec
<gphi
*> new_phis
;
8239 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
8241 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
8243 /* Skip not yet vectorized defs. */
8244 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
8245 && SLP_TREE_VEC_STMTS (child
).is_empty ())
8248 auto_vec
<tree
> vec_oprnds
;
8249 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
8250 if (!new_phis
.exists ())
8252 new_phis
.create (vec_oprnds
.length ());
8253 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8255 /* Create the vectorized LC PHI node. */
8256 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
8257 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phis
[j
]);
8260 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
8261 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8262 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
8264 /* We should have at least one already vectorized child. */
8265 gcc_assert (new_phis
.exists ());
8270 /* Return true if VECTYPE represents a vector that requires lowering
8271 by the vector lowering pass. */
8274 vect_emulated_vector_p (tree vectype
)
8276 return (!VECTOR_MODE_P (TYPE_MODE (vectype
))
8277 && (!VECTOR_BOOLEAN_TYPE_P (vectype
)
8278 || TYPE_PRECISION (TREE_TYPE (vectype
)) != 1));
8281 /* Return true if we can emulate CODE on an integer mode representation
8285 vect_can_vectorize_without_simd_p (tree_code code
)
8303 /* Likewise, but taking a code_helper. */
8306 vect_can_vectorize_without_simd_p (code_helper code
)
8308 return (code
.is_tree_code ()
8309 && vect_can_vectorize_without_simd_p (tree_code (code
)));
8312 /* Create vector init for vectorized iv. */
8314 vect_create_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
8315 tree step_expr
, poly_uint64 nunits
,
8317 enum vect_induction_op_type induction_type
)
8319 unsigned HOST_WIDE_INT const_nunits
;
8320 tree vec_shift
, vec_init
, new_name
;
8322 tree itype
= TREE_TYPE (vectype
);
8324 /* iv_loop is the loop to be vectorized. Create:
8325 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8326 new_name
= gimple_convert (stmts
, itype
, init_expr
);
8327 switch (induction_type
)
8329 case vect_step_op_shr
:
8330 case vect_step_op_shl
:
8331 /* Build the Initial value from shift_expr. */
8332 vec_init
= gimple_build_vector_from_val (stmts
,
8335 vec_shift
= gimple_build (stmts
, VEC_SERIES_EXPR
, vectype
,
8336 build_zero_cst (itype
), step_expr
);
8337 vec_init
= gimple_build (stmts
,
8338 (induction_type
== vect_step_op_shr
8339 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
8340 vectype
, vec_init
, vec_shift
);
8343 case vect_step_op_neg
:
8345 vec_init
= gimple_build_vector_from_val (stmts
,
8348 tree vec_neg
= gimple_build (stmts
, NEGATE_EXPR
,
8350 /* The encoding has 2 interleaved stepped patterns. */
8351 vec_perm_builder
sel (nunits
, 2, 3);
8353 for (i
= 0; i
< 3; i
++)
8356 sel
[2 * i
+ 1] = i
+ nunits
;
8358 vec_perm_indices
indices (sel
, 2, nunits
);
8360 = vect_gen_perm_mask_checked (vectype
, indices
);
8361 vec_init
= gimple_build (stmts
, VEC_PERM_EXPR
,
8368 case vect_step_op_mul
:
8370 /* Use unsigned mult to avoid UD integer overflow. */
8371 gcc_assert (nunits
.is_constant (&const_nunits
));
8372 tree utype
= unsigned_type_for (itype
);
8373 tree uvectype
= build_vector_type (utype
,
8374 TYPE_VECTOR_SUBPARTS (vectype
));
8375 new_name
= gimple_convert (stmts
, utype
, new_name
);
8376 vec_init
= gimple_build_vector_from_val (stmts
,
8379 tree_vector_builder
elts (uvectype
, const_nunits
, 1);
8380 tree elt_step
= build_one_cst (utype
);
8382 elts
.quick_push (elt_step
);
8383 for (i
= 1; i
< const_nunits
; i
++)
8385 /* Create: new_name_i = new_name + step_expr. */
8386 elt_step
= gimple_build (stmts
, MULT_EXPR
,
8387 utype
, elt_step
, step_expr
);
8388 elts
.quick_push (elt_step
);
8390 /* Create a vector from [new_name_0, new_name_1, ...,
8391 new_name_nunits-1]. */
8392 tree vec_mul
= gimple_build_vector (stmts
, &elts
);
8393 vec_init
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
8395 vec_init
= gimple_convert (stmts
, vectype
, vec_init
);
8406 /* Peel init_expr by skip_niter for induction_type. */
8408 vect_peel_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
8409 tree skip_niters
, tree step_expr
,
8410 enum vect_induction_op_type induction_type
)
8412 gcc_assert (TREE_CODE (skip_niters
) == INTEGER_CST
);
8413 tree type
= TREE_TYPE (init_expr
);
8414 unsigned prec
= TYPE_PRECISION (type
);
8415 switch (induction_type
)
8417 case vect_step_op_neg
:
8418 if (TREE_INT_CST_LOW (skip_niters
) % 2)
8419 init_expr
= gimple_build (stmts
, NEGATE_EXPR
, type
, init_expr
);
8420 /* else no change. */
8423 case vect_step_op_shr
:
8424 case vect_step_op_shl
:
8425 skip_niters
= gimple_convert (stmts
, type
, skip_niters
);
8426 step_expr
= gimple_build (stmts
, MULT_EXPR
, type
, step_expr
, skip_niters
);
8427 /* When shift mount >= precision, need to avoid UD.
8428 In the original loop, there's no UD, and according to semantic,
8429 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8430 if (!tree_fits_uhwi_p (step_expr
)
8431 || tree_to_uhwi (step_expr
) >= prec
)
8433 if (induction_type
== vect_step_op_shl
8434 || TYPE_UNSIGNED (type
))
8435 init_expr
= build_zero_cst (type
);
8437 init_expr
= gimple_build (stmts
, RSHIFT_EXPR
, type
,
8439 wide_int_to_tree (type
, prec
- 1));
8442 init_expr
= gimple_build (stmts
, (induction_type
== vect_step_op_shr
8443 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
8444 type
, init_expr
, step_expr
);
8447 case vect_step_op_mul
:
8449 tree utype
= unsigned_type_for (type
);
8450 init_expr
= gimple_convert (stmts
, utype
, init_expr
);
8451 unsigned skipn
= TREE_INT_CST_LOW (skip_niters
);
8452 wide_int begin
= wi::to_wide (step_expr
);
8453 for (unsigned i
= 0; i
!= skipn
- 1; i
++)
8454 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
8455 tree mult_expr
= wide_int_to_tree (utype
, begin
);
8456 init_expr
= gimple_build (stmts
, MULT_EXPR
, utype
, init_expr
, mult_expr
);
8457 init_expr
= gimple_convert (stmts
, type
, init_expr
);
8468 /* Create vector step for vectorized iv. */
8470 vect_create_nonlinear_iv_step (gimple_seq
* stmts
, tree step_expr
,
8472 enum vect_induction_op_type induction_type
)
8474 tree expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
8475 tree new_name
= NULL
;
8476 /* Step should be pow (step, vf) for mult induction. */
8477 if (induction_type
== vect_step_op_mul
)
8479 gcc_assert (vf
.is_constant ());
8480 wide_int begin
= wi::to_wide (step_expr
);
8482 for (unsigned i
= 0; i
!= vf
.to_constant () - 1; i
++)
8483 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
8485 new_name
= wide_int_to_tree (TREE_TYPE (step_expr
), begin
);
8487 else if (induction_type
== vect_step_op_neg
)
8491 new_name
= gimple_build (stmts
, MULT_EXPR
, TREE_TYPE (step_expr
),
8497 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo
,
8498 stmt_vec_info stmt_info
,
8499 tree new_name
, tree vectype
,
8500 enum vect_induction_op_type induction_type
)
8502 /* No step is needed for neg induction. */
8503 if (induction_type
== vect_step_op_neg
)
8506 tree t
= unshare_expr (new_name
);
8507 gcc_assert (CONSTANT_CLASS_P (new_name
)
8508 || TREE_CODE (new_name
) == SSA_NAME
);
8509 tree new_vec
= build_vector_from_val (vectype
, t
);
8510 tree vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8511 new_vec
, vectype
, NULL
);
8515 /* Update vectorized iv with vect_step, induc_def is init. */
8517 vect_update_nonlinear_iv (gimple_seq
* stmts
, tree vectype
,
8518 tree induc_def
, tree vec_step
,
8519 enum vect_induction_op_type induction_type
)
8521 tree vec_def
= induc_def
;
8522 switch (induction_type
)
8524 case vect_step_op_mul
:
8526 /* Use unsigned mult to avoid UD integer overflow. */
8528 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype
)),
8529 TYPE_VECTOR_SUBPARTS (vectype
));
8530 vec_def
= gimple_convert (stmts
, uvectype
, vec_def
);
8531 vec_step
= gimple_convert (stmts
, uvectype
, vec_step
);
8532 vec_def
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
8534 vec_def
= gimple_convert (stmts
, vectype
, vec_def
);
8538 case vect_step_op_shr
:
8539 vec_def
= gimple_build (stmts
, RSHIFT_EXPR
, vectype
,
8543 case vect_step_op_shl
:
8544 vec_def
= gimple_build (stmts
, LSHIFT_EXPR
, vectype
,
8547 case vect_step_op_neg
:
8548 vec_def
= induc_def
;
8558 /* Function vectorizable_induction
8560 Check if STMT_INFO performs an nonlinear induction computation that can be
8561 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8562 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8564 Return true if STMT_INFO is vectorizable in this way. */
8567 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo
,
8568 stmt_vec_info stmt_info
,
8569 gimple
**vec_stmt
, slp_tree slp_node
,
8570 stmt_vector_for_cost
*cost_vec
)
8572 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8574 bool nested_in_vect_loop
= false;
8575 class loop
*iv_loop
;
8577 edge pe
= loop_preheader_edge (loop
);
8579 tree vec_init
, vec_step
;
8582 gphi
*induction_phi
;
8583 tree induc_def
, vec_dest
;
8584 tree init_expr
, step_expr
;
8586 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8588 gimple_stmt_iterator si
;
8590 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
8592 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8593 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8594 enum vect_induction_op_type induction_type
8595 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
8597 gcc_assert (induction_type
> vect_step_op_add
);
8602 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8603 gcc_assert (ncopies
>= 1);
8605 /* FORNOW. Only handle nonlinear induction in the same loop. */
8606 if (nested_in_vect_loop_p (loop
, stmt_info
))
8608 if (dump_enabled_p ())
8609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8610 "nonlinear induction in nested loop.\n");
8615 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
8617 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8618 update for each iv and a permutation to generate wanted vector iv. */
8621 if (dump_enabled_p ())
8622 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8623 "SLP induction not supported for nonlinear"
8628 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
8629 if niters is unkown:
8630 For shift, when shift mount >= precision, there would be UD.
8631 For mult, don't known how to generate
8632 init_expr * pow (step, niters) for variable niters.
8633 For neg, it should be ok, since niters of vectorized main loop
8634 will always be multiple of 2. */
8635 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8636 && induction_type
!= vect_step_op_neg
)
8638 if (dump_enabled_p ())
8639 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8640 "Peeling for epilogue is not supported"
8641 " for nonlinear induction except neg"
8642 " when iteration count is unknown.\n");
8646 /* Also doens't support peel for neg when niter is variable.
8647 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
8648 niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
8649 if (niters_skip
!= NULL_TREE
8650 && TREE_CODE (niters_skip
) != INTEGER_CST
)
8652 if (dump_enabled_p ())
8653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8654 "Peeling for alignement is not supported"
8655 " for nonlinear induction when niters_skip"
8656 " is not constant.\n");
8660 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8661 && induction_type
== vect_step_op_mul
)
8662 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype
)))
8664 if (dump_enabled_p ())
8665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8666 "floating point nonlinear induction vectorization"
8667 " not supported.\n");
8671 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
8672 init_expr
= vect_phi_initial_value (phi
);
8673 gcc_assert (step_expr
!= NULL_TREE
&& init_expr
!= NULL
8674 && TREE_CODE (step_expr
) == INTEGER_CST
);
8675 /* step_expr should be aligned with init_expr,
8676 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8677 step_expr
= fold_convert (TREE_TYPE (vectype
), step_expr
);
8679 if (TREE_CODE (init_expr
) == INTEGER_CST
)
8680 init_expr
= fold_convert (TREE_TYPE (vectype
), init_expr
);
8682 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype
),
8683 TREE_TYPE (init_expr
)));
8685 switch (induction_type
)
8687 case vect_step_op_neg
:
8688 if (TREE_CODE (init_expr
) != INTEGER_CST
8689 && TREE_CODE (init_expr
) != REAL_CST
)
8691 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8692 if (!directly_supported_p (NEGATE_EXPR
, vectype
))
8695 /* The encoding has 2 interleaved stepped patterns. */
8696 vec_perm_builder
sel (nunits
, 2, 3);
8697 machine_mode mode
= TYPE_MODE (vectype
);
8699 for (i
= 0; i
< 3; i
++)
8702 sel
[i
* 2 + 1] = i
+ nunits
;
8704 vec_perm_indices
indices (sel
, 2, nunits
);
8705 if (!can_vec_perm_const_p (mode
, mode
, indices
))
8710 case vect_step_op_mul
:
8712 /* Check for backend support of MULT_EXPR. */
8713 if (!directly_supported_p (MULT_EXPR
, vectype
))
8716 /* ?? How to construct vector step for variable number vector.
8717 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8718 if (!vf
.is_constant ())
8723 case vect_step_op_shr
:
8724 /* Check for backend support of RSHIFT_EXPR. */
8725 if (!directly_supported_p (RSHIFT_EXPR
, vectype
, optab_vector
))
8728 /* Don't shift more than type precision to avoid UD. */
8729 if (!tree_fits_uhwi_p (step_expr
)
8730 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
8731 TYPE_PRECISION (TREE_TYPE (init_expr
))))
8735 case vect_step_op_shl
:
8736 /* Check for backend support of RSHIFT_EXPR. */
8737 if (!directly_supported_p (LSHIFT_EXPR
, vectype
, optab_vector
))
8740 /* Don't shift more than type precision to avoid UD. */
8741 if (!tree_fits_uhwi_p (step_expr
)
8742 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
8743 TYPE_PRECISION (TREE_TYPE (init_expr
))))
8752 if (!vec_stmt
) /* transformation not required. */
8754 unsigned inside_cost
= 0, prologue_cost
= 0;
8755 /* loop cost for vec_loop. Neg induction doesn't have any
8757 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
8758 stmt_info
, 0, vect_body
);
8760 /* loop cost for vec_loop. Neg induction doesn't have any
8762 if (induction_type
== vect_step_op_neg
)
8765 /* prologue cost for vec_init and vec_step. */
8766 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
8767 stmt_info
, 0, vect_prologue
);
8769 if (dump_enabled_p ())
8770 dump_printf_loc (MSG_NOTE
, vect_location
,
8771 "vect_model_induction_cost: inside_cost = %d, "
8772 "prologue_cost = %d. \n", inside_cost
,
8775 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
8776 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
8782 /* Compute a vector variable, initialized with the first VF values of
8783 the induction variable. E.g., for an iv with IV_PHI='X' and
8784 evolution S, for a vector of 4 units, we want to compute:
8785 [X, X + S, X + 2*S, X + 3*S]. */
8787 if (dump_enabled_p ())
8788 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
8790 pe
= loop_preheader_edge (iv_loop
);
8791 /* Find the first insertion point in the BB. */
8792 basic_block bb
= gimple_bb (phi
);
8793 si
= gsi_after_labels (bb
);
8795 gimple_seq stmts
= NULL
;
8797 /* If we are using the loop mask to "peel" for alignment then we need
8798 to adjust the start value here. */
8799 if (niters_skip
!= NULL_TREE
)
8800 init_expr
= vect_peel_nonlinear_iv_init (&stmts
, init_expr
, niters_skip
,
8801 step_expr
, induction_type
);
8803 vec_init
= vect_create_nonlinear_iv_init (&stmts
, init_expr
,
8804 step_expr
, nunits
, vectype
,
8808 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8809 gcc_assert (!new_bb
);
8813 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
8814 vf
, induction_type
);
8817 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8818 gcc_assert (!new_bb
);
8821 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
8824 /* Create the following def-use cycle:
8829 vec_iv = PHI <vec_init, vec_loop>
8833 vec_loop = vec_iv + vec_step; */
8835 /* Create the induction-phi that defines the induction-operand. */
8836 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
8837 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8838 induc_def
= PHI_RESULT (induction_phi
);
8840 /* Create the iv update inside the loop. */
8842 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
8843 induc_def
, vec_step
,
8846 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8847 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8849 /* Set the arguments of the phi node: */
8850 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8851 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8854 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
8855 *vec_stmt
= induction_phi
;
8857 /* In case that vectorization factor (VF) is bigger than the number
8858 of elements that we can fit in a vectype (nunits), we have to generate
8859 more than one vector stmt - i.e - we need to "unroll" the
8860 vector stmt by a factor VF/nunits. For more details see documentation
8861 in vectorizable_operation. */
8866 /* FORNOW. This restriction should be relaxed. */
8867 gcc_assert (!nested_in_vect_loop
);
8869 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
8870 nunits
, induction_type
);
8872 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
8875 vec_def
= induc_def
;
8876 for (i
= 1; i
< ncopies
; i
++)
8878 /* vec_i = vec_prev + vec_step. */
8880 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
8883 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8884 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8885 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
8889 if (dump_enabled_p ())
8890 dump_printf_loc (MSG_NOTE
, vect_location
,
8891 "transform induction: created def-use cycle: %G%G",
8892 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
8897 /* Function vectorizable_induction
8899 Check if STMT_INFO performs an induction computation that can be vectorized.
8900 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8901 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8902 Return true if STMT_INFO is vectorizable in this way. */
8905 vectorizable_induction (loop_vec_info loop_vinfo
,
8906 stmt_vec_info stmt_info
,
8907 gimple
**vec_stmt
, slp_tree slp_node
,
8908 stmt_vector_for_cost
*cost_vec
)
8910 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8912 bool nested_in_vect_loop
= false;
8913 class loop
*iv_loop
;
8915 edge pe
= loop_preheader_edge (loop
);
8917 tree new_vec
, vec_init
, vec_step
, t
;
8920 gphi
*induction_phi
;
8921 tree induc_def
, vec_dest
;
8922 tree init_expr
, step_expr
;
8923 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8926 gimple_stmt_iterator si
;
8927 enum vect_induction_op_type induction_type
8928 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
8930 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
8934 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8937 /* Make sure it was recognized as induction computation. */
8938 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
8941 /* Handle nonlinear induction in a separate place. */
8942 if (induction_type
!= vect_step_op_add
)
8943 return vectorizable_nonlinear_induction (loop_vinfo
, stmt_info
,
8944 vec_stmt
, slp_node
, cost_vec
);
8946 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8947 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8952 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8953 gcc_assert (ncopies
>= 1);
8955 /* FORNOW. These restrictions should be relaxed. */
8956 if (nested_in_vect_loop_p (loop
, stmt_info
))
8958 imm_use_iterator imm_iter
;
8959 use_operand_p use_p
;
8966 if (dump_enabled_p ())
8967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8968 "multiple types in nested loop.\n");
8973 latch_e
= loop_latch_edge (loop
->inner
);
8974 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
8975 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
8977 gimple
*use_stmt
= USE_STMT (use_p
);
8978 if (is_gimple_debug (use_stmt
))
8981 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
8983 exit_phi
= use_stmt
;
8989 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
8990 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
8991 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
8993 if (dump_enabled_p ())
8994 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8995 "inner-loop induction only used outside "
8996 "of the outer vectorized loop.\n");
9001 nested_in_vect_loop
= true;
9002 iv_loop
= loop
->inner
;
9006 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
9008 if (slp_node
&& !nunits
.is_constant ())
9010 /* The current SLP code creates the step value element-by-element. */
9011 if (dump_enabled_p ())
9012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9013 "SLP induction not supported for variable-length"
9018 if (FLOAT_TYPE_P (vectype
) && !param_vect_induction_float
)
9020 if (dump_enabled_p ())
9021 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9022 "floating point induction vectorization disabled\n");
9026 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
9027 gcc_assert (step_expr
!= NULL_TREE
);
9028 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
9030 /* Check for backend support of PLUS/MINUS_EXPR. */
9031 if (!directly_supported_p (PLUS_EXPR
, step_vectype
)
9032 || !directly_supported_p (MINUS_EXPR
, step_vectype
))
9035 if (!vec_stmt
) /* transformation not required. */
9037 unsigned inside_cost
= 0, prologue_cost
= 0;
9040 /* We eventually need to set a vector type on invariant
9044 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
9045 if (!vect_maybe_update_slp_op_vectype
9046 (child
, SLP_TREE_VECTYPE (slp_node
)))
9048 if (dump_enabled_p ())
9049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9050 "incompatible vector types for "
9054 /* loop cost for vec_loop. */
9056 = record_stmt_cost (cost_vec
,
9057 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
9058 vector_stmt
, stmt_info
, 0, vect_body
);
9059 /* prologue cost for vec_init (if not nested) and step. */
9060 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
9062 stmt_info
, 0, vect_prologue
);
9064 else /* if (!slp_node) */
9066 /* loop cost for vec_loop. */
9067 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9068 stmt_info
, 0, vect_body
);
9069 /* prologue cost for vec_init and vec_step. */
9070 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
9071 stmt_info
, 0, vect_prologue
);
9073 if (dump_enabled_p ())
9074 dump_printf_loc (MSG_NOTE
, vect_location
,
9075 "vect_model_induction_cost: inside_cost = %d, "
9076 "prologue_cost = %d .\n", inside_cost
,
9079 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
9080 DUMP_VECT_SCOPE ("vectorizable_induction");
9086 /* Compute a vector variable, initialized with the first VF values of
9087 the induction variable. E.g., for an iv with IV_PHI='X' and
9088 evolution S, for a vector of 4 units, we want to compute:
9089 [X, X + S, X + 2*S, X + 3*S]. */
9091 if (dump_enabled_p ())
9092 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
9094 pe
= loop_preheader_edge (iv_loop
);
9095 /* Find the first insertion point in the BB. */
9096 basic_block bb
= gimple_bb (phi
);
9097 si
= gsi_after_labels (bb
);
9099 /* For SLP induction we have to generate several IVs as for example
9100 with group size 3 we need
9101 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9102 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9105 /* Enforced above. */
9106 unsigned int const_nunits
= nunits
.to_constant ();
9108 /* The initial values are vectorized, but any lanes > group_size
9111 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
9113 /* Gather steps. Since we do not vectorize inductions as
9114 cycles we have to reconstruct the step from SCEV data. */
9115 unsigned group_size
= SLP_TREE_LANES (slp_node
);
9116 tree
*steps
= XALLOCAVEC (tree
, group_size
);
9117 tree
*inits
= XALLOCAVEC (tree
, group_size
);
9118 stmt_vec_info phi_info
;
9119 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
9121 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
9123 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
9127 /* Now generate the IVs. */
9128 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9129 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
9131 if (nested_in_vect_loop
)
9135 /* Compute the number of distinct IVs we need. First reduce
9136 group_size if it is a multiple of const_nunits so we get
9137 one IV for a group_size of 4 but const_nunits 2. */
9138 unsigned group_sizep
= group_size
;
9139 if (group_sizep
% const_nunits
== 0)
9140 group_sizep
= group_sizep
/ const_nunits
;
9141 nivs
= least_common_multiple (group_sizep
,
9142 const_nunits
) / const_nunits
;
9144 tree stept
= TREE_TYPE (step_vectype
);
9145 tree lupdate_mul
= NULL_TREE
;
9146 if (!nested_in_vect_loop
)
9148 /* The number of iterations covered in one vector iteration. */
9149 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
9151 = build_vector_from_val (step_vectype
,
9152 SCALAR_FLOAT_TYPE_P (stept
)
9153 ? build_real_from_wide (stept
, lup_mul
,
9155 : build_int_cstu (stept
, lup_mul
));
9157 tree peel_mul
= NULL_TREE
;
9158 gimple_seq init_stmts
= NULL
;
9159 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
9161 if (SCALAR_FLOAT_TYPE_P (stept
))
9162 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
9163 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
9165 peel_mul
= gimple_convert (&init_stmts
, stept
,
9166 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
9167 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
9168 step_vectype
, peel_mul
);
9171 auto_vec
<tree
> vec_steps
;
9172 for (ivn
= 0; ivn
< nivs
; ++ivn
)
9174 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
9175 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
9176 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
9177 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
9179 /* The scalar steps of the IVs. */
9180 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
9181 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
9182 step_elts
.quick_push (elt
);
9185 /* The scalar inits of the IVs if not vectorized. */
9186 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
9187 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
9189 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
9190 TREE_TYPE (vectype
), elt
);
9191 init_elts
.quick_push (elt
);
9193 /* The number of steps to add to the initial values. */
9194 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
9195 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
9196 ? build_real_from_wide (stept
,
9198 : build_int_cstu (stept
, mul_elt
));
9200 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
9201 vec_steps
.safe_push (vec_step
);
9202 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
9204 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
9205 step_mul
, peel_mul
);
9207 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
9209 /* Create the induction-phi that defines the induction-operand. */
9210 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
9212 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9213 induc_def
= PHI_RESULT (induction_phi
);
9215 /* Create the iv update inside the loop */
9218 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9219 vec_step
, lupdate_mul
);
9220 gimple_seq stmts
= NULL
;
9221 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
9222 vec_def
= gimple_build (&stmts
,
9223 PLUS_EXPR
, step_vectype
, vec_def
, up
);
9224 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9225 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9226 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9230 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
9231 if (!nested_in_vect_loop
9232 && !integer_zerop (step_mul
))
9234 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
9235 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9236 vec_step
, step_mul
);
9237 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
9239 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
9242 /* Set the arguments of the phi node: */
9243 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9245 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
9247 if (!nested_in_vect_loop
)
9249 /* Fill up to the number of vectors we need for the whole group. */
9250 nivs
= least_common_multiple (group_size
,
9251 const_nunits
) / const_nunits
;
9252 vec_steps
.reserve (nivs
-ivn
);
9253 for (; ivn
< nivs
; ++ivn
)
9255 SLP_TREE_VEC_STMTS (slp_node
)
9256 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
9257 vec_steps
.quick_push (vec_steps
[0]);
9261 /* Re-use IVs when we can. We are generating further vector
9262 stmts by adding VF' * stride to the IVs generated above. */
9266 = least_common_multiple (group_size
, const_nunits
) / group_size
;
9268 = build_vector_from_val (step_vectype
,
9269 SCALAR_FLOAT_TYPE_P (stept
)
9270 ? build_real_from_wide (stept
,
9272 : build_int_cstu (stept
, vfp
));
9273 for (; ivn
< nvects
; ++ivn
)
9275 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
9276 tree def
= gimple_get_lhs (iv
);
9278 vec_steps
[ivn
- nivs
]
9279 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9280 vec_steps
[ivn
- nivs
], lupdate_mul
);
9281 gimple_seq stmts
= NULL
;
9282 def
= gimple_convert (&stmts
, step_vectype
, def
);
9283 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
9284 def
, vec_steps
[ivn
% nivs
]);
9285 def
= gimple_convert (&stmts
, vectype
, def
);
9286 if (gimple_code (iv
) == GIMPLE_PHI
)
9287 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9290 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
9291 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
9293 SLP_TREE_VEC_STMTS (slp_node
)
9294 .quick_push (SSA_NAME_DEF_STMT (def
));
9298 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
9299 gcc_assert (!new_bb
);
9304 init_expr
= vect_phi_initial_value (phi
);
9306 gimple_seq stmts
= NULL
;
9307 if (!nested_in_vect_loop
)
9309 /* Convert the initial value to the IV update type. */
9310 tree new_type
= TREE_TYPE (step_expr
);
9311 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
9313 /* If we are using the loop mask to "peel" for alignment then we need
9314 to adjust the start value here. */
9315 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
9316 if (skip_niters
!= NULL_TREE
)
9318 if (FLOAT_TYPE_P (vectype
))
9319 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
9322 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
9323 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
9324 skip_niters
, step_expr
);
9325 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
9326 init_expr
, skip_step
);
9332 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9333 gcc_assert (!new_bb
);
9336 /* Create the vector that holds the initial_value of the induction. */
9337 if (nested_in_vect_loop
)
9339 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9340 been created during vectorization of previous stmts. We obtain it
9341 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9342 auto_vec
<tree
> vec_inits
;
9343 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
9344 init_expr
, &vec_inits
);
9345 vec_init
= vec_inits
[0];
9346 /* If the initial value is not of proper type, convert it. */
9347 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
9350 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
9354 build1 (VIEW_CONVERT_EXPR
, vectype
,
9356 vec_init
= gimple_assign_lhs (new_stmt
);
9357 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
9359 gcc_assert (!new_bb
);
9364 /* iv_loop is the loop to be vectorized. Create:
9365 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9367 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
9369 unsigned HOST_WIDE_INT const_nunits
;
9370 if (nunits
.is_constant (&const_nunits
))
9372 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
9373 elts
.quick_push (new_name
);
9374 for (i
= 1; i
< const_nunits
; i
++)
9376 /* Create: new_name_i = new_name + step_expr */
9377 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
9378 new_name
, step_expr
);
9379 elts
.quick_push (new_name
);
9381 /* Create a vector from [new_name_0, new_name_1, ...,
9382 new_name_nunits-1] */
9383 vec_init
= gimple_build_vector (&stmts
, &elts
);
9385 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
9386 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9387 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
9388 new_name
, step_expr
);
9392 [base, base, base, ...]
9393 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9394 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
9395 gcc_assert (flag_associative_math
);
9396 tree index
= build_index_vector (step_vectype
, 0, 1);
9397 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
9399 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
9401 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
9402 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
9403 vec_init
, step_vec
);
9404 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
9405 vec_init
, base_vec
);
9407 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
9411 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9412 gcc_assert (!new_bb
);
9417 /* Create the vector that holds the step of the induction. */
9418 if (nested_in_vect_loop
)
9419 /* iv_loop is nested in the loop to be vectorized. Generate:
9420 vec_step = [S, S, S, S] */
9421 new_name
= step_expr
;
9424 /* iv_loop is the loop to be vectorized. Generate:
9425 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9426 gimple_seq seq
= NULL
;
9427 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
9429 expr
= build_int_cst (integer_type_node
, vf
);
9430 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
9433 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
9434 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
9438 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
9439 gcc_assert (!new_bb
);
9443 t
= unshare_expr (new_name
);
9444 gcc_assert (CONSTANT_CLASS_P (new_name
)
9445 || TREE_CODE (new_name
) == SSA_NAME
);
9446 new_vec
= build_vector_from_val (step_vectype
, t
);
9447 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9448 new_vec
, step_vectype
, NULL
);
9451 /* Create the following def-use cycle:
9456 vec_iv = PHI <vec_init, vec_loop>
9460 vec_loop = vec_iv + vec_step; */
9462 /* Create the induction-phi that defines the induction-operand. */
9463 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
9464 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9465 induc_def
= PHI_RESULT (induction_phi
);
9467 /* Create the iv update inside the loop */
9469 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
9470 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
9471 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9472 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9473 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9475 /* Set the arguments of the phi node: */
9476 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9477 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9480 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
9481 *vec_stmt
= induction_phi
;
9483 /* In case that vectorization factor (VF) is bigger than the number
9484 of elements that we can fit in a vectype (nunits), we have to generate
9485 more than one vector stmt - i.e - we need to "unroll" the
9486 vector stmt by a factor VF/nunits. For more details see documentation
9487 in vectorizable_operation. */
9491 gimple_seq seq
= NULL
;
9492 /* FORNOW. This restriction should be relaxed. */
9493 gcc_assert (!nested_in_vect_loop
);
9495 /* Create the vector that holds the step of the induction. */
9496 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
9498 expr
= build_int_cst (integer_type_node
, nunits
);
9499 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
9502 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
9503 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
9507 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
9508 gcc_assert (!new_bb
);
9511 t
= unshare_expr (new_name
);
9512 gcc_assert (CONSTANT_CLASS_P (new_name
)
9513 || TREE_CODE (new_name
) == SSA_NAME
);
9514 new_vec
= build_vector_from_val (step_vectype
, t
);
9515 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9516 new_vec
, step_vectype
, NULL
);
9518 vec_def
= induc_def
;
9519 for (i
= 1; i
< ncopies
; i
++)
9521 /* vec_i = vec_prev + vec_step */
9522 gimple_seq stmts
= NULL
;
9523 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
9524 vec_def
= gimple_build (&stmts
,
9525 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
9526 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9528 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9529 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9530 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
9534 if (dump_enabled_p ())
9535 dump_printf_loc (MSG_NOTE
, vect_location
,
9536 "transform induction: created def-use cycle: %G%G",
9537 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
9542 /* Function vectorizable_live_operation.
9544 STMT_INFO computes a value that is used outside the loop. Check if
9545 it can be supported. */
9548 vectorizable_live_operation (vec_info
*vinfo
,
9549 stmt_vec_info stmt_info
,
9550 gimple_stmt_iterator
*gsi
,
9551 slp_tree slp_node
, slp_instance slp_node_instance
,
9552 int slp_index
, bool vec_stmt_p
,
9553 stmt_vector_for_cost
*cost_vec
)
9555 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
9556 imm_use_iterator imm_iter
;
9557 tree lhs
, lhs_type
, bitsize
;
9558 tree vectype
= (slp_node
9559 ? SLP_TREE_VECTYPE (slp_node
)
9560 : STMT_VINFO_VECTYPE (stmt_info
));
9561 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9564 auto_vec
<tree
> vec_oprnds
;
9566 poly_uint64 vec_index
= 0;
9568 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
9570 /* If a stmt of a reduction is live, vectorize it via
9571 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9572 validity so just trigger the transform here. */
9573 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
9579 /* For reduction chains the meta-info is attached to
9580 the group leader. */
9581 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
9582 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
9583 /* For SLP reductions we vectorize the epilogue for
9584 all involved stmts together. */
9585 else if (slp_index
!= 0)
9588 /* For SLP reductions the meta-info is attached to
9589 the representative. */
9590 stmt_info
= SLP_TREE_REPRESENTATIVE (slp_node
);
9592 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
9593 gcc_assert (reduc_info
->is_reduc_info
);
9594 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
9595 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
9597 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
9602 /* If STMT is not relevant and it is a simple assignment and its inputs are
9603 invariant then it can remain in place, unvectorized. The original last
9604 scalar value that it computes will be used. */
9605 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
9607 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
9608 if (dump_enabled_p ())
9609 dump_printf_loc (MSG_NOTE
, vect_location
,
9610 "statement is simple and uses invariant. Leaving in "
9618 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9622 gcc_assert (slp_index
>= 0);
9624 /* Get the last occurrence of the scalar index from the concatenation of
9625 all the slp vectors. Calculate which slp vector it is and the index
9627 int num_scalar
= SLP_TREE_LANES (slp_node
);
9628 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9629 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
9631 /* Calculate which vector contains the result, and which lane of
9632 that vector we need. */
9633 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
9635 if (dump_enabled_p ())
9636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9637 "Cannot determine which vector holds the"
9638 " final result.\n");
9645 /* No transformation required. */
9646 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
9648 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
9649 OPTIMIZE_FOR_SPEED
))
9651 if (dump_enabled_p ())
9652 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9653 "can't operate on partial vectors "
9654 "because the target doesn't support extract "
9655 "last reduction.\n");
9656 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9660 if (dump_enabled_p ())
9661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9662 "can't operate on partial vectors "
9663 "because an SLP statement is live after "
9665 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9667 else if (ncopies
> 1)
9669 if (dump_enabled_p ())
9670 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9671 "can't operate on partial vectors "
9672 "because ncopies is greater than 1.\n");
9673 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9677 gcc_assert (ncopies
== 1 && !slp_node
);
9678 vect_record_loop_mask (loop_vinfo
,
9679 &LOOP_VINFO_MASKS (loop_vinfo
),
9683 /* ??? Enable for loop costing as well. */
9685 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
9690 /* Use the lhs of the original scalar statement. */
9691 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
9692 if (dump_enabled_p ())
9693 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
9696 lhs
= gimple_get_lhs (stmt
);
9697 lhs_type
= TREE_TYPE (lhs
);
9699 bitsize
= vector_element_bits_tree (vectype
);
9701 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9702 tree vec_lhs
, bitstart
;
9706 gcc_assert (!loop_vinfo
|| !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
9708 /* Get the correct slp vectorized stmt. */
9709 vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
9710 vec_lhs
= gimple_get_lhs (vec_stmt
);
9712 /* Get entry to use. */
9713 bitstart
= bitsize_int (vec_index
);
9714 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
9718 /* For multiple copies, get the last copy. */
9719 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
9720 vec_lhs
= gimple_get_lhs (vec_stmt
);
9722 /* Get the last lane in the vector. */
9723 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
9728 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9729 requirement, insert one phi node for it. It looks like:
9736 # vec_lhs' = PHI <vec_lhs>
9737 new_tree = lane_extract <vec_lhs', ...>;
9740 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9741 basic_block exit_bb
= single_exit (loop
)->dest
;
9742 gcc_assert (single_pred_p (exit_bb
));
9744 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
9745 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
9746 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
9748 gimple_seq stmts
= NULL
;
9750 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
9754 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9756 where VEC_LHS is the vectorized live-out result and MASK is
9757 the loop mask for the final iteration. */
9758 gcc_assert (ncopies
== 1 && !slp_node
);
9759 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
9760 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
9762 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
9765 /* Convert the extracted vector element to the scalar type. */
9766 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
9770 tree bftype
= TREE_TYPE (vectype
);
9771 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
9772 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
9773 new_tree
= build3 (BIT_FIELD_REF
, bftype
,
9774 vec_lhs_phi
, bitsize
, bitstart
);
9775 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
9776 &stmts
, true, NULL_TREE
);
9781 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
9782 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
9784 /* Remove existing phi from lhs and create one copy from new_tree. */
9785 tree lhs_phi
= NULL_TREE
;
9786 gimple_stmt_iterator gsi
;
9787 for (gsi
= gsi_start_phis (exit_bb
);
9788 !gsi_end_p (gsi
); gsi_next (&gsi
))
9790 gimple
*phi
= gsi_stmt (gsi
);
9791 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
9793 remove_phi_node (&gsi
, false);
9794 lhs_phi
= gimple_phi_result (phi
);
9795 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
9796 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
9802 /* Replace use of lhs with newly computed result. If the use stmt is a
9803 single arg PHI, just replace all uses of PHI result. It's necessary
9804 because lcssa PHI defining lhs may be before newly inserted stmt. */
9805 use_operand_p use_p
;
9806 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
9807 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
9808 && !is_gimple_debug (use_stmt
))
9810 if (gimple_code (use_stmt
) == GIMPLE_PHI
9811 && gimple_phi_num_args (use_stmt
) == 1)
9813 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
9817 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
9818 SET_USE (use_p
, new_tree
);
9820 update_stmt (use_stmt
);
9825 /* For basic-block vectorization simply insert the lane-extraction. */
9826 tree bftype
= TREE_TYPE (vectype
);
9827 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
9828 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
9829 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
9830 vec_lhs
, bitsize
, bitstart
);
9831 gimple_seq stmts
= NULL
;
9832 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
9833 &stmts
, true, NULL_TREE
);
9834 if (TREE_CODE (new_tree
) == SSA_NAME
9835 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
9836 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
9837 if (is_a
<gphi
*> (vec_stmt
))
9839 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
9840 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9844 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
9845 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
9848 /* Replace use of lhs with newly computed result. If the use stmt is a
9849 single arg PHI, just replace all uses of PHI result. It's necessary
9850 because lcssa PHI defining lhs may be before newly inserted stmt. */
9851 use_operand_p use_p
;
9852 stmt_vec_info use_stmt_info
;
9853 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
9854 if (!is_gimple_debug (use_stmt
)
9855 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
9856 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
9858 /* ??? This can happen when the live lane ends up being
9859 used in a vector construction code-generated by an
9860 external SLP node (and code-generation for that already
9861 happened). See gcc.dg/vect/bb-slp-47.c.
9862 Doing this is what would happen if that vector CTOR
9863 were not code-generated yet so it is not too bad.
9864 ??? In fact we'd likely want to avoid this situation
9865 in the first place. */
9866 if (TREE_CODE (new_tree
) == SSA_NAME
9867 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
9868 && gimple_code (use_stmt
) != GIMPLE_PHI
9869 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
9872 enum tree_code code
= gimple_assign_rhs_code (use_stmt
);
9873 gcc_assert (code
== CONSTRUCTOR
9874 || code
== VIEW_CONVERT_EXPR
9875 || CONVERT_EXPR_CODE_P (code
));
9876 if (dump_enabled_p ())
9877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9878 "Using original scalar computation for "
9879 "live lane because use preceeds vector "
9883 /* ??? It can also happen that we end up pulling a def into
9884 a loop where replacing out-of-loop uses would require
9885 a new LC SSA PHI node. Retain the original scalar in
9886 those cases as well. PR98064. */
9887 if (TREE_CODE (new_tree
) == SSA_NAME
9888 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
9889 && (gimple_bb (use_stmt
)->loop_father
9890 != gimple_bb (vec_stmt
)->loop_father
)
9891 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
9892 gimple_bb (use_stmt
)->loop_father
))
9894 if (dump_enabled_p ())
9895 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9896 "Using original scalar computation for "
9897 "live lane because there is an out-of-loop "
9898 "definition for it\n");
9901 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
9902 SET_USE (use_p
, new_tree
);
9903 update_stmt (use_stmt
);
9910 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9913 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
9915 ssa_op_iter op_iter
;
9916 imm_use_iterator imm_iter
;
9917 def_operand_p def_p
;
9920 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
9922 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
9926 if (!is_gimple_debug (ustmt
))
9929 bb
= gimple_bb (ustmt
);
9931 if (!flow_bb_inside_loop_p (loop
, bb
))
9933 if (gimple_debug_bind_p (ustmt
))
9935 if (dump_enabled_p ())
9936 dump_printf_loc (MSG_NOTE
, vect_location
,
9937 "killing debug use\n");
9939 gimple_debug_bind_reset_value (ustmt
);
9940 update_stmt (ustmt
);
9949 /* Given loop represented by LOOP_VINFO, return true if computation of
9950 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9954 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
9956 /* Constant case. */
9957 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
9959 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
9960 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
9962 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
9963 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
9964 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
9969 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9970 /* Check the upper bound of loop niters. */
9971 if (get_max_loop_iterations (loop
, &max
))
9973 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
9974 signop sgn
= TYPE_SIGN (type
);
9975 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
9982 /* Return a mask type with half the number of elements as OLD_TYPE,
9983 given that it should have mode NEW_MODE. */
9986 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
9988 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
9989 return build_truth_vector_type_for_mode (nunits
, new_mode
);
9992 /* Return a mask type with twice as many elements as OLD_TYPE,
9993 given that it should have mode NEW_MODE. */
9996 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
9998 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
9999 return build_truth_vector_type_for_mode (nunits
, new_mode
);
10002 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10003 contain a sequence of NVECTORS masks that each control a vector of type
10004 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10005 these vector masks with the vector version of SCALAR_MASK. */
10008 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
10009 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
10011 gcc_assert (nvectors
!= 0);
10012 if (masks
->length () < nvectors
)
10013 masks
->safe_grow_cleared (nvectors
, true);
10014 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
10015 /* The number of scalars per iteration and the number of vectors are
10016 both compile-time constants. */
10017 unsigned int nscalars_per_iter
10018 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
10019 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
10023 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
10024 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
10027 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
10029 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
10030 rgm
->type
= truth_type_for (vectype
);
10035 /* Given a complete set of masks MASKS, extract mask number INDEX
10036 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10037 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10039 See the comment above vec_loop_masks for more details about the mask
10043 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
10044 unsigned int nvectors
, tree vectype
, unsigned int index
)
10046 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
10047 tree mask_type
= rgm
->type
;
10049 /* Populate the rgroup's mask array, if this is the first time we've
10051 if (rgm
->controls
.is_empty ())
10053 rgm
->controls
.safe_grow_cleared (nvectors
, true);
10054 for (unsigned int i
= 0; i
< nvectors
; ++i
)
10056 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
10057 /* Provide a dummy definition until the real one is available. */
10058 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
10059 rgm
->controls
[i
] = mask
;
10063 tree mask
= rgm
->controls
[index
];
10064 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
10065 TYPE_VECTOR_SUBPARTS (vectype
)))
10067 /* A loop mask for data type X can be reused for data type Y
10068 if X has N times more elements than Y and if Y's elements
10069 are N times bigger than X's. In this case each sequence
10070 of N elements in the loop mask will be all-zero or all-one.
10071 We can then view-convert the mask so that each sequence of
10072 N elements is replaced by a single element. */
10073 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
10074 TYPE_VECTOR_SUBPARTS (vectype
)));
10075 gimple_seq seq
= NULL
;
10076 mask_type
= truth_type_for (vectype
);
10077 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
10079 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
10084 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10085 lengths for controlling an operation on VECTYPE. The operation splits
10086 each element of VECTYPE into FACTOR separate subelements, measuring the
10087 length as a number of these subelements. */
10090 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
10091 unsigned int nvectors
, tree vectype
, unsigned int factor
)
10093 gcc_assert (nvectors
!= 0);
10094 if (lens
->length () < nvectors
)
10095 lens
->safe_grow_cleared (nvectors
, true);
10096 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
10098 /* The number of scalars per iteration, scalar occupied bytes and
10099 the number of vectors are both compile-time constants. */
10100 unsigned int nscalars_per_iter
10101 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
10102 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
10104 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
10106 /* For now, we only support cases in which all loads and stores fall back
10107 to VnQI or none do. */
10108 gcc_assert (!rgl
->max_nscalars_per_iter
10109 || (rgl
->factor
== 1 && factor
== 1)
10110 || (rgl
->max_nscalars_per_iter
* rgl
->factor
10111 == nscalars_per_iter
* factor
));
10112 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
10113 rgl
->type
= vectype
;
10114 rgl
->factor
= factor
;
10118 /* Given a complete set of length LENS, extract length number INDEX for an
10119 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10122 vect_get_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
10123 unsigned int nvectors
, unsigned int index
)
10125 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
10126 bool use_bias_adjusted_len
=
10127 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) != 0;
10129 /* Populate the rgroup's len array, if this is the first time we've
10131 if (rgl
->controls
.is_empty ())
10133 rgl
->controls
.safe_grow_cleared (nvectors
, true);
10134 for (unsigned int i
= 0; i
< nvectors
; ++i
)
10136 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
10137 gcc_assert (len_type
!= NULL_TREE
);
10139 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
10141 /* Provide a dummy definition until the real one is available. */
10142 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
10143 rgl
->controls
[i
] = len
;
10145 if (use_bias_adjusted_len
)
10147 gcc_assert (i
== 0);
10148 tree adjusted_len
=
10149 make_temp_ssa_name (len_type
, NULL
, "adjusted_loop_len");
10150 SSA_NAME_DEF_STMT (adjusted_len
) = gimple_build_nop ();
10151 rgl
->bias_adjusted_ctrl
= adjusted_len
;
10156 if (use_bias_adjusted_len
)
10157 return rgl
->bias_adjusted_ctrl
;
10159 return rgl
->controls
[index
];
10162 /* Scale profiling counters by estimation for LOOP which is vectorized
10166 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
10168 edge preheader
= loop_preheader_edge (loop
);
10169 /* Reduce loop iterations by the vectorization factor. */
10170 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
10171 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
10173 if (freq_h
.nonzero_p ())
10175 profile_probability p
;
10177 /* Avoid dropping loop body profile counter to 0 because of zero count
10178 in loop's preheader. */
10179 if (!(freq_e
== profile_count::zero ()))
10180 freq_e
= freq_e
.force_nonzero ();
10181 p
= (freq_e
* (new_est_niter
+ 1)).probability_in (freq_h
);
10182 scale_loop_frequencies (loop
, p
);
10185 edge exit_e
= single_exit (loop
);
10186 exit_e
->probability
= profile_probability::always () / (new_est_niter
+ 1);
10188 edge exit_l
= single_pred_edge (loop
->latch
);
10189 profile_probability prob
= exit_l
->probability
;
10190 exit_l
->probability
= exit_e
->probability
.invert ();
10191 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
10192 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
10195 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10196 latch edge values originally defined by it. */
10199 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
10200 stmt_vec_info def_stmt_info
)
10202 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
10203 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
10205 stmt_vec_info phi_info
;
10206 imm_use_iterator iter
;
10207 use_operand_p use_p
;
10208 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
10209 if (gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
)))
10210 if (gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
10211 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
10212 && STMT_VINFO_RELEVANT_P (phi_info
)
10213 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
10214 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
10215 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
10217 loop_p loop
= gimple_bb (phi
)->loop_father
;
10218 edge e
= loop_latch_edge (loop
);
10219 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) == def
)
10221 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
10222 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
10223 gcc_assert (phi_defs
.length () == latch_defs
.length ());
10224 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
10225 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
10226 gimple_get_lhs (latch_defs
[i
]), e
,
10227 gimple_phi_arg_location (phi
, e
->dest_idx
));
10232 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10233 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10237 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
10238 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
10240 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10241 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10243 if (dump_enabled_p ())
10244 dump_printf_loc (MSG_NOTE
, vect_location
,
10245 "------>vectorizing statement: %G", stmt_info
->stmt
);
10247 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
10248 vect_loop_kill_debug_uses (loop
, stmt_info
);
10250 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10251 && !STMT_VINFO_LIVE_P (stmt_info
))
10254 if (STMT_VINFO_VECTYPE (stmt_info
))
10257 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
10258 if (!STMT_SLP_TYPE (stmt_info
)
10259 && maybe_ne (nunits
, vf
)
10260 && dump_enabled_p ())
10261 /* For SLP VF is set according to unrolling factor, and not
10262 to vector size, hence for SLP this print is not valid. */
10263 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
10266 /* Pure SLP statements have already been vectorized. We still need
10267 to apply loop vectorization to hybrid SLP statements. */
10268 if (PURE_SLP_STMT (stmt_info
))
10271 if (dump_enabled_p ())
10272 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
10274 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
10275 *seen_store
= stmt_info
;
10280 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10281 in the hash_map with its corresponding values. */
10284 find_in_mapping (tree t
, void *context
)
10286 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
10288 tree
*value
= mapping
->get (t
);
10289 return value
? *value
: t
;
10292 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10293 original loop that has now been vectorized.
10295 The inits of the data_references need to be advanced with the number of
10296 iterations of the main loop. This has been computed in vect_do_peeling and
10297 is stored in parameter ADVANCE. We first restore the data_references
10298 initial offset with the values recored in ORIG_DRS_INIT.
10300 Since the loop_vec_info of this EPILOGUE was constructed for the original
10301 loop, its stmt_vec_infos all point to the original statements. These need
10302 to be updated to point to their corresponding copies as well as the SSA_NAMES
10303 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10305 The data_reference's connections also need to be updated. Their
10306 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10307 stmt_vec_infos, their statements need to point to their corresponding copy,
10308 if they are gather loads or scatter stores then their reference needs to be
10309 updated to point to its corresponding copy and finally we set
10310 'base_misaligned' to false as we have already peeled for alignment in the
10311 prologue of the main loop. */
10314 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
10316 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
10317 auto_vec
<gimple
*> stmt_worklist
;
10318 hash_map
<tree
,tree
> mapping
;
10319 gimple
*orig_stmt
, *new_stmt
;
10320 gimple_stmt_iterator epilogue_gsi
;
10321 gphi_iterator epilogue_phi_gsi
;
10322 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
10323 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
10326 free (LOOP_VINFO_BBS (epilogue_vinfo
));
10327 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
10329 /* Advance data_reference's with the number of iterations of the previous
10330 loop and its prologue. */
10331 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
10334 /* The EPILOGUE loop is a copy of the original loop so they share the same
10335 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10336 point to the copied statements. We also create a mapping of all LHS' in
10337 the original loop and all the LHS' in the EPILOGUE and create worklists to
10338 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10339 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
10341 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
10342 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
10344 new_stmt
= epilogue_phi_gsi
.phi ();
10346 gcc_assert (gimple_uid (new_stmt
) > 0);
10348 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
10350 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
10351 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
10353 mapping
.put (gimple_phi_result (orig_stmt
),
10354 gimple_phi_result (new_stmt
));
10355 /* PHI nodes can not have patterns or related statements. */
10356 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
10357 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
10360 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
10361 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
10363 new_stmt
= gsi_stmt (epilogue_gsi
);
10364 if (is_gimple_debug (new_stmt
))
10367 gcc_assert (gimple_uid (new_stmt
) > 0);
10369 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
10371 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
10372 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
10374 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
10375 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
10377 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
10379 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
10380 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
10381 !gsi_end_p (gsi
); gsi_next (&gsi
))
10382 stmt_worklist
.safe_push (gsi_stmt (gsi
));
10385 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
10386 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
10388 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
10389 stmt_worklist
.safe_push (stmt
);
10390 /* Set BB such that the assert in
10391 'get_initial_def_for_reduction' is able to determine that
10392 the BB of the related stmt is inside this loop. */
10393 gimple_set_bb (stmt
,
10394 gimple_bb (new_stmt
));
10395 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
10396 gcc_assert (related_vinfo
== NULL
10397 || related_vinfo
== stmt_vinfo
);
10402 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10403 using the original main loop and thus need to be updated to refer to the
10404 cloned variables used in the epilogue. */
10405 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
10407 gimple
*stmt
= stmt_worklist
[i
];
10410 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
10412 tree op
= gimple_op (stmt
, j
);
10413 if ((new_op
= mapping
.get(op
)))
10414 gimple_set_op (stmt
, j
, *new_op
);
10417 /* PR92429: The last argument of simplify_replace_tree disables
10418 folding when replacing arguments. This is required as
10419 otherwise you might end up with different statements than the
10420 ones analyzed in vect_loop_analyze, leading to different
10422 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
10423 &find_in_mapping
, &mapping
, false);
10424 gimple_set_op (stmt
, j
, op
);
10429 struct data_reference
*dr
;
10430 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
10431 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
10433 orig_stmt
= DR_STMT (dr
);
10434 gcc_assert (gimple_uid (orig_stmt
) > 0);
10435 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
10436 /* Data references for gather loads and scatter stores do not use the
10437 updated offset we set using ADVANCE. Instead we have to make sure the
10438 reference in the data references point to the corresponding copy of
10439 the original in the epilogue. */
10440 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
10441 == VMAT_GATHER_SCATTER
)
10444 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
10445 &find_in_mapping
, &mapping
);
10446 DR_BASE_ADDRESS (dr
)
10447 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
10448 &find_in_mapping
, &mapping
);
10450 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
10451 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
10452 /* The vector size of the epilogue is smaller than that of the main loop
10453 so the alignment is either the same or lower. This means the dr will
10454 thus by definition be aligned. */
10455 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
10458 epilogue_vinfo
->shared
->datarefs_copy
.release ();
10459 epilogue_vinfo
->shared
->save_datarefs ();
10462 /* Function vect_transform_loop.
10464 The analysis phase has determined that the loop is vectorizable.
10465 Vectorize the loop - created vectorized stmts to replace the scalar
10466 stmts in the loop, and update the loop exit condition.
10467 Returns scalar epilogue loop if any. */
10470 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
10472 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10473 class loop
*epilogue
= NULL
;
10474 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
10475 int nbbs
= loop
->num_nodes
;
10477 tree niters_vector
= NULL_TREE
;
10478 tree step_vector
= NULL_TREE
;
10479 tree niters_vector_mult_vf
= NULL_TREE
;
10480 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10481 unsigned int lowest_vf
= constant_lower_bound (vf
);
10483 bool check_profitability
= false;
10486 DUMP_VECT_SCOPE ("vec_transform_loop");
10488 loop_vinfo
->shared
->check_datarefs ();
10490 /* Use the more conservative vectorization threshold. If the number
10491 of iterations is constant assume the cost check has been performed
10492 by our caller. If the threshold makes all loops profitable that
10493 run at least the (estimated) vectorization factor number of times
10494 checking is pointless, too. */
10495 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
10496 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
10498 if (dump_enabled_p ())
10499 dump_printf_loc (MSG_NOTE
, vect_location
,
10500 "Profitability threshold is %d loop iterations.\n",
10502 check_profitability
= true;
10505 /* Make sure there exists a single-predecessor exit bb. Do this before
10507 edge e
= single_exit (loop
);
10508 if (! single_pred_p (e
->dest
))
10510 split_loop_exit_edge (e
, true);
10511 if (dump_enabled_p ())
10512 dump_printf (MSG_NOTE
, "split exit edge\n");
10515 /* Version the loop first, if required, so the profitability check
10518 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
10521 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
10522 sloop
->force_vectorize
= false;
10523 check_profitability
= false;
10526 /* Make sure there exists a single-predecessor exit bb also on the
10527 scalar loop copy. Do this after versioning but before peeling
10528 so CFG structure is fine for both scalar and if-converted loop
10529 to make slpeel_duplicate_current_defs_from_edges face matched
10530 loop closed PHI nodes on the exit. */
10531 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
10533 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
10534 if (! single_pred_p (e
->dest
))
10536 split_loop_exit_edge (e
, true);
10537 if (dump_enabled_p ())
10538 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
10542 tree niters
= vect_build_loop_niters (loop_vinfo
);
10543 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
10544 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
10545 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
10547 drs_init_vec orig_drs_init
;
10549 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
10550 &step_vector
, &niters_vector_mult_vf
, th
,
10551 check_profitability
, niters_no_overflow
,
10554 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
10555 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
10556 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
10557 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
10559 if (niters_vector
== NULL_TREE
)
10561 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
10562 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
10563 && known_eq (lowest_vf
, vf
))
10566 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
10567 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
10568 step_vector
= build_one_cst (TREE_TYPE (niters
));
10570 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
10571 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
10572 &step_vector
, niters_no_overflow
);
10574 /* vect_do_peeling subtracted the number of peeled prologue
10575 iterations from LOOP_VINFO_NITERS. */
10576 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
10577 &niters_vector
, &step_vector
,
10578 niters_no_overflow
);
10581 /* 1) Make sure the loop header has exactly two entries
10582 2) Make sure we have a preheader basic block. */
10584 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
10586 split_edge (loop_preheader_edge (loop
));
10588 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
10589 /* This will deal with any possible peeling. */
10590 vect_prepare_for_masked_peels (loop_vinfo
);
10592 /* Schedule the SLP instances first, then handle loop vectorization
10594 if (!loop_vinfo
->slp_instances
.is_empty ())
10596 DUMP_VECT_SCOPE ("scheduling SLP instances");
10597 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
10600 /* FORNOW: the vectorizer supports only loops which body consist
10601 of one basic block (header + empty latch). When the vectorizer will
10602 support more involved loop forms, the order by which the BBs are
10603 traversed need to be reconsidered. */
10605 for (i
= 0; i
< nbbs
; i
++)
10607 basic_block bb
= bbs
[i
];
10608 stmt_vec_info stmt_info
;
10610 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
10613 gphi
*phi
= si
.phi ();
10614 if (dump_enabled_p ())
10615 dump_printf_loc (MSG_NOTE
, vect_location
,
10616 "------>vectorizing phi: %G", (gimple
*) phi
);
10617 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
10621 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
10622 vect_loop_kill_debug_uses (loop
, stmt_info
);
10624 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10625 && !STMT_VINFO_LIVE_P (stmt_info
))
10628 if (STMT_VINFO_VECTYPE (stmt_info
)
10630 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
10631 && dump_enabled_p ())
10632 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
10634 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
10635 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
10636 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
10637 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
10638 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
10639 && ! PURE_SLP_STMT (stmt_info
))
10641 if (dump_enabled_p ())
10642 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
10643 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
10647 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
10650 gphi
*phi
= si
.phi ();
10651 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
10655 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10656 && !STMT_VINFO_LIVE_P (stmt_info
))
10659 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
10660 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
10661 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
10662 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
10663 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
10664 && ! PURE_SLP_STMT (stmt_info
))
10665 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
10668 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
10671 stmt
= gsi_stmt (si
);
10672 /* During vectorization remove existing clobber stmts. */
10673 if (gimple_clobber_p (stmt
))
10675 unlink_stmt_vdef (stmt
);
10676 gsi_remove (&si
, true);
10677 release_defs (stmt
);
10681 /* Ignore vector stmts created in the outer loop. */
10682 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
10684 /* vector stmts created in the outer-loop during vectorization of
10685 stmts in an inner-loop may not have a stmt_info, and do not
10686 need to be vectorized. */
10687 stmt_vec_info seen_store
= NULL
;
10690 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
10692 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
10693 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
10694 !gsi_end_p (subsi
); gsi_next (&subsi
))
10696 stmt_vec_info pat_stmt_info
10697 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
10698 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
10701 stmt_vec_info pat_stmt_info
10702 = STMT_VINFO_RELATED_STMT (stmt_info
);
10703 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
10705 maybe_set_vectorized_backedge_value (loop_vinfo
,
10710 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
10712 maybe_set_vectorized_backedge_value (loop_vinfo
,
10719 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
10720 /* Interleaving. If IS_STORE is TRUE, the
10721 vectorization of the interleaving chain was
10722 completed - free all the stores in the chain. */
10723 vect_remove_stores (loop_vinfo
,
10724 DR_GROUP_FIRST_ELEMENT (seen_store
));
10726 /* Free the attached stmt_vec_info and remove the stmt. */
10727 loop_vinfo
->remove_stmt (stmt_info
);
10732 /* Stub out scalar statements that must not survive vectorization.
10733 Doing this here helps with grouped statements, or statements that
10734 are involved in patterns. */
10735 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
10736 !gsi_end_p (gsi
); gsi_next (&gsi
))
10738 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
10739 if (!call
|| !gimple_call_internal_p (call
))
10741 internal_fn ifn
= gimple_call_internal_fn (call
);
10742 if (ifn
== IFN_MASK_LOAD
)
10744 tree lhs
= gimple_get_lhs (call
);
10745 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
10747 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
10748 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
10749 gsi_replace (&gsi
, new_stmt
, true);
10752 else if (conditional_internal_fn_code (ifn
) != ERROR_MARK
)
10754 tree lhs
= gimple_get_lhs (call
);
10755 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
10758 = gimple_call_arg (call
, gimple_call_num_args (call
) - 1);
10759 gimple
*new_stmt
= gimple_build_assign (lhs
, else_arg
);
10760 gsi_replace (&gsi
, new_stmt
, true);
10764 } /* BBs in loop */
10766 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
10767 a zero NITERS becomes a nonzero NITERS_VECTOR. */
10768 if (integer_onep (step_vector
))
10769 niters_no_overflow
= true;
10770 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
10771 niters_vector_mult_vf
, !niters_no_overflow
);
10773 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
10774 scale_profile_for_vect_loop (loop
, assumed_vf
);
10776 /* True if the final iteration might not handle a full vector's
10777 worth of scalar iterations. */
10778 bool final_iter_may_be_partial
10779 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
);
10780 /* The minimum number of iterations performed by the epilogue. This
10781 is 1 when peeling for gaps because we always need a final scalar
10783 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
10784 /* +1 to convert latch counts to loop iteration counts,
10785 -min_epilogue_iters to remove iterations that cannot be performed
10786 by the vector code. */
10787 int bias_for_lowest
= 1 - min_epilogue_iters
;
10788 int bias_for_assumed
= bias_for_lowest
;
10789 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
10790 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
10792 /* When the amount of peeling is known at compile time, the first
10793 iteration will have exactly alignment_npeels active elements.
10794 In the worst case it will have at least one. */
10795 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
10796 bias_for_lowest
+= lowest_vf
- min_first_active
;
10797 bias_for_assumed
+= assumed_vf
- min_first_active
;
10799 /* In these calculations the "- 1" converts loop iteration counts
10800 back to latch counts. */
10801 if (loop
->any_upper_bound
)
10803 loop_vec_info main_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
10804 loop
->nb_iterations_upper_bound
10805 = (final_iter_may_be_partial
10806 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
10808 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
10811 /* Both peeling for alignment and peeling for gaps can end up
10812 with the scalar epilogue running for more than VF-1 iterations. */
10813 && !main_vinfo
->peeling_for_alignment
10814 && !main_vinfo
->peeling_for_gaps
)
10816 unsigned int bound
;
10817 poly_uint64 main_iters
10818 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo
),
10819 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo
));
10821 = upper_bound (main_iters
,
10822 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo
));
10823 if (can_div_away_from_zero_p (main_iters
,
10824 LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
10826 loop
->nb_iterations_upper_bound
10827 = wi::umin ((widest_int
) (bound
- 1),
10828 loop
->nb_iterations_upper_bound
);
10831 if (loop
->any_likely_upper_bound
)
10832 loop
->nb_iterations_likely_upper_bound
10833 = (final_iter_may_be_partial
10834 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
10835 + bias_for_lowest
, lowest_vf
) - 1
10836 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
10837 + bias_for_lowest
, lowest_vf
) - 1);
10838 if (loop
->any_estimate
)
10839 loop
->nb_iterations_estimate
10840 = (final_iter_may_be_partial
10841 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
10843 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
10846 if (dump_enabled_p ())
10848 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
10850 dump_printf_loc (MSG_NOTE
, vect_location
,
10851 "LOOP VECTORIZED\n");
10853 dump_printf_loc (MSG_NOTE
, vect_location
,
10854 "OUTER LOOP VECTORIZED\n");
10855 dump_printf (MSG_NOTE
, "\n");
10858 dump_printf_loc (MSG_NOTE
, vect_location
,
10859 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10860 GET_MODE_NAME (loop_vinfo
->vector_mode
));
10863 /* Loops vectorized with a variable factor won't benefit from
10864 unrolling/peeling. */
10865 if (!vf
.is_constant ())
10868 if (dump_enabled_p ())
10869 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
10870 " variable-length vectorization factor\n");
10872 /* Free SLP instances here because otherwise stmt reference counting
10874 slp_instance instance
;
10875 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
10876 vect_free_slp_instance (instance
);
10877 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
10878 /* Clear-up safelen field since its value is invalid after vectorization
10879 since vectorized loop can have loop-carried dependencies. */
10884 update_epilogue_loop_vinfo (epilogue
, advance
);
10886 epilogue
->simduid
= loop
->simduid
;
10887 epilogue
->force_vectorize
= loop
->force_vectorize
;
10888 epilogue
->dont_vectorize
= false;
10894 /* The code below is trying to perform simple optimization - revert
10895 if-conversion for masked stores, i.e. if the mask of a store is zero
10896 do not perform it and all stored value producers also if possible.
10898 for (i=0; i<n; i++)
10904 this transformation will produce the following semi-hammock:
10906 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10908 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10909 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10910 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10911 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10912 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10913 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10918 optimize_mask_stores (class loop
*loop
)
10920 basic_block
*bbs
= get_loop_body (loop
);
10921 unsigned nbbs
= loop
->num_nodes
;
10924 class loop
*bb_loop
;
10925 gimple_stmt_iterator gsi
;
10927 auto_vec
<gimple
*> worklist
;
10928 auto_purge_vect_location sentinel
;
10930 vect_location
= find_loop_location (loop
);
10931 /* Pick up all masked stores in loop if any. */
10932 for (i
= 0; i
< nbbs
; i
++)
10935 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
10938 stmt
= gsi_stmt (gsi
);
10939 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
10940 worklist
.safe_push (stmt
);
10945 if (worklist
.is_empty ())
10948 /* Loop has masked stores. */
10949 while (!worklist
.is_empty ())
10951 gimple
*last
, *last_store
;
10954 basic_block store_bb
, join_bb
;
10955 gimple_stmt_iterator gsi_to
;
10956 tree vdef
, new_vdef
;
10961 last
= worklist
.pop ();
10962 mask
= gimple_call_arg (last
, 2);
10963 bb
= gimple_bb (last
);
10964 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10965 the same loop as if_bb. It could be different to LOOP when two
10966 level loop-nest is vectorized and mask_store belongs to the inner
10968 e
= split_block (bb
, last
);
10969 bb_loop
= bb
->loop_father
;
10970 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
10972 store_bb
= create_empty_bb (bb
);
10973 add_bb_to_loop (store_bb
, bb_loop
);
10974 e
->flags
= EDGE_TRUE_VALUE
;
10975 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
10976 /* Put STORE_BB to likely part. */
10977 efalse
->probability
= profile_probability::unlikely ();
10978 store_bb
->count
= efalse
->count ();
10979 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
10980 if (dom_info_available_p (CDI_DOMINATORS
))
10981 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
10982 if (dump_enabled_p ())
10983 dump_printf_loc (MSG_NOTE
, vect_location
,
10984 "Create new block %d to sink mask stores.",
10986 /* Create vector comparison with boolean result. */
10987 vectype
= TREE_TYPE (mask
);
10988 zero
= build_zero_cst (vectype
);
10989 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
10990 gsi
= gsi_last_bb (bb
);
10991 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
10992 /* Create new PHI node for vdef of the last masked store:
10993 .MEM_2 = VDEF <.MEM_1>
10994 will be converted to
10995 .MEM.3 = VDEF <.MEM_1>
10996 and new PHI node will be created in join bb
10997 .MEM_2 = PHI <.MEM_1, .MEM_3>
10999 vdef
= gimple_vdef (last
);
11000 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
11001 gimple_set_vdef (last
, new_vdef
);
11002 phi
= create_phi_node (vdef
, join_bb
);
11003 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
11005 /* Put all masked stores with the same mask to STORE_BB if possible. */
11008 gimple_stmt_iterator gsi_from
;
11009 gimple
*stmt1
= NULL
;
11011 /* Move masked store to STORE_BB. */
11013 gsi
= gsi_for_stmt (last
);
11015 /* Shift GSI to the previous stmt for further traversal. */
11017 gsi_to
= gsi_start_bb (store_bb
);
11018 gsi_move_before (&gsi_from
, &gsi_to
);
11019 /* Setup GSI_TO to the non-empty block start. */
11020 gsi_to
= gsi_start_bb (store_bb
);
11021 if (dump_enabled_p ())
11022 dump_printf_loc (MSG_NOTE
, vect_location
,
11023 "Move stmt to created bb\n%G", last
);
11024 /* Move all stored value producers if possible. */
11025 while (!gsi_end_p (gsi
))
11028 imm_use_iterator imm_iter
;
11029 use_operand_p use_p
;
11032 /* Skip debug statements. */
11033 if (is_gimple_debug (gsi_stmt (gsi
)))
11038 stmt1
= gsi_stmt (gsi
);
11039 /* Do not consider statements writing to memory or having
11040 volatile operand. */
11041 if (gimple_vdef (stmt1
)
11042 || gimple_has_volatile_ops (stmt1
))
11046 lhs
= gimple_get_lhs (stmt1
);
11050 /* LHS of vectorized stmt must be SSA_NAME. */
11051 if (TREE_CODE (lhs
) != SSA_NAME
)
11054 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
11056 /* Remove dead scalar statement. */
11057 if (has_zero_uses (lhs
))
11059 gsi_remove (&gsi_from
, true);
11064 /* Check that LHS does not have uses outside of STORE_BB. */
11066 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
11069 use_stmt
= USE_STMT (use_p
);
11070 if (is_gimple_debug (use_stmt
))
11072 if (gimple_bb (use_stmt
) != store_bb
)
11081 if (gimple_vuse (stmt1
)
11082 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
11085 /* Can move STMT1 to STORE_BB. */
11086 if (dump_enabled_p ())
11087 dump_printf_loc (MSG_NOTE
, vect_location
,
11088 "Move stmt to created bb\n%G", stmt1
);
11089 gsi_move_before (&gsi_from
, &gsi_to
);
11090 /* Shift GSI_TO for further insertion. */
11091 gsi_prev (&gsi_to
);
11093 /* Put other masked stores with the same mask to STORE_BB. */
11094 if (worklist
.is_empty ()
11095 || gimple_call_arg (worklist
.last (), 2) != mask
11096 || worklist
.last () != stmt1
)
11098 last
= worklist
.pop ();
11100 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
11104 /* Decide whether it is possible to use a zero-based induction variable
11105 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11106 the value that the induction variable must be able to hold in order
11107 to ensure that the rgroups eventually have no active vector elements.
11108 Return -1 otherwise. */
11111 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
11113 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
11114 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11115 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
11117 /* Calculate the value that the induction variable must be able
11118 to hit in order to ensure that we end the loop with an all-false mask.
11119 This involves adding the maximum number of inactive trailing scalar
11121 widest_int iv_limit
= -1;
11122 if (max_loop_iterations (loop
, &iv_limit
))
11126 /* Add the maximum number of skipped iterations to the
11127 maximum iteration count. */
11128 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
11129 iv_limit
+= wi::to_widest (niters_skip
);
11131 iv_limit
+= max_vf
- 1;
11133 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
11134 /* Make a conservatively-correct assumption. */
11135 iv_limit
+= max_vf
- 1;
11137 /* IV_LIMIT is the maximum number of latch iterations, which is also
11138 the maximum in-range IV value. Round this value down to the previous
11139 vector alignment boundary and then add an extra full iteration. */
11140 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
11141 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
11146 /* For the given rgroup_controls RGC, check whether an induction variable
11147 would ever hit a value that produces a set of all-false masks or zero
11148 lengths before wrapping around. Return true if it's possible to wrap
11149 around before hitting the desirable value, otherwise return false. */
11152 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
11154 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
11156 if (iv_limit
== -1)
11159 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
11160 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
11161 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
11163 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)