2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
32 #include "tree-pass.h"
34 #include "optabs-tree.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 for (i=0; i<N/8; i++){
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *,
162 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
170 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
171 bool vectype_maybe_set_p
,
174 gimple
*stmt
= stmt_info
->stmt
;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
177 && !STMT_VINFO_LIVE_P (stmt_info
))
178 || gimple_clobber_p (stmt
))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype
, nunits_vectype
;
186 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
194 if (STMT_VINFO_VECTYPE (stmt_info
))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
199 || vectype_maybe_set_p
)
200 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
202 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
206 vect_update_max_nunits (vf
, nunits_vectype
);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
217 vect_determine_vf_for_stmt (vec_info
*vinfo
,
218 stmt_vec_info stmt_info
, poly_uint64
*vf
)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
223 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
228 && STMT_VINFO_RELATED_STMT (stmt_info
))
230 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
231 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
235 !gsi_end_p (si
); gsi_next (&si
))
237 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE
, vect_location
,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info
->stmt
);
242 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE
, vect_location
,
249 "==> examining pattern statement: %G",
251 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
287 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
288 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
289 unsigned nbbs
= loop
->num_nodes
;
290 poly_uint64 vectorization_factor
= 1;
291 tree scalar_type
= NULL_TREE
;
294 stmt_vec_info stmt_info
;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i
= 0; i
< nbbs
; i
++)
301 basic_block bb
= bbs
[i
];
303 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
307 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
312 gcc_assert (stmt_info
);
314 if (STMT_VINFO_RELEVANT_P (stmt_info
)
315 || STMT_VINFO_LIVE_P (stmt_info
))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
318 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE
, vect_location
,
322 "get vectype for scalar type: %T\n",
325 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
327 return opt_result::failure_at (phi
,
328 "not vectorized: unsupported "
331 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
340 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
341 dump_printf (MSG_NOTE
, "\n");
344 vect_update_max_nunits (&vectorization_factor
, vectype
);
348 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
351 if (is_gimple_debug (gsi_stmt (si
)))
353 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
355 = vect_determine_vf_for_stmt (loop_vinfo
,
356 stmt_info
, &vectorization_factor
);
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
366 dump_dec (MSG_NOTE
, vectorization_factor
);
367 dump_printf (MSG_NOTE
, "\n");
370 if (known_le (vectorization_factor
, 1U))
371 return opt_result::failure_at (vect_location
,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
384 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
389 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
392 /* When there is no evolution in this loop, the evolution function
394 if (evolution_part
== NULL_TREE
)
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part
))
402 step_expr
= evolution_part
;
403 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
407 step_expr
, init_expr
);
412 if (TREE_CODE (step_expr
) != INTEGER_CST
413 && (TREE_CODE (step_expr
) != SSA_NAME
414 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
415 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
418 || !flag_associative_math
)))
419 && (TREE_CODE (step_expr
) != REAL_CST
420 || !flag_associative_math
))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
440 vect_is_nonlinear_iv_evolution (class loop
* loop
, stmt_vec_info stmt_info
,
441 gphi
* loop_phi_node
, tree
*init
, tree
*step
)
443 tree init_expr
, ev_expr
, result
, op1
, op2
;
446 if (gimple_phi_num_args (loop_phi_node
) != 2)
449 init_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_preheader_edge (loop
));
450 ev_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_latch_edge (loop
));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr
)))
457 result
= PHI_RESULT (loop_phi_node
);
459 if (TREE_CODE (ev_expr
) != SSA_NAME
460 || ((def
= SSA_NAME_DEF_STMT (ev_expr
)), false)
461 || !is_gimple_assign (def
))
464 enum tree_code t_code
= gimple_assign_rhs_code (def
);
468 if (gimple_assign_rhs1 (def
) != result
)
470 *step
= build_int_cst (TREE_TYPE (init_expr
), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_neg
;
477 op1
= gimple_assign_rhs1 (def
);
478 op2
= gimple_assign_rhs2 (def
);
479 if (TREE_CODE (op2
) != INTEGER_CST
483 if (t_code
== LSHIFT_EXPR
)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shl
;
485 else if (t_code
== RSHIFT_EXPR
)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shr
;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_mul
;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info
) = *init
;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
) = *step
;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
507 x_1 = PHI <x_4(outer2), ...>;
511 x_2 = PHI <x_1(outer1), ...>;
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
528 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
529 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
530 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo
, class loop
*loop
,
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo
) != loop
)
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch
= loop_latch_edge (loop
);
550 tree ldef
= PHI_ARG_DEF_FROM_EDGE (phi
, latch
);
551 if (TREE_CODE (ldef
) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef
)
553 || is_a
<gphi
*> (SSA_NAME_DEF_STMT (ldef
))
554 || !flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (ldef
))))
557 tree def
= gimple_phi_result (phi
);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
561 imm_use_iterator imm_iter
;
563 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, def
)
564 if (!is_gimple_debug (USE_STMT (use_p
))
565 && (SSA_NAME_DEF_STMT (ldef
) == USE_STMT (use_p
)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef
),
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type
= TREE_TYPE (def
);
572 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
,
591 basic_block bb
= loop
->header
;
593 auto_vec
<stmt_vec_info
, 64> worklist
;
595 bool double_reduc
, reduc_chain
;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
602 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
604 gphi
*phi
= gsi
.phi ();
605 tree access_fn
= NULL
;
606 tree def
= PHI_RESULT (phi
);
607 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def
))
618 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
620 /* Analyze the evolution function. */
621 access_fn
= analyze_scalar_evolution (loop
, def
);
624 STRIP_NOPS (access_fn
);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE
, vect_location
,
627 "Access function of PHI: %T\n", access_fn
);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
629 = initial_condition_in_loop_num (access_fn
, loop
->num
);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
631 = evolution_part_in_loop_num (access_fn
, loop
->num
);
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
636 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
,
638 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
639 && TREE_CODE (step
) != INTEGER_CST
))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo
) != loop
642 || !vect_is_nonlinear_iv_evolution (loop
, stmt_vinfo
,
645 worklist
.safe_push (stmt_vinfo
);
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist
.length () > 0)
662 stmt_vec_info stmt_vinfo
= worklist
.pop ();
663 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
664 tree def
= PHI_RESULT (phi
);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
670 gcc_assert (!virtual_operand_p (def
)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
678 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE
, vect_location
,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
691 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE
, vect_location
,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE
, vect_location
,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
711 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo
, loop
, phi
))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_first_order_recurrence
;
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
, bool slp
)
750 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
752 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
, slp
);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
764 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
, slp
);
767 /* Transfer group and reduction information from STMT_INFO to its
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
773 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
777 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
780 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
782 == STMT_VINFO_DEF_TYPE (stmt_info
));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
784 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
786 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
787 = STMT_VINFO_RELATED_STMT (stmt_info
);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
802 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
805 if ((STMT_VINFO_IN_PATTERN_P (next
)
806 != STMT_VINFO_IN_PATTERN_P (first
))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
809 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first
))
818 vect_fixup_reduc_chain (first
);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
820 = STMT_VINFO_RELATED_STMT (first
);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
828 stmt_vec_info vinfo
= first
;
829 stmt_vec_info last
= NULL
;
832 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
840 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
858 vect_get_loop_niters (class loop
*loop
, const_edge main_exit
, tree
*assumptions
,
859 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
861 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
863 conds
.create (exits
.length ());
864 class tree_niter_desc niter_desc
;
865 tree niter_assumptions
, niter
, may_be_zero
;
867 *assumptions
= boolean_true_node
;
868 *number_of_iterationsm1
= chrec_dont_know
;
869 *number_of_iterations
= chrec_dont_know
;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits
.is_empty ())
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE
, vect_location
, "Loop has %d exits.\n",
882 FOR_EACH_VEC_ELT (exits
, i
, exit
)
884 gcond
*cond
= get_loop_exit_condition (exit
);
886 conds
.safe_push (cond
);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyzing exit %d...\n", i
);
891 if (exit
!= main_exit
)
894 may_be_zero
= NULL_TREE
;
895 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
896 || chrec_contains_undetermined (niter_desc
.niter
))
899 niter_assumptions
= niter_desc
.assumptions
;
900 may_be_zero
= niter_desc
.may_be_zero
;
901 niter
= niter_desc
.niter
;
903 if (may_be_zero
&& integer_zerop (may_be_zero
))
904 may_be_zero
= NULL_TREE
;
908 if (COMPARISON_CLASS_P (may_be_zero
))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
913 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
915 fold_build1 (TRUTH_NOT_EXPR
,
919 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
920 build_int_cst (TREE_TYPE (niter
), 0),
921 rewrite_to_non_trapping_overflow (niter
));
923 may_be_zero
= NULL_TREE
;
925 else if (integer_nonzerop (may_be_zero
))
927 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
928 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions
= niter_assumptions
;
937 *number_of_iterationsm1
= niter
;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter
&& !chrec_contains_undetermined (niter
))
945 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
),
946 unshare_expr (niter
),
947 build_int_cst (TREE_TYPE (niter
), 1));
948 if (TREE_CODE (niter
) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1
) != INTEGER_CST
)
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), niter
,
957 build_minus_one_cst (TREE_TYPE (niter
)));
960 *number_of_iterations
= niter
;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE
, vect_location
, "All loop exits successfully analyzed.\n");
969 /* Determine the main loop exit for the vectorizer. */
972 vec_init_loop_exit_info (class loop
*loop
)
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
977 if (exits
.length () == 1)
980 /* If we have multiple exits we only support counting IV at the moment.
981 Analyze all exits and return the last one we can analyze. */
982 class tree_niter_desc niter_desc
;
983 edge candidate
= NULL
;
984 for (edge exit
: exits
)
986 if (!get_loop_exit_condition (exit
))
989 if (number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
990 && !chrec_contains_undetermined (niter_desc
.niter
))
992 tree may_be_zero
= niter_desc
.may_be_zero
;
993 if ((integer_zerop (may_be_zero
)
994 /* As we are handling may_be_zero that's not false by
995 rewriting niter to may_be_zero ? 0 : niter we require
997 || (single_pred_p (loop
->latch
)
998 && exit
->src
== single_pred (loop
->latch
)
999 && (integer_nonzerop (may_be_zero
)
1000 || COMPARISON_CLASS_P (may_be_zero
))))
1002 || dominated_by_p (CDI_DOMINATORS
, exit
->src
,
1011 /* Function bb_in_loop_p
1013 Used as predicate for dfs order traversal of the loop bbs. */
1016 bb_in_loop_p (const_basic_block bb
, const void *data
)
1018 const class loop
*const loop
= (const class loop
*)data
;
1019 if (flow_bb_inside_loop_p (loop
, bb
))
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026 stmt_vec_info structs for all the stmts in LOOP_IN. */
1028 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
1029 : vec_info (vec_info::loop
, shared
),
1031 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
1032 num_itersm1 (NULL_TREE
),
1033 num_iters (NULL_TREE
),
1034 num_iters_unchanged (NULL_TREE
),
1035 num_iters_assumptions (NULL_TREE
),
1036 vector_costs (nullptr),
1037 scalar_costs (nullptr),
1039 versioning_threshold (0),
1040 vectorization_factor (0),
1041 main_loop_edge (nullptr),
1042 skip_main_loop_edge (nullptr),
1043 skip_this_loop_edge (nullptr),
1044 reusable_accumulators (),
1045 suggested_unroll_factor (1),
1046 max_vectorization_factor (0),
1047 mask_skip_niters (NULL_TREE
),
1048 rgroup_compare_type (NULL_TREE
),
1049 simd_if_cond (NULL_TREE
),
1050 partial_vector_style (vect_partial_vectors_none
),
1051 unaligned_dr (NULL
),
1052 peeling_for_alignment (0),
1056 slp_unrolling_factor (1),
1057 inner_loop_cost_factor (param_vect_inner_loop_cost_factor
),
1058 vectorizable (false),
1059 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
1060 using_partial_vectors_p (false),
1061 using_decrementing_iv_p (false),
1062 using_select_vl_p (false),
1063 epil_using_partial_vectors_p (false),
1064 partial_load_store_bias (0),
1065 peeling_for_gaps (false),
1066 peeling_for_niter (false),
1067 early_breaks (false),
1068 no_data_dependencies (false),
1069 has_mask_store (false),
1070 scalar_loop_scaling (profile_probability::uninitialized ()),
1072 orig_loop_info (NULL
),
1073 vec_loop_iv_exit (NULL
),
1074 vec_epilogue_loop_iv_exit (NULL
),
1075 scalar_loop_iv_exit (NULL
)
1077 /* CHECKME: We want to visit all BBs before their successors (except for
1078 latch blocks, for which this assertion wouldn't hold). In the simple
1079 case of the loop forms we allow, a dfs order of the BBs would the same
1080 as reversed postorder traversal, so we are safe. */
1082 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
1083 bbs
, loop
->num_nodes
, loop
);
1084 gcc_assert (nbbs
== loop
->num_nodes
);
1086 for (unsigned int i
= 0; i
< nbbs
; i
++)
1088 basic_block bb
= bbs
[i
];
1089 gimple_stmt_iterator si
;
1091 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1093 gimple
*phi
= gsi_stmt (si
);
1094 gimple_set_uid (phi
, 0);
1098 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1100 gimple
*stmt
= gsi_stmt (si
);
1101 gimple_set_uid (stmt
, 0);
1102 if (is_gimple_debug (stmt
))
1105 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106 third argument is the #pragma omp simd if (x) condition, when 0,
1107 loop shouldn't be vectorized, when non-zero constant, it should
1108 be vectorized normally, otherwise versioned with vectorized loop
1109 done if the condition is non-zero at runtime. */
1110 if (loop_in
->simduid
1111 && is_gimple_call (stmt
)
1112 && gimple_call_internal_p (stmt
)
1113 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
1114 && gimple_call_num_args (stmt
) >= 3
1115 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
1116 && (loop_in
->simduid
1117 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
1119 tree arg
= gimple_call_arg (stmt
, 2);
1120 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
1123 gcc_assert (integer_nonzerop (arg
));
1128 epilogue_vinfos
.create (6);
1131 /* Free all levels of rgroup CONTROLS. */
1134 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
1136 rgroup_controls
*rgc
;
1138 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
1139 rgc
->controls
.release ();
1140 controls
->release ();
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144 stmt_vec_info structs of all the stmts in the loop. */
1146 _loop_vec_info::~_loop_vec_info ()
1150 release_vec_loop_controls (&masks
.rgc_vec
);
1151 release_vec_loop_controls (&lens
);
1154 epilogue_vinfos
.release ();
1155 delete scalar_costs
;
1156 delete vector_costs
;
1158 /* When we release an epiloge vinfo that we do not intend to use
1159 avoid clearing AUX of the main loop which should continue to
1160 point to the main loop vinfo since otherwise we'll leak that. */
1161 if (loop
->aux
== this)
1165 /* Return an invariant or register for EXPR and emit necessary
1166 computations in the LOOP_VINFO loop preheader. */
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
1171 if (is_gimple_reg (expr
)
1172 || is_gimple_min_invariant (expr
))
1175 if (! loop_vinfo
->ivexpr_map
)
1176 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
1177 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
1180 gimple_seq stmts
= NULL
;
1181 cached
= force_gimple_operand (unshare_expr (expr
),
1182 &stmts
, true, NULL_TREE
);
1185 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1186 gsi_insert_seq_on_edge_immediate (e
, stmts
);
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193 all masks required to mask LOOP_VINFO. */
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1198 rgroup_controls
*rgm
;
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
, i
, rgm
)
1201 if (rgm
->type
!= NULL_TREE
1202 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1203 cmp_type
, rgm
->type
,
1204 OPTIMIZE_FOR_SPEED
))
1209 /* Calculate the maximum number of scalars per iteration for every
1210 rgroup in LOOP_VINFO. */
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1215 unsigned int res
= 1;
1217 rgroup_controls
*rgm
;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
, i
, rgm
)
1219 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1223 /* Calculate the minimum precision necessary to represent:
1227 as an unsigned integer, where MAX_NITERS is the maximum number of
1228 loop header iterations for the original scalar form of LOOP_VINFO. */
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1233 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1235 /* Get the maximum number of iterations that is representable
1236 in the counter type. */
1237 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1238 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1240 /* Get a more refined estimate for the number of iterations. */
1241 widest_int max_back_edges
;
1242 if (max_loop_iterations (loop
, &max_back_edges
))
1243 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1245 /* Work out how many bits we need to represent the limit. */
1246 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1249 /* True if the loop needs peeling or partial vectors when vectorized. */
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1254 unsigned HOST_WIDE_INT const_vf
;
1255 HOST_WIDE_INT max_niter
1256 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1258 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1259 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1260 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1263 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1264 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1266 /* Work out the (constant) number of iterations that need to be
1267 peeled for reasons other than niters. */
1268 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1269 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1271 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1272 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1275 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1276 /* ??? When peeling for gaps but not alignment, we could
1277 try to check whether the (variable) niters is known to be
1278 VF * N + 1. That's something of a niche case though. */
1279 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1280 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1281 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1282 < (unsigned) exact_log2 (const_vf
))
1283 /* In case of versioning, check if the maximum number of
1284 iterations is greater than th. If they are identical,
1285 the epilogue is unnecessary. */
1286 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1287 || ((unsigned HOST_WIDE_INT
) max_niter
1288 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289 but that's only computed later based on our result.
1290 The following is the most conservative approximation. */
1291 > (std::max ((unsigned HOST_WIDE_INT
) th
,
1292 const_vf
) / const_vf
) * const_vf
))))
1298 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1299 whether we can actually generate the masks required. Return true if so,
1300 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1303 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1305 unsigned int min_ni_width
;
1307 /* Use a normal loop if there are no statements that need masking.
1308 This only happens in rare degenerate cases: it means that the loop
1309 has no loads, no stores, and no live-out values. */
1310 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1313 /* Produce the rgroup controls. */
1314 for (auto mask
: LOOP_VINFO_MASKS (loop_vinfo
).mask_set
)
1316 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
1317 tree vectype
= mask
.first
;
1318 unsigned nvectors
= mask
.second
;
1320 if (masks
->rgc_vec
.length () < nvectors
)
1321 masks
->rgc_vec
.safe_grow_cleared (nvectors
, true);
1322 rgroup_controls
*rgm
= &(*masks
).rgc_vec
[nvectors
- 1];
1323 /* The number of scalars per iteration and the number of vectors are
1324 both compile-time constants. */
1325 unsigned int nscalars_per_iter
1326 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
1327 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
1329 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
1331 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
1332 rgm
->type
= truth_type_for (vectype
);
1337 unsigned int max_nscalars_per_iter
1338 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1340 /* Work out how many bits we need to represent the limit. */
1342 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1344 /* Find a scalar mode for which WHILE_ULT is supported. */
1345 opt_scalar_int_mode cmp_mode_iter
;
1346 tree cmp_type
= NULL_TREE
;
1347 tree iv_type
= NULL_TREE
;
1348 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1349 unsigned int iv_precision
= UINT_MAX
;
1352 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1355 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1357 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1358 if (cmp_bits
>= min_ni_width
1359 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1361 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1363 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1365 /* Although we could stop as soon as we find a valid mode,
1366 there are at least two reasons why that's not always the
1369 - An IV that's Pmode or wider is more likely to be reusable
1370 in address calculations than an IV that's narrower than
1373 - Doing the comparison in IV_PRECISION or wider allows
1374 a natural 0-based IV, whereas using a narrower comparison
1375 type requires mitigations against wrap-around.
1377 Conversely, if the IV limit is variable, doing the comparison
1378 in a wider type than the original type can introduce
1379 unnecessary extensions, so picking the widest valid mode
1380 is not always a good choice either.
1382 Here we prefer the first IV type that's Pmode or wider,
1383 and the first comparison type that's IV_PRECISION or wider.
1384 (The comparison type must be no wider than the IV type,
1385 to avoid extensions in the vector loop.)
1387 ??? We might want to try continuing beyond Pmode for ILP32
1388 targets if CMP_BITS < IV_PRECISION. */
1389 iv_type
= this_type
;
1390 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1391 cmp_type
= this_type
;
1392 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1400 LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.release ();
1404 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1405 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1406 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_while_ult
;
1410 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1411 whether we can actually generate AVX512 style masks. Return true if so,
1412 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo
)
1417 /* Produce differently organized rgc_vec and differently check
1418 we can produce masks. */
1420 /* Use a normal loop if there are no statements that need masking.
1421 This only happens in rare degenerate cases: it means that the loop
1422 has no loads, no stores, and no live-out values. */
1423 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1426 /* For the decrementing IV we need to represent all values in
1427 [0, niter + niter_skip] where niter_skip is the elements we
1428 skip in the first iteration for prologue peeling. */
1429 tree iv_type
= NULL_TREE
;
1430 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1431 unsigned int iv_precision
= UINT_MAX
;
1433 iv_precision
= wi::min_precision (iv_limit
, UNSIGNED
);
1435 /* First compute the type for the IV we use to track the remaining
1436 scalar iterations. */
1437 opt_scalar_int_mode cmp_mode_iter
;
1438 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1440 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1441 if (cmp_bits
>= iv_precision
1442 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1444 iv_type
= build_nonstandard_integer_type (cmp_bits
, true);
1452 /* Produce the rgroup controls. */
1453 for (auto const &mask
: LOOP_VINFO_MASKS (loop_vinfo
).mask_set
)
1455 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
1456 tree vectype
= mask
.first
;
1457 unsigned nvectors
= mask
.second
;
1459 /* The number of scalars per iteration and the number of vectors are
1460 both compile-time constants. */
1461 unsigned int nscalars_per_iter
1462 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
1463 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
1465 /* We index the rgroup_controls vector with nscalars_per_iter
1466 which we keep constant and instead have a varying nvectors,
1467 remembering the vector mask with the fewest nV. */
1468 if (masks
->rgc_vec
.length () < nscalars_per_iter
)
1469 masks
->rgc_vec
.safe_grow_cleared (nscalars_per_iter
, true);
1470 rgroup_controls
*rgm
= &(*masks
).rgc_vec
[nscalars_per_iter
- 1];
1472 if (!rgm
->type
|| rgm
->factor
> nvectors
)
1474 rgm
->type
= truth_type_for (vectype
);
1475 rgm
->compare_type
= NULL_TREE
;
1476 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
1477 rgm
->factor
= nvectors
;
1478 rgm
->bias_adjusted_ctrl
= NULL_TREE
;
1482 /* There is no fixed compare type we are going to use but we have to
1483 be able to get at one for each mask group. */
1484 unsigned int min_ni_width
1485 = wi::min_precision (vect_max_vf (loop_vinfo
), UNSIGNED
);
1488 for (auto &rgc
: LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
)
1490 tree mask_type
= rgc
.type
;
1494 /* For now vect_get_loop_mask only supports integer mode masks
1495 when we need to split it. */
1496 if (GET_MODE_CLASS (TYPE_MODE (mask_type
)) != MODE_INT
1497 || TYPE_PRECISION (TREE_TYPE (mask_type
)) != 1)
1503 /* If iv_type is usable as compare type use that - we can elide the
1504 saturation in that case. */
1505 if (TYPE_PRECISION (iv_type
) >= min_ni_width
)
1508 = build_vector_type (iv_type
, TYPE_VECTOR_SUBPARTS (mask_type
));
1509 if (expand_vec_cmp_expr_p (cmp_vectype
, mask_type
, LT_EXPR
))
1510 rgc
.compare_type
= cmp_vectype
;
1512 if (!rgc
.compare_type
)
1513 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1515 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1516 if (cmp_bits
>= min_ni_width
1517 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1519 tree cmp_type
= build_nonstandard_integer_type (cmp_bits
, true);
1523 /* Check whether we can produce the mask with cmp_type. */
1525 = build_vector_type (cmp_type
, TYPE_VECTOR_SUBPARTS (mask_type
));
1526 if (expand_vec_cmp_expr_p (cmp_vectype
, mask_type
, LT_EXPR
))
1528 rgc
.compare_type
= cmp_vectype
;
1533 if (!rgc
.compare_type
)
1541 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
);
1545 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = error_mark_node
;
1546 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1547 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_avx512
;
1551 /* Check whether we can use vector access with length based on precison
1552 comparison. So far, to keep it simple, we only allow the case that the
1553 precision of the target supported length is larger than the precision
1554 required by loop niters. */
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1559 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1562 machine_mode len_load_mode
, len_store_mode
;
1563 if (!get_len_load_store_mode (loop_vinfo
->vector_mode
, true)
1564 .exists (&len_load_mode
))
1566 if (!get_len_load_store_mode (loop_vinfo
->vector_mode
, false)
1567 .exists (&len_store_mode
))
1570 signed char partial_load_bias
= internal_len_load_store_bias
1571 (IFN_LEN_LOAD
, len_load_mode
);
1573 signed char partial_store_bias
= internal_len_load_store_bias
1574 (IFN_LEN_STORE
, len_store_mode
);
1576 gcc_assert (partial_load_bias
== partial_store_bias
);
1578 if (partial_load_bias
== VECT_PARTIAL_BIAS_UNSUPPORTED
)
1581 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582 len_loads with a length of zero. In order to avoid that we prohibit
1583 more than one loop length here. */
1584 if (partial_load_bias
== -1
1585 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
1588 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) = partial_load_bias
;
1590 unsigned int max_nitems_per_iter
= 1;
1592 rgroup_controls
*rgl
;
1593 /* Find the maximum number of items per iteration for every rgroup. */
1594 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1596 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1597 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1600 /* Work out how many bits we need to represent the length limit. */
1601 unsigned int min_ni_prec
1602 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1604 /* Now use the maximum of below precisions for one suitable IV type:
1605 - the IV's natural precision
1606 - the precision needed to hold: the maximum number of scalar
1607 iterations multiplied by the scale factor (min_ni_prec above)
1608 - the Pmode precision
1610 If min_ni_prec is less than the precision of the current niters,
1611 we perfer to still use the niters type. Prefer to use Pmode and
1612 wider IV to avoid narrow conversions. */
1614 unsigned int ni_prec
1615 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1616 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1617 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1619 tree iv_type
= NULL_TREE
;
1620 opt_scalar_int_mode tmode_iter
;
1621 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1623 scalar_mode tmode
= tmode_iter
.require ();
1624 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1626 /* ??? Do we really want to construct one IV whose precision exceeds
1628 if (tbits
> BITS_PER_WORD
)
1631 /* Find the first available standard integral type. */
1632 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1634 iv_type
= build_nonstandard_integer_type (tbits
, true);
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1643 "can't vectorize with length-based partial vectors"
1644 " because there is no suitable iv type.\n");
1648 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1649 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1650 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_len
;
1655 /* Calculate the cost of one scalar iteration of the loop. */
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1659 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1660 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1661 int nbbs
= loop
->num_nodes
, factor
;
1662 int innerloop_iters
, i
;
1664 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1666 /* Gather costs for statements in the scalar loop. */
1669 innerloop_iters
= 1;
1671 innerloop_iters
= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
);
1673 for (i
= 0; i
< nbbs
; i
++)
1675 gimple_stmt_iterator si
;
1676 basic_block bb
= bbs
[i
];
1678 if (bb
->loop_father
== loop
->inner
)
1679 factor
= innerloop_iters
;
1683 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1685 gimple
*stmt
= gsi_stmt (si
);
1686 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1688 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1691 /* Skip stmts that are not vectorized inside the loop. */
1692 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1693 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1694 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1695 || !VECTORIZABLE_CYCLE_DEF
1696 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1699 vect_cost_for_stmt kind
;
1700 if (STMT_VINFO_DATA_REF (stmt_info
))
1702 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1705 kind
= scalar_store
;
1707 else if (vect_nop_conversion_p (stmt_info
))
1712 /* We are using vect_prologue here to avoid scaling twice
1713 by the inner loop factor. */
1714 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1715 factor
, kind
, stmt_info
, 0, vect_prologue
);
1719 /* Now accumulate cost. */
1720 loop_vinfo
->scalar_costs
= init_cost (loop_vinfo
, true);
1721 add_stmt_costs (loop_vinfo
->scalar_costs
,
1722 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
));
1723 loop_vinfo
->scalar_costs
->finish_cost (nullptr);
1726 /* Function vect_analyze_loop_form.
1728 Verify that certain CFG restrictions hold, including:
1729 - the loop has a pre-header
1730 - the loop has a single entry
1731 - nested loops can have only a single exit.
1732 - the loop exit condition is simple enough
1733 - the number of iterations can be analyzed, i.e, a countable loop. The
1734 niter could be analyzed under some assumptions. */
1737 vect_analyze_loop_form (class loop
*loop
, vect_loop_form_info
*info
)
1739 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1741 edge exit_e
= vec_init_loop_exit_info (loop
);
1743 return opt_result::failure_at (vect_location
,
1745 " could not determine main exit from"
1746 " loop with multiple exits.\n");
1747 info
->loop_exit
= exit_e
;
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE
, vect_location
,
1750 "using as main loop exit: %d -> %d [AUX: %p]\n",
1751 exit_e
->src
->index
, exit_e
->dest
->index
, exit_e
->aux
);
1753 /* Check if we have any control flow that doesn't leave the loop. */
1754 class loop
*v_loop
= loop
->inner
? loop
->inner
: loop
;
1755 basic_block
*bbs
= get_loop_body (v_loop
);
1756 for (unsigned i
= 0; i
< v_loop
->num_nodes
; i
++)
1757 if (EDGE_COUNT (bbs
[i
]->succs
) != 1
1758 && (EDGE_COUNT (bbs
[i
]->succs
) != 2
1759 || !loop_exits_from_bb_p (bbs
[i
]->loop_father
, bbs
[i
])))
1762 return opt_result::failure_at (vect_location
,
1764 " unsupported control flow in loop.\n");
1768 /* Different restrictions apply when we are considering an inner-most loop,
1769 vs. an outer (nested) loop.
1770 (FORNOW. May want to relax some of these restrictions in the future). */
1772 info
->inner_loop_cond
= NULL
;
1775 /* Inner-most loop. */
1777 if (empty_block_p (loop
->header
))
1778 return opt_result::failure_at (vect_location
,
1779 "not vectorized: empty loop.\n");
1783 class loop
*innerloop
= loop
->inner
;
1786 /* Nested loop. We currently require that the loop is doubly-nested,
1787 contains a single inner loop with a single exit to the block
1788 with the single exit condition in the outer loop.
1789 Vectorizable outer-loops look like this:
1801 The inner-loop also has the properties expected of inner-most loops
1802 as described above. */
1804 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1805 return opt_result::failure_at (vect_location
,
1807 " multiple nested loops.\n");
1809 entryedge
= loop_preheader_edge (innerloop
);
1810 if (entryedge
->src
!= loop
->header
1811 || !single_exit (innerloop
)
1812 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1813 return opt_result::failure_at (vect_location
,
1815 " unsupported outerloop form.\n");
1817 /* Analyze the inner-loop. */
1818 vect_loop_form_info inner
;
1819 opt_result res
= vect_analyze_loop_form (loop
->inner
, &inner
);
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1824 "not vectorized: Bad inner loop.\n");
1828 /* Don't support analyzing niter under assumptions for inner
1830 if (!integer_onep (inner
.assumptions
))
1831 return opt_result::failure_at (vect_location
,
1832 "not vectorized: Bad inner loop.\n");
1834 if (!expr_invariant_in_loop_p (loop
, inner
.number_of_iterations
))
1835 return opt_result::failure_at (vect_location
,
1836 "not vectorized: inner-loop count not"
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE
, vect_location
,
1841 "Considering outer-loop vectorization.\n");
1842 info
->inner_loop_cond
= inner
.conds
[0];
1845 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1846 return opt_result::failure_at (vect_location
,
1848 " too many incoming edges.\n");
1850 /* We assume that the latch is empty. */
1851 if (!empty_block_p (loop
->latch
)
1852 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1853 return opt_result::failure_at (vect_location
,
1854 "not vectorized: latch block not empty.\n");
1856 /* Make sure there is no abnormal exit. */
1857 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
1858 for (edge e
: exits
)
1860 if (e
->flags
& EDGE_ABNORMAL
)
1861 return opt_result::failure_at (vect_location
,
1863 " abnormal loop exit edge.\n");
1867 = vect_get_loop_niters (loop
, exit_e
, &info
->assumptions
,
1868 &info
->number_of_iterations
,
1869 &info
->number_of_iterationsm1
);
1870 if (info
->conds
.is_empty ())
1871 return opt_result::failure_at
1873 "not vectorized: complicated exit condition.\n");
1875 /* Determine what the primary and alternate exit conds are. */
1876 for (unsigned i
= 0; i
< info
->conds
.length (); i
++)
1878 gcond
*cond
= info
->conds
[i
];
1879 if (exit_e
->src
== gimple_bb (cond
))
1880 std::swap (info
->conds
[0], info
->conds
[i
]);
1883 if (integer_zerop (info
->assumptions
)
1884 || !info
->number_of_iterations
1885 || chrec_contains_undetermined (info
->number_of_iterations
))
1886 return opt_result::failure_at
1888 "not vectorized: number of iterations cannot be computed.\n");
1890 if (integer_zerop (info
->number_of_iterations
))
1891 return opt_result::failure_at
1893 "not vectorized: number of iterations = 0.\n");
1895 if (!(tree_fits_shwi_p (info
->number_of_iterations
)
1896 && tree_to_shwi (info
->number_of_iterations
) > 0))
1898 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE
, vect_location
,
1901 "Symbolic number of iterations is ");
1902 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, info
->number_of_iterations
);
1903 dump_printf (MSG_NOTE
, "\n");
1907 return opt_result::success ();
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911 vect_analyze_loop_form result. */
1914 vect_create_loop_vinfo (class loop
*loop
, vec_info_shared
*shared
,
1915 const vect_loop_form_info
*info
,
1916 loop_vec_info main_loop_info
)
1918 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1919 LOOP_VINFO_NITERSM1 (loop_vinfo
) = info
->number_of_iterationsm1
;
1920 LOOP_VINFO_NITERS (loop_vinfo
) = info
->number_of_iterations
;
1921 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = info
->number_of_iterations
;
1922 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = main_loop_info
;
1923 /* Also record the assumptions for versioning. */
1924 if (!integer_onep (info
->assumptions
) && !main_loop_info
)
1925 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = info
->assumptions
;
1927 for (gcond
*cond
: info
->conds
)
1929 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (cond
);
1930 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1931 /* Mark the statement as a condition. */
1932 STMT_VINFO_DEF_TYPE (loop_cond_info
) = vect_condition_def
;
1935 for (unsigned i
= 1; i
< info
->conds
.length (); i
++)
1936 LOOP_VINFO_LOOP_CONDS (loop_vinfo
).safe_push (info
->conds
[i
]);
1937 LOOP_VINFO_LOOP_IV_COND (loop_vinfo
) = info
->conds
[0];
1939 LOOP_VINFO_IV_EXIT (loop_vinfo
) = info
->loop_exit
;
1941 /* Check to see if we're vectorizing multiple exits. */
1942 LOOP_VINFO_EARLY_BREAKS (loop_vinfo
)
1943 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo
).is_empty ();
1945 if (info
->inner_loop_cond
)
1947 stmt_vec_info inner_loop_cond_info
1948 = loop_vinfo
->lookup_stmt (info
->inner_loop_cond
);
1949 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1950 /* If we have an estimate on the number of iterations of the inner
1951 loop use that to limit the scale for costing, otherwise use
1952 --param vect-inner-loop-cost-factor literally. */
1954 if (estimated_stmt_executions (loop
->inner
, &nit
))
1955 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
)
1956 = wi::smin (nit
, param_vect_inner_loop_cost_factor
).to_uhwi ();
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965 statements update the vectorization factor. */
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1970 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1971 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1972 int nbbs
= loop
->num_nodes
;
1973 poly_uint64 vectorization_factor
;
1976 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1978 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1979 gcc_assert (known_ne (vectorization_factor
, 0U));
1981 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982 vectorization factor of the loop is the unrolling factor required by
1983 the SLP instances. If that unrolling factor is 1, we say, that we
1984 perform pure SLP on loop - cross iteration parallelism is not
1986 bool only_slp_in_loop
= true;
1987 for (i
= 0; i
< nbbs
; i
++)
1989 basic_block bb
= bbs
[i
];
1990 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1993 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1996 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1997 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1998 && !PURE_SLP_STMT (stmt_info
))
1999 /* STMT needs both SLP and loop-based vectorization. */
2000 only_slp_in_loop
= false;
2002 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
2005 if (is_gimple_debug (gsi_stmt (si
)))
2007 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2008 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
2009 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
2010 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2011 && !PURE_SLP_STMT (stmt_info
))
2012 /* STMT needs both SLP and loop-based vectorization. */
2013 only_slp_in_loop
= false;
2017 if (only_slp_in_loop
)
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE
, vect_location
,
2021 "Loop contains only SLP stmts\n");
2022 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE
, vect_location
,
2028 "Loop contains SLP and non-SLP stmts\n");
2029 /* Both the vectorization factor and unroll factor have the form
2030 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031 so they must have a common multiple. */
2032 vectorization_factor
2033 = force_common_multiple (vectorization_factor
,
2034 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
2037 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
2038 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE
, vect_location
,
2041 "Updating vectorization factor to ");
2042 dump_dec (MSG_NOTE
, vectorization_factor
);
2043 dump_printf (MSG_NOTE
, ".\n");
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048 the other phi in the reduction is also relevant for vectorization.
2049 This rejects cases such as:
2052 x_1 = PHI <x_3(outer2), ...>;
2060 x_3 = PHI <x_2(inner)>;
2062 if nothing in x_2 or elsewhere makes x_1 relevant. */
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
2067 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2070 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
2073 /* Function vect_analyze_loop_operations.
2075 Scan the loop stmts and make sure they are all vectorizable. */
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
2080 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2081 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2082 int nbbs
= loop
->num_nodes
;
2084 stmt_vec_info stmt_info
;
2085 bool need_to_vectorize
= false;
2088 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2090 auto_vec
<stmt_info_for_cost
> cost_vec
;
2092 for (i
= 0; i
< nbbs
; i
++)
2094 basic_block bb
= bbs
[i
];
2096 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
2099 gphi
*phi
= si
.phi ();
2102 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G",
2106 if (virtual_operand_p (gimple_phi_result (phi
)))
2109 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110 (i.e., a phi in the tail of the outer-loop). */
2111 if (! is_loop_header_bb_p (bb
))
2113 /* FORNOW: we currently don't support the case that these phis
2114 are not used in the outerloop (unless it is double reduction,
2115 i.e., this phi is vect_reduction_def), cause this case
2116 requires to actually do something here. */
2117 if (STMT_VINFO_LIVE_P (stmt_info
)
2118 && !vect_active_double_reduction_p (stmt_info
))
2119 return opt_result::failure_at (phi
,
2120 "Unsupported loop-closed phi"
2121 " in outer-loop.\n");
2123 /* If PHI is used in the outer loop, we check that its operand
2124 is defined in the inner loop. */
2125 if (STMT_VINFO_RELEVANT_P (stmt_info
))
2129 if (gimple_phi_num_args (phi
) != 1)
2130 return opt_result::failure_at (phi
, "unsupported phi");
2132 phi_op
= PHI_ARG_DEF (phi
, 0);
2133 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
2135 return opt_result::failure_at (phi
, "unsupported phi\n");
2137 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
2138 && (STMT_VINFO_RELEVANT (op_def_info
)
2139 != vect_used_in_outer_by_reduction
))
2140 return opt_result::failure_at (phi
, "unsupported phi\n");
2142 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
2143 || (STMT_VINFO_DEF_TYPE (stmt_info
)
2144 == vect_double_reduction_def
))
2145 && !vectorizable_lc_phi (loop_vinfo
,
2146 stmt_info
, NULL
, NULL
))
2147 return opt_result::failure_at (phi
, "unsupported phi\n");
2153 gcc_assert (stmt_info
);
2155 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
2156 || STMT_VINFO_LIVE_P (stmt_info
))
2157 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
2158 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
2159 /* A scalar-dependence cycle that we don't support. */
2160 return opt_result::failure_at (phi
,
2162 " scalar dependence cycle.\n");
2164 if (STMT_VINFO_RELEVANT_P (stmt_info
))
2166 need_to_vectorize
= true;
2167 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
2168 && ! PURE_SLP_STMT (stmt_info
))
2169 ok
= vectorizable_induction (loop_vinfo
,
2170 stmt_info
, NULL
, NULL
,
2172 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2173 || (STMT_VINFO_DEF_TYPE (stmt_info
)
2174 == vect_double_reduction_def
)
2175 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
2176 && ! PURE_SLP_STMT (stmt_info
))
2177 ok
= vectorizable_reduction (loop_vinfo
,
2178 stmt_info
, NULL
, NULL
, &cost_vec
);
2179 else if ((STMT_VINFO_DEF_TYPE (stmt_info
)
2180 == vect_first_order_recurrence
)
2181 && ! PURE_SLP_STMT (stmt_info
))
2182 ok
= vectorizable_recurr (loop_vinfo
, stmt_info
, NULL
, NULL
,
2186 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2188 && STMT_VINFO_LIVE_P (stmt_info
)
2189 && !PURE_SLP_STMT (stmt_info
))
2190 ok
= vectorizable_live_operation (loop_vinfo
, stmt_info
, NULL
, NULL
,
2191 -1, false, &cost_vec
);
2194 return opt_result::failure_at (phi
,
2195 "not vectorized: relevant phi not "
2197 static_cast <gimple
*> (phi
));
2200 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
2203 gimple
*stmt
= gsi_stmt (si
);
2204 if (!gimple_clobber_p (stmt
)
2205 && !is_gimple_debug (stmt
))
2208 = vect_analyze_stmt (loop_vinfo
,
2209 loop_vinfo
->lookup_stmt (stmt
),
2211 NULL
, NULL
, &cost_vec
);
2218 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
2220 /* All operations in the loop are either irrelevant (deal with loop
2221 control, or dead), or only used outside the loop and can be moved
2222 out of the loop (e.g. invariants, inductions). The loop can be
2223 optimized away by scalar optimizations. We're better off not
2224 touching this loop. */
2225 if (!need_to_vectorize
)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE
, vect_location
,
2229 "All the computation can be taken out of the loop.\n");
2230 return opt_result::failure_at
2232 "not vectorized: redundant loop. no profit to vectorize.\n");
2235 return opt_result::success ();
2238 /* Return true if we know that the iteration count is smaller than the
2239 vectorization factor. Return false if it isn't, or if we can't be sure
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
2245 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2247 HOST_WIDE_INT max_niter
;
2248 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2249 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2251 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
2253 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
2259 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2260 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2261 definitely no, or -1 if it's worth retrying. */
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo
,
2265 unsigned *suggested_unroll_factor
)
2267 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2268 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2270 /* Only loops that can handle partially-populated vectors can have iteration
2271 counts less than the vectorization factor. */
2272 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2273 && vect_known_niters_smaller_than_vf (loop_vinfo
))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2277 "not vectorized: iteration count smaller than "
2278 "vectorization factor.\n");
2282 /* If we know the number of iterations we can do better, for the
2283 epilogue we can also decide whether the main loop leaves us
2284 with enough iterations, prefering a smaller vector epilog then
2285 also possibly used for the case we skip the vector loop. */
2286 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2288 widest_int scalar_niters
2289 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo
)) + 1;
2290 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2292 loop_vec_info orig_loop_vinfo
2293 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2295 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
));
2296 int prolog_peeling
= 0;
2297 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2298 prolog_peeling
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
);
2299 if (prolog_peeling
>= 0
2300 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
),
2304 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo
) ? 1 : 0;
2305 scalar_niters
= ((scalar_niters
- gap
- prolog_peeling
)
2309 /* Reject vectorizing for a single scalar iteration, even if
2310 we could in principle implement that using partial vectors. */
2311 unsigned peeling_gap
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
);
2312 if (scalar_niters
<= peeling_gap
+ 1)
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2316 "not vectorized: loop only has a single "
2317 "scalar iteration.\n");
2321 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2323 /* Check that the loop processes at least one full vector. */
2324 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2325 if (known_lt (scalar_niters
, vf
))
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2329 "loop does not have enough iterations "
2330 "to support vectorization.\n");
2334 /* If we need to peel an extra epilogue iteration to handle data
2335 accesses with gaps, check that there are enough scalar iterations
2338 The check above is redundant with this one when peeling for gaps,
2339 but the distinction is useful for diagnostics. */
2340 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2341 && known_le (scalar_niters
, vf
))
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2345 "loop does not have enough iterations "
2346 "to support peeling for gaps.\n");
2352 /* If using the "very cheap" model. reject cases in which we'd keep
2353 a copy of the scalar code (even if we might be able to vectorize it). */
2354 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2355 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2356 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2357 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2361 "some scalar iterations would need to be peeled\n");
2365 int min_profitable_iters
, min_profitable_estimate
;
2366 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
2367 &min_profitable_estimate
,
2368 suggested_unroll_factor
);
2370 if (min_profitable_iters
< 0)
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2374 "not vectorized: vectorization not profitable.\n");
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2377 "not vectorized: vector version will never be "
2382 int min_scalar_loop_bound
= (param_min_vect_loop_bound
2385 /* Use the cost model only if it is more conservative than user specified
2387 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
2388 min_profitable_iters
);
2390 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
2392 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2393 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2397 "not vectorized: vectorization not profitable.\n");
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE
, vect_location
,
2400 "not vectorized: iteration count smaller than user "
2401 "specified loop bound parameter or minimum profitable "
2402 "iterations (whichever is more conservative).\n");
2406 /* The static profitablity threshold min_profitable_estimate includes
2407 the cost of having to check at runtime whether the scalar loop
2408 should be used instead. If it turns out that we don't need or want
2409 such a check, the threshold we should use for the static estimate
2410 is simply the point at which the vector loop becomes more profitable
2411 than the scalar loop. */
2412 if (min_profitable_estimate
> min_profitable_iters
2413 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2414 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2415 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2416 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
2420 " choice between the scalar and vector loops\n");
2421 min_profitable_estimate
= min_profitable_iters
;
2424 /* If the vector loop needs multiple iterations to be beneficial then
2425 things are probably too close to call, and the conservative thing
2426 would be to stick with the scalar code. */
2427 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2428 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2432 "one iteration of the vector loop would be"
2433 " more expensive than the equivalent number of"
2434 " iterations of the scalar loop\n");
2438 HOST_WIDE_INT estimated_niter
;
2440 /* If we are vectorizing an epilogue then we know the maximum number of
2441 scalar iterations it will cover is at least one lower than the
2442 vectorization factor of the main loop. */
2443 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2445 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
2448 estimated_niter
= estimated_stmt_executions_int (loop
);
2449 if (estimated_niter
== -1)
2450 estimated_niter
= likely_max_stmt_executions_int (loop
);
2452 if (estimated_niter
!= -1
2453 && ((unsigned HOST_WIDE_INT
) estimated_niter
2454 < MAX (th
, (unsigned) min_profitable_estimate
)))
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2458 "not vectorized: estimated iteration count too "
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE
, vect_location
,
2462 "not vectorized: estimated iteration count smaller "
2463 "than specified loop bound parameter or minimum "
2464 "profitable iterations (whichever is more "
2465 "conservative).\n");
2473 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
2474 vec
<data_reference_p
> *datarefs
,
2475 unsigned int *n_stmts
)
2478 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
2479 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
2480 !gsi_end_p (gsi
); gsi_next (&gsi
))
2482 gimple
*stmt
= gsi_stmt (gsi
);
2483 if (is_gimple_debug (stmt
))
2486 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
2490 if (is_gimple_call (stmt
) && loop
->safelen
)
2492 tree fndecl
= gimple_call_fndecl (stmt
), op
;
2493 if (fndecl
== NULL_TREE
2494 && gimple_call_internal_p (stmt
, IFN_MASK_CALL
))
2496 fndecl
= gimple_call_arg (stmt
, 0);
2497 gcc_checking_assert (TREE_CODE (fndecl
) == ADDR_EXPR
);
2498 fndecl
= TREE_OPERAND (fndecl
, 0);
2499 gcc_checking_assert (TREE_CODE (fndecl
) == FUNCTION_DECL
);
2501 if (fndecl
!= NULL_TREE
)
2503 cgraph_node
*node
= cgraph_node::get (fndecl
);
2504 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2506 unsigned int j
, n
= gimple_call_num_args (stmt
);
2507 for (j
= 0; j
< n
; j
++)
2509 op
= gimple_call_arg (stmt
, j
);
2511 || (REFERENCE_CLASS_P (op
)
2512 && get_base_address (op
)))
2515 op
= gimple_call_lhs (stmt
);
2516 /* Ignore #pragma omp declare simd functions
2517 if they don't have data references in the
2518 call stmt itself. */
2522 || (REFERENCE_CLASS_P (op
)
2523 && get_base_address (op
)))))
2530 /* If dependence analysis will give up due to the limit on the
2531 number of datarefs stop here and fail fatally. */
2532 if (datarefs
->length ()
2533 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2534 return opt_result::failure_at (stmt
, "exceeded param "
2535 "loop-max-datarefs-for-datadeps\n");
2537 return opt_result::success ();
2540 /* Look for SLP-only access groups and turn each individual access into its own
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2546 struct data_reference
*dr
;
2548 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2550 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2551 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2553 gcc_assert (DR_REF (dr
));
2554 stmt_vec_info stmt_info
2555 = vect_stmt_to_vectorize (loop_vinfo
->lookup_stmt (DR_STMT (dr
)));
2557 /* Check if the load is a part of an interleaving chain. */
2558 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2560 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2561 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_element
);
2562 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2564 /* Check if SLP-only groups. */
2565 if (!STMT_SLP_TYPE (stmt_info
)
2566 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2568 /* Dissolve the group. */
2569 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2571 stmt_vec_info vinfo
= first_element
;
2574 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2575 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2576 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2577 DR_GROUP_SIZE (vinfo
) = 1;
2578 if (STMT_VINFO_STRIDED_P (first_element
)
2579 /* We cannot handle stores with gaps. */
2580 || DR_IS_WRITE (dr_info
->dr
))
2582 STMT_VINFO_STRIDED_P (vinfo
) = true;
2583 DR_GROUP_GAP (vinfo
) = 0;
2586 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2587 /* Duplicate and adjust alignment info, it needs to
2588 be present on each group leader, see dr_misalignment. */
2589 if (vinfo
!= first_element
)
2591 dr_vec_info
*dr_info2
= STMT_VINFO_DR_INFO (vinfo
);
2592 dr_info2
->target_alignment
= dr_info
->target_alignment
;
2593 int misalignment
= dr_info
->misalignment
;
2594 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
)
2597 = (TREE_INT_CST_LOW (DR_INIT (dr_info2
->dr
))
2598 - TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
)));
2599 unsigned HOST_WIDE_INT align_c
2600 = dr_info
->target_alignment
.to_constant ();
2601 misalignment
= (misalignment
+ diff
) % align_c
;
2603 dr_info2
->misalignment
= misalignment
;
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613 some scalar iterations still to do. If so, decide how we should
2614 handle those scalar iterations. The possibilities are:
2616 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621 LOOP_VINFO_PEELING_FOR_NITER == false
2623 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624 to handle the remaining scalar iterations. In this case:
2626 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627 LOOP_VINFO_PEELING_FOR_NITER == true
2629 There are two choices:
2631 (2a) Consider vectorizing the epilogue loop at the same VF as the
2632 main loop, but using partial vectors instead of full vectors.
2635 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2637 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
)
2646 /* Determine whether there would be any scalar iterations left over. */
2647 bool need_peeling_or_partial_vectors_p
2648 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2650 /* Decide whether to vectorize the loop with partial vectors. */
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2652 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2653 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2654 && need_peeling_or_partial_vectors_p
)
2656 /* For partial-vector-usage=1, try to push the handling of partial
2657 vectors to the epilogue, with the main loop continuing to operate
2660 If we are unrolling we also do not want to use partial vectors. This
2661 is to avoid the overhead of generating multiple masks and also to
2662 avoid having to execute entire iterations of FALSE masked instructions
2663 when dealing with one or less full iterations.
2665 ??? We could then end up failing to use partial vectors if we
2666 decide to peel iterations into a prologue, and if the main loop
2667 then ends up processing fewer than VF iterations. */
2668 if ((param_vect_partial_vector_usage
== 1
2669 || loop_vinfo
->suggested_unroll_factor
> 1)
2670 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2671 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2672 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2674 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2677 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE
, vect_location
,
2679 "operating on %s vectors%s.\n",
2680 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2681 ? "partial" : "full",
2682 LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2683 ? " for epilogue loop" : "");
2685 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2686 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2687 && need_peeling_or_partial_vectors_p
);
2689 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690 analysis that we don't know whether the loop is vectorized by partial
2691 vectors (More details see tree-vect-loop-manip.cc).
2693 However, SELECT_VL vectorizaton style should only applied on partial
2694 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695 number of elements to be process for each iteration.
2697 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698 if it is not partial vectorized loop. */
2699 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2700 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
) = false;
2702 return opt_result::success ();
2705 /* Function vect_analyze_loop_2.
2707 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708 analyses will record information in some members of LOOP_VINFO. FATAL
2709 indicates if some analysis meets fatal error. If one non-NULL pointer
2710 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711 worked out suggested unroll factor, while one NULL pointer shows it's
2712 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2713 is to hold the slp decision when the suggested unroll factor is worked
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
,
2717 unsigned *suggested_unroll_factor
,
2718 bool& slp_done_for_suggested_uf
)
2720 opt_result ok
= opt_result::success ();
2722 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2723 poly_uint64 min_vf
= 2;
2724 loop_vec_info orig_loop_vinfo
= NULL
;
2726 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727 loop_vec_info of the first vectorized loop. */
2728 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2729 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2731 orig_loop_vinfo
= loop_vinfo
;
2732 gcc_assert (orig_loop_vinfo
);
2734 /* The first group of checks is independent of the vector size. */
2737 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2738 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2739 return opt_result::failure_at (vect_location
,
2740 "not vectorized: simd if(0)\n");
2742 /* Find all data references in the loop (which correspond to vdefs/vuses)
2743 and analyze their evolution in the loop. */
2745 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2747 /* Gather the data references and count stmts in the loop. */
2748 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2751 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2752 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2753 &LOOP_VINFO_N_STMTS (loop_vinfo
));
2756 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2758 "not vectorized: loop contains function "
2759 "calls or data references that cannot "
2763 loop_vinfo
->shared
->save_datarefs ();
2766 loop_vinfo
->shared
->check_datarefs ();
2768 /* Analyze the data references and also adjust the minimal
2769 vectorization factor according to the loads and stores. */
2771 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2776 "bad data references.\n");
2780 /* Check if we are applying unroll factor now. */
2781 bool applying_suggested_uf
= loop_vinfo
->suggested_unroll_factor
> 1;
2782 gcc_assert (!applying_suggested_uf
|| !suggested_unroll_factor
);
2784 /* If the slp decision is false when suggested unroll factor is worked
2785 out, and we are applying suggested unroll factor, we can simply skip
2786 all slp related analyses this time. */
2787 bool slp
= !applying_suggested_uf
|| slp_done_for_suggested_uf
;
2789 /* Classify all cross-iteration scalar data-flow cycles.
2790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2791 vect_analyze_scalar_cycles (loop_vinfo
, slp
);
2793 vect_pattern_recog (loop_vinfo
);
2795 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2797 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2800 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2805 "bad data access.\n");
2809 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2811 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2814 if (dump_enabled_p ())
2815 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2816 "unexpected pattern.\n");
2820 /* While the rest of the analysis below depends on it in some way. */
2823 /* Analyze data dependences between the data-refs in the loop
2824 and adjust the maximum vectorization factor according to
2826 FORNOW: fail at the first data dependence that we encounter. */
2828 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2831 if (dump_enabled_p ())
2832 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2833 "bad data dependence.\n");
2836 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2837 && maybe_lt (max_vf
, min_vf
))
2838 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2839 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2841 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2844 if (dump_enabled_p ())
2845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2846 "can't determine vectorization factor.\n");
2850 /* Compute the scalar iteration cost. */
2851 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2853 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2857 /* Check the SLP opportunities in the loop, analyze and build
2859 ok
= vect_analyze_slp (loop_vinfo
, LOOP_VINFO_N_STMTS (loop_vinfo
));
2863 /* If there are any SLP instances mark them as pure_slp. */
2864 slp
= vect_make_slp_decision (loop_vinfo
);
2867 /* Find stmts that need to be both vectorized and SLPed. */
2868 vect_detect_hybrid_slp (loop_vinfo
);
2870 /* Update the vectorization factor based on the SLP decision. */
2871 vect_update_vf_for_slp (loop_vinfo
);
2873 /* Optimize the SLP graph with the vectorization factor fixed. */
2874 vect_optimize_slp (loop_vinfo
);
2876 /* Gather the loads reachable from the SLP graph entries. */
2877 vect_gather_slp_loads (loop_vinfo
);
2881 bool saved_can_use_partial_vectors_p
2882 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2884 /* We don't expect to have to roll back to anything other than an empty
2886 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2888 /* This is the point where we can re-start analysis with SLP forced off. */
2891 /* Apply the suggested unrolling factor, this was determined by the backend
2892 during finish_cost the first time we ran the analyzis for this
2894 if (applying_suggested_uf
)
2895 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *= loop_vinfo
->suggested_unroll_factor
;
2897 /* Now the vectorization factor is final. */
2898 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2899 gcc_assert (known_ne (vectorization_factor
, 0U));
2901 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2903 dump_printf_loc (MSG_NOTE
, vect_location
,
2904 "vectorization_factor = ");
2905 dump_dec (MSG_NOTE
, vectorization_factor
);
2906 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2907 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2910 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2911 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2912 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2914 loop_vinfo
->vector_costs
= init_cost (loop_vinfo
, false);
2916 /* Analyze the alignment of the data-refs in the loop.
2917 Fail if a data reference is found that cannot be vectorized. */
2919 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2922 if (dump_enabled_p ())
2923 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2924 "bad data alignment.\n");
2928 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929 It is important to call pruning after vect_analyze_data_ref_accesses,
2930 since we use grouping information gathered by interleaving analysis. */
2931 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2935 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936 vectorization, since we do not want to add extra peeling or
2937 add versioning for alignment. */
2938 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2939 /* This pass will decide on using loop versioning and/or loop peeling in
2940 order to enhance the alignment of data references in the loop. */
2941 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2947 /* Analyze operations in the SLP instances. Note this may
2948 remove unsupported SLP instances which makes the above
2949 SLP kind detection invalid. */
2950 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2951 vect_slp_analyze_operations (loop_vinfo
);
2952 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2954 ok
= opt_result::failure_at (vect_location
,
2955 "unsupported SLP instances\n");
2959 /* Check whether any load in ALL SLP instances is possibly permuted. */
2960 slp_tree load_node
, slp_root
;
2962 slp_instance instance
;
2963 bool can_use_lanes
= true;
2964 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2966 slp_root
= SLP_INSTANCE_TREE (instance
);
2967 int group_size
= SLP_TREE_LANES (slp_root
);
2968 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2969 bool loads_permuted
= false;
2970 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2972 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2975 stmt_vec_info load_info
;
2976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2977 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2979 loads_permuted
= true;
2984 /* If the loads and stores can be handled with load/store-lane
2985 instructions record it and move on to the next instance. */
2987 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2988 && vect_store_lanes_supported (vectype
, group_size
, false)
2991 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2992 if (STMT_VINFO_GROUPED_ACCESS
2993 (SLP_TREE_REPRESENTATIVE (load_node
)))
2995 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2996 (SLP_TREE_REPRESENTATIVE (load_node
));
2997 /* Use SLP for strided accesses (or if we can't
2999 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
3000 || vect_load_lanes_supported
3001 (STMT_VINFO_VECTYPE (stmt_vinfo
),
3002 DR_GROUP_SIZE (stmt_vinfo
), false) == IFN_LAST
)
3007 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
3009 if (can_use_lanes
&& dump_enabled_p ())
3010 dump_printf_loc (MSG_NOTE
, vect_location
,
3011 "SLP instance %p can use load/store-lanes\n",
3016 can_use_lanes
= false;
3021 /* If all SLP instances can use load/store-lanes abort SLP and try again
3022 with SLP disabled. */
3025 ok
= opt_result::failure_at (vect_location
,
3026 "Built SLP cancelled: can use "
3027 "load/store-lanes\n");
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3030 "Built SLP cancelled: all SLP instances support "
3031 "load/store-lanes\n");
3036 /* Dissolve SLP-only groups. */
3037 vect_dissolve_slp_only_groups (loop_vinfo
);
3039 /* Scan all the remaining operations in the loop that are not subject
3040 to SLP and make sure they are vectorizable. */
3041 ok
= vect_analyze_loop_operations (loop_vinfo
);
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3046 "bad operation or unsupported loop bound.\n");
3050 /* For now, we don't expect to mix both masking and length approaches for one
3051 loop, disable it if both are recorded. */
3052 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3053 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
3054 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3058 "can't vectorize a loop with partial vectors"
3059 " because we don't expect to mix different"
3060 " approaches with partial vectors for the"
3062 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3065 /* If we still have the option of using partial vectors,
3066 check whether we can generate the necessary loop controls. */
3067 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
3069 if (!LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
3071 if (!vect_verify_full_masking (loop_vinfo
)
3072 && !vect_verify_full_masking_avx512 (loop_vinfo
))
3073 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3075 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076 if (!vect_verify_loop_lens (loop_vinfo
))
3077 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3080 /* If we're vectorizing a loop that uses length "controls" and
3081 can iterate more than once, we apply decrementing IV approach
3083 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3084 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) == vect_partial_vectors_len
3085 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) == 0
3086 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3087 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo
),
3088 LOOP_VINFO_VECT_FACTOR (loop_vinfo
))))
3089 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
) = true;
3091 /* If a loop uses length controls and has a decrementing loop control IV,
3092 we will normally pass that IV through a MIN_EXPR to calcaluate the
3093 basis for the length controls. E.g. in a loop that processes one
3094 element per scalar iteration, the number of elements would be
3095 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3097 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098 step, since only the final iteration of the vector loop can have
3101 However, some targets have a dedicated instruction for calculating the
3102 preferred length, given the total number of elements that still need to
3103 be processed. This is encapsulated in the SELECT_VL internal function.
3105 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106 to determine the basis for the length controls. However, unlike the
3107 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108 lanes inactive in any iteration of the vector loop, not just the last
3109 iteration. This SELECT_VL approach therefore requires us to use pointer
3110 IVs with variable steps.
3112 Once we've decided how many elements should be processed by one
3113 iteration of the vector loop, we need to populate the rgroup controls.
3114 If a loop has multiple rgroups, we need to make sure that those rgroups
3115 "line up" (that is, they must be consistent about which elements are
3116 active and which aren't). This is done by vect_adjust_loop_lens_control.
3118 In principle, it would be possible to use vect_adjust_loop_lens_control
3119 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3122 (1) In practice, it only makes sense to use SELECT_VL when a vector
3123 operation will be controlled directly by the result. It is not
3124 worth using SELECT_VL if it would only be the input to other
3127 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128 pointer IV will need N updates by a variable amount (N-1 updates
3129 within the iteration and 1 update to move to the next iteration).
3131 Because of this, we prefer to use the MIN_EXPR approach whenever there
3132 is more than one length control.
3134 In addition, SELECT_VL always operates to a granularity of 1 unit.
3135 If we wanted to use it to control an SLP operation on N consecutive
3136 elements, we would need to make the SELECT_VL inputs measure scalar
3137 iterations (rather than elements) and then multiply the SELECT_VL
3138 result by N. But using SELECT_VL this way is inefficient because
3141 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3144 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3147 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148 we will fail to gain benefits of following unroll optimizations. We prefer
3149 using the MIN_EXPR approach in this situation. */
3150 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
))
3152 tree iv_type
= LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
);
3153 if (direct_internal_fn_supported_p (IFN_SELECT_VL
, iv_type
,
3155 && LOOP_VINFO_LENS (loop_vinfo
).length () == 1
3156 && LOOP_VINFO_LENS (loop_vinfo
)[0].factor
== 1 && !slp
3157 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3158 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant ()))
3159 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
) = true;
3162 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163 assuming that the loop will be used as a main loop. We will redo
3164 this analysis later if we instead decide to use the loop as an
3166 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
);
3170 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171 to be able to handle fewer than VF scalars, or needs to have a lower VF
3172 than the main loop. */
3173 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
3174 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3176 poly_uint64 unscaled_vf
3177 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
),
3178 orig_loop_vinfo
->suggested_unroll_factor
);
3179 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), unscaled_vf
))
3180 return opt_result::failure_at (vect_location
,
3181 "Vectorization factor too high for"
3182 " epilogue loop.\n");
3185 /* Check the costings of the loop make vectorizing worthwhile. */
3186 res
= vect_analyze_loop_costing (loop_vinfo
, suggested_unroll_factor
);
3189 ok
= opt_result::failure_at (vect_location
,
3190 "Loop costings may not be worthwhile.\n");
3194 return opt_result::failure_at (vect_location
,
3195 "Loop costings not worthwhile.\n");
3197 /* If an epilogue loop is required make sure we can create one. */
3198 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
3199 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
3200 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
3204 if (!vect_can_advance_ivs_p (loop_vinfo
)
3205 || !slpeel_can_duplicate_loop_p (loop
,
3206 LOOP_VINFO_IV_EXIT (loop_vinfo
),
3207 LOOP_VINFO_IV_EXIT (loop_vinfo
)))
3209 ok
= opt_result::failure_at (vect_location
,
3210 "not vectorized: can't create required "
3216 /* During peeling, we need to check if number of loop iterations is
3217 enough for both peeled prolog loop and vector loop. This check
3218 can be merged along with threshold check of loop versioning, so
3219 increase threshold for this case if necessary.
3221 If we are analyzing an epilogue we still want to check what its
3222 versioning threshold would be. If we decide to vectorize the epilogues we
3223 will want to use the lowest versioning threshold of all epilogues and main
3224 loop. This will enable us to enter a vectorized epilogue even when
3225 versioning the loop. We can't simply check whether the epilogue requires
3226 versioning though since we may have skipped some versioning checks when
3227 analyzing the epilogue. For instance, checks for alias versioning will be
3228 skipped when dealing with epilogues as we assume we already checked them
3229 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3230 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
3232 poly_uint64 niters_th
= 0;
3233 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
3235 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
3237 /* Niters for peeled prolog loop. */
3238 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3240 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
3241 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
3242 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
3245 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3248 /* Niters for at least one iteration of vectorized loop. */
3249 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3250 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3251 /* One additional iteration because of peeling for gap. */
3252 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3255 /* Use the same condition as vect_transform_loop to decide when to use
3256 the cost to determine a versioning threshold. */
3257 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
3258 && ordered_p (th
, niters_th
))
3259 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
3261 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
3264 gcc_assert (known_eq (vectorization_factor
,
3265 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
3267 slp_done_for_suggested_uf
= slp
;
3269 /* Ok to vectorize! */
3270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
3271 return opt_result::success ();
3274 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3277 /* Try again with SLP forced off but if we didn't do any SLP there is
3278 no point in re-trying. */
3282 /* If the slp decision is true when suggested unroll factor is worked
3283 out, and we are applying suggested unroll factor, we don't need to
3285 if (applying_suggested_uf
&& slp_done_for_suggested_uf
)
3288 /* If there are reduction chains re-trying will fail anyway. */
3289 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
3292 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293 via interleaving or lane instructions. */
3294 slp_instance instance
;
3297 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
3299 stmt_vec_info vinfo
;
3300 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
3301 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
3303 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
3304 unsigned int size
= DR_GROUP_SIZE (vinfo
);
3305 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
3306 if (vect_store_lanes_supported (vectype
, size
, false) == IFN_LAST
3307 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
3308 && ! vect_grouped_store_supported (vectype
, size
))
3309 return opt_result::failure_at (vinfo
->stmt
,
3310 "unsupported grouped store\n");
3311 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
3313 vinfo
= SLP_TREE_REPRESENTATIVE (node
);
3314 if (STMT_VINFO_GROUPED_ACCESS (vinfo
))
3316 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
3317 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
3318 size
= DR_GROUP_SIZE (vinfo
);
3319 vectype
= STMT_VINFO_VECTYPE (vinfo
);
3320 if (vect_load_lanes_supported (vectype
, size
, false) == IFN_LAST
3321 && ! vect_grouped_load_supported (vectype
, single_element_p
,
3323 return opt_result::failure_at (vinfo
->stmt
,
3324 "unsupported grouped load\n");
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE
, vect_location
,
3331 "re-trying with SLP disabled\n");
3333 /* Roll back state appropriately. No SLP this time. */
3335 /* Restore vectorization factor as it were without SLP. */
3336 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
3337 /* Free the SLP instances. */
3338 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
3339 vect_free_slp_instance (instance
);
3340 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
3341 /* Reset SLP type to loop_vect on all stmts. */
3342 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
3344 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3345 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
3346 !gsi_end_p (si
); gsi_next (&si
))
3348 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
3349 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
3350 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
3351 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
3353 /* vectorizable_reduction adjusts reduction stmt def-types,
3354 restore them to that of the PHI. */
3355 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
3356 = STMT_VINFO_DEF_TYPE (stmt_info
);
3357 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358 (STMT_VINFO_REDUC_DEF (stmt_info
)))
3359 = STMT_VINFO_DEF_TYPE (stmt_info
);
3362 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
3363 !gsi_end_p (si
); gsi_next (&si
))
3365 if (is_gimple_debug (gsi_stmt (si
)))
3367 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
3368 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
3369 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3371 stmt_vec_info pattern_stmt_info
3372 = STMT_VINFO_RELATED_STMT (stmt_info
);
3373 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
3374 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
3376 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
3377 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
3378 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
3379 !gsi_end_p (pi
); gsi_next (&pi
))
3380 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
3385 /* Free optimized alias test DDRS. */
3386 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
3387 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
3388 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
3389 /* Reset target cost data. */
3390 delete loop_vinfo
->vector_costs
;
3391 loop_vinfo
->vector_costs
= nullptr;
3392 /* Reset accumulated rgroup information. */
3393 LOOP_VINFO_MASKS (loop_vinfo
).mask_set
.empty ();
3394 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
);
3395 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
3396 /* Reset assorted flags. */
3397 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
3398 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
3399 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
3400 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
3401 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3402 = saved_can_use_partial_vectors_p
;
3407 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3408 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3409 OLD_LOOP_VINFO is better unless something specifically indicates
3412 Note that this deliberately isn't a partial order. */
3415 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
3416 loop_vec_info old_loop_vinfo
)
3418 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
3419 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
3421 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
3422 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
3424 /* Always prefer a VF of loop->simdlen over any other VF. */
3427 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
3428 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
3429 if (new_simdlen_p
!= old_simdlen_p
)
3430 return new_simdlen_p
;
3433 const auto *old_costs
= old_loop_vinfo
->vector_costs
;
3434 const auto *new_costs
= new_loop_vinfo
->vector_costs
;
3435 if (loop_vec_info main_loop
= LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo
))
3436 return new_costs
->better_epilogue_loop_than_p (old_costs
, main_loop
);
3438 return new_costs
->better_main_loop_than_p (old_costs
);
3441 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3442 true if we should. */
3445 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
3446 loop_vec_info old_loop_vinfo
)
3448 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
3451 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE
, vect_location
,
3453 "***** Preferring vector mode %s to vector mode %s\n",
3454 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
3455 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
3459 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3460 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3461 MODE_I to the next mode useful to analyze.
3462 Return the loop_vinfo on success and wrapped null on failure. */
3464 static opt_loop_vec_info
3465 vect_analyze_loop_1 (class loop
*loop
, vec_info_shared
*shared
,
3466 const vect_loop_form_info
*loop_form_info
,
3467 loop_vec_info main_loop_vinfo
,
3468 const vector_modes
&vector_modes
, unsigned &mode_i
,
3469 machine_mode
&autodetected_vector_mode
,
3472 loop_vec_info loop_vinfo
3473 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3475 machine_mode vector_mode
= vector_modes
[mode_i
];
3476 loop_vinfo
->vector_mode
= vector_mode
;
3477 unsigned int suggested_unroll_factor
= 1;
3478 bool slp_done_for_suggested_uf
= false;
3480 /* Run the main analysis. */
3481 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
,
3482 &suggested_unroll_factor
,
3483 slp_done_for_suggested_uf
);
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE
, vect_location
,
3486 "***** Analysis %s with vector mode %s\n",
3487 res
? "succeeded" : " failed",
3488 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3490 if (res
&& !main_loop_vinfo
&& suggested_unroll_factor
> 1)
3492 if (dump_enabled_p ())
3493 dump_printf_loc (MSG_NOTE
, vect_location
,
3494 "***** Re-trying analysis for unrolling"
3495 " with unroll factor %d and slp %s.\n",
3496 suggested_unroll_factor
,
3497 slp_done_for_suggested_uf
? "on" : "off");
3498 loop_vec_info unroll_vinfo
3499 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3500 unroll_vinfo
->vector_mode
= vector_mode
;
3501 unroll_vinfo
->suggested_unroll_factor
= suggested_unroll_factor
;
3502 opt_result new_res
= vect_analyze_loop_2 (unroll_vinfo
, fatal
, NULL
,
3503 slp_done_for_suggested_uf
);
3507 loop_vinfo
= unroll_vinfo
;
3510 delete unroll_vinfo
;
3513 /* Remember the autodetected vector mode. */
3514 if (vector_mode
== VOIDmode
)
3515 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
3517 /* Advance mode_i, first skipping modes that would result in the
3518 same analysis result. */
3519 while (mode_i
+ 1 < vector_modes
.length ()
3520 && vect_chooses_same_modes_p (loop_vinfo
,
3521 vector_modes
[mode_i
+ 1]))
3523 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE
, vect_location
,
3525 "***** The result for vector mode %s would"
3527 GET_MODE_NAME (vector_modes
[mode_i
+ 1]));
3530 if (mode_i
+ 1 < vector_modes
.length ()
3531 && VECTOR_MODE_P (autodetected_vector_mode
)
3532 && (related_vector_mode (vector_modes
[mode_i
+ 1],
3533 GET_MODE_INNER (autodetected_vector_mode
))
3534 == autodetected_vector_mode
)
3535 && (related_vector_mode (autodetected_vector_mode
,
3536 GET_MODE_INNER (vector_modes
[mode_i
+ 1]))
3537 == vector_modes
[mode_i
+ 1]))
3539 if (dump_enabled_p ())
3540 dump_printf_loc (MSG_NOTE
, vect_location
,
3541 "***** Skipping vector mode %s, which would"
3542 " repeat the analysis for %s\n",
3543 GET_MODE_NAME (vector_modes
[mode_i
+ 1]),
3544 GET_MODE_NAME (autodetected_vector_mode
));
3553 gcc_checking_assert (main_loop_vinfo
== NULL
);
3554 return opt_loop_vec_info::propagate_failure (res
);
3557 return opt_loop_vec_info::success (loop_vinfo
);
3560 /* Function vect_analyze_loop.
3562 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3563 for it. The different analyses will record information in the
3564 loop_vec_info struct. */
3566 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
3568 DUMP_VECT_SCOPE ("analyze_loop_nest");
3570 if (loop_outer (loop
)
3571 && loop_vec_info_for_loop (loop_outer (loop
))
3572 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
3573 return opt_loop_vec_info::failure_at (vect_location
,
3574 "outer-loop already vectorized.\n");
3576 if (!find_loop_nest (loop
, &shared
->loop_nest
))
3577 return opt_loop_vec_info::failure_at
3579 "not vectorized: loop nest containing two or more consecutive inner"
3580 " loops cannot be vectorized\n");
3582 /* Analyze the loop form. */
3583 vect_loop_form_info loop_form_info
;
3584 opt_result res
= vect_analyze_loop_form (loop
, &loop_form_info
);
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3589 "bad loop form.\n");
3590 return opt_loop_vec_info::propagate_failure (res
);
3592 if (!integer_onep (loop_form_info
.assumptions
))
3594 /* We consider to vectorize this loop by versioning it under
3595 some assumptions. In order to do this, we need to clear
3596 existing information computed by scev and niter analyzer. */
3598 free_numbers_of_iterations_estimates (loop
);
3599 /* Also set flag for this loop so that following scev and niter
3600 analysis are done under the assumptions. */
3601 loop_constraint_set (loop
, LOOP_C_FINITE
);
3604 /* Clear the existing niter information to make sure the nonwrapping flag
3605 will be calculated and set propriately. */
3606 free_numbers_of_iterations_estimates (loop
);
3608 auto_vector_modes vector_modes
;
3609 /* Autodetect first vector size we try. */
3610 vector_modes
.safe_push (VOIDmode
);
3611 unsigned int autovec_flags
3612 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
3613 loop
->simdlen
!= 0);
3614 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
3615 && !unlimited_cost_model (loop
));
3616 machine_mode autodetected_vector_mode
= VOIDmode
;
3617 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3618 unsigned int mode_i
= 0;
3619 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
3621 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3622 a mode has not been analyzed. */
3623 auto_vec
<poly_uint64
, 8> cached_vf_per_mode
;
3624 for (unsigned i
= 0; i
< vector_modes
.length (); ++i
)
3625 cached_vf_per_mode
.safe_push (0);
3627 /* First determine the main loop vectorization mode, either the first
3628 one that works, starting with auto-detecting the vector mode and then
3629 following the targets order of preference, or the one with the
3630 lowest cost if pick_lowest_cost_p. */
3634 unsigned int last_mode_i
= mode_i
;
3635 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637 cached_vf_per_mode
[last_mode_i
] = -1;
3638 opt_loop_vec_info loop_vinfo
3639 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3640 NULL
, vector_modes
, mode_i
,
3641 autodetected_vector_mode
, fatal
);
3647 /* Analyzis has been successful so update the VF value. The
3648 VF should always be a multiple of unroll_factor and we want to
3649 capture the original VF here. */
3650 cached_vf_per_mode
[last_mode_i
]
3651 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
3652 loop_vinfo
->suggested_unroll_factor
);
3653 /* Once we hit the desired simdlen for the first time,
3654 discard any previous attempts. */
3656 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3658 delete first_loop_vinfo
;
3659 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3662 else if (pick_lowest_cost_p
3664 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3666 /* Pick loop_vinfo over first_loop_vinfo. */
3667 delete first_loop_vinfo
;
3668 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3670 if (first_loop_vinfo
== NULL
)
3671 first_loop_vinfo
= loop_vinfo
;
3675 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3678 /* Commit to first_loop_vinfo if we have no reason to try
3680 if (!simdlen
&& !pick_lowest_cost_p
)
3683 if (mode_i
== vector_modes
.length ()
3684 || autodetected_vector_mode
== VOIDmode
)
3687 /* Try the next biggest vector size. */
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_NOTE
, vect_location
,
3690 "***** Re-trying analysis with vector mode %s\n",
3691 GET_MODE_NAME (vector_modes
[mode_i
]));
3693 if (!first_loop_vinfo
)
3694 return opt_loop_vec_info::propagate_failure (res
);
3696 if (dump_enabled_p ())
3697 dump_printf_loc (MSG_NOTE
, vect_location
,
3698 "***** Choosing vector mode %s\n",
3699 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3701 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3702 enabled, SIMDUID is not set, it is the innermost loop and we have
3703 either already found the loop's SIMDLEN or there was no SIMDLEN to
3705 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3706 bool vect_epilogues
= (!simdlen
3707 && loop
->inner
== NULL
3708 && param_vect_epilogues_nomask
3709 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3710 /* No code motion support for multiple epilogues so for now
3711 not supported when multiple exits. */
3712 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo
)
3714 if (!vect_epilogues
)
3715 return first_loop_vinfo
;
3717 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3718 poly_uint64 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3720 /* For epilogues start the analysis from the first mode. The motivation
3721 behind starting from the beginning comes from cases where the VECTOR_MODES
3722 array may contain length-agnostic and length-specific modes. Their
3723 ordering is not guaranteed, so we could end up picking a mode for the main
3724 loop that is after the epilogue's optimal mode. */
3725 vector_modes
[0] = autodetected_vector_mode
;
3728 bool supports_partial_vectors
=
3729 partial_vectors_supported_p () && param_vect_partial_vector_usage
!= 0;
3730 poly_uint64 first_vinfo_vf
= LOOP_VINFO_VECT_FACTOR (first_loop_vinfo
);
3734 /* If the target does not support partial vectors we can shorten the
3735 number of modes to analyze for the epilogue as we know we can't pick a
3736 mode that would lead to a VF at least as big as the
3738 if (!supports_partial_vectors
3739 && maybe_ge (cached_vf_per_mode
[mode_i
], first_vinfo_vf
))
3742 if (mode_i
== vector_modes
.length ())
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_NOTE
, vect_location
,
3749 "***** Re-trying epilogue analysis with vector "
3750 "mode %s\n", GET_MODE_NAME (vector_modes
[mode_i
]));
3753 opt_loop_vec_info loop_vinfo
3754 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3756 vector_modes
, mode_i
,
3757 autodetected_vector_mode
, fatal
);
3763 if (pick_lowest_cost_p
)
3765 /* Keep trying to roll back vectorization attempts while the
3766 loop_vec_infos they produced were worse than this one. */
3767 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3768 while (!vinfos
.is_empty ()
3769 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3771 gcc_assert (vect_epilogues
);
3772 delete vinfos
.pop ();
3775 /* For now only allow one epilogue loop. */
3776 if (first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3778 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3779 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3780 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3781 || maybe_ne (lowest_th
, 0U));
3782 /* Keep track of the known smallest versioning
3784 if (ordered_p (lowest_th
, th
))
3785 lowest_th
= ordered_min (lowest_th
, th
);
3790 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3793 /* For now only allow one epilogue loop, but allow
3794 pick_lowest_cost_p to replace it, so commit to the
3795 first epilogue if we have no reason to try alternatives. */
3796 if (!pick_lowest_cost_p
)
3800 if (mode_i
== vector_modes
.length ())
3805 if (!first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3807 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3808 if (dump_enabled_p ())
3809 dump_printf_loc (MSG_NOTE
, vect_location
,
3810 "***** Choosing epilogue vector mode %s\n",
3812 (first_loop_vinfo
->epilogue_vinfos
[0]->vector_mode
));
3815 return first_loop_vinfo
;
3818 /* Return true if there is an in-order reduction function for CODE, storing
3819 it in *REDUC_FN if so. */
3822 fold_left_reduction_fn (code_helper code
, internal_fn
*reduc_fn
)
3824 /* We support MINUS_EXPR by negating the operand. This also preserves an
3825 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827 if (code
== PLUS_EXPR
|| code
== MINUS_EXPR
)
3829 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3835 /* Function reduction_fn_for_scalar_code
3838 CODE - tree_code of a reduction operations.
3841 REDUC_FN - the corresponding internal function to be used to reduce the
3842 vector of partial results into a single scalar result, or IFN_LAST
3843 if the operation is a supported reduction operation, but does not have
3844 such an internal function.
3846 Return FALSE if CODE currently cannot be vectorized as reduction. */
3849 reduction_fn_for_scalar_code (code_helper code
, internal_fn
*reduc_fn
)
3851 if (code
.is_tree_code ())
3852 switch (tree_code (code
))
3855 *reduc_fn
= IFN_REDUC_MAX
;
3859 *reduc_fn
= IFN_REDUC_MIN
;
3863 *reduc_fn
= IFN_REDUC_PLUS
;
3867 *reduc_fn
= IFN_REDUC_AND
;
3871 *reduc_fn
= IFN_REDUC_IOR
;
3875 *reduc_fn
= IFN_REDUC_XOR
;
3880 *reduc_fn
= IFN_LAST
;
3887 switch (combined_fn (code
))
3890 *reduc_fn
= IFN_REDUC_FMAX
;
3894 *reduc_fn
= IFN_REDUC_FMIN
;
3902 /* If there is a neutral value X such that a reduction would not be affected
3903 by the introduction of additional X elements, return that X, otherwise
3904 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3905 of the scalar elements. If the reduction has just a single initial value
3906 then INITIAL_VALUE is that value, otherwise it is null.
3907 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3908 In that case no signed zero is returned. */
3911 neutral_op_for_reduction (tree scalar_type
, code_helper code
,
3912 tree initial_value
, bool as_initial
)
3914 if (code
.is_tree_code ())
3915 switch (tree_code (code
))
3922 return build_zero_cst (scalar_type
);
3923 case WIDEN_SUM_EXPR
:
3925 if (!as_initial
&& HONOR_SIGNED_ZEROS (scalar_type
))
3926 return build_real (scalar_type
, dconstm0
);
3928 return build_zero_cst (scalar_type
);
3931 return build_one_cst (scalar_type
);
3934 return build_all_ones_cst (scalar_type
);
3938 return initial_value
;
3944 switch (combined_fn (code
))
3948 return initial_value
;
3955 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3956 STMT is printed with a message MSG. */
3959 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3961 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3964 /* Return true if we need an in-order reduction for operation CODE
3965 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3966 overflow must wrap. */
3969 needs_fold_left_reduction_p (tree type
, code_helper code
)
3971 /* CHECKME: check for !flag_finite_math_only too? */
3972 if (SCALAR_FLOAT_TYPE_P (type
))
3974 if (code
.is_tree_code ())
3975 switch (tree_code (code
))
3982 return !flag_associative_math
;
3985 switch (combined_fn (code
))
3992 return !flag_associative_math
;
3996 if (INTEGRAL_TYPE_P (type
))
3997 return (!code
.is_tree_code ()
3998 || !operation_no_trapping_overflow (type
, tree_code (code
)));
4000 if (SAT_FIXED_POINT_TYPE_P (type
))
4006 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4007 has a handled computation expression. Store the main reduction
4008 operation in *CODE. */
4011 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
4012 tree loop_arg
, code_helper
*code
,
4013 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
4015 auto_bitmap visited
;
4016 tree lookfor
= PHI_RESULT (phi
);
4018 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
4019 while (USE_FROM_PTR (curr
) != loop_arg
)
4020 curr
= op_iter_next_use (&curri
);
4021 curri
.i
= curri
.numops
;
4024 path
.safe_push (std::make_pair (curri
, curr
));
4025 tree use
= USE_FROM_PTR (curr
);
4028 gimple
*def
= SSA_NAME_DEF_STMT (use
);
4029 if (gimple_nop_p (def
)
4030 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
4035 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
4039 curr
= op_iter_next_use (&curri
);
4040 /* Skip already visited or non-SSA operands (from iterating
4042 while (curr
!= NULL_USE_OPERAND_P
4043 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
4044 || ! bitmap_set_bit (visited
,
4046 (USE_FROM_PTR (curr
)))));
4048 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
4049 if (curr
== NULL_USE_OPERAND_P
)
4054 if (gimple_code (def
) == GIMPLE_PHI
)
4055 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
4057 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
4058 while (curr
!= NULL_USE_OPERAND_P
4059 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
4060 || ! bitmap_set_bit (visited
,
4062 (USE_FROM_PTR (curr
)))))
4063 curr
= op_iter_next_use (&curri
);
4064 if (curr
== NULL_USE_OPERAND_P
)
4069 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
4071 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
4073 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
4074 FOR_EACH_VEC_ELT (path
, i
, x
)
4075 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
4076 dump_printf (MSG_NOTE
, "\n");
4079 /* Check whether the reduction path detected is valid. */
4080 bool fail
= path
.length () == 0;
4084 for (unsigned i
= 1; i
< path
.length (); ++i
)
4086 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
4088 if (!gimple_extract_op (use_stmt
, &op
))
4093 unsigned int opi
= op
.num_ops
;
4094 if (gassign
*assign
= dyn_cast
<gassign
*> (use_stmt
))
4096 /* The following make sure we can compute the operand index
4097 easily plus it mostly disallows chaining via COND_EXPR condition
4099 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
4100 if (gimple_assign_rhs1_ptr (assign
) + opi
== path
[i
].second
->use
)
4103 else if (gcall
*call
= dyn_cast
<gcall
*> (use_stmt
))
4105 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
4106 if (gimple_call_arg_ptr (call
, opi
) == path
[i
].second
->use
)
4109 if (opi
== op
.num_ops
)
4114 op
.code
= canonicalize_code (op
.code
, op
.type
);
4115 if (op
.code
== MINUS_EXPR
)
4117 op
.code
= PLUS_EXPR
;
4118 /* Track whether we negate the reduction value each iteration. */
4119 if (op
.ops
[1] == op
.ops
[opi
])
4122 else if (op
.code
== IFN_COND_SUB
)
4124 op
.code
= IFN_COND_ADD
;
4125 /* Track whether we negate the reduction value each iteration. */
4126 if (op
.ops
[2] == op
.ops
[opi
])
4129 if (CONVERT_EXPR_CODE_P (op
.code
)
4130 && tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
4132 else if (*code
== ERROR_MARK
)
4135 sign
= TYPE_SIGN (op
.type
);
4137 else if (op
.code
!= *code
)
4142 else if ((op
.code
== MIN_EXPR
4143 || op
.code
== MAX_EXPR
)
4144 && sign
!= TYPE_SIGN (op
.type
))
4149 /* Check there's only a single stmt the op is used on. For the
4150 not value-changing tail and the last stmt allow out-of-loop uses.
4151 ??? We could relax this and handle arbitrary live stmts by
4152 forcing a scalar epilogue for example. */
4153 imm_use_iterator imm_iter
;
4154 use_operand_p use_p
;
4155 gimple
*op_use_stmt
;
4157 bool cond_fn_p
= op
.code
.is_internal_fn ()
4158 && (conditional_internal_fn_code (internal_fn (op
.code
))
4161 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
.ops
[opi
])
4163 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4164 op1 twice (once as definition, once as else) in the same operation.
4166 if (cond_fn_p
&& op_use_stmt
== use_stmt
)
4168 gcall
*call
= as_a
<gcall
*> (use_stmt
);
4170 = internal_fn_else_index (internal_fn (op
.code
));
4172 for (unsigned int j
= 0; j
< gimple_call_num_args (call
); ++j
)
4176 if (gimple_call_arg (call
, j
) == op
.ops
[opi
])
4180 else if (!is_gimple_debug (op_use_stmt
)
4181 && (*code
!= ERROR_MARK
4182 || flow_bb_inside_loop_p (loop
,
4183 gimple_bb (op_use_stmt
))))
4184 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4194 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
4198 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
4199 tree loop_arg
, enum tree_code code
)
4201 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
4203 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
4209 /* Function vect_is_simple_reduction
4211 (1) Detect a cross-iteration def-use cycle that represents a simple
4212 reduction computation. We look for the following pattern:
4217 a2 = operation (a3, a1)
4224 a2 = operation (a3, a1)
4227 1. operation is commutative and associative and it is safe to
4228 change the order of the computation
4229 2. no uses for a2 in the loop (a2 is used out of the loop)
4230 3. no uses of a1 in the loop besides the reduction operation
4231 4. no uses of a1 outside the loop.
4233 Conditions 1,4 are tested here.
4234 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4236 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4239 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4243 inner loop (def of a3)
4246 (4) Detect condition expressions, ie:
4247 for (int i = 0; i < N; i++)
4253 static stmt_vec_info
4254 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
4255 bool *double_reduc
, bool *reduc_chain_p
, bool slp
)
4257 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
4258 gimple
*phi_use_stmt
= NULL
;
4259 imm_use_iterator imm_iter
;
4260 use_operand_p use_p
;
4262 *double_reduc
= false;
4263 *reduc_chain_p
= false;
4264 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
4266 tree phi_name
= PHI_RESULT (phi
);
4267 /* ??? If there are no uses of the PHI result the inner loop reduction
4268 won't be detected as possibly double-reduction by vectorizable_reduction
4269 because that tries to walk the PHI arg from the preheader edge which
4270 can be constant. See PR60382. */
4271 if (has_zero_uses (phi_name
))
4273 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
4274 unsigned nphi_def_loop_uses
= 0;
4275 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
4277 gimple
*use_stmt
= USE_STMT (use_p
);
4278 if (is_gimple_debug (use_stmt
))
4281 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4285 "intermediate value used outside loop.\n");
4290 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4291 op1 twice (once as definition, once as else) in the same operation.
4292 Only count it as one. */
4293 if (use_stmt
!= phi_use_stmt
)
4295 nphi_def_loop_uses
++;
4296 phi_use_stmt
= use_stmt
;
4300 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
4301 if (TREE_CODE (latch_def
) != SSA_NAME
)
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4305 "reduction: not ssa_name: %T\n", latch_def
);
4309 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
4311 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
4314 bool nested_in_vect_loop
4315 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
4316 unsigned nlatch_def_loop_uses
= 0;
4317 auto_vec
<gphi
*, 3> lcphis
;
4318 bool inner_loop_of_double_reduc
= false;
4319 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
4321 gimple
*use_stmt
= USE_STMT (use_p
);
4322 if (is_gimple_debug (use_stmt
))
4324 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
4325 nlatch_def_loop_uses
++;
4328 /* We can have more than one loop-closed PHI. */
4329 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
4330 if (nested_in_vect_loop
4331 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
4332 == vect_double_reduction_def
))
4333 inner_loop_of_double_reduc
= true;
4337 /* If we are vectorizing an inner reduction we are executing that
4338 in the original order only in case we are not dealing with a
4339 double reduction. */
4340 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
4342 if (dump_enabled_p ())
4343 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
4344 "detected nested cycle: ");
4345 return def_stmt_info
;
4348 /* When the inner loop of a double reduction ends up with more than
4349 one loop-closed PHI we have failed to classify alternate such
4350 PHIs as double reduction, leading to wrong code. See PR103237. */
4351 if (inner_loop_of_double_reduc
&& lcphis
.length () != 1)
4353 if (dump_enabled_p ())
4354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4355 "unhandle double reduction\n");
4359 /* If this isn't a nested cycle or if the nested cycle reduction value
4360 is used ouside of the inner loop we cannot handle uses of the reduction
4362 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
4364 if (dump_enabled_p ())
4365 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4366 "reduction used in loop.\n");
4370 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4371 defined in the inner loop. */
4372 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
4374 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
4375 if (gimple_phi_num_args (def_stmt
) != 1
4376 || TREE_CODE (op1
) != SSA_NAME
)
4378 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4380 "unsupported phi node definition.\n");
4385 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4386 and the latch definition op1. */
4387 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
4388 if (gimple_bb (def1
)
4389 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
4391 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
4392 && (is_gimple_assign (def1
) || is_gimple_call (def1
))
4393 && is_a
<gphi
*> (phi_use_stmt
)
4394 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
))
4395 && (op1
== PHI_ARG_DEF_FROM_EDGE (phi_use_stmt
,
4396 loop_latch_edge (loop
->inner
))))
4398 if (dump_enabled_p ())
4399 report_vect_op (MSG_NOTE
, def_stmt
,
4400 "detected double reduction: ");
4402 *double_reduc
= true;
4403 return def_stmt_info
;
4409 /* Look for the expression computing latch_def from then loop PHI result. */
4410 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
4412 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
4415 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
4416 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
4417 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
4419 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4420 reduction chain for which the additional restriction is that
4421 all operations in the chain are the same. */
4422 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
4424 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
4425 for (i
= path
.length () - 1; i
>= 1; --i
)
4427 gimple
*stmt
= USE_STMT (path
[i
].second
);
4428 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
4430 if (!gimple_extract_op (stmt
, &op
))
4432 if (gassign
*assign
= dyn_cast
<gassign
*> (stmt
))
4433 STMT_VINFO_REDUC_IDX (stmt_info
)
4434 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (assign
);
4437 gcall
*call
= as_a
<gcall
*> (stmt
);
4438 STMT_VINFO_REDUC_IDX (stmt_info
)
4439 = path
[i
].second
->use
- gimple_call_arg_ptr (call
, 0);
4441 bool leading_conversion
= (CONVERT_EXPR_CODE_P (op
.code
)
4442 && (i
== 1 || i
== path
.length () - 1));
4443 if ((op
.code
!= code
&& !leading_conversion
)
4444 /* We can only handle the final value in epilogue
4445 generation for reduction chains. */
4446 || (i
!= 1 && !has_single_use (gimple_get_lhs (stmt
))))
4447 is_slp_reduc
= false;
4448 /* For reduction chains we support a trailing/leading
4449 conversions. We do not store those in the actual chain. */
4450 if (leading_conversion
)
4452 reduc_chain
.safe_push (stmt_info
);
4454 if (slp
&& is_slp_reduc
&& reduc_chain
.length () > 1)
4456 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
4458 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
4459 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
4461 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
4462 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
4464 /* Save the chain for further analysis in SLP detection. */
4465 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
4466 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
4468 *reduc_chain_p
= true;
4469 if (dump_enabled_p ())
4470 dump_printf_loc (MSG_NOTE
, vect_location
,
4471 "reduction: detected reduction chain\n");
4473 else if (dump_enabled_p ())
4474 dump_printf_loc (MSG_NOTE
, vect_location
,
4475 "reduction: detected reduction\n");
4477 return def_stmt_info
;
4480 if (dump_enabled_p ())
4481 dump_printf_loc (MSG_NOTE
, vect_location
,
4482 "reduction: unknown pattern\n");
4487 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4488 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4489 or -1 if not known. */
4492 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
4494 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4495 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
4497 if (dump_enabled_p ())
4498 dump_printf_loc (MSG_NOTE
, vect_location
,
4499 "cost model: epilogue peel iters set to vf/2 "
4500 "because loop iterations are unknown .\n");
4501 return assumed_vf
/ 2;
4505 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
4506 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
4507 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
4508 /* If we need to peel for gaps, but no peeling is required, we have to
4509 peel VF iterations. */
4510 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
4511 peel_iters_epilogue
= assumed_vf
;
4512 return peel_iters_epilogue
;
4516 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4518 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
4519 int *peel_iters_epilogue
,
4520 stmt_vector_for_cost
*scalar_cost_vec
,
4521 stmt_vector_for_cost
*prologue_cost_vec
,
4522 stmt_vector_for_cost
*epilogue_cost_vec
)
4526 *peel_iters_epilogue
4527 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
4529 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
4531 /* If peeled iterations are known but number of scalar loop
4532 iterations are unknown, count a taken branch per peeled loop. */
4533 if (peel_iters_prologue
> 0)
4534 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
4536 if (*peel_iters_epilogue
> 0)
4537 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
4541 stmt_info_for_cost
*si
;
4543 if (peel_iters_prologue
)
4544 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4545 retval
+= record_stmt_cost (prologue_cost_vec
,
4546 si
->count
* peel_iters_prologue
,
4547 si
->kind
, si
->stmt_info
, si
->misalign
,
4549 if (*peel_iters_epilogue
)
4550 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4551 retval
+= record_stmt_cost (epilogue_cost_vec
,
4552 si
->count
* *peel_iters_epilogue
,
4553 si
->kind
, si
->stmt_info
, si
->misalign
,
4559 /* Function vect_estimate_min_profitable_iters
4561 Return the number of iterations required for the vector version of the
4562 loop to be profitable relative to the cost of the scalar version of the
4565 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4566 of iterations for vectorization. -1 value means loop vectorization
4567 is not profitable. This returned value may be used for dynamic
4568 profitability check.
4570 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4571 for static check against estimated number of iterations. */
4574 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
4575 int *ret_min_profitable_niters
,
4576 int *ret_min_profitable_estimate
,
4577 unsigned *suggested_unroll_factor
)
4579 int min_profitable_iters
;
4580 int min_profitable_estimate
;
4581 int peel_iters_prologue
;
4582 int peel_iters_epilogue
;
4583 unsigned vec_inside_cost
= 0;
4584 int vec_outside_cost
= 0;
4585 unsigned vec_prologue_cost
= 0;
4586 unsigned vec_epilogue_cost
= 0;
4587 int scalar_single_iter_cost
= 0;
4588 int scalar_outside_cost
= 0;
4589 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4590 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
4591 vector_costs
*target_cost_data
= loop_vinfo
->vector_costs
;
4593 /* Cost model disabled. */
4594 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
4596 if (dump_enabled_p ())
4597 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
4598 *ret_min_profitable_niters
= 0;
4599 *ret_min_profitable_estimate
= 0;
4603 /* Requires loop versioning tests to handle misalignment. */
4604 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
4606 /* FIXME: Make cost depend on complexity of individual check. */
4607 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
4608 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4609 if (dump_enabled_p ())
4610 dump_printf (MSG_NOTE
,
4611 "cost model: Adding cost of checks for loop "
4612 "versioning to treat misalignment.\n");
4615 /* Requires loop versioning with alias checks. */
4616 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
4618 /* FIXME: Make cost depend on complexity of individual check. */
4619 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
4620 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4621 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
4623 /* Count LEN - 1 ANDs and LEN comparisons. */
4624 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1,
4625 scalar_stmt
, vect_prologue
);
4626 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
4629 /* Count LEN - 1 ANDs and LEN comparisons. */
4630 unsigned int nstmts
= len
* 2 - 1;
4631 /* +1 for each bias that needs adding. */
4632 for (unsigned int i
= 0; i
< len
; ++i
)
4633 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
4635 (void) add_stmt_cost (target_cost_data
, nstmts
,
4636 scalar_stmt
, vect_prologue
);
4638 if (dump_enabled_p ())
4639 dump_printf (MSG_NOTE
,
4640 "cost model: Adding cost of checks for loop "
4641 "versioning aliasing.\n");
4644 /* Requires loop versioning with niter checks. */
4645 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
4647 /* FIXME: Make cost depend on complexity of individual check. */
4648 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
,
4649 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4650 if (dump_enabled_p ())
4651 dump_printf (MSG_NOTE
,
4652 "cost model: Adding cost of checks for loop "
4653 "versioning niters.\n");
4656 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4657 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4660 /* Count statements in scalar loop. Using this as scalar cost for a single
4663 TODO: Add outer loop support.
4665 TODO: Consider assigning different costs to different scalar
4668 scalar_single_iter_cost
= loop_vinfo
->scalar_costs
->total_cost ();
4670 /* Add additional cost for the peeled instructions in prologue and epilogue
4671 loop. (For fully-masked loops there will be no peeling.)
4673 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4674 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4676 TODO: Build an expression that represents peel_iters for prologue and
4677 epilogue to be used in a run-time test. */
4679 bool prologue_need_br_taken_cost
= false;
4680 bool prologue_need_br_not_taken_cost
= false;
4682 /* Calculate peel_iters_prologue. */
4683 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
4684 peel_iters_prologue
= 0;
4687 peel_iters_prologue
= assumed_vf
/ 2;
4688 if (dump_enabled_p ())
4689 dump_printf (MSG_NOTE
, "cost model: "
4690 "prologue peel iters set to vf/2.\n");
4692 /* If peeled iterations are unknown, count a taken branch and a not taken
4693 branch per peeled loop. Even if scalar loop iterations are known,
4694 vector iterations are not known since peeled prologue iterations are
4695 not known. Hence guards remain the same. */
4696 prologue_need_br_taken_cost
= true;
4697 prologue_need_br_not_taken_cost
= true;
4701 peel_iters_prologue
= npeel
;
4702 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
4703 /* If peeled iterations are known but number of scalar loop
4704 iterations are unknown, count a taken branch per peeled loop. */
4705 prologue_need_br_taken_cost
= true;
4708 bool epilogue_need_br_taken_cost
= false;
4709 bool epilogue_need_br_not_taken_cost
= false;
4711 /* Calculate peel_iters_epilogue. */
4712 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4713 /* We need to peel exactly one iteration for gaps. */
4714 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
4717 /* If peeling for alignment is unknown, loop bound of main loop
4719 peel_iters_epilogue
= assumed_vf
/ 2;
4720 if (dump_enabled_p ())
4721 dump_printf (MSG_NOTE
, "cost model: "
4722 "epilogue peel iters set to vf/2 because "
4723 "peeling for alignment is unknown.\n");
4725 /* See the same reason above in peel_iters_prologue calculation. */
4726 epilogue_need_br_taken_cost
= true;
4727 epilogue_need_br_not_taken_cost
= true;
4731 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4732 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4733 /* If peeled iterations are known but number of scalar loop
4734 iterations are unknown, count a taken branch per peeled loop. */
4735 epilogue_need_br_taken_cost
= true;
4738 stmt_info_for_cost
*si
;
4740 /* Add costs associated with peel_iters_prologue. */
4741 if (peel_iters_prologue
)
4742 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4744 (void) add_stmt_cost (target_cost_data
,
4745 si
->count
* peel_iters_prologue
, si
->kind
,
4746 si
->stmt_info
, si
->node
, si
->vectype
,
4747 si
->misalign
, vect_prologue
);
4750 /* Add costs associated with peel_iters_epilogue. */
4751 if (peel_iters_epilogue
)
4752 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4754 (void) add_stmt_cost (target_cost_data
,
4755 si
->count
* peel_iters_epilogue
, si
->kind
,
4756 si
->stmt_info
, si
->node
, si
->vectype
,
4757 si
->misalign
, vect_epilogue
);
4760 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4762 if (prologue_need_br_taken_cost
)
4763 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4766 if (prologue_need_br_not_taken_cost
)
4767 (void) add_stmt_cost (target_cost_data
, 1,
4768 cond_branch_not_taken
, vect_prologue
);
4770 if (epilogue_need_br_taken_cost
)
4771 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4774 if (epilogue_need_br_not_taken_cost
)
4775 (void) add_stmt_cost (target_cost_data
, 1,
4776 cond_branch_not_taken
, vect_epilogue
);
4778 /* Take care of special costs for rgroup controls of partial vectors. */
4779 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
4780 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
4781 == vect_partial_vectors_avx512
))
4783 /* Calculate how many masks we need to generate. */
4784 unsigned int num_masks
= 0;
4785 bool need_saturation
= false;
4786 for (auto rgm
: LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
)
4789 unsigned nvectors
= rgm
.factor
;
4790 num_masks
+= nvectors
;
4791 if (TYPE_PRECISION (TREE_TYPE (rgm
.compare_type
))
4792 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
)))
4793 need_saturation
= true;
4796 /* ??? The target isn't able to identify the costs below as
4797 producing masks so it cannot penaltize cases where we'd run
4798 out of mask registers for example. */
4800 /* ??? We are also failing to account for smaller vector masks
4801 we generate by splitting larger masks in vect_get_loop_mask. */
4803 /* In the worst case, we need to generate each mask in the prologue
4804 and in the loop body. We need one splat per group and one
4807 Sometimes the prologue mask will fold to a constant,
4808 so the actual prologue cost might be smaller. However, it's
4809 simpler and safer to use the worst-case cost; if this ends up
4810 being the tie-breaker between vectorizing or not, then it's
4811 probably better not to vectorize. */
4812 (void) add_stmt_cost (target_cost_data
,
4814 + LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.length (),
4815 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4817 (void) add_stmt_cost (target_cost_data
,
4819 + LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.length (),
4820 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0, vect_body
);
4822 /* When we need saturation we need it both in the prologue and
4824 if (need_saturation
)
4826 (void) add_stmt_cost (target_cost_data
, 1, scalar_stmt
,
4827 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4828 (void) add_stmt_cost (target_cost_data
, 1, scalar_stmt
,
4829 NULL
, NULL
, NULL_TREE
, 0, vect_body
);
4832 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
4833 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
4834 == vect_partial_vectors_while_ult
))
4836 /* Calculate how many masks we need to generate. */
4837 unsigned int num_masks
= 0;
4838 rgroup_controls
*rgm
;
4839 unsigned int num_vectors_m1
;
4840 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
,
4841 num_vectors_m1
, rgm
)
4843 num_masks
+= num_vectors_m1
+ 1;
4844 gcc_assert (num_masks
> 0);
4846 /* In the worst case, we need to generate each mask in the prologue
4847 and in the loop body. One of the loop body mask instructions
4848 replaces the comparison in the scalar loop, and since we don't
4849 count the scalar comparison against the scalar body, we shouldn't
4850 count that vector instruction against the vector body either.
4852 Sometimes we can use unpacks instead of generating prologue
4853 masks and sometimes the prologue mask will fold to a constant,
4854 so the actual prologue cost might be smaller. However, it's
4855 simpler and safer to use the worst-case cost; if this ends up
4856 being the tie-breaker between vectorizing or not, then it's
4857 probably better not to vectorize. */
4858 (void) add_stmt_cost (target_cost_data
, num_masks
,
4859 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4861 (void) add_stmt_cost (target_cost_data
, num_masks
- 1,
4862 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4865 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4867 /* Referring to the functions vect_set_loop_condition_partial_vectors
4868 and vect_set_loop_controls_directly, we need to generate each
4869 length in the prologue and in the loop body if required. Although
4870 there are some possible optimizations, we consider the worst case
4873 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4874 signed char partial_load_store_bias
4875 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
4877 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4878 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4880 /* Calculate how many statements to be added. */
4881 unsigned int prologue_stmts
= 0;
4882 unsigned int body_stmts
= 0;
4884 rgroup_controls
*rgc
;
4885 unsigned int num_vectors_m1
;
4886 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4889 /* May need one SHIFT for nitems_total computation. */
4890 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4891 if (nitems
!= 1 && !niters_known_p
)
4892 prologue_stmts
+= 1;
4894 /* May need one MAX and one MINUS for wrap around. */
4895 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4896 prologue_stmts
+= 2;
4898 /* Need one MAX and one MINUS for each batch limit excepting for
4900 prologue_stmts
+= num_vectors_m1
* 2;
4902 unsigned int num_vectors
= num_vectors_m1
+ 1;
4904 /* Need to set up lengths in prologue, only one MIN required
4905 for each since start index is zero. */
4906 prologue_stmts
+= num_vectors
;
4908 /* If we have a non-zero partial load bias, we need one PLUS
4909 to adjust the load length. */
4910 if (partial_load_store_bias
!= 0)
4913 unsigned int length_update_cost
= 0;
4914 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
))
4915 /* For decrement IV style, Each only need a single SELECT_VL
4916 or MIN since beginning to calculate the number of elements
4917 need to be processed in current iteration. */
4918 length_update_cost
= 1;
4920 /* For increment IV stype, Each may need two MINs and one MINUS to
4921 update lengths in body for next iteration. */
4922 length_update_cost
= 3;
4925 body_stmts
+= length_update_cost
* num_vectors
;
4928 (void) add_stmt_cost (target_cost_data
, prologue_stmts
,
4929 scalar_stmt
, vect_prologue
);
4930 (void) add_stmt_cost (target_cost_data
, body_stmts
,
4931 scalar_stmt
, vect_body
);
4934 /* FORNOW: The scalar outside cost is incremented in one of the
4937 1. The vectorizer checks for alignment and aliasing and generates
4938 a condition that allows dynamic vectorization. A cost model
4939 check is ANDED with the versioning condition. Hence scalar code
4940 path now has the added cost of the versioning check.
4942 if (cost > th & versioning_check)
4945 Hence run-time scalar is incremented by not-taken branch cost.
4947 2. The vectorizer then checks if a prologue is required. If the
4948 cost model check was not done before during versioning, it has to
4949 be done before the prologue check.
4952 prologue = scalar_iters
4957 if (prologue == num_iters)
4960 Hence the run-time scalar cost is incremented by a taken branch,
4961 plus a not-taken branch, plus a taken branch cost.
4963 3. The vectorizer then checks if an epilogue is required. If the
4964 cost model check was not done before during prologue check, it
4965 has to be done with the epilogue check.
4971 if (prologue == num_iters)
4974 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4977 Hence the run-time scalar cost should be incremented by 2 taken
4980 TODO: The back end may reorder the BBS's differently and reverse
4981 conditions/branch directions. Change the estimates below to
4982 something more reasonable. */
4984 /* If the number of iterations is known and we do not do versioning, we can
4985 decide whether to vectorize at compile time. Hence the scalar version
4986 do not carry cost model guard costs. */
4987 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4988 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4990 /* Cost model check occurs at versioning. */
4991 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4992 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4995 /* Cost model check occurs at prologue generation. */
4996 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4997 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4998 + vect_get_stmt_cost (cond_branch_not_taken
);
4999 /* Cost model check occurs at epilogue generation. */
5001 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
5005 /* Complete the target-specific cost calculations. */
5006 finish_cost (loop_vinfo
->vector_costs
, loop_vinfo
->scalar_costs
,
5007 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
,
5008 suggested_unroll_factor
);
5010 if (suggested_unroll_factor
&& *suggested_unroll_factor
> 1
5011 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) != MAX_VECTORIZATION_FACTOR
5012 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *
5013 *suggested_unroll_factor
,
5014 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
)))
5016 if (dump_enabled_p ())
5017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5018 "can't unroll as unrolled vectorization factor larger"
5019 " than maximum vectorization factor: "
5020 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
5021 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
));
5022 *suggested_unroll_factor
= 1;
5025 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
5027 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
5030 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
5032 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
5034 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
5036 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
5037 scalar_single_iter_cost
);
5038 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
5039 scalar_outside_cost
);
5040 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
5042 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
5043 peel_iters_prologue
);
5044 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
5045 peel_iters_epilogue
);
5048 /* Calculate number of iterations required to make the vector version
5049 profitable, relative to the loop bodies only. The following condition
5051 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053 SIC = scalar iteration cost, VIC = vector iteration cost,
5054 VOC = vector outside cost, VF = vectorization factor,
5055 NPEEL = prologue iterations + epilogue iterations,
5056 SOC = scalar outside cost for run time cost model check. */
5058 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
5060 if (saving_per_viter
<= 0)
5062 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
5063 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
5064 "vectorization did not happen for a simd loop");
5066 if (dump_enabled_p ())
5067 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5068 "cost model: the vector iteration cost = %d "
5069 "divided by the scalar iteration cost = %d "
5070 "is greater or equal to the vectorization factor = %d"
5072 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
5073 *ret_min_profitable_niters
= -1;
5074 *ret_min_profitable_estimate
= -1;
5078 /* ??? The "if" arm is written to handle all cases; see below for what
5079 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5080 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5082 /* Rewriting the condition above in terms of the number of
5083 vector iterations (vniters) rather than the number of
5084 scalar iterations (niters) gives:
5086 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5088 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5090 For integer N, X and Y when X > 0:
5092 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5093 int outside_overhead
= (vec_outside_cost
5094 - scalar_single_iter_cost
* peel_iters_prologue
5095 - scalar_single_iter_cost
* peel_iters_epilogue
5096 - scalar_outside_cost
);
5097 /* We're only interested in cases that require at least one
5098 vector iteration. */
5099 int min_vec_niters
= 1;
5100 if (outside_overhead
> 0)
5101 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
5103 if (dump_enabled_p ())
5104 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
5107 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5109 /* Now that we know the minimum number of vector iterations,
5110 find the minimum niters for which the scalar cost is larger:
5112 SIC * niters > VIC * vniters + VOC - SOC
5114 We know that the minimum niters is no more than
5115 vniters * VF + NPEEL, but it might be (and often is) less
5116 than that if a partial vector iteration is cheaper than the
5117 equivalent scalar code. */
5118 int threshold
= (vec_inside_cost
* min_vec_niters
5120 - scalar_outside_cost
);
5122 min_profitable_iters
= 1;
5124 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
5127 /* Convert the number of vector iterations into a number of
5128 scalar iterations. */
5129 min_profitable_iters
= (min_vec_niters
* assumed_vf
5130 + peel_iters_prologue
5131 + peel_iters_epilogue
);
5135 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
5137 - vec_inside_cost
* peel_iters_prologue
5138 - vec_inside_cost
* peel_iters_epilogue
);
5139 if (min_profitable_iters
<= 0)
5140 min_profitable_iters
= 0;
5143 min_profitable_iters
/= saving_per_viter
;
5145 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
5146 <= (((int) vec_inside_cost
* min_profitable_iters
)
5147 + (((int) vec_outside_cost
- scalar_outside_cost
)
5149 min_profitable_iters
++;
5153 if (dump_enabled_p ())
5154 dump_printf (MSG_NOTE
,
5155 " Calculated minimum iters for profitability: %d\n",
5156 min_profitable_iters
);
5158 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
5159 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
5160 /* We want the vectorized loop to execute at least once. */
5161 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
5162 else if (min_profitable_iters
< peel_iters_prologue
)
5163 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5164 vectorized loop executes at least once. */
5165 min_profitable_iters
= peel_iters_prologue
;
5167 if (dump_enabled_p ())
5168 dump_printf_loc (MSG_NOTE
, vect_location
,
5169 " Runtime profitability threshold = %d\n",
5170 min_profitable_iters
);
5172 *ret_min_profitable_niters
= min_profitable_iters
;
5174 /* Calculate number of iterations required to make the vector version
5175 profitable, relative to the loop bodies only.
5177 Non-vectorized variant is SIC * niters and it must win over vector
5178 variant on the expected loop trip count. The following condition must hold true:
5179 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5181 if (vec_outside_cost
<= 0)
5182 min_profitable_estimate
= 0;
5183 /* ??? This "else if" arm is written to handle all cases; see below for
5184 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5185 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5187 /* This is a repeat of the code above, but with + SOC rather
5189 int outside_overhead
= (vec_outside_cost
5190 - scalar_single_iter_cost
* peel_iters_prologue
5191 - scalar_single_iter_cost
* peel_iters_epilogue
5192 + scalar_outside_cost
);
5193 int min_vec_niters
= 1;
5194 if (outside_overhead
> 0)
5195 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
5197 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5199 int threshold
= (vec_inside_cost
* min_vec_niters
5201 + scalar_outside_cost
);
5202 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
5205 min_profitable_estimate
= (min_vec_niters
* assumed_vf
5206 + peel_iters_prologue
5207 + peel_iters_epilogue
);
5211 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
5213 - vec_inside_cost
* peel_iters_prologue
5214 - vec_inside_cost
* peel_iters_epilogue
)
5215 / ((scalar_single_iter_cost
* assumed_vf
)
5218 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
5219 if (dump_enabled_p ())
5220 dump_printf_loc (MSG_NOTE
, vect_location
,
5221 " Static estimate profitability threshold = %d\n",
5222 min_profitable_estimate
);
5224 *ret_min_profitable_estimate
= min_profitable_estimate
;
5227 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5228 vector elements (not bits) for a vector with NELT elements. */
5230 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
5231 vec_perm_builder
*sel
)
5233 /* The encoding is a single stepped pattern. Any wrap-around is handled
5234 by vec_perm_indices. */
5235 sel
->new_vector (nelt
, 1, 3);
5236 for (unsigned int i
= 0; i
< 3; i
++)
5237 sel
->quick_push (i
+ offset
);
5240 /* Checks whether the target supports whole-vector shifts for vectors of mode
5241 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5242 it supports vec_perm_const with masks for all necessary shift amounts. */
5244 have_whole_vector_shift (machine_mode mode
)
5246 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
5249 /* Variable-length vectors should be handled via the optab. */
5251 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5254 vec_perm_builder sel
;
5255 vec_perm_indices indices
;
5256 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
5258 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
5259 indices
.new_vector (sel
, 2, nelt
);
5260 if (!can_vec_perm_const_p (mode
, mode
, indices
, false))
5266 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5267 multiplication operands have differing signs and (b) we intend
5268 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5269 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5272 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo
,
5273 stmt_vec_info stmt_info
)
5275 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
5276 if (!assign
|| gimple_assign_rhs_code (assign
) != DOT_PROD_EXPR
)
5279 tree rhs1
= gimple_assign_rhs1 (assign
);
5280 tree rhs2
= gimple_assign_rhs2 (assign
);
5281 if (TYPE_SIGN (TREE_TYPE (rhs1
)) == TYPE_SIGN (TREE_TYPE (rhs2
)))
5284 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5285 gcc_assert (reduc_info
->is_reduc_info
);
5286 return !directly_supported_p (DOT_PROD_EXPR
,
5287 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
),
5288 optab_vector_mixed_sign
);
5291 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5292 functions. Design better to avoid maintenance issues. */
5294 /* Function vect_model_reduction_cost.
5296 Models cost for a reduction operation, including the vector ops
5297 generated within the strip-mine loop in some cases, the initial
5298 definition before the loop, and the epilogue code that must be generated. */
5301 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
5302 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
5303 vect_reduction_type reduction_type
,
5304 int ncopies
, stmt_vector_for_cost
*cost_vec
)
5306 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
= 0;
5309 class loop
*loop
= NULL
;
5312 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5314 /* Condition reductions generate two reductions in the loop. */
5315 if (reduction_type
== COND_REDUCTION
)
5318 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5319 mode
= TYPE_MODE (vectype
);
5320 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5323 if (!gimple_extract_op (orig_stmt_info
->stmt
, &op
))
5326 bool emulated_mixed_dot_prod
5327 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
5328 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
5329 /* No extra instructions are needed in the prologue. The loop body
5330 operations are costed in vectorizable_condition. */
5332 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
5334 /* No extra instructions needed in the prologue. */
5337 if (reduc_fn
!= IFN_LAST
)
5338 /* Count one reduction-like operation per vector. */
5339 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
5340 stmt_info
, 0, vect_body
);
5343 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5344 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
5345 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
5346 vec_to_scalar
, stmt_info
, 0,
5348 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
5349 scalar_stmt
, stmt_info
, 0,
5355 /* Add in the cost of the initial definitions. */
5357 if (reduction_type
== COND_REDUCTION
)
5358 /* For cond reductions we have four vectors: initial index, step,
5359 initial result of the data reduction, initial value of the index
5362 else if (emulated_mixed_dot_prod
)
5363 /* We need the initial reduction value and two invariants:
5364 one that contains the minimum signed value and one that
5365 contains half of its negative. */
5369 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
5370 scalar_to_vec
, stmt_info
, 0,
5374 /* Determine cost of epilogue code.
5376 We have a reduction operator that will reduce the vector in one statement.
5377 Also requires scalar extract. */
5379 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
5381 if (reduc_fn
!= IFN_LAST
)
5383 if (reduction_type
== COND_REDUCTION
)
5385 /* An EQ stmt and an COND_EXPR stmt. */
5386 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
5387 vector_stmt
, stmt_info
, 0,
5389 /* Reduction of the max index and a reduction of the found
5391 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
5392 vec_to_scalar
, stmt_info
, 0,
5394 /* A broadcast of the max value. */
5395 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5396 scalar_to_vec
, stmt_info
, 0,
5401 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
5402 stmt_info
, 0, vect_epilogue
);
5403 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5404 vec_to_scalar
, stmt_info
, 0,
5408 else if (reduction_type
== COND_REDUCTION
)
5410 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
5411 /* Extraction of scalar elements. */
5412 epilogue_cost
+= record_stmt_cost (cost_vec
,
5413 2 * estimated_nunits
,
5414 vec_to_scalar
, stmt_info
, 0,
5416 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5417 epilogue_cost
+= record_stmt_cost (cost_vec
,
5418 2 * estimated_nunits
- 3,
5419 scalar_stmt
, stmt_info
, 0,
5422 else if (reduction_type
== EXTRACT_LAST_REDUCTION
5423 || reduction_type
== FOLD_LEFT_REDUCTION
)
5424 /* No extra instructions need in the epilogue. */
5428 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5429 tree bitsize
= TYPE_SIZE (op
.type
);
5430 int element_bitsize
= tree_to_uhwi (bitsize
);
5431 int nelements
= vec_size_in_bits
/ element_bitsize
;
5433 if (op
.code
== COND_EXPR
)
5436 /* We have a whole vector shift available. */
5437 if (VECTOR_MODE_P (mode
)
5438 && directly_supported_p (op
.code
, vectype
)
5439 && have_whole_vector_shift (mode
))
5441 /* Final reduction via vector shifts and the reduction operator.
5442 Also requires scalar extract. */
5443 epilogue_cost
+= record_stmt_cost (cost_vec
,
5444 exact_log2 (nelements
) * 2,
5445 vector_stmt
, stmt_info
, 0,
5447 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5448 vec_to_scalar
, stmt_info
, 0,
5452 /* Use extracts and reduction op for final reduction. For N
5453 elements, we have N extracts and N-1 reduction ops. */
5454 epilogue_cost
+= record_stmt_cost (cost_vec
,
5455 nelements
+ nelements
- 1,
5456 vector_stmt
, stmt_info
, 0,
5461 if (dump_enabled_p ())
5462 dump_printf (MSG_NOTE
,
5463 "vect_model_reduction_cost: inside_cost = %d, "
5464 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
5465 prologue_cost
, epilogue_cost
);
5468 /* SEQ is a sequence of instructions that initialize the reduction
5469 described by REDUC_INFO. Emit them in the appropriate place. */
5472 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo
,
5473 stmt_vec_info reduc_info
, gimple
*seq
)
5475 if (reduc_info
->reused_accumulator
)
5477 /* When reusing an accumulator from the main loop, we only need
5478 initialization instructions if the main loop can be skipped.
5479 In that case, emit the initialization instructions at the end
5480 of the guard block that does the skip. */
5481 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5482 gcc_assert (skip_edge
);
5483 gimple_stmt_iterator gsi
= gsi_last_bb (skip_edge
->src
);
5484 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
5488 /* The normal case: emit the initialization instructions on the
5490 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5491 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), seq
);
5495 /* Function get_initial_def_for_reduction
5498 REDUC_INFO - the info_for_reduction
5499 INIT_VAL - the initial value of the reduction variable
5500 NEUTRAL_OP - a value that has no effect on the reduction, as per
5501 neutral_op_for_reduction
5504 Return a vector variable, initialized according to the operation that
5505 STMT_VINFO performs. This vector will be used as the initial value
5506 of the vector of partial results.
5508 The value we need is a vector in which element 0 has value INIT_VAL
5509 and every other element has value NEUTRAL_OP. */
5512 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
5513 stmt_vec_info reduc_info
,
5514 tree init_val
, tree neutral_op
)
5516 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5517 tree scalar_type
= TREE_TYPE (init_val
);
5518 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
5520 gimple_seq stmts
= NULL
;
5522 gcc_assert (vectype
);
5524 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
5525 || SCALAR_FLOAT_TYPE_P (scalar_type
));
5527 gcc_assert (nested_in_vect_loop_p (loop
, reduc_info
)
5528 || loop
== (gimple_bb (reduc_info
->stmt
))->loop_father
);
5530 if (operand_equal_p (init_val
, neutral_op
))
5532 /* If both elements are equal then the vector described above is
5534 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
5535 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, neutral_op
);
5539 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
5540 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
5541 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
5543 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
5547 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
5548 vectype
, init_def
, init_val
);
5552 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5553 tree_vector_builder
elts (vectype
, 1, 2);
5554 elts
.quick_push (init_val
);
5555 elts
.quick_push (neutral_op
);
5556 init_def
= gimple_build_vector (&stmts
, &elts
);
5561 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, stmts
);
5565 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5566 which performs a reduction involving GROUP_SIZE scalar statements.
5567 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5568 is nonnull, introducing extra elements of that value will not change the
5572 get_initial_defs_for_reduction (loop_vec_info loop_vinfo
,
5573 stmt_vec_info reduc_info
,
5574 vec
<tree
> *vec_oprnds
,
5575 unsigned int number_of_vectors
,
5576 unsigned int group_size
, tree neutral_op
)
5578 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
5579 unsigned HOST_WIDE_INT nunits
;
5580 unsigned j
, number_of_places_left_in_vector
;
5581 tree vector_type
= STMT_VINFO_VECTYPE (reduc_info
);
5584 gcc_assert (group_size
== initial_values
.length () || neutral_op
);
5586 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5587 created vectors. It is greater than 1 if unrolling is performed.
5589 For example, we have two scalar operands, s1 and s2 (e.g., group of
5590 strided accesses of size two), while NUNITS is four (i.e., four scalars
5591 of this type can be packed in a vector). The output vector will contain
5592 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5595 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5596 vectors containing the operands.
5598 For example, NUNITS is four as before, and the group size is 8
5599 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5600 {s5, s6, s7, s8}. */
5602 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5603 nunits
= group_size
;
5605 number_of_places_left_in_vector
= nunits
;
5606 bool constant_p
= true;
5607 tree_vector_builder
elts (vector_type
, nunits
, 1);
5608 elts
.quick_grow (nunits
);
5609 gimple_seq ctor_seq
= NULL
;
5610 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
5615 /* Get the def before the loop. In reduction chain we have only
5616 one initial value. Else we have as many as PHIs in the group. */
5617 if (i
>= initial_values
.length () || (j
> i
&& neutral_op
))
5620 op
= initial_values
[i
];
5622 /* Create 'vect_ = {op0,op1,...,opn}'. */
5623 number_of_places_left_in_vector
--;
5624 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
5625 if (!CONSTANT_CLASS_P (op
))
5628 if (number_of_places_left_in_vector
== 0)
5631 if (constant_p
&& !neutral_op
5632 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5633 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5634 /* Build the vector directly from ELTS. */
5635 init
= gimple_build_vector (&ctor_seq
, &elts
);
5636 else if (neutral_op
)
5638 /* Build a vector of the neutral value and shift the
5639 other elements into place. */
5640 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
5643 while (k
> 0 && elts
[k
- 1] == neutral_op
)
5648 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
5649 vector_type
, init
, elts
[k
]);
5654 /* First time round, duplicate ELTS to fill the
5655 required number of vectors. */
5656 duplicate_and_interleave (loop_vinfo
, &ctor_seq
, vector_type
,
5657 elts
, number_of_vectors
, *vec_oprnds
);
5660 vec_oprnds
->quick_push (init
);
5662 number_of_places_left_in_vector
= nunits
;
5663 elts
.new_vector (vector_type
, nunits
, 1);
5664 elts
.quick_grow (nunits
);
5668 if (ctor_seq
!= NULL
)
5669 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, ctor_seq
);
5672 /* For a statement STMT_INFO taking part in a reduction operation return
5673 the stmt_vec_info the meta information is stored on. */
5676 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
5678 stmt_info
= vect_orig_stmt (stmt_info
);
5679 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
5680 if (!is_a
<gphi
*> (stmt_info
->stmt
)
5681 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
5682 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5683 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
5684 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5686 if (gimple_phi_num_args (phi
) == 1)
5687 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5689 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
5691 stmt_vec_info info
= vinfo
->lookup_def (vect_phi_initial_value (phi
));
5692 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
5698 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5699 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5703 vect_find_reusable_accumulator (loop_vec_info loop_vinfo
,
5704 stmt_vec_info reduc_info
)
5706 loop_vec_info main_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
5707 if (!main_loop_vinfo
)
5710 if (STMT_VINFO_REDUC_TYPE (reduc_info
) != TREE_CODE_REDUCTION
)
5713 unsigned int num_phis
= reduc_info
->reduc_initial_values
.length ();
5714 auto_vec
<tree
, 16> main_loop_results (num_phis
);
5715 auto_vec
<tree
, 16> initial_values (num_phis
);
5716 if (edge main_loop_edge
= loop_vinfo
->main_loop_edge
)
5718 /* The epilogue loop can be entered either from the main loop or
5719 from an earlier guard block. */
5720 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5721 for (tree incoming_value
: reduc_info
->reduc_initial_values
)
5725 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5726 INITIAL_VALUE(guard block)>. */
5727 gcc_assert (TREE_CODE (incoming_value
) == SSA_NAME
);
5729 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (incoming_value
));
5730 gcc_assert (gimple_bb (phi
) == main_loop_edge
->dest
);
5732 tree from_main_loop
= PHI_ARG_DEF_FROM_EDGE (phi
, main_loop_edge
);
5733 tree from_skip
= PHI_ARG_DEF_FROM_EDGE (phi
, skip_edge
);
5735 main_loop_results
.quick_push (from_main_loop
);
5736 initial_values
.quick_push (from_skip
);
5740 /* The main loop dominates the epilogue loop. */
5741 main_loop_results
.splice (reduc_info
->reduc_initial_values
);
5743 /* See if the main loop has the kind of accumulator we need. */
5744 vect_reusable_accumulator
*accumulator
5745 = main_loop_vinfo
->reusable_accumulators
.get (main_loop_results
[0]);
5747 || num_phis
!= accumulator
->reduc_info
->reduc_scalar_results
.length ()
5748 || !std::equal (main_loop_results
.begin (), main_loop_results
.end (),
5749 accumulator
->reduc_info
->reduc_scalar_results
.begin ()))
5752 /* Handle the case where we can reduce wider vectors to narrower ones. */
5753 tree vectype
= STMT_VINFO_VECTYPE (reduc_info
);
5754 tree old_vectype
= TREE_TYPE (accumulator
->reduc_input
);
5755 unsigned HOST_WIDE_INT m
;
5756 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype
),
5757 TYPE_VECTOR_SUBPARTS (vectype
), &m
))
5759 /* Check the intermediate vector types and operations are available. */
5760 tree prev_vectype
= old_vectype
;
5761 poly_uint64 intermediate_nunits
= TYPE_VECTOR_SUBPARTS (old_vectype
);
5762 while (known_gt (intermediate_nunits
, TYPE_VECTOR_SUBPARTS (vectype
)))
5764 intermediate_nunits
= exact_div (intermediate_nunits
, 2);
5765 tree intermediate_vectype
= get_related_vectype_for_scalar_type
5766 (TYPE_MODE (vectype
), TREE_TYPE (vectype
), intermediate_nunits
);
5767 if (!intermediate_vectype
5768 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info
),
5769 intermediate_vectype
)
5770 || !can_vec_extract (TYPE_MODE (prev_vectype
),
5771 TYPE_MODE (intermediate_vectype
)))
5773 prev_vectype
= intermediate_vectype
;
5776 /* Non-SLP reductions might apply an adjustment after the reduction
5777 operation, in order to simplify the initialization of the accumulator.
5778 If the epilogue loop carries on from where the main loop left off,
5779 it should apply the same adjustment to the final reduction result.
5781 If the epilogue loop can also be entered directly (rather than via
5782 the main loop), we need to be able to handle that case in the same way,
5783 with the same adjustment. (In principle we could add a PHI node
5784 to select the correct adjustment, but in practice that shouldn't be
5786 tree main_adjustment
5787 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator
->reduc_info
);
5788 if (loop_vinfo
->main_loop_edge
&& main_adjustment
)
5790 gcc_assert (num_phis
== 1);
5791 tree initial_value
= initial_values
[0];
5792 /* Check that we can use INITIAL_VALUE as the adjustment and
5793 initialize the accumulator with a neutral value instead. */
5794 if (!operand_equal_p (initial_value
, main_adjustment
))
5796 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5797 initial_values
[0] = neutral_op_for_reduction (TREE_TYPE (initial_value
),
5798 code
, initial_value
);
5800 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = main_adjustment
;
5801 reduc_info
->reduc_initial_values
.truncate (0);
5802 reduc_info
->reduc_initial_values
.splice (initial_values
);
5803 reduc_info
->reused_accumulator
= accumulator
;
5807 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5808 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5811 vect_create_partial_epilog (tree vec_def
, tree vectype
, code_helper code
,
5814 unsigned nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def
)).to_constant ();
5815 unsigned nunits1
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5816 tree stype
= TREE_TYPE (vectype
);
5817 tree new_temp
= vec_def
;
5818 while (nunits
> nunits1
)
5821 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5823 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5825 /* The target has to make sure we support lowpart/highpart
5826 extraction, either via direct vector extract or through
5827 an integer mode punning. */
5829 gimple
*epilog_stmt
;
5830 if (convert_optab_handler (vec_extract_optab
,
5831 TYPE_MODE (TREE_TYPE (new_temp
)),
5832 TYPE_MODE (vectype1
))
5833 != CODE_FOR_nothing
)
5835 /* Extract sub-vectors directly once vec_extract becomes
5836 a conversion optab. */
5837 dst1
= make_ssa_name (vectype1
);
5839 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5840 build3 (BIT_FIELD_REF
, vectype1
,
5841 new_temp
, TYPE_SIZE (vectype1
),
5843 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5844 dst2
= make_ssa_name (vectype1
);
5846 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5847 build3 (BIT_FIELD_REF
, vectype1
,
5848 new_temp
, TYPE_SIZE (vectype1
),
5849 bitsize_int (bitsize
)));
5850 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5854 /* Extract via punning to appropriately sized integer mode
5856 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5857 tree etype
= build_vector_type (eltype
, 2);
5858 gcc_assert (convert_optab_handler (vec_extract_optab
,
5861 != CODE_FOR_nothing
);
5862 tree tem
= make_ssa_name (etype
);
5863 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5864 build1 (VIEW_CONVERT_EXPR
,
5866 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5868 tem
= make_ssa_name (eltype
);
5870 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5871 build3 (BIT_FIELD_REF
, eltype
,
5872 new_temp
, TYPE_SIZE (eltype
),
5874 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5875 dst1
= make_ssa_name (vectype1
);
5876 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5877 build1 (VIEW_CONVERT_EXPR
,
5879 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5880 tem
= make_ssa_name (eltype
);
5882 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5883 build3 (BIT_FIELD_REF
, eltype
,
5884 new_temp
, TYPE_SIZE (eltype
),
5885 bitsize_int (bitsize
)));
5886 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5887 dst2
= make_ssa_name (vectype1
);
5888 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5889 build1 (VIEW_CONVERT_EXPR
,
5891 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5894 new_temp
= gimple_build (seq
, code
, vectype1
, dst1
, dst2
);
5900 /* Retrieves the definining statement to be used for a reduction.
5901 For LAST_VAL_REDUC_P we use the current VEC_STMTs which correspond to the
5902 final value after vectorization and otherwise we look at the reduction
5903 definitions to get the first. */
5906 vect_get_vect_def (stmt_vec_info reduc_info
, slp_tree slp_node
,
5907 slp_instance slp_node_instance
, bool last_val_reduc_p
,
5908 unsigned i
, vec
<gimple
*> &vec_stmts
)
5914 if (!last_val_reduc_p
)
5915 slp_node
= slp_node_instance
->reduc_phis
;
5916 def
= vect_get_slp_vect_def (slp_node
, i
);
5920 if (!last_val_reduc_p
)
5921 reduc_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info
));
5922 vec_stmts
= STMT_VINFO_VEC_STMTS (reduc_info
);
5923 def
= gimple_get_lhs (vec_stmts
[0]);
5929 /* Function vect_create_epilog_for_reduction
5931 Create code at the loop-epilog to finalize the result of a reduction
5934 STMT_INFO is the scalar reduction stmt that is being vectorized.
5935 SLP_NODE is an SLP node containing a group of reduction statements. The
5936 first one in this group is STMT_INFO.
5937 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5938 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5940 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5941 exit this edge is always the main loop exit.
5944 1. Completes the reduction def-use cycles.
5945 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5946 by calling the function specified by REDUC_FN if available, or by
5947 other means (whole-vector shifts or a scalar loop).
5948 The function also creates a new phi node at the loop exit to preserve
5949 loop-closed form, as illustrated below.
5951 The flow at the entry to this function:
5954 vec_def = phi <vec_init, null> # REDUCTION_PHI
5955 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5956 s_loop = scalar_stmt # (scalar) STMT_INFO
5958 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5962 The above is transformed by this function into:
5965 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5966 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5967 s_loop = scalar_stmt # (scalar) STMT_INFO
5969 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5970 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5971 v_out2 = reduce <v_out1>
5972 s_out3 = extract_field <v_out2, 0>
5973 s_out4 = adjust_result <s_out3>
5979 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
5980 stmt_vec_info stmt_info
,
5982 slp_instance slp_node_instance
,
5985 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5986 gcc_assert (reduc_info
->is_reduc_info
);
5987 /* For double reductions we need to get at the inner loop reduction
5988 stmt which has the meta info attached. Our stmt_info is that of the
5989 loop-closed PHI of the inner loop which we remember as
5990 def for the reduction PHI generation. */
5991 bool double_reduc
= false;
5992 bool last_val_reduc_p
= LOOP_VINFO_IV_EXIT (loop_vinfo
) == loop_exit
5993 && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
);
5994 stmt_vec_info rdef_info
= stmt_info
;
5995 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5997 gcc_assert (!slp_node
);
5998 double_reduc
= true;
5999 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
6000 (stmt_info
->stmt
, 0));
6001 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
6003 gphi
*reduc_def_stmt
6004 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
6005 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
6006 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
6009 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
6010 basic_block exit_bb
;
6013 gimple
*new_phi
= NULL
, *phi
= NULL
;
6014 gimple_stmt_iterator exit_gsi
;
6015 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
6016 gimple
*epilog_stmt
= NULL
;
6020 tree orig_name
, scalar_result
;
6021 imm_use_iterator imm_iter
, phi_imm_iter
;
6022 use_operand_p use_p
, phi_use_p
;
6024 auto_vec
<tree
> reduc_inputs
;
6026 vec
<tree
> &scalar_results
= reduc_info
->reduc_scalar_results
;
6027 unsigned int group_size
= 1, k
;
6028 /* SLP reduction without reduction chain, e.g.,
6032 b2 = operation (b1) */
6033 bool slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
6034 bool direct_slp_reduc
;
6035 tree induction_index
= NULL_TREE
;
6038 group_size
= SLP_TREE_LANES (slp_node
);
6040 if (nested_in_vect_loop_p (loop
, stmt_info
))
6044 gcc_assert (!slp_node
&& double_reduc
);
6047 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
6048 gcc_assert (vectype
);
6049 mode
= TYPE_MODE (vectype
);
6051 tree induc_val
= NULL_TREE
;
6052 tree adjustment_def
= NULL
;
6057 /* Optimize: for induction condition reduction, if we can't use zero
6058 for induc_val, use initial_def. */
6059 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6060 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
6061 else if (double_reduc
)
6064 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
6067 stmt_vec_info single_live_out_stmt
[] = { stmt_info
};
6068 array_slice
<const stmt_vec_info
> live_out_stmts
= single_live_out_stmt
;
6069 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
)
6070 && loop_exit
!= LOOP_VINFO_IV_EXIT (loop_vinfo
)
6071 /* ??? We should fend this off earlier. For conversions we create
6072 multiple epilogues, one dead. */
6073 && stmt_info
== reduc_info
->reduc_def
)
6075 gcc_assert (!slp_node
);
6076 single_live_out_stmt
[0] = reduc_info
;
6081 /* All statements produce live-out values. */
6082 live_out_stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
6085 /* The last statement in the reduction chain produces the live-out
6086 value. Note SLP optimization can shuffle scalar stmts to
6087 optimize permutations so we have to search for the last stmt. */
6088 for (k
= 0; k
< group_size
; ++k
)
6089 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node
)[k
]))
6091 single_live_out_stmt
[0] = SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
6101 vec_num
= SLP_TREE_VEC_DEFS (slp_node_instance
->reduc_phis
).length ();
6106 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
6108 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
6111 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6112 which is updated with the current index of the loop for every match of
6113 the original loop's cond_expr (VEC_STMT). This results in a vector
6114 containing the last time the condition passed for that vector lane.
6115 The first match will be a 1 to allow 0 to be used for non-matching
6116 indexes. If there are no matches at all then the vector will be all
6119 PR92772: This algorithm is broken for architectures that support
6120 masked vectors, but do not provide fold_extract_last. */
6121 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
6123 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
6124 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
6125 cond_info
= vect_stmt_to_vectorize (cond_info
);
6126 while (cond_info
!= reduc_info
)
6128 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
6130 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
6131 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
6133 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
6134 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
6137 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
6138 1 + STMT_VINFO_REDUC_IDX
6140 cond_info
= vect_stmt_to_vectorize (cond_info
);
6142 gcc_assert (ccompares
.length () != 0);
6144 tree indx_before_incr
, indx_after_incr
;
6145 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
6146 int scalar_precision
6147 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
6148 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6149 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
6150 (TYPE_MODE (vectype
), cr_index_scalar_type
,
6151 TYPE_VECTOR_SUBPARTS (vectype
));
6153 /* First we create a simple vector induction variable which starts
6154 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6155 vector size (STEP). */
6157 /* Create a {1,2,3,...} vector. */
6158 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
6160 /* Create a vector of the step value. */
6161 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
6162 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
6164 /* Create an induction variable. */
6165 gimple_stmt_iterator incr_gsi
;
6167 vect_iv_increment_position (loop_exit
, &incr_gsi
, &insert_after
);
6168 create_iv (series_vect
, PLUS_EXPR
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
6169 insert_after
, &indx_before_incr
, &indx_after_incr
);
6171 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6172 filled with zeros (VEC_ZERO). */
6174 /* Create a vector of 0s. */
6175 tree zero
= build_zero_cst (cr_index_scalar_type
);
6176 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
6178 /* Create a vector phi node. */
6179 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
6180 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
6181 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
6182 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
6184 /* Now take the condition from the loops original cond_exprs
6185 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6186 every match uses values from the induction variable
6187 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6189 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6190 the new cond_expr (INDEX_COND_EXPR). */
6191 gimple_seq stmts
= NULL
;
6192 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
6194 tree ccompare
= ccompares
[i
].first
;
6195 if (ccompares
[i
].second
)
6196 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
6197 cr_index_vector_type
,
6199 indx_before_incr
, new_phi_tree
);
6201 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
6202 cr_index_vector_type
,
6204 new_phi_tree
, indx_before_incr
);
6206 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
6208 /* Update the phi with the vec cond. */
6209 induction_index
= new_phi_tree
;
6210 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
6211 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
6214 /* 2. Create epilog code.
6215 The reduction epilog code operates across the elements of the vector
6216 of partial results computed by the vectorized loop.
6217 The reduction epilog code consists of:
6219 step 1: compute the scalar result in a vector (v_out2)
6220 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6221 step 3: adjust the scalar result (s_out3) if needed.
6223 Step 1 can be accomplished using one the following three schemes:
6224 (scheme 1) using reduc_fn, if available.
6225 (scheme 2) using whole-vector shifts, if available.
6226 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6229 The overall epilog code looks like this:
6231 s_out0 = phi <s_loop> # original EXIT_PHI
6232 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6233 v_out2 = reduce <v_out1> # step 1
6234 s_out3 = extract_field <v_out2, 0> # step 2
6235 s_out4 = adjust_result <s_out3> # step 3
6237 (step 3 is optional, and steps 1 and 2 may be combined).
6238 Lastly, the uses of s_out0 are replaced by s_out4. */
6241 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6242 v_out1 = phi <VECT_DEF>
6243 Store them in NEW_PHIS. */
6246 /* We need to reduce values in all exits. */
6247 exit_bb
= loop_exit
->dest
;
6248 exit_gsi
= gsi_after_labels (exit_bb
);
6249 reduc_inputs
.create (slp_node
? vec_num
: ncopies
);
6250 vec
<gimple
*> vec_stmts
= vNULL
;
6251 for (unsigned i
= 0; i
< vec_num
; i
++)
6253 gimple_seq stmts
= NULL
;
6254 def
= vect_get_vect_def (rdef_info
, slp_node
, slp_node_instance
,
6255 last_val_reduc_p
, i
, vec_stmts
);
6256 for (j
= 0; j
< ncopies
; j
++)
6258 tree new_def
= copy_ssa_name (def
);
6259 phi
= create_phi_node (new_def
, exit_bb
);
6261 def
= gimple_get_lhs (vec_stmts
[j
]);
6262 if (LOOP_VINFO_IV_EXIT (loop_vinfo
) == loop_exit
)
6263 SET_PHI_ARG_DEF (phi
, loop_exit
->dest_idx
, def
);
6266 for (unsigned k
= 0; k
< gimple_phi_num_args (phi
); k
++)
6267 SET_PHI_ARG_DEF (phi
, k
, def
);
6269 new_def
= gimple_convert (&stmts
, vectype
, new_def
);
6270 reduc_inputs
.quick_push (new_def
);
6272 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6275 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6276 (i.e. when reduc_fn is not available) and in the final adjustment
6277 code (if needed). Also get the original scalar reduction variable as
6278 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6279 represents a reduction pattern), the tree-code and scalar-def are
6280 taken from the original stmt that the pattern-stmt (STMT) replaces.
6281 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6282 are taken from STMT. */
6284 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
6285 if (orig_stmt_info
!= stmt_info
)
6287 /* Reduction pattern */
6288 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6289 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
6292 scalar_dest
= gimple_get_lhs (orig_stmt_info
->stmt
);
6293 scalar_type
= TREE_TYPE (scalar_dest
);
6294 scalar_results
.truncate (0);
6295 scalar_results
.reserve_exact (group_size
);
6296 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
6297 bitsize
= TYPE_SIZE (scalar_type
);
6299 /* True if we should implement SLP_REDUC using native reduction operations
6300 instead of scalar operations. */
6301 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
6303 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
6305 /* In case of reduction chain, e.g.,
6308 a3 = operation (a2),
6310 we may end up with more than one vector result. Here we reduce them
6313 The same is true for a SLP reduction, e.g.,
6317 b2 = operation (a2),
6319 where we can end up with more than one vector as well. We can
6320 easily accumulate vectors when the number of vector elements is
6321 a multiple of the SLP group size.
6323 The same is true if we couldn't use a single defuse cycle. */
6324 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6327 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
), group_size
))
6330 gimple_seq stmts
= NULL
;
6331 tree single_input
= reduc_inputs
[0];
6332 for (k
= 1; k
< reduc_inputs
.length (); k
++)
6333 single_input
= gimple_build (&stmts
, code
, vectype
,
6334 single_input
, reduc_inputs
[k
]);
6335 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6337 reduc_inputs
.truncate (0);
6338 reduc_inputs
.safe_push (single_input
);
6341 tree orig_reduc_input
= reduc_inputs
[0];
6343 /* If this loop is an epilogue loop that can be skipped after the
6344 main loop, we can only share a reduction operation between the
6345 main loop and the epilogue if we put it at the target of the
6348 We can still reuse accumulators if this check fails. Doing so has
6349 the minor(?) benefit of making the epilogue loop's scalar result
6350 independent of the main loop's scalar result. */
6351 bool unify_with_main_loop_p
= false;
6352 if (reduc_info
->reused_accumulator
6353 && loop_vinfo
->skip_this_loop_edge
6354 && single_succ_p (exit_bb
)
6355 && single_succ (exit_bb
) == loop_vinfo
->skip_this_loop_edge
->dest
)
6357 unify_with_main_loop_p
= true;
6359 basic_block reduc_block
= loop_vinfo
->skip_this_loop_edge
->dest
;
6360 reduc_inputs
[0] = make_ssa_name (vectype
);
6361 gphi
*new_phi
= create_phi_node (reduc_inputs
[0], reduc_block
);
6362 add_phi_arg (new_phi
, orig_reduc_input
, single_succ_edge (exit_bb
),
6364 add_phi_arg (new_phi
, reduc_info
->reused_accumulator
->reduc_input
,
6365 loop_vinfo
->skip_this_loop_edge
, UNKNOWN_LOCATION
);
6366 exit_gsi
= gsi_after_labels (reduc_block
);
6369 /* Shouldn't be used beyond this point. */
6372 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
6373 && reduc_fn
!= IFN_LAST
)
6375 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6376 various data values where the condition matched and another vector
6377 (INDUCTION_INDEX) containing all the indexes of those matches. We
6378 need to extract the last matching index (which will be the index with
6379 highest value) and use this to index into the data vector.
6380 For the case where there were no matches, the data vector will contain
6381 all default values and the index vector will be all zeros. */
6383 /* Get various versions of the type of the vector of indexes. */
6384 tree index_vec_type
= TREE_TYPE (induction_index
);
6385 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
6386 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
6387 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
6389 /* Get an unsigned integer version of the type of the data vector. */
6390 int scalar_precision
6391 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6392 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
6393 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
6396 /* First we need to create a vector (ZERO_VEC) of zeros and another
6397 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6398 can create using a MAX reduction and then expanding.
6399 In the case where the loop never made any matches, the max index will
6402 /* Vector of {0, 0, 0,...}. */
6403 tree zero_vec
= build_zero_cst (vectype
);
6405 /* Find maximum value from the vector of found indexes. */
6406 tree max_index
= make_ssa_name (index_scalar_type
);
6407 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
6408 1, induction_index
);
6409 gimple_call_set_lhs (max_index_stmt
, max_index
);
6410 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
6412 /* Vector of {max_index, max_index, max_index,...}. */
6413 tree max_index_vec
= make_ssa_name (index_vec_type
);
6414 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
6416 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
6418 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
6420 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6421 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6422 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6423 otherwise. Only one value should match, resulting in a vector
6424 (VEC_COND) with one data value and the rest zeros.
6425 In the case where the loop never made any matches, every index will
6426 match, resulting in a vector with all data values (which will all be
6427 the default value). */
6429 /* Compare the max index vector to the vector of found indexes to find
6430 the position of the max value. */
6431 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
6432 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
6435 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
6437 /* Use the compare to choose either values from the data vector or
6439 tree vec_cond
= make_ssa_name (vectype
);
6440 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
6444 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
6446 /* Finally we need to extract the data value from the vector (VEC_COND)
6447 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6448 reduction, but because this doesn't exist, we can use a MAX reduction
6449 instead. The data value might be signed or a float so we need to cast
6451 In the case where the loop never made any matches, the data values are
6452 all identical, and so will reduce down correctly. */
6454 /* Make the matched data values unsigned. */
6455 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
6456 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
6458 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
6461 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
6463 /* Reduce down to a scalar value. */
6464 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
6465 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
6467 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
6468 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
6470 /* Convert the reduced value back to the result type and set as the
6472 gimple_seq stmts
= NULL
;
6473 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
6475 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6476 scalar_results
.safe_push (new_temp
);
6478 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
6479 && reduc_fn
== IFN_LAST
)
6481 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6483 idx_val = induction_index[0];
6484 val = data_reduc[0];
6485 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6486 if (induction_index[i] > idx_val)
6487 val = data_reduc[i], idx_val = induction_index[i];
6490 tree data_eltype
= TREE_TYPE (vectype
);
6491 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
6492 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
6493 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
6494 /* Enforced by vectorizable_reduction, which ensures we have target
6495 support before allowing a conditional reduction on variable-length
6497 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
6498 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
6499 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
6501 tree old_idx_val
= idx_val
;
6503 idx_val
= make_ssa_name (idx_eltype
);
6504 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
6505 build3 (BIT_FIELD_REF
, idx_eltype
,
6507 bitsize_int (el_size
),
6508 bitsize_int (off
)));
6509 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6510 val
= make_ssa_name (data_eltype
);
6511 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
6512 build3 (BIT_FIELD_REF
,
6515 bitsize_int (el_size
),
6516 bitsize_int (off
)));
6517 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6520 tree new_idx_val
= idx_val
;
6521 if (off
!= v_size
- el_size
)
6523 new_idx_val
= make_ssa_name (idx_eltype
);
6524 epilog_stmt
= gimple_build_assign (new_idx_val
,
6527 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6529 tree cond
= make_ssa_name (boolean_type_node
);
6530 epilog_stmt
= gimple_build_assign (cond
, GT_EXPR
,
6531 idx_val
, old_idx_val
);
6532 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6533 tree new_val
= make_ssa_name (data_eltype
);
6534 epilog_stmt
= gimple_build_assign (new_val
, COND_EXPR
,
6535 cond
, val
, old_val
);
6536 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6537 idx_val
= new_idx_val
;
6541 /* Convert the reduced value back to the result type and set as the
6543 gimple_seq stmts
= NULL
;
6544 val
= gimple_convert (&stmts
, scalar_type
, val
);
6545 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6546 scalar_results
.safe_push (val
);
6549 /* 2.3 Create the reduction code, using one of the three schemes described
6550 above. In SLP we simply need to extract all the elements from the
6551 vector (without reducing them), so we use scalar shifts. */
6552 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
6558 v_out2 = reduc_expr <v_out1> */
6560 if (dump_enabled_p ())
6561 dump_printf_loc (MSG_NOTE
, vect_location
,
6562 "Reduce using direct vector reduction.\n");
6564 gimple_seq stmts
= NULL
;
6565 vec_elem_type
= TREE_TYPE (vectype
);
6566 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
6567 vec_elem_type
, reduc_inputs
[0]);
6568 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6569 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6571 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6574 /* Earlier we set the initial value to be a vector if induc_val
6575 values. Check the result and if it is induc_val then replace
6576 with the original initial value, unless induc_val is
6577 the same as initial_def already. */
6578 tree zcompare
= make_ssa_name (boolean_type_node
);
6579 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
,
6580 new_temp
, induc_val
);
6581 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6582 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6583 tmp
= make_ssa_name (new_scalar_dest
);
6584 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6585 initial_def
, new_temp
);
6586 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6590 scalar_results
.safe_push (new_temp
);
6592 else if (direct_slp_reduc
)
6594 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6595 with the elements for other SLP statements replaced with the
6596 neutral value. We can then do a normal reduction on each vector. */
6598 /* Enforced by vectorizable_reduction. */
6599 gcc_assert (reduc_inputs
.length () == 1);
6600 gcc_assert (pow2p_hwi (group_size
));
6602 gimple_seq seq
= NULL
;
6604 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6605 and the same element size as VECTYPE. */
6606 tree index
= build_index_vector (vectype
, 0, 1);
6607 tree index_type
= TREE_TYPE (index
);
6608 tree index_elt_type
= TREE_TYPE (index_type
);
6609 tree mask_type
= truth_type_for (index_type
);
6611 /* Create a vector that, for each element, identifies which of
6612 the REDUC_GROUP_SIZE results should use it. */
6613 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
6614 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
6615 build_vector_from_val (index_type
, index_mask
));
6617 /* Get a neutral vector value. This is simply a splat of the neutral
6618 scalar value if we have one, otherwise the initial scalar value
6619 is itself a neutral value. */
6620 tree vector_identity
= NULL_TREE
;
6621 tree neutral_op
= NULL_TREE
;
6624 tree initial_value
= NULL_TREE
;
6625 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6626 initial_value
= reduc_info
->reduc_initial_values
[0];
6627 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype
), code
,
6628 initial_value
, false);
6631 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6633 for (unsigned int i
= 0; i
< group_size
; ++i
)
6635 /* If there's no univeral neutral value, we can use the
6636 initial scalar value from the original PHI. This is used
6637 for MIN and MAX reduction, for example. */
6640 tree scalar_value
= reduc_info
->reduc_initial_values
[i
];
6641 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
6643 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6647 /* Calculate the equivalent of:
6649 sel[j] = (index[j] == i);
6651 which selects the elements of REDUC_INPUTS[0] that should
6652 be included in the result. */
6653 tree compare_val
= build_int_cst (index_elt_type
, i
);
6654 compare_val
= build_vector_from_val (index_type
, compare_val
);
6655 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
6656 index
, compare_val
);
6658 /* Calculate the equivalent of:
6660 vec = seq ? reduc_inputs[0] : vector_identity;
6662 VEC is now suitable for a full vector reduction. */
6663 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
6664 sel
, reduc_inputs
[0], vector_identity
);
6666 /* Do the reduction and convert it to the appropriate type. */
6667 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
6668 TREE_TYPE (vectype
), vec
);
6669 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
6670 scalar_results
.safe_push (scalar
);
6672 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
6676 bool reduce_with_shift
;
6679 gcc_assert (slp_reduc
|| reduc_inputs
.length () == 1);
6681 /* See if the target wants to do the final (shift) reduction
6682 in a vector mode of smaller size and first reduce upper/lower
6683 halves against each other. */
6684 enum machine_mode mode1
= mode
;
6685 tree stype
= TREE_TYPE (vectype
);
6686 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6687 unsigned nunits1
= nunits
;
6688 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
6689 && reduc_inputs
.length () == 1)
6691 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6692 /* For SLP reductions we have to make sure lanes match up, but
6693 since we're doing individual element final reduction reducing
6694 vector width here is even more important.
6695 ??? We can also separate lanes with permutes, for the common
6696 case of power-of-two group-size odd/even extracts would work. */
6697 if (slp_reduc
&& nunits
!= nunits1
)
6699 nunits1
= least_common_multiple (nunits1
, group_size
);
6700 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
6704 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
6705 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6707 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
6709 reduce_with_shift
= have_whole_vector_shift (mode1
);
6710 if (!VECTOR_MODE_P (mode1
)
6711 || !directly_supported_p (code
, vectype1
))
6712 reduce_with_shift
= false;
6714 /* First reduce the vector to the desired vector size we should
6715 do shift reduction on by combining upper and lower halves. */
6716 gimple_seq stmts
= NULL
;
6717 new_temp
= vect_create_partial_epilog (reduc_inputs
[0], vectype1
,
6719 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6720 reduc_inputs
[0] = new_temp
;
6722 if (reduce_with_shift
&& !slp_reduc
)
6724 int element_bitsize
= tree_to_uhwi (bitsize
);
6725 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6726 for variable-length vectors and also requires direct target support
6727 for loop reductions. */
6728 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6729 int nelements
= vec_size_in_bits
/ element_bitsize
;
6730 vec_perm_builder sel
;
6731 vec_perm_indices indices
;
6735 tree zero_vec
= build_zero_cst (vectype1
);
6737 for (offset = nelements/2; offset >= 1; offset/=2)
6739 Create: va' = vec_shift <va, offset>
6740 Create: va = vop <va, va'>
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_NOTE
, vect_location
,
6747 "Reduce using vector shifts\n");
6749 gimple_seq stmts
= NULL
;
6750 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
6751 for (elt_offset
= nelements
/ 2;
6755 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
6756 indices
.new_vector (sel
, 2, nelements
);
6757 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
6758 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
6759 new_temp
, zero_vec
, mask
);
6760 new_temp
= gimple_build (&stmts
, code
,
6761 vectype1
, new_name
, new_temp
);
6763 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6765 /* 2.4 Extract the final scalar result. Create:
6766 s_out3 = extract_field <v_out2, bitpos> */
6768 if (dump_enabled_p ())
6769 dump_printf_loc (MSG_NOTE
, vect_location
,
6770 "extract scalar result\n");
6772 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
6773 bitsize
, bitsize_zero_node
);
6774 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
6775 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
6776 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
6777 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6778 scalar_results
.safe_push (new_temp
);
6783 s = extract_field <v_out2, 0>
6784 for (offset = element_size;
6785 offset < vector_size;
6786 offset += element_size;)
6788 Create: s' = extract_field <v_out2, offset>
6789 Create: s = op <s, s'> // For non SLP cases
6792 if (dump_enabled_p ())
6793 dump_printf_loc (MSG_NOTE
, vect_location
,
6794 "Reduce using scalar code.\n");
6796 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6797 int element_bitsize
= tree_to_uhwi (bitsize
);
6798 tree compute_type
= TREE_TYPE (vectype
);
6799 gimple_seq stmts
= NULL
;
6800 FOR_EACH_VEC_ELT (reduc_inputs
, i
, vec_temp
)
6803 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
6804 vec_temp
, bitsize
, bitsize_zero_node
);
6806 /* In SLP we don't need to apply reduction operation, so we just
6807 collect s' values in SCALAR_RESULTS. */
6809 scalar_results
.safe_push (new_temp
);
6811 for (bit_offset
= element_bitsize
;
6812 bit_offset
< vec_size_in_bits
;
6813 bit_offset
+= element_bitsize
)
6815 tree bitpos
= bitsize_int (bit_offset
);
6816 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
6817 compute_type
, vec_temp
,
6821 /* In SLP we don't need to apply reduction operation, so
6822 we just collect s' values in SCALAR_RESULTS. */
6823 new_temp
= new_name
;
6824 scalar_results
.safe_push (new_name
);
6827 new_temp
= gimple_build (&stmts
, code
, compute_type
,
6828 new_name
, new_temp
);
6832 /* The only case where we need to reduce scalar results in SLP, is
6833 unrolling. If the size of SCALAR_RESULTS is greater than
6834 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6835 REDUC_GROUP_SIZE. */
6838 tree res
, first_res
, new_res
;
6840 /* Reduce multiple scalar results in case of SLP unrolling. */
6841 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
6844 first_res
= scalar_results
[j
% group_size
];
6845 new_res
= gimple_build (&stmts
, code
, compute_type
,
6847 scalar_results
[j
% group_size
] = new_res
;
6849 scalar_results
.truncate (group_size
);
6850 for (k
= 0; k
< group_size
; k
++)
6851 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
6856 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6857 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6858 scalar_results
.safe_push (new_temp
);
6861 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6864 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6867 /* Earlier we set the initial value to be a vector if induc_val
6868 values. Check the result and if it is induc_val then replace
6869 with the original initial value, unless induc_val is
6870 the same as initial_def already. */
6871 tree zcompare
= make_ssa_name (boolean_type_node
);
6872 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
, new_temp
,
6874 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6875 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6876 tree tmp
= make_ssa_name (new_scalar_dest
);
6877 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6878 initial_def
, new_temp
);
6879 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6880 scalar_results
[0] = tmp
;
6884 /* 2.5 Adjust the final result by the initial value of the reduction
6885 variable. (When such adjustment is not needed, then
6886 'adjustment_def' is zero). For example, if code is PLUS we create:
6887 new_temp = loop_exit_def + adjustment_def */
6891 gcc_assert (!slp_reduc
);
6892 gimple_seq stmts
= NULL
;
6895 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
6896 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
6897 new_temp
= gimple_build (&stmts
, code
, vectype
,
6898 reduc_inputs
[0], adjustment_def
);
6902 new_temp
= scalar_results
[0];
6903 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
6904 adjustment_def
= gimple_convert (&stmts
, TREE_TYPE (vectype
),
6906 new_temp
= gimple_convert (&stmts
, TREE_TYPE (vectype
), new_temp
);
6907 new_temp
= gimple_build (&stmts
, code
, TREE_TYPE (vectype
),
6908 new_temp
, adjustment_def
);
6909 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6912 epilog_stmt
= gimple_seq_last_stmt (stmts
);
6913 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6914 scalar_results
[0] = new_temp
;
6917 /* Record this operation if it could be reused by the epilogue loop. */
6918 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
6919 && reduc_inputs
.length () == 1)
6920 loop_vinfo
->reusable_accumulators
.put (scalar_results
[0],
6921 { orig_reduc_input
, reduc_info
});
6926 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6927 phis with new adjusted scalar results, i.e., replace use <s_out0>
6932 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6933 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6934 v_out2 = reduce <v_out1>
6935 s_out3 = extract_field <v_out2, 0>
6936 s_out4 = adjust_result <s_out3>
6943 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6944 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6945 v_out2 = reduce <v_out1>
6946 s_out3 = extract_field <v_out2, 0>
6947 s_out4 = adjust_result <s_out3>
6951 gcc_assert (live_out_stmts
.size () == scalar_results
.length ());
6952 auto_vec
<gimple
*> phis
;
6953 for (k
= 0; k
< live_out_stmts
.size (); k
++)
6955 stmt_vec_info scalar_stmt_info
= vect_orig_stmt (live_out_stmts
[k
]);
6956 scalar_dest
= gimple_get_lhs (scalar_stmt_info
->stmt
);
6958 /* Find the loop-closed-use at the loop exit of the original scalar
6959 result. (The reduction result is expected to have two immediate uses,
6960 one at the latch block, and one at the loop exit). For double
6961 reductions we are looking for exit phis of the outer loop. */
6962 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6964 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6966 if (!is_gimple_debug (USE_STMT (use_p
)))
6967 phis
.safe_push (USE_STMT (use_p
));
6971 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6973 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6975 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6977 if (!flow_bb_inside_loop_p (loop
,
6978 gimple_bb (USE_STMT (phi_use_p
)))
6979 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6980 phis
.safe_push (USE_STMT (phi_use_p
));
6986 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6988 /* Replace the uses: */
6989 orig_name
= PHI_RESULT (exit_phi
);
6991 /* Look for a single use at the target of the skip edge. */
6992 if (unify_with_main_loop_p
)
6994 use_operand_p use_p
;
6996 if (!single_imm_use (orig_name
, &use_p
, &user
))
6998 orig_name
= gimple_get_lhs (user
);
7001 scalar_result
= scalar_results
[k
];
7002 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
7004 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
7005 SET_USE (use_p
, scalar_result
);
7006 update_stmt (use_stmt
);
7014 /* Return a vector of type VECTYPE that is equal to the vector select
7015 operation "MASK ? VEC : IDENTITY". Insert the select statements
7019 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
7020 tree vec
, tree identity
)
7022 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
7023 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
7024 mask
, vec
, identity
);
7025 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
7029 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7030 order, starting with LHS. Insert the extraction statements before GSI and
7031 associate the new scalar SSA names with variable SCALAR_DEST.
7032 If MASK is nonzero mask the input and then operate on it unconditionally.
7033 Return the SSA name for the result. */
7036 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
7037 tree_code code
, tree lhs
, tree vector_rhs
,
7040 tree vectype
= TREE_TYPE (vector_rhs
);
7041 tree scalar_type
= TREE_TYPE (vectype
);
7042 tree bitsize
= TYPE_SIZE (scalar_type
);
7043 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
7044 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
7046 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7047 to perform an unconditional element-wise reduction of it. */
7050 tree masked_vector_rhs
= make_temp_ssa_name (vectype
, NULL
,
7051 "masked_vector_rhs");
7052 tree neutral_op
= neutral_op_for_reduction (scalar_type
, code
, NULL_TREE
,
7054 tree vector_identity
= build_vector_from_val (vectype
, neutral_op
);
7055 gassign
*select
= gimple_build_assign (masked_vector_rhs
, VEC_COND_EXPR
,
7056 mask
, vector_rhs
, vector_identity
);
7057 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7058 vector_rhs
= masked_vector_rhs
;
7061 for (unsigned HOST_WIDE_INT bit_offset
= 0;
7062 bit_offset
< vec_size_in_bits
;
7063 bit_offset
+= element_bitsize
)
7065 tree bitpos
= bitsize_int (bit_offset
);
7066 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
7069 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
7070 rhs
= make_ssa_name (scalar_dest
, stmt
);
7071 gimple_assign_set_lhs (stmt
, rhs
);
7072 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
7074 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
7075 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
7076 gimple_assign_set_lhs (stmt
, new_name
);
7077 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
7083 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7084 type of the vector input. */
7087 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
7089 internal_fn mask_reduc_fn
;
7090 internal_fn mask_len_reduc_fn
;
7094 case IFN_FOLD_LEFT_PLUS
:
7095 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
7096 mask_len_reduc_fn
= IFN_MASK_LEN_FOLD_LEFT_PLUS
;
7103 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
7104 OPTIMIZE_FOR_SPEED
))
7105 return mask_reduc_fn
;
7106 if (direct_internal_fn_supported_p (mask_len_reduc_fn
, vectype_in
,
7107 OPTIMIZE_FOR_SPEED
))
7108 return mask_len_reduc_fn
;
7112 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7113 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7114 statement. CODE is the operation performed by STMT_INFO and OPS are
7115 its scalar operands. REDUC_INDEX is the index of the operand in
7116 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7117 implements in-order reduction, or IFN_LAST if we should open-code it.
7118 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7119 that should be used to control the operation in a fully-masked loop. */
7122 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
7123 stmt_vec_info stmt_info
,
7124 gimple_stmt_iterator
*gsi
,
7125 gimple
**vec_stmt
, slp_tree slp_node
,
7126 gimple
*reduc_def_stmt
,
7127 code_helper code
, internal_fn reduc_fn
,
7128 tree
*ops
, int num_ops
, tree vectype_in
,
7129 int reduc_index
, vec_loop_masks
*masks
,
7130 vec_loop_lens
*lens
)
7132 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7133 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7134 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
7140 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7142 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
7143 gcc_assert (ncopies
== 1);
7145 bool is_cond_op
= false;
7146 if (!code
.is_tree_code ())
7148 code
= conditional_internal_fn_code (internal_fn (code
));
7149 gcc_assert (code
!= ERROR_MARK
);
7153 gcc_assert (TREE_CODE_LENGTH (tree_code (code
)) == binary_op
);
7159 if (dump_enabled_p ())
7160 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7161 "fold-left reduction on SLP not supported.\n");
7165 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
7166 TYPE_VECTOR_SUBPARTS (vectype_in
)));
7169 /* The operands either come from a binary operation or an IFN_COND operation.
7170 The former is a gimple assign with binary rhs and the latter is a
7171 gimple call with four arguments. */
7172 gcc_assert (num_ops
== 2 || num_ops
== 4);
7175 op0
= ops
[1 - reduc_index
];
7178 op0
= ops
[2 + (1 - reduc_index
)];
7180 gcc_assert (!slp_node
);
7184 stmt_vec_info scalar_dest_def_info
;
7185 auto_vec
<tree
> vec_oprnds0
, vec_opmask
;
7188 auto_vec
<vec
<tree
> > vec_defs (2);
7189 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
7190 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
7191 vec_defs
[0].release ();
7192 vec_defs
[1].release ();
7193 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7194 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
7198 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7200 scalar_dest_def_info
= stmt_info
;
7202 /* For an IFN_COND_OP we also need the vector mask operand. */
7204 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7205 opmask
, &vec_opmask
);
7208 gimple
*sdef
= vect_orig_stmt (scalar_dest_def_info
)->stmt
;
7209 tree scalar_dest
= gimple_get_lhs (sdef
);
7210 tree scalar_type
= TREE_TYPE (scalar_dest
);
7211 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
7213 int vec_num
= vec_oprnds0
.length ();
7214 gcc_assert (vec_num
== 1 || slp_node
);
7215 tree vec_elem_type
= TREE_TYPE (vectype_out
);
7216 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
7218 tree vector_identity
= NULL_TREE
;
7219 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
7221 vector_identity
= build_zero_cst (vectype_out
);
7222 if (!HONOR_SIGNED_ZEROS (vectype_out
))
7226 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out
));
7227 vector_identity
= const_unop (NEGATE_EXPR
, vectype_out
,
7232 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
7235 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7238 tree mask
= NULL_TREE
;
7239 tree len
= NULL_TREE
;
7240 tree bias
= NULL_TREE
;
7241 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
7242 mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
, vec_num
, vectype_in
, i
);
7243 else if (is_cond_op
)
7244 mask
= vec_opmask
[0];
7245 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
7247 len
= vect_get_loop_len (loop_vinfo
, gsi
, lens
, vec_num
, vectype_in
,
7249 signed char biasval
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
7250 bias
= build_int_cst (intQI_type_node
, biasval
);
7252 mask
= build_minus_one_cst (truth_type_for (vectype_in
));
7255 /* Handle MINUS by adding the negative. */
7256 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
7258 tree negated
= make_ssa_name (vectype_out
);
7259 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
7260 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
7264 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
7265 && mask
&& mask_reduc_fn
== IFN_LAST
)
7266 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
7269 /* On the first iteration the input is simply the scalar phi
7270 result, and for subsequent iterations it is the output of
7271 the preceding operation. */
7272 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
7274 if (mask
&& len
&& mask_reduc_fn
== IFN_MASK_LEN_FOLD_LEFT_PLUS
)
7275 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 5, reduc_var
,
7276 def0
, mask
, len
, bias
);
7277 else if (mask
&& mask_reduc_fn
== IFN_MASK_FOLD_LEFT_PLUS
)
7278 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
7281 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
7283 /* For chained SLP reductions the output of the previous reduction
7284 operation serves as the input of the next. For the final statement
7285 the output cannot be a temporary - we reuse the original
7286 scalar destination of the last statement. */
7287 if (i
!= vec_num
- 1)
7289 gimple_set_lhs (new_stmt
, scalar_dest_var
);
7290 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
7291 gimple_set_lhs (new_stmt
, reduc_var
);
7296 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
,
7297 tree_code (code
), reduc_var
, def0
,
7299 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
7300 /* Remove the statement, so that we can use the same code paths
7301 as for statements that we've just created. */
7302 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
7303 gsi_remove (&tmp_gsi
, true);
7306 if (i
== vec_num
- 1)
7308 gimple_set_lhs (new_stmt
, scalar_dest
);
7309 vect_finish_replace_stmt (loop_vinfo
,
7310 scalar_dest_def_info
,
7314 vect_finish_stmt_generation (loop_vinfo
,
7315 scalar_dest_def_info
,
7319 slp_node
->push_vec_def (new_stmt
);
7322 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7323 *vec_stmt
= new_stmt
;
7330 /* Function is_nonwrapping_integer_induction.
7332 Check if STMT_VINO (which is part of loop LOOP) both increments and
7333 does not cause overflow. */
7336 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
7338 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
7339 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
7340 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
7341 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
7342 widest_int ni
, max_loop_value
, lhs_max
;
7343 wi::overflow_type overflow
= wi::OVF_NONE
;
7345 /* Make sure the loop is integer based. */
7346 if (TREE_CODE (base
) != INTEGER_CST
7347 || TREE_CODE (step
) != INTEGER_CST
)
7350 /* Check that the max size of the loop will not wrap. */
7352 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
7355 if (! max_stmt_executions (loop
, &ni
))
7358 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
7363 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
7364 TYPE_SIGN (lhs_type
), &overflow
);
7368 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
7369 <= TYPE_PRECISION (lhs_type
));
7372 /* Check if masking can be supported by inserting a conditional expression.
7373 CODE is the code for the operation. COND_FN is the conditional internal
7374 function, if it exists. VECTYPE_IN is the type of the vector input. */
7376 use_mask_by_cond_expr_p (code_helper code
, internal_fn cond_fn
,
7379 if (cond_fn
!= IFN_LAST
7380 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7381 OPTIMIZE_FOR_SPEED
))
7384 if (code
.is_tree_code ())
7385 switch (tree_code (code
))
7397 /* Insert a conditional expression to enable masked vectorization. CODE is the
7398 code for the operation. VOP is the array of operands. MASK is the loop
7399 mask. GSI is a statement iterator used to place the new conditional
7402 build_vect_cond_expr (code_helper code
, tree vop
[3], tree mask
,
7403 gimple_stmt_iterator
*gsi
)
7405 switch (tree_code (code
))
7409 tree vectype
= TREE_TYPE (vop
[1]);
7410 tree zero
= build_zero_cst (vectype
);
7411 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
7412 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
7413 mask
, vop
[1], zero
);
7414 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7415 vop
[1] = masked_op1
;
7421 tree vectype
= TREE_TYPE (vop
[1]);
7422 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
7423 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
7424 mask
, vop
[1], vop
[0]);
7425 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7426 vop
[1] = masked_op1
;
7435 /* Function vectorizable_reduction.
7437 Check if STMT_INFO performs a reduction operation that can be vectorized.
7438 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7439 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7440 Return true if STMT_INFO is vectorizable in this way.
7442 This function also handles reduction idioms (patterns) that have been
7443 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7444 may be of this form:
7445 X = pattern_expr (arg0, arg1, ..., X)
7446 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7447 sequence that had been detected and replaced by the pattern-stmt
7450 This function also handles reduction of condition expressions, for example:
7451 for (int i = 0; i < N; i++)
7454 This is handled by vectorising the loop and creating an additional vector
7455 containing the loop indexes for which "a[i] < value" was true. In the
7456 function epilogue this is reduced to a single max value and then used to
7457 index into the vector of results.
7459 In some cases of reduction patterns, the type of the reduction variable X is
7460 different than the type of the other arguments of STMT_INFO.
7461 In such cases, the vectype that is used when transforming STMT_INFO into
7462 a vector stmt is different than the vectype that is used to determine the
7463 vectorization factor, because it consists of a different number of elements
7464 than the actual number of elements that are being operated upon in parallel.
7466 For example, consider an accumulation of shorts into an int accumulator.
7467 On some targets it's possible to vectorize this pattern operating on 8
7468 shorts at a time (hence, the vectype for purposes of determining the
7469 vectorization factor should be V8HI); on the other hand, the vectype that
7470 is used to create the vector form is actually V4SI (the type of the result).
7472 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7473 indicates what is the actual level of parallelism (V8HI in the example), so
7474 that the right vectorization factor would be derived. This vectype
7475 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7476 be used to create the vectorized stmt. The right vectype for the vectorized
7477 stmt is obtained from the type of the result X:
7478 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7480 This means that, contrary to "regular" reductions (or "regular" stmts in
7481 general), the following equation:
7482 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7483 does *NOT* necessarily hold for reduction patterns. */
7486 vectorizable_reduction (loop_vec_info loop_vinfo
,
7487 stmt_vec_info stmt_info
, slp_tree slp_node
,
7488 slp_instance slp_node_instance
,
7489 stmt_vector_for_cost
*cost_vec
)
7491 tree vectype_in
= NULL_TREE
;
7492 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7493 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
7494 stmt_vec_info cond_stmt_vinfo
= NULL
;
7497 bool single_defuse_cycle
= false;
7498 bool nested_cycle
= false;
7499 bool double_reduc
= false;
7501 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
7502 tree cond_reduc_val
= NULL_TREE
;
7504 /* Make sure it was already recognized as a reduction computation. */
7505 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
7506 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
7507 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
7510 /* The stmt we store reduction analysis meta on. */
7511 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7512 reduc_info
->is_reduc_info
= true;
7514 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
7516 if (is_a
<gphi
*> (stmt_info
->stmt
))
7520 /* We eventually need to set a vector type on invariant
7524 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
7525 if (!vect_maybe_update_slp_op_vectype
7526 (child
, SLP_TREE_VECTYPE (slp_node
)))
7528 if (dump_enabled_p ())
7529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7530 "incompatible vector types for "
7535 /* Analysis for double-reduction is done on the outer
7536 loop PHI, nested cycles have no further restrictions. */
7537 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
7540 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7544 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
7545 stmt_vec_info phi_info
= stmt_info
;
7546 if (!is_a
<gphi
*> (stmt_info
->stmt
))
7548 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7553 slp_node_instance
->reduc_phis
= slp_node
;
7554 /* ??? We're leaving slp_node to point to the PHIs, we only
7555 need it to get at the number of vector stmts which wasn't
7556 yet initialized for the instance root. */
7558 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
7560 use_operand_p use_p
;
7562 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
7565 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
7568 /* PHIs should not participate in patterns. */
7569 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
7570 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7572 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7573 and compute the reduction chain length. Discover the real
7574 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7576 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
7578 (gimple_bb (reduc_def_phi
)->loop_father
));
7579 unsigned reduc_chain_length
= 0;
7580 bool only_slp_reduc_chain
= true;
7582 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
7583 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
7585 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
7586 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
7587 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
7589 if (dump_enabled_p ())
7590 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7591 "reduction chain broken by patterns.\n");
7594 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
7595 only_slp_reduc_chain
= false;
7596 /* For epilogue generation live members of the chain need
7597 to point back to the PHI via their original stmt for
7598 info_for_reduction to work. For SLP we need to look at
7599 all lanes here - even though we only will vectorize from
7600 the SLP node with live lane zero the other live lanes also
7601 need to be identified as part of a reduction to be able
7602 to skip code generation for them. */
7603 if (slp_for_stmt_info
)
7605 for (auto s
: SLP_TREE_SCALAR_STMTS (slp_for_stmt_info
))
7606 if (STMT_VINFO_LIVE_P (s
))
7607 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s
)) = phi_info
;
7609 else if (STMT_VINFO_LIVE_P (vdef
))
7610 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
7612 if (!gimple_extract_op (vdef
->stmt
, &op
))
7614 if (dump_enabled_p ())
7615 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7616 "reduction chain includes unsupported"
7617 " statement type.\n");
7620 if (CONVERT_EXPR_CODE_P (op
.code
))
7622 if (!tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
7624 if (dump_enabled_p ())
7625 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7626 "conversion in the reduction chain.\n");
7630 else if (!stmt_info
)
7631 /* First non-conversion stmt. */
7633 reduc_def
= op
.ops
[STMT_VINFO_REDUC_IDX (vdef
)];
7634 reduc_chain_length
++;
7635 if (!stmt_info
&& slp_node
)
7636 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
7638 /* PHIs should not participate in patterns. */
7639 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
7641 if (nested_in_vect_loop_p (loop
, stmt_info
))
7644 nested_cycle
= true;
7647 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7649 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7651 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
7652 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
7654 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7655 gcc_assert (slp_node
7656 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
7658 /* 1. Is vectorizable reduction? */
7659 /* Not supportable if the reduction variable is used in the loop, unless
7660 it's a reduction chain. */
7661 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
7662 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7665 /* Reductions that are not used even in an enclosing outer-loop,
7666 are expected to be "live" (used out of the loop). */
7667 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
7668 && !STMT_VINFO_LIVE_P (stmt_info
))
7671 /* 2. Has this been recognized as a reduction pattern?
7673 Check if STMT represents a pattern that has been recognized
7674 in earlier analysis stages. For stmts that represent a pattern,
7675 the STMT_VINFO_RELATED_STMT field records the last stmt in
7676 the original sequence that constitutes the pattern. */
7678 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
7681 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
7682 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
7685 /* 3. Check the operands of the operation. The first operands are defined
7686 inside the loop body. The last operand is the reduction variable,
7687 which is defined by the loop-header-phi. */
7689 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7690 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
7692 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7694 bool lane_reduc_code_p
= (op
.code
== DOT_PROD_EXPR
7695 || op
.code
== WIDEN_SUM_EXPR
7696 || op
.code
== SAD_EXPR
);
7698 if (!POINTER_TYPE_P (op
.type
) && !INTEGRAL_TYPE_P (op
.type
)
7699 && !SCALAR_FLOAT_TYPE_P (op
.type
))
7702 /* Do not try to vectorize bit-precision reductions. */
7703 if (!type_has_mode_precision_p (op
.type
))
7706 /* For lane-reducing ops we're reducing the number of reduction PHIs
7707 which means the only use of that may be in the lane-reducing operation. */
7708 if (lane_reduc_code_p
7709 && reduc_chain_length
!= 1
7710 && !only_slp_reduc_chain
)
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7714 "lane-reducing reduction with extra stmts.\n");
7718 /* All uses but the last are expected to be defined in the loop.
7719 The last use is the reduction variable. In case of nested cycle this
7720 assumption is not true: we use reduc_index to record the index of the
7721 reduction variable. */
7722 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op
.num_ops
);
7723 tree
*vectype_op
= XALLOCAVEC (tree
, op
.num_ops
);
7724 /* We need to skip an extra operand for COND_EXPRs with embedded
7726 unsigned opno_adjust
= 0;
7727 if (op
.code
== COND_EXPR
&& COMPARISON_CLASS_P (op
.ops
[0]))
7729 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7731 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7732 if (i
== 0 && op
.code
== COND_EXPR
)
7735 stmt_vec_info def_stmt_info
;
7736 enum vect_def_type dt
;
7737 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
7738 i
+ opno_adjust
, &op
.ops
[i
], &slp_op
[i
], &dt
,
7739 &vectype_op
[i
], &def_stmt_info
))
7741 if (dump_enabled_p ())
7742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7743 "use not simple.\n");
7746 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
7749 /* For an IFN_COND_OP we might hit the reduction definition operand
7750 twice (once as definition, once as else). */
7751 if (op
.ops
[i
] == op
.ops
[STMT_VINFO_REDUC_IDX (stmt_info
)])
7754 /* There should be only one cycle def in the stmt, the one
7755 leading to reduc_def. */
7756 if (VECTORIZABLE_CYCLE_DEF (dt
))
7761 = get_vectype_for_scalar_type (loop_vinfo
,
7762 TREE_TYPE (op
.ops
[i
]), slp_op
[i
]);
7764 /* To properly compute ncopies we are interested in the widest
7765 non-reduction input type in case we're looking at a widening
7766 accumulation that we later handle in vect_transform_reduction. */
7767 if (lane_reduc_code_p
7770 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
7771 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op
[i
]))))))
7772 vectype_in
= vectype_op
[i
];
7774 /* Record how the non-reduction-def value of COND_EXPR is defined.
7775 ??? For a chain of multiple CONDs we'd have to match them up all. */
7776 if (op
.code
== COND_EXPR
&& reduc_chain_length
== 1)
7778 if (dt
== vect_constant_def
)
7781 cond_reduc_val
= op
.ops
[i
];
7783 else if (dt
== vect_induction_def
7785 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
7788 cond_stmt_vinfo
= def_stmt_info
;
7793 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
7794 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
7796 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
7797 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
7798 /* If we have a condition reduction, see if we can simplify it further. */
7799 if (v_reduc_type
== COND_REDUCTION
)
7804 /* When the condition uses the reduction value in the condition, fail. */
7805 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
7807 if (dump_enabled_p ())
7808 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7809 "condition depends on previous iteration\n");
7813 if (reduc_chain_length
== 1
7814 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
, vectype_in
,
7816 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST
,
7818 OPTIMIZE_FOR_SPEED
)))
7820 if (dump_enabled_p ())
7821 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7822 "optimizing condition reduction with"
7823 " FOLD_EXTRACT_LAST.\n");
7824 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
7826 else if (cond_reduc_dt
== vect_induction_def
)
7829 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
7830 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
7832 gcc_assert (TREE_CODE (base
) == INTEGER_CST
7833 && TREE_CODE (step
) == INTEGER_CST
);
7834 cond_reduc_val
= NULL_TREE
;
7835 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
7836 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
7837 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
7839 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7840 above base; punt if base is the minimum value of the type for
7841 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7842 else if (tree_int_cst_sgn (step
) == -1)
7844 cond_reduc_op_code
= MIN_EXPR
;
7845 if (tree_int_cst_sgn (base
) == -1)
7846 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7847 else if (tree_int_cst_lt (base
,
7848 TYPE_MAX_VALUE (TREE_TYPE (base
))))
7850 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
7854 cond_reduc_op_code
= MAX_EXPR
;
7855 if (tree_int_cst_sgn (base
) == 1)
7856 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7857 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
7860 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
7864 if (dump_enabled_p ())
7865 dump_printf_loc (MSG_NOTE
, vect_location
,
7866 "condition expression based on "
7867 "integer induction.\n");
7868 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
7869 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
7871 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
7874 else if (cond_reduc_dt
== vect_constant_def
)
7876 enum vect_def_type cond_initial_dt
;
7877 tree cond_initial_val
= vect_phi_initial_value (reduc_def_phi
);
7878 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
7879 if (cond_initial_dt
== vect_constant_def
7880 && types_compatible_p (TREE_TYPE (cond_initial_val
),
7881 TREE_TYPE (cond_reduc_val
)))
7883 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
7884 cond_initial_val
, cond_reduc_val
);
7885 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
7887 if (dump_enabled_p ())
7888 dump_printf_loc (MSG_NOTE
, vect_location
,
7889 "condition expression based on "
7890 "compile time constant.\n");
7891 /* Record reduction code at analysis stage. */
7892 STMT_VINFO_REDUC_CODE (reduc_info
)
7893 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
7894 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
7900 if (STMT_VINFO_LIVE_P (phi_info
))
7906 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7908 gcc_assert (ncopies
>= 1);
7910 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
7914 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
7915 == vect_double_reduction_def
);
7916 double_reduc
= true;
7919 /* 4.2. Check support for the epilog operation.
7921 If STMT represents a reduction pattern, then the type of the
7922 reduction variable may be different than the type of the rest
7923 of the arguments. For example, consider the case of accumulation
7924 of shorts into an int accumulator; The original code:
7925 S1: int_a = (int) short_a;
7926 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7929 STMT: int_acc = widen_sum <short_a, int_acc>
7932 1. The tree-code that is used to create the vector operation in the
7933 epilog code (that reduces the partial results) is not the
7934 tree-code of STMT, but is rather the tree-code of the original
7935 stmt from the pattern that STMT is replacing. I.e, in the example
7936 above we want to use 'widen_sum' in the loop, but 'plus' in the
7938 2. The type (mode) we use to check available target support
7939 for the vector operation to be created in the *epilog*, is
7940 determined by the type of the reduction variable (in the example
7941 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7942 However the type (mode) we use to check available target support
7943 for the vector operation to be created *inside the loop*, is
7944 determined by the type of the other arguments to STMT (in the
7945 example we'd check this: optab_handler (widen_sum_optab,
7948 This is contrary to "regular" reductions, in which the types of all
7949 the arguments are the same as the type of the reduction variable.
7950 For "regular" reductions we can therefore use the same vector type
7951 (and also the same tree-code) when generating the epilog code and
7952 when generating the code inside the loop. */
7954 code_helper orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
7956 /* If conversion might have created a conditional operation like
7957 IFN_COND_ADD already. Use the internal code for the following checks. */
7958 if (orig_code
.is_internal_fn ())
7960 tree_code new_code
= conditional_internal_fn_code (internal_fn (orig_code
));
7961 orig_code
= new_code
!= ERROR_MARK
? new_code
: orig_code
;
7964 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
7966 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7967 if (reduction_type
== TREE_CODE_REDUCTION
)
7969 /* Check whether it's ok to change the order of the computation.
7970 Generally, when vectorizing a reduction we change the order of the
7971 computation. This may change the behavior of the program in some
7972 cases, so we need to check that this is ok. One exception is when
7973 vectorizing an outer-loop: the inner-loop is executed sequentially,
7974 and therefore vectorizing reductions in the inner-loop during
7975 outer-loop vectorization is safe. Likewise when we are vectorizing
7976 a series of reductions using SLP and the VF is one the reductions
7977 are performed in scalar order. */
7979 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7980 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
7982 else if (needs_fold_left_reduction_p (op
.type
, orig_code
))
7984 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7985 is not directy used in stmt. */
7986 if (!only_slp_reduc_chain
7987 && reduc_chain_length
!= 1)
7989 if (dump_enabled_p ())
7990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7991 "in-order reduction chain without SLP.\n");
7994 STMT_VINFO_REDUC_TYPE (reduc_info
)
7995 = reduction_type
= FOLD_LEFT_REDUCTION
;
7997 else if (!commutative_binary_op_p (orig_code
, op
.type
)
7998 || !associative_binary_op_p (orig_code
, op
.type
))
8000 if (dump_enabled_p ())
8001 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8002 "reduction: not commutative/associative\n");
8007 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
8010 if (dump_enabled_p ())
8011 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8012 "multiple types in double reduction or condition "
8013 "reduction or fold-left reduction.\n");
8017 internal_fn reduc_fn
= IFN_LAST
;
8018 if (reduction_type
== TREE_CODE_REDUCTION
8019 || reduction_type
== FOLD_LEFT_REDUCTION
8020 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
8021 || reduction_type
== CONST_COND_REDUCTION
)
8023 if (reduction_type
== FOLD_LEFT_REDUCTION
8024 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
8025 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
8027 if (reduc_fn
!= IFN_LAST
8028 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
8029 OPTIMIZE_FOR_SPEED
))
8031 if (dump_enabled_p ())
8032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8033 "reduc op not supported by target.\n");
8035 reduc_fn
= IFN_LAST
;
8040 if (!nested_cycle
|| double_reduc
)
8042 if (dump_enabled_p ())
8043 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8044 "no reduc code for scalar code.\n");
8050 else if (reduction_type
== COND_REDUCTION
)
8052 int scalar_precision
8053 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op
.type
));
8054 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
8055 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
8058 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
8059 OPTIMIZE_FOR_SPEED
))
8060 reduc_fn
= IFN_REDUC_MAX
;
8062 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
8064 if (reduction_type
!= EXTRACT_LAST_REDUCTION
8065 && (!nested_cycle
|| double_reduc
)
8066 && reduc_fn
== IFN_LAST
8067 && !nunits_out
.is_constant ())
8069 if (dump_enabled_p ())
8070 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8071 "missing target support for reduction on"
8072 " variable-length vectors.\n");
8076 /* For SLP reductions, see if there is a neutral value we can use. */
8077 tree neutral_op
= NULL_TREE
;
8080 tree initial_value
= NULL_TREE
;
8081 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
)
8082 initial_value
= vect_phi_initial_value (reduc_def_phi
);
8083 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype_out
),
8084 orig_code
, initial_value
);
8087 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
8089 /* We can't support in-order reductions of code such as this:
8091 for (int i = 0; i < n1; ++i)
8092 for (int j = 0; j < n2; ++j)
8095 since GCC effectively transforms the loop when vectorizing:
8097 for (int i = 0; i < n1 / VF; ++i)
8098 for (int j = 0; j < n2; ++j)
8099 for (int k = 0; k < VF; ++k)
8102 which is a reassociation of the original operation. */
8103 if (dump_enabled_p ())
8104 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8105 "in-order double reduction not supported.\n");
8110 if (reduction_type
== FOLD_LEFT_REDUCTION
8112 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
8114 /* We cannot use in-order reductions in this case because there is
8115 an implicit reassociation of the operations involved. */
8116 if (dump_enabled_p ())
8117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8118 "in-order unchained SLP reductions not supported.\n");
8122 /* For double reductions, and for SLP reductions with a neutral value,
8123 we construct a variable-length initial vector by loading a vector
8124 full of the neutral value and then shift-and-inserting the start
8125 values into the low-numbered elements. */
8126 if ((double_reduc
|| neutral_op
)
8127 && !nunits_out
.is_constant ()
8128 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
8129 vectype_out
, OPTIMIZE_FOR_SPEED
))
8131 if (dump_enabled_p ())
8132 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8133 "reduction on variable-length vectors requires"
8134 " target support for a vector-shift-and-insert"
8139 /* Check extra constraints for variable-length unchained SLP reductions. */
8141 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
8142 && !nunits_out
.is_constant ())
8144 /* We checked above that we could build the initial vector when
8145 there's a neutral element value. Check here for the case in
8146 which each SLP statement has its own initial value and in which
8147 that value needs to be repeated for every instance of the
8148 statement within the initial vector. */
8149 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
8151 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
8152 TREE_TYPE (vectype_out
)))
8154 if (dump_enabled_p ())
8155 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8156 "unsupported form of SLP reduction for"
8157 " variable-length vectors: cannot build"
8158 " initial vector.\n");
8161 /* The epilogue code relies on the number of elements being a multiple
8162 of the group size. The duplicate-and-interleave approach to setting
8163 up the initial vector does too. */
8164 if (!multiple_p (nunits_out
, group_size
))
8166 if (dump_enabled_p ())
8167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8168 "unsupported form of SLP reduction for"
8169 " variable-length vectors: the vector size"
8170 " is not a multiple of the number of results.\n");
8175 if (reduction_type
== COND_REDUCTION
)
8179 if (! max_loop_iterations (loop
, &ni
))
8181 if (dump_enabled_p ())
8182 dump_printf_loc (MSG_NOTE
, vect_location
,
8183 "loop count not known, cannot create cond "
8187 /* Convert backedges to iterations. */
8190 /* The additional index will be the same type as the condition. Check
8191 that the loop can fit into this less one (because we'll use up the
8192 zero slot for when there are no matches). */
8193 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
8194 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
8196 if (dump_enabled_p ())
8197 dump_printf_loc (MSG_NOTE
, vect_location
,
8198 "loop size is greater than data size.\n");
8203 /* In case the vectorization factor (VF) is bigger than the number
8204 of elements that we can fit in a vectype (nunits), we have to generate
8205 more than one vector stmt - i.e - we need to "unroll" the
8206 vector stmt by a factor VF/nunits. For more details see documentation
8207 in vectorizable_operation. */
8209 /* If the reduction is used in an outer loop we need to generate
8210 VF intermediate results, like so (e.g. for ncopies=2):
8215 (i.e. we generate VF results in 2 registers).
8216 In this case we have a separate def-use cycle for each copy, and therefore
8217 for each copy we get the vector def for the reduction variable from the
8218 respective phi node created for this copy.
8220 Otherwise (the reduction is unused in the loop nest), we can combine
8221 together intermediate results, like so (e.g. for ncopies=2):
8225 (i.e. we generate VF/2 results in a single register).
8226 In this case for each copy we get the vector def for the reduction variable
8227 from the vectorized reduction operation generated in the previous iteration.
8229 This only works when we see both the reduction PHI and its only consumer
8230 in vectorizable_reduction and there are no intermediate stmts
8231 participating. When unrolling we want each unrolled iteration to have its
8232 own reduction accumulator since one of the main goals of unrolling a
8233 reduction is to reduce the aggregate loop-carried latency. */
8235 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
8236 && reduc_chain_length
== 1
8237 && loop_vinfo
->suggested_unroll_factor
== 1)
8238 single_defuse_cycle
= true;
8240 if (single_defuse_cycle
|| lane_reduc_code_p
)
8242 gcc_assert (op
.code
!= COND_EXPR
);
8244 /* 4. Supportable by target? */
8247 /* 4.1. check support for the operation in the loop
8249 This isn't necessary for the lane reduction codes, since they
8250 can only be produced by pattern matching, and it's up to the
8251 pattern matcher to test for support. The main reason for
8252 specifically skipping this step is to avoid rechecking whether
8253 mixed-sign dot-products can be implemented using signed
8255 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
8256 if (!lane_reduc_code_p
8257 && !directly_supported_p (op
.code
, vectype_in
, optab_vector
))
8259 if (dump_enabled_p ())
8260 dump_printf (MSG_NOTE
, "op not supported by target.\n");
8261 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
8262 || !vect_can_vectorize_without_simd_p (op
.code
))
8265 if (dump_enabled_p ())
8266 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
8269 if (vect_emulated_vector_p (vectype_in
)
8270 && !vect_can_vectorize_without_simd_p (op
.code
))
8272 if (dump_enabled_p ())
8273 dump_printf (MSG_NOTE
, "using word mode not possible.\n");
8277 /* lane-reducing operations have to go through vect_transform_reduction.
8278 For the other cases try without the single cycle optimization. */
8281 if (lane_reduc_code_p
)
8284 single_defuse_cycle
= false;
8287 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
8289 /* If the reduction stmt is one of the patterns that have lane
8290 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8291 if ((ncopies
> 1 && ! single_defuse_cycle
)
8292 && lane_reduc_code_p
)
8294 if (dump_enabled_p ())
8295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8296 "multi def-use cycle not possible for lane-reducing "
8297 "reduction operation\n");
8302 && !(!single_defuse_cycle
8303 && !lane_reduc_code_p
8304 && reduction_type
!= FOLD_LEFT_REDUCTION
))
8305 for (i
= 0; i
< (int) op
.num_ops
; i
++)
8306 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_op
[i
]))
8308 if (dump_enabled_p ())
8309 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8310 "incompatible vector types for invariants\n");
8315 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8319 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
8320 reduction_type
, ncopies
, cost_vec
);
8321 /* Cost the reduction op inside the loop if transformed via
8322 vect_transform_reduction. Otherwise this is costed by the
8323 separate vectorizable_* routines. */
8324 if (single_defuse_cycle
|| lane_reduc_code_p
)
8327 if (vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
))
8328 /* Three dot-products and a subtraction. */
8330 record_stmt_cost (cost_vec
, ncopies
* factor
, vector_stmt
,
8331 stmt_info
, 0, vect_body
);
8334 if (dump_enabled_p ()
8335 && reduction_type
== FOLD_LEFT_REDUCTION
)
8336 dump_printf_loc (MSG_NOTE
, vect_location
,
8337 "using an in-order (fold-left) reduction.\n");
8338 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
8339 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8340 reductions go through their own vectorizable_* routines. */
8341 if (!single_defuse_cycle
8342 && !lane_reduc_code_p
8343 && reduction_type
!= FOLD_LEFT_REDUCTION
)
8346 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
8347 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
8349 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
8350 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
8352 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
8353 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
8355 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
8357 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
8358 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
8359 internal_fn cond_fn
= get_conditional_internal_fn (op
.code
, op
.type
);
8361 if (reduction_type
!= FOLD_LEFT_REDUCTION
8362 && !use_mask_by_cond_expr_p (op
.code
, cond_fn
, vectype_in
)
8363 && (cond_fn
== IFN_LAST
8364 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
8365 OPTIMIZE_FOR_SPEED
)))
8367 if (dump_enabled_p ())
8368 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8369 "can't operate on partial vectors because"
8370 " no conditional operation is available.\n");
8371 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8373 else if (reduction_type
== FOLD_LEFT_REDUCTION
8374 && reduc_fn
== IFN_LAST
8375 && !expand_vec_cond_expr_p (vectype_in
,
8376 truth_type_for (vectype_in
),
8379 if (dump_enabled_p ())
8380 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8381 "can't operate on partial vectors because"
8382 " no conditional operation is available.\n");
8383 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8385 else if (reduction_type
== FOLD_LEFT_REDUCTION
8386 && internal_fn_mask_index (reduc_fn
) == -1
8387 && FLOAT_TYPE_P (vectype_in
)
8388 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in
))
8390 if (dump_enabled_p ())
8391 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8392 "can't operate on partial vectors because"
8393 " signed zeros cannot be preserved.\n");
8394 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8398 internal_fn mask_reduc_fn
8399 = get_masked_reduction_fn (reduc_fn
, vectype_in
);
8401 if (mask_reduc_fn
== IFN_MASK_LEN_FOLD_LEFT_PLUS
)
8402 vect_record_loop_len (loop_vinfo
, lens
, ncopies
* vec_num
,
8405 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
8412 /* STMT_INFO is a dot-product reduction whose multiplication operands
8413 have different signs. Emit a sequence to emulate the operation
8414 using a series of signed DOT_PROD_EXPRs and return the last
8415 statement generated. VEC_DEST is the result of the vector operation
8416 and VOP lists its inputs. */
8419 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8420 gimple_stmt_iterator
*gsi
, tree vec_dest
,
8423 tree wide_vectype
= signed_type_for (TREE_TYPE (vec_dest
));
8424 tree narrow_vectype
= signed_type_for (TREE_TYPE (vop
[0]));
8425 tree narrow_elttype
= TREE_TYPE (narrow_vectype
);
8428 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8429 if (!TYPE_UNSIGNED (TREE_TYPE (vop
[0])))
8430 std::swap (vop
[0], vop
[1]);
8432 /* Convert all inputs to signed types. */
8433 for (int i
= 0; i
< 3; ++i
)
8434 if (TYPE_UNSIGNED (TREE_TYPE (vop
[i
])))
8436 tree tmp
= make_ssa_name (signed_type_for (TREE_TYPE (vop
[i
])));
8437 new_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, vop
[i
]);
8438 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8442 /* In the comments below we assume 8-bit inputs for simplicity,
8443 but the approach works for any full integer type. */
8445 /* Create a vector of -128. */
8446 tree min_narrow_elttype
= TYPE_MIN_VALUE (narrow_elttype
);
8447 tree min_narrow
= build_vector_from_val (narrow_vectype
,
8448 min_narrow_elttype
);
8450 /* Create a vector of 64. */
8451 auto half_wi
= wi::lrshift (wi::to_wide (min_narrow_elttype
), 1);
8452 tree half_narrow
= wide_int_to_tree (narrow_elttype
, half_wi
);
8453 half_narrow
= build_vector_from_val (narrow_vectype
, half_narrow
);
8455 /* Emit: SUB_RES = VOP[0] - 128. */
8456 tree sub_res
= make_ssa_name (narrow_vectype
);
8457 new_stmt
= gimple_build_assign (sub_res
, PLUS_EXPR
, vop
[0], min_narrow
);
8458 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8462 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8463 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8464 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8466 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8467 Doing the two 64 * y steps first allows more time to compute x. */
8468 tree stage1
= make_ssa_name (wide_vectype
);
8469 new_stmt
= gimple_build_assign (stage1
, DOT_PROD_EXPR
,
8470 vop
[1], half_narrow
, vop
[2]);
8471 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8473 tree stage2
= make_ssa_name (wide_vectype
);
8474 new_stmt
= gimple_build_assign (stage2
, DOT_PROD_EXPR
,
8475 vop
[1], half_narrow
, stage1
);
8476 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8478 tree stage3
= make_ssa_name (wide_vectype
);
8479 new_stmt
= gimple_build_assign (stage3
, DOT_PROD_EXPR
,
8480 sub_res
, vop
[1], stage2
);
8481 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8483 /* Convert STAGE3 to the reduction type. */
8484 return gimple_build_assign (vec_dest
, CONVERT_EXPR
, stage3
);
8487 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8491 vect_transform_reduction (loop_vec_info loop_vinfo
,
8492 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
8493 gimple
**vec_stmt
, slp_tree slp_node
)
8495 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
8496 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8501 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8502 gcc_assert (reduc_info
->is_reduc_info
);
8504 if (nested_in_vect_loop_p (loop
, stmt_info
))
8507 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
8511 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
8514 /* All uses but the last are expected to be defined in the loop.
8515 The last use is the reduction variable. In case of nested cycle this
8516 assumption is not true: we use reduc_index to record the index of the
8517 reduction variable. */
8518 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
8519 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
8520 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
8521 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
8526 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8530 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
8534 code_helper code
= canonicalize_code (op
.code
, op
.type
);
8535 internal_fn cond_fn
= get_conditional_internal_fn (code
, op
.type
);
8537 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
8538 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
8539 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
8542 tree new_temp
= NULL_TREE
;
8543 auto_vec
<tree
> vec_oprnds0
;
8544 auto_vec
<tree
> vec_oprnds1
;
8545 auto_vec
<tree
> vec_oprnds2
;
8548 if (dump_enabled_p ())
8549 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
8551 /* FORNOW: Multiple types are not supported for condition. */
8552 if (code
== COND_EXPR
)
8553 gcc_assert (ncopies
== 1);
8555 /* A binary COND_OP reduction must have the same definition and else
8557 bool cond_fn_p
= code
.is_internal_fn ()
8558 && conditional_internal_fn_code (internal_fn (code
)) != ERROR_MARK
;
8561 gcc_assert (code
== IFN_COND_ADD
|| code
== IFN_COND_SUB
8562 || code
== IFN_COND_MUL
|| code
== IFN_COND_AND
8563 || code
== IFN_COND_IOR
|| code
== IFN_COND_XOR
);
8564 gcc_assert (op
.num_ops
== 4
8565 && (op
.ops
[reduc_index
]
8566 == op
.ops
[internal_fn_else_index ((internal_fn
) code
)]));
8569 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8571 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
8572 if (reduction_type
== FOLD_LEFT_REDUCTION
)
8574 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
8575 gcc_assert (code
.is_tree_code () || cond_fn_p
);
8576 return vectorize_fold_left_reduction
8577 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
,
8578 code
, reduc_fn
, op
.ops
, op
.num_ops
, vectype_in
,
8579 reduc_index
, masks
, lens
);
8582 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
8583 gcc_assert (single_defuse_cycle
8584 || code
== DOT_PROD_EXPR
8585 || code
== WIDEN_SUM_EXPR
8586 || code
== SAD_EXPR
);
8588 /* Create the destination vector */
8589 tree scalar_dest
= gimple_get_lhs (stmt_info
->stmt
);
8590 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
8592 /* Get NCOPIES vector definitions for all operands except the reduction
8596 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
8597 single_defuse_cycle
&& reduc_index
== 0
8598 ? NULL_TREE
: op
.ops
[0], &vec_oprnds0
,
8599 single_defuse_cycle
&& reduc_index
== 1
8600 ? NULL_TREE
: op
.ops
[1], &vec_oprnds1
,
8602 && !(single_defuse_cycle
&& reduc_index
== 2)
8603 ? op
.ops
[2] : NULL_TREE
, &vec_oprnds2
);
8607 /* For a conditional operation pass the truth type as mask
8609 gcc_assert (single_defuse_cycle
8610 && (reduc_index
== 1 || reduc_index
== 2));
8611 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
8612 op
.ops
[0], truth_type_for (vectype_in
), &vec_oprnds0
,
8613 reduc_index
== 1 ? NULL_TREE
: op
.ops
[1],
8614 NULL_TREE
, &vec_oprnds1
,
8615 reduc_index
== 2 ? NULL_TREE
: op
.ops
[2],
8616 NULL_TREE
, &vec_oprnds2
);
8619 /* For single def-use cycles get one copy of the vectorized reduction
8621 if (single_defuse_cycle
)
8623 gcc_assert (!slp_node
);
8624 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
8625 op
.ops
[reduc_index
],
8626 reduc_index
== 0 ? &vec_oprnds0
8627 : (reduc_index
== 1 ? &vec_oprnds1
8631 bool emulated_mixed_dot_prod
8632 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
8633 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
8636 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
8637 if (masked_loop_p
&& !mask_by_cond_expr
)
8639 /* No conditional ifns have been defined for dot-product yet. */
8640 gcc_assert (code
!= DOT_PROD_EXPR
);
8642 /* Make sure that the reduction accumulator is vop[0]. */
8643 if (reduc_index
== 1)
8645 gcc_assert (commutative_binary_op_p (code
, op
.type
));
8646 std::swap (vop
[0], vop
[1]);
8648 tree mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
,
8649 vec_num
* ncopies
, vectype_in
, i
);
8650 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
8651 vop
[0], vop
[1], vop
[0]);
8652 new_temp
= make_ssa_name (vec_dest
, call
);
8653 gimple_call_set_lhs (call
, new_temp
);
8654 gimple_call_set_nothrow (call
, true);
8655 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
8660 if (op
.num_ops
>= 3)
8661 vop
[2] = vec_oprnds2
[i
];
8663 if (masked_loop_p
&& mask_by_cond_expr
)
8665 tree mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
,
8666 vec_num
* ncopies
, vectype_in
, i
);
8667 build_vect_cond_expr (code
, vop
, mask
, gsi
);
8670 if (emulated_mixed_dot_prod
)
8671 new_stmt
= vect_emulate_mixed_dot_prod (loop_vinfo
, stmt_info
, gsi
,
8674 else if (code
.is_internal_fn () && !cond_fn_p
)
8675 new_stmt
= gimple_build_call_internal (internal_fn (code
),
8677 vop
[0], vop
[1], vop
[2]);
8678 else if (code
.is_internal_fn () && cond_fn_p
)
8679 new_stmt
= gimple_build_call_internal (internal_fn (code
),
8681 vop
[0], vop
[1], vop
[2],
8684 new_stmt
= gimple_build_assign (vec_dest
, tree_code (op
.code
),
8685 vop
[0], vop
[1], vop
[2]);
8686 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
8687 gimple_set_lhs (new_stmt
, new_temp
);
8688 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8692 slp_node
->push_vec_def (new_stmt
);
8693 else if (single_defuse_cycle
8696 if (reduc_index
== 0)
8697 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
8698 else if (reduc_index
== 1)
8699 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
8700 else if (reduc_index
== 2)
8701 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
8704 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
8708 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
8713 /* Transform phase of a cycle PHI. */
8716 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
8717 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8718 slp_tree slp_node
, slp_instance slp_node_instance
)
8720 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
8721 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8725 bool nested_cycle
= false;
8728 if (nested_in_vect_loop_p (loop
, stmt_info
))
8731 nested_cycle
= true;
8734 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
8735 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
8736 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8737 gcc_assert (reduc_info
->is_reduc_info
);
8739 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
8740 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
8741 /* Leave the scalar phi in place. */
8744 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
8745 /* For a nested cycle we do not fill the above. */
8747 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
8748 gcc_assert (vectype_in
);
8752 /* The size vect_schedule_slp_instance computes is off for us. */
8753 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
8754 * SLP_TREE_LANES (slp_node
), vectype_in
);
8760 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
8763 /* Check whether we should use a single PHI node and accumulate
8764 vectors to one before the backedge. */
8765 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
8768 /* Create the destination vector */
8769 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
8770 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
8773 /* Get the loop-entry arguments. */
8774 tree vec_initial_def
= NULL_TREE
;
8775 auto_vec
<tree
> vec_initial_defs
;
8778 vec_initial_defs
.reserve (vec_num
);
8781 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
8782 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
8787 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
8788 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
8789 vec
<stmt_vec_info
> &stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
8791 unsigned int num_phis
= stmts
.length ();
8792 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
8794 initial_values
.reserve (num_phis
);
8795 for (unsigned int i
= 0; i
< num_phis
; ++i
)
8797 gphi
*this_phi
= as_a
<gphi
*> (stmts
[i
]->stmt
);
8798 initial_values
.quick_push (vect_phi_initial_value (this_phi
));
8801 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
8802 if (!initial_values
.is_empty ())
8805 = (num_phis
== 1 ? initial_values
[0] : NULL_TREE
);
8806 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
8808 = neutral_op_for_reduction (TREE_TYPE (vectype_out
),
8809 code
, initial_value
);
8810 get_initial_defs_for_reduction (loop_vinfo
, reduc_info
,
8811 &vec_initial_defs
, vec_num
,
8812 stmts
.length (), neutral_op
);
8818 /* Get at the scalar def before the loop, that defines the initial
8819 value of the reduction variable. */
8820 tree initial_def
= vect_phi_initial_value (phi
);
8821 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
8822 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8823 and we can't use zero for induc_val, use initial_def. Similarly
8824 for REDUC_MIN and initial_def larger than the base. */
8825 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
8827 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
8828 if (TREE_CODE (initial_def
) == INTEGER_CST
8829 && !integer_zerop (induc_val
)
8830 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
8831 && tree_int_cst_lt (initial_def
, induc_val
))
8832 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
8833 && tree_int_cst_lt (induc_val
, initial_def
))))
8835 induc_val
= initial_def
;
8836 /* Communicate we used the initial_def to epilouge
8838 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
8840 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
8842 else if (nested_cycle
)
8844 /* Do not use an adjustment def as that case is not supported
8845 correctly if ncopies is not one. */
8846 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
8847 ncopies
, initial_def
,
8850 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == CONST_COND_REDUCTION
8851 || STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
8852 /* Fill the initial vector with the initial scalar value. */
8854 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
,
8855 initial_def
, initial_def
);
8859 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
8860 if (!reduc_info
->reduc_initial_values
.is_empty ())
8862 initial_def
= reduc_info
->reduc_initial_values
[0];
8863 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
8865 = neutral_op_for_reduction (TREE_TYPE (initial_def
),
8867 gcc_assert (neutral_op
);
8868 /* Try to simplify the vector initialization by applying an
8869 adjustment after the reduction has been performed. */
8870 if (!reduc_info
->reused_accumulator
8871 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8872 && !operand_equal_p (neutral_op
, initial_def
))
8874 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
)
8876 initial_def
= neutral_op
;
8879 = get_initial_def_for_reduction (loop_vinfo
, reduc_info
,
8880 initial_def
, neutral_op
);
8885 if (vec_initial_def
)
8887 vec_initial_defs
.create (ncopies
);
8888 for (i
= 0; i
< ncopies
; ++i
)
8889 vec_initial_defs
.quick_push (vec_initial_def
);
8892 if (auto *accumulator
= reduc_info
->reused_accumulator
)
8894 tree def
= accumulator
->reduc_input
;
8895 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8897 unsigned int nreduc
;
8898 bool res
= constant_multiple_p (TYPE_VECTOR_SUBPARTS
8900 TYPE_VECTOR_SUBPARTS (vectype_out
),
8903 gimple_seq stmts
= NULL
;
8904 /* Reduce the single vector to a smaller one. */
8907 /* Perform the reduction in the appropriate type. */
8908 tree rvectype
= vectype_out
;
8909 if (!useless_type_conversion_p (TREE_TYPE (vectype_out
),
8910 TREE_TYPE (TREE_TYPE (def
))))
8911 rvectype
= build_vector_type (TREE_TYPE (TREE_TYPE (def
)),
8912 TYPE_VECTOR_SUBPARTS
8914 def
= vect_create_partial_epilog (def
, rvectype
,
8915 STMT_VINFO_REDUC_CODE
8919 /* The epilogue loop might use a different vector mode, like
8921 if (TYPE_MODE (vectype_out
) != TYPE_MODE (TREE_TYPE (def
)))
8923 tree reduc_type
= build_vector_type_for_mode
8924 (TREE_TYPE (TREE_TYPE (def
)), TYPE_MODE (vectype_out
));
8925 def
= gimple_convert (&stmts
, reduc_type
, def
);
8927 /* Adjust the input so we pick up the partially reduced value
8928 for the skip edge in vect_create_epilog_for_reduction. */
8929 accumulator
->reduc_input
= def
;
8930 /* And the reduction could be carried out using a different sign. */
8931 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8932 def
= gimple_convert (&stmts
, vectype_out
, def
);
8933 if (loop_vinfo
->main_loop_edge
)
8935 /* While we'd like to insert on the edge this will split
8936 blocks and disturb bookkeeping, we also will eventually
8937 need this on the skip edge. Rely on sinking to
8938 fixup optimal placement and insert in the pred. */
8939 gimple_stmt_iterator gsi
8940 = gsi_last_bb (loop_vinfo
->main_loop_edge
->src
);
8941 /* Insert before a cond that eventually skips the
8943 if (!gsi_end_p (gsi
) && stmt_ends_bb_p (gsi_stmt (gsi
)))
8945 gsi_insert_seq_after (&gsi
, stmts
, GSI_CONTINUE_LINKING
);
8948 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
),
8951 if (loop_vinfo
->main_loop_edge
)
8953 = vect_get_main_loop_result (loop_vinfo
, def
,
8954 vec_initial_defs
[0]);
8956 vec_initial_defs
.safe_push (def
);
8959 /* Generate the reduction PHIs upfront. */
8960 for (i
= 0; i
< vec_num
; i
++)
8962 tree vec_init_def
= vec_initial_defs
[i
];
8963 for (j
= 0; j
< ncopies
; j
++)
8965 /* Create the reduction-phi that defines the reduction
8967 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
8969 /* Set the loop-entry arg of the reduction-phi. */
8970 if (j
!= 0 && nested_cycle
)
8971 vec_init_def
= vec_initial_defs
[j
];
8972 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
8975 /* The loop-latch arg is set in epilogue processing. */
8978 slp_node
->push_vec_def (new_phi
);
8982 *vec_stmt
= new_phi
;
8983 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
8991 /* Vectorizes LC PHIs. */
8994 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
8995 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8999 || !is_a
<gphi
*> (stmt_info
->stmt
)
9000 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
9003 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
9004 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
9007 if (!vec_stmt
) /* transformation not required. */
9009 /* Deal with copies from externs or constants that disguise as
9010 loop-closed PHI nodes (PR97886). */
9012 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
9013 SLP_TREE_VECTYPE (slp_node
)))
9015 if (dump_enabled_p ())
9016 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9017 "incompatible vector types for invariants\n");
9020 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
9024 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9025 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
9026 basic_block bb
= gimple_bb (stmt_info
->stmt
);
9027 edge e
= single_pred_edge (bb
);
9028 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
9029 auto_vec
<tree
> vec_oprnds
;
9030 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
9031 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
9032 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
9033 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
9035 /* Create the vectorized LC PHI node. */
9036 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
9037 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
9039 slp_node
->push_vec_def (new_phi
);
9041 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
9044 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
9049 /* Vectorizes PHIs. */
9052 vectorizable_phi (vec_info
*,
9053 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
9054 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
9056 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
9059 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
9062 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
9064 if (!vec_stmt
) /* transformation not required. */
9068 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
9071 if (dump_enabled_p ())
9072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9073 "PHI node with unvectorized backedge def\n");
9076 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
9078 if (dump_enabled_p ())
9079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9080 "incompatible vector types for invariants\n");
9083 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
9084 && !useless_type_conversion_p (vectype
,
9085 SLP_TREE_VECTYPE (child
)))
9087 /* With bools we can have mask and non-mask precision vectors
9088 or different non-mask precisions. while pattern recog is
9089 supposed to guarantee consistency here bugs in it can cause
9090 mismatches (PR103489 and PR103800 for example).
9091 Deal with them here instead of ICEing later. */
9092 if (dump_enabled_p ())
9093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9094 "incompatible vector type setup from "
9095 "bool pattern detection\n");
9099 /* For single-argument PHIs assume coalescing which means zero cost
9100 for the scalar and the vector PHIs. This avoids artificially
9101 favoring the vector path (but may pessimize it in some cases). */
9102 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
9103 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
9104 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
9105 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
9109 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
9110 basic_block bb
= gimple_bb (stmt_info
->stmt
);
9111 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
9112 auto_vec
<gphi
*> new_phis
;
9113 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
9115 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
9117 /* Skip not yet vectorized defs. */
9118 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
9119 && SLP_TREE_VEC_DEFS (child
).is_empty ())
9122 auto_vec
<tree
> vec_oprnds
;
9123 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
9124 if (!new_phis
.exists ())
9126 new_phis
.create (vec_oprnds
.length ());
9127 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
9129 /* Create the vectorized LC PHI node. */
9130 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
9131 slp_node
->push_vec_def (new_phis
[j
]);
9134 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
9135 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
9136 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
9138 /* We should have at least one already vectorized child. */
9139 gcc_assert (new_phis
.exists ());
9144 /* Vectorizes first order recurrences. An overview of the transformation
9145 is described below. Suppose we have the following loop.
9148 for (int i = 0; i < n; ++i)
9154 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9155 looks (simplified) like:
9161 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9162 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9165 if (i < n) goto scalar.body
9167 In this example, _2 is a recurrence because it's value depends on the
9168 previous iteration. We vectorize this as (VF = 4)
9171 vect_init = vect_cst(..., ..., ..., 0)
9174 i = PHI <0(vector.preheader), i+4(vector.body)>
9175 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9176 vect_2 = a[i, i+1, i+2, i+3];
9177 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9178 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9179 if (..) goto vector.body
9181 In this function, vectorizable_recurr, we code generate both the
9182 vector PHI node and the permute since those together compute the
9183 vectorized value of the scalar PHI. We do not yet have the
9184 backedge value to fill in there nor into the vec_perm. Those
9185 are filled in maybe_set_vectorized_backedge_value and
9188 TODO: Since the scalar loop does not have a use of the recurrence
9189 outside of the loop the natural way to implement peeling via
9190 vectorizing the live value doesn't work. For now peeling of loops
9191 with a recurrence is not implemented. For SLP the supported cases
9192 are restricted to those requiring a single vector recurrence PHI. */
9195 vectorizable_recurr (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
9196 gimple
**vec_stmt
, slp_tree slp_node
,
9197 stmt_vector_for_cost
*cost_vec
)
9199 if (!loop_vinfo
|| !is_a
<gphi
*> (stmt_info
->stmt
))
9202 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
9204 /* So far we only support first-order recurrence auto-vectorization. */
9205 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
9208 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9211 ncopies
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9213 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9214 poly_int64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9215 unsigned dist
= slp_node
? SLP_TREE_LANES (slp_node
) : 1;
9216 /* We need to be able to make progress with a single vector. */
9217 if (maybe_gt (dist
* 2, nunits
))
9219 if (dump_enabled_p ())
9220 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9221 "first order recurrence exceeds half of "
9226 /* First-order recurrence autovectorization needs to handle permutation
9227 with indices = [nunits-1, nunits, nunits+1, ...]. */
9228 vec_perm_builder
sel (nunits
, 1, 3);
9229 for (int i
= 0; i
< 3; ++i
)
9230 sel
.quick_push (nunits
- dist
+ i
);
9231 vec_perm_indices
indices (sel
, 2, nunits
);
9233 if (!vec_stmt
) /* transformation not required. */
9235 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), TYPE_MODE (vectype
),
9241 /* We eventually need to set a vector type on invariant
9245 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
9246 if (!vect_maybe_update_slp_op_vectype
9247 (child
, SLP_TREE_VECTYPE (slp_node
)))
9249 if (dump_enabled_p ())
9250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9251 "incompatible vector types for "
9256 /* The recurrence costs the initialization vector and one permute
9258 unsigned prologue_cost
= record_stmt_cost (cost_vec
, 1, scalar_to_vec
,
9259 stmt_info
, 0, vect_prologue
);
9260 unsigned inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9261 stmt_info
, 0, vect_body
);
9262 if (dump_enabled_p ())
9263 dump_printf_loc (MSG_NOTE
, vect_location
,
9264 "vectorizable_recurr: inside_cost = %d, "
9265 "prologue_cost = %d .\n", inside_cost
,
9268 STMT_VINFO_TYPE (stmt_info
) = recurr_info_type
;
9272 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
9273 basic_block bb
= gimple_bb (phi
);
9274 tree preheader
= PHI_ARG_DEF_FROM_EDGE (phi
, pe
);
9275 if (!useless_type_conversion_p (TREE_TYPE (vectype
), TREE_TYPE (preheader
)))
9277 gimple_seq stmts
= NULL
;
9278 preheader
= gimple_convert (&stmts
, TREE_TYPE (vectype
), preheader
);
9279 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9281 tree vec_init
= build_vector_from_val (vectype
, preheader
);
9282 vec_init
= vect_init_vector (loop_vinfo
, stmt_info
, vec_init
, vectype
, NULL
);
9284 /* Create the vectorized first-order PHI node. */
9285 tree vec_dest
= vect_get_new_vect_var (vectype
,
9286 vect_simple_var
, "vec_recur_");
9287 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
9288 add_phi_arg (new_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9290 /* Insert shuffles the first-order recurrence autovectorization.
9291 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9292 tree perm
= vect_gen_perm_mask_checked (vectype
, indices
);
9294 /* Insert the required permute after the latch definition. The
9295 second and later operands are tentative and will be updated when we have
9296 vectorized the latch definition. */
9297 edge le
= loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo
));
9298 gimple
*latch_def
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, le
));
9299 gimple_stmt_iterator gsi2
= gsi_for_stmt (latch_def
);
9302 for (unsigned i
= 0; i
< ncopies
; ++i
)
9304 vec_dest
= make_ssa_name (vectype
);
9306 = gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
9307 i
== 0 ? gimple_phi_result (new_phi
) : NULL
,
9309 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, vperm
, &gsi2
);
9312 slp_node
->push_vec_def (vperm
);
9314 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (vperm
);
9318 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
9322 /* Return true if VECTYPE represents a vector that requires lowering
9323 by the vector lowering pass. */
9326 vect_emulated_vector_p (tree vectype
)
9328 return (!VECTOR_MODE_P (TYPE_MODE (vectype
))
9329 && (!VECTOR_BOOLEAN_TYPE_P (vectype
)
9330 || TYPE_PRECISION (TREE_TYPE (vectype
)) != 1));
9333 /* Return true if we can emulate CODE on an integer mode representation
9337 vect_can_vectorize_without_simd_p (tree_code code
)
9355 /* Likewise, but taking a code_helper. */
9358 vect_can_vectorize_without_simd_p (code_helper code
)
9360 return (code
.is_tree_code ()
9361 && vect_can_vectorize_without_simd_p (tree_code (code
)));
9364 /* Create vector init for vectorized iv. */
9366 vect_create_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
9367 tree step_expr
, poly_uint64 nunits
,
9369 enum vect_induction_op_type induction_type
)
9371 unsigned HOST_WIDE_INT const_nunits
;
9372 tree vec_shift
, vec_init
, new_name
;
9374 tree itype
= TREE_TYPE (vectype
);
9376 /* iv_loop is the loop to be vectorized. Create:
9377 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9378 new_name
= gimple_convert (stmts
, itype
, init_expr
);
9379 switch (induction_type
)
9381 case vect_step_op_shr
:
9382 case vect_step_op_shl
:
9383 /* Build the Initial value from shift_expr. */
9384 vec_init
= gimple_build_vector_from_val (stmts
,
9387 vec_shift
= gimple_build (stmts
, VEC_SERIES_EXPR
, vectype
,
9388 build_zero_cst (itype
), step_expr
);
9389 vec_init
= gimple_build (stmts
,
9390 (induction_type
== vect_step_op_shr
9391 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
9392 vectype
, vec_init
, vec_shift
);
9395 case vect_step_op_neg
:
9397 vec_init
= gimple_build_vector_from_val (stmts
,
9400 tree vec_neg
= gimple_build (stmts
, NEGATE_EXPR
,
9402 /* The encoding has 2 interleaved stepped patterns. */
9403 vec_perm_builder
sel (nunits
, 2, 3);
9405 for (i
= 0; i
< 3; i
++)
9408 sel
[2 * i
+ 1] = i
+ nunits
;
9410 vec_perm_indices
indices (sel
, 2, nunits
);
9411 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9412 fail when vec_init is const vector. In that situation vec_perm is not
9415 = vect_gen_perm_mask_any (vectype
, indices
);
9416 vec_init
= gimple_build (stmts
, VEC_PERM_EXPR
,
9423 case vect_step_op_mul
:
9425 /* Use unsigned mult to avoid UD integer overflow. */
9426 gcc_assert (nunits
.is_constant (&const_nunits
));
9427 tree utype
= unsigned_type_for (itype
);
9428 tree uvectype
= build_vector_type (utype
,
9429 TYPE_VECTOR_SUBPARTS (vectype
));
9430 new_name
= gimple_convert (stmts
, utype
, new_name
);
9431 vec_init
= gimple_build_vector_from_val (stmts
,
9434 tree_vector_builder
elts (uvectype
, const_nunits
, 1);
9435 tree elt_step
= build_one_cst (utype
);
9437 elts
.quick_push (elt_step
);
9438 for (i
= 1; i
< const_nunits
; i
++)
9440 /* Create: new_name_i = new_name + step_expr. */
9441 elt_step
= gimple_build (stmts
, MULT_EXPR
,
9442 utype
, elt_step
, step_expr
);
9443 elts
.quick_push (elt_step
);
9445 /* Create a vector from [new_name_0, new_name_1, ...,
9446 new_name_nunits-1]. */
9447 tree vec_mul
= gimple_build_vector (stmts
, &elts
);
9448 vec_init
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
9450 vec_init
= gimple_convert (stmts
, vectype
, vec_init
);
9461 /* Peel init_expr by skip_niter for induction_type. */
9463 vect_peel_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
9464 tree skip_niters
, tree step_expr
,
9465 enum vect_induction_op_type induction_type
)
9467 gcc_assert (TREE_CODE (skip_niters
) == INTEGER_CST
);
9468 tree type
= TREE_TYPE (init_expr
);
9469 unsigned prec
= TYPE_PRECISION (type
);
9470 switch (induction_type
)
9472 case vect_step_op_neg
:
9473 if (TREE_INT_CST_LOW (skip_niters
) % 2)
9474 init_expr
= gimple_build (stmts
, NEGATE_EXPR
, type
, init_expr
);
9475 /* else no change. */
9478 case vect_step_op_shr
:
9479 case vect_step_op_shl
:
9480 skip_niters
= gimple_convert (stmts
, type
, skip_niters
);
9481 step_expr
= gimple_build (stmts
, MULT_EXPR
, type
, step_expr
, skip_niters
);
9482 /* When shift mount >= precision, need to avoid UD.
9483 In the original loop, there's no UD, and according to semantic,
9484 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9485 if (!tree_fits_uhwi_p (step_expr
)
9486 || tree_to_uhwi (step_expr
) >= prec
)
9488 if (induction_type
== vect_step_op_shl
9489 || TYPE_UNSIGNED (type
))
9490 init_expr
= build_zero_cst (type
);
9492 init_expr
= gimple_build (stmts
, RSHIFT_EXPR
, type
,
9494 wide_int_to_tree (type
, prec
- 1));
9497 init_expr
= gimple_build (stmts
, (induction_type
== vect_step_op_shr
9498 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
9499 type
, init_expr
, step_expr
);
9502 case vect_step_op_mul
:
9504 tree utype
= unsigned_type_for (type
);
9505 init_expr
= gimple_convert (stmts
, utype
, init_expr
);
9506 wide_int skipn
= wi::to_wide (skip_niters
);
9507 wide_int begin
= wi::to_wide (step_expr
);
9508 auto_mpz base
, exp
, mod
, res
;
9509 wi::to_mpz (begin
, base
, TYPE_SIGN (type
));
9510 wi::to_mpz (skipn
, exp
, UNSIGNED
);
9511 mpz_ui_pow_ui (mod
, 2, TYPE_PRECISION (type
));
9512 mpz_powm (res
, base
, exp
, mod
);
9513 begin
= wi::from_mpz (type
, res
, TYPE_SIGN (type
));
9514 tree mult_expr
= wide_int_to_tree (utype
, begin
);
9515 init_expr
= gimple_build (stmts
, MULT_EXPR
, utype
,
9516 init_expr
, mult_expr
);
9517 init_expr
= gimple_convert (stmts
, type
, init_expr
);
9528 /* Create vector step for vectorized iv. */
9530 vect_create_nonlinear_iv_step (gimple_seq
* stmts
, tree step_expr
,
9532 enum vect_induction_op_type induction_type
)
9534 tree expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
9535 tree new_name
= NULL
;
9536 /* Step should be pow (step, vf) for mult induction. */
9537 if (induction_type
== vect_step_op_mul
)
9539 gcc_assert (vf
.is_constant ());
9540 wide_int begin
= wi::to_wide (step_expr
);
9542 for (unsigned i
= 0; i
!= vf
.to_constant () - 1; i
++)
9543 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
9545 new_name
= wide_int_to_tree (TREE_TYPE (step_expr
), begin
);
9547 else if (induction_type
== vect_step_op_neg
)
9551 new_name
= gimple_build (stmts
, MULT_EXPR
, TREE_TYPE (step_expr
),
9557 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo
,
9558 stmt_vec_info stmt_info
,
9559 tree new_name
, tree vectype
,
9560 enum vect_induction_op_type induction_type
)
9562 /* No step is needed for neg induction. */
9563 if (induction_type
== vect_step_op_neg
)
9566 tree t
= unshare_expr (new_name
);
9567 gcc_assert (CONSTANT_CLASS_P (new_name
)
9568 || TREE_CODE (new_name
) == SSA_NAME
);
9569 tree new_vec
= build_vector_from_val (vectype
, t
);
9570 tree vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9571 new_vec
, vectype
, NULL
);
9575 /* Update vectorized iv with vect_step, induc_def is init. */
9577 vect_update_nonlinear_iv (gimple_seq
* stmts
, tree vectype
,
9578 tree induc_def
, tree vec_step
,
9579 enum vect_induction_op_type induction_type
)
9581 tree vec_def
= induc_def
;
9582 switch (induction_type
)
9584 case vect_step_op_mul
:
9586 /* Use unsigned mult to avoid UD integer overflow. */
9588 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype
)),
9589 TYPE_VECTOR_SUBPARTS (vectype
));
9590 vec_def
= gimple_convert (stmts
, uvectype
, vec_def
);
9591 vec_step
= gimple_convert (stmts
, uvectype
, vec_step
);
9592 vec_def
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
9594 vec_def
= gimple_convert (stmts
, vectype
, vec_def
);
9598 case vect_step_op_shr
:
9599 vec_def
= gimple_build (stmts
, RSHIFT_EXPR
, vectype
,
9603 case vect_step_op_shl
:
9604 vec_def
= gimple_build (stmts
, LSHIFT_EXPR
, vectype
,
9607 case vect_step_op_neg
:
9608 vec_def
= induc_def
;
9619 /* Function vectorizable_induction
9621 Check if STMT_INFO performs an nonlinear induction computation that can be
9622 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9623 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9625 Return true if STMT_INFO is vectorizable in this way. */
9628 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo
,
9629 stmt_vec_info stmt_info
,
9630 gimple
**vec_stmt
, slp_tree slp_node
,
9631 stmt_vector_for_cost
*cost_vec
)
9633 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9635 bool nested_in_vect_loop
= false;
9636 class loop
*iv_loop
;
9638 edge pe
= loop_preheader_edge (loop
);
9640 tree vec_init
, vec_step
;
9643 gphi
*induction_phi
;
9644 tree induc_def
, vec_dest
;
9645 tree init_expr
, step_expr
;
9647 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9649 gimple_stmt_iterator si
;
9651 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
9653 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9654 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9655 enum vect_induction_op_type induction_type
9656 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
9658 gcc_assert (induction_type
> vect_step_op_add
);
9663 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9664 gcc_assert (ncopies
>= 1);
9666 /* FORNOW. Only handle nonlinear induction in the same loop. */
9667 if (nested_in_vect_loop_p (loop
, stmt_info
))
9669 if (dump_enabled_p ())
9670 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9671 "nonlinear induction in nested loop.\n");
9676 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
9678 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9679 update for each iv and a permutation to generate wanted vector iv. */
9682 if (dump_enabled_p ())
9683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9684 "SLP induction not supported for nonlinear"
9689 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype
)))
9691 if (dump_enabled_p ())
9692 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9693 "floating point nonlinear induction vectorization"
9694 " not supported.\n");
9698 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
9699 init_expr
= vect_phi_initial_value (phi
);
9700 gcc_assert (step_expr
!= NULL_TREE
&& init_expr
!= NULL
9701 && TREE_CODE (step_expr
) == INTEGER_CST
);
9702 /* step_expr should be aligned with init_expr,
9703 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9704 step_expr
= fold_convert (TREE_TYPE (vectype
), step_expr
);
9706 if (TREE_CODE (init_expr
) == INTEGER_CST
)
9707 init_expr
= fold_convert (TREE_TYPE (vectype
), init_expr
);
9708 else if (!tree_nop_conversion_p (TREE_TYPE (vectype
), TREE_TYPE (init_expr
)))
9710 /* INIT_EXPR could be a bit_field, bail out for such case. */
9711 if (dump_enabled_p ())
9712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9713 "nonlinear induction vectorization failed:"
9714 " component type of vectype is not a nop conversion"
9715 " from type of init_expr.\n");
9719 switch (induction_type
)
9721 case vect_step_op_neg
:
9722 if (TREE_CODE (init_expr
) != INTEGER_CST
9723 && TREE_CODE (init_expr
) != REAL_CST
)
9725 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9726 if (!directly_supported_p (NEGATE_EXPR
, vectype
))
9729 /* The encoding has 2 interleaved stepped patterns. */
9730 vec_perm_builder
sel (nunits
, 2, 3);
9731 machine_mode mode
= TYPE_MODE (vectype
);
9733 for (i
= 0; i
< 3; i
++)
9736 sel
[i
* 2 + 1] = i
+ nunits
;
9738 vec_perm_indices
indices (sel
, 2, nunits
);
9739 if (!can_vec_perm_const_p (mode
, mode
, indices
))
9744 case vect_step_op_mul
:
9746 /* Check for backend support of MULT_EXPR. */
9747 if (!directly_supported_p (MULT_EXPR
, vectype
))
9750 /* ?? How to construct vector step for variable number vector.
9751 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9752 if (!vf
.is_constant ())
9757 case vect_step_op_shr
:
9758 /* Check for backend support of RSHIFT_EXPR. */
9759 if (!directly_supported_p (RSHIFT_EXPR
, vectype
, optab_vector
))
9762 /* Don't shift more than type precision to avoid UD. */
9763 if (!tree_fits_uhwi_p (step_expr
)
9764 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
9765 TYPE_PRECISION (TREE_TYPE (init_expr
))))
9769 case vect_step_op_shl
:
9770 /* Check for backend support of RSHIFT_EXPR. */
9771 if (!directly_supported_p (LSHIFT_EXPR
, vectype
, optab_vector
))
9774 /* Don't shift more than type precision to avoid UD. */
9775 if (!tree_fits_uhwi_p (step_expr
)
9776 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
9777 TYPE_PRECISION (TREE_TYPE (init_expr
))))
9786 if (!vec_stmt
) /* transformation not required. */
9788 unsigned inside_cost
= 0, prologue_cost
= 0;
9789 /* loop cost for vec_loop. Neg induction doesn't have any
9791 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9792 stmt_info
, 0, vect_body
);
9794 /* loop cost for vec_loop. Neg induction doesn't have any
9796 if (induction_type
== vect_step_op_neg
)
9799 /* prologue cost for vec_init and vec_step. */
9800 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
9801 stmt_info
, 0, vect_prologue
);
9803 if (dump_enabled_p ())
9804 dump_printf_loc (MSG_NOTE
, vect_location
,
9805 "vect_model_induction_cost: inside_cost = %d, "
9806 "prologue_cost = %d. \n", inside_cost
,
9809 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
9810 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9816 /* Compute a vector variable, initialized with the first VF values of
9817 the induction variable. E.g., for an iv with IV_PHI='X' and
9818 evolution S, for a vector of 4 units, we want to compute:
9819 [X, X + S, X + 2*S, X + 3*S]. */
9821 if (dump_enabled_p ())
9822 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
9824 pe
= loop_preheader_edge (iv_loop
);
9825 /* Find the first insertion point in the BB. */
9826 basic_block bb
= gimple_bb (phi
);
9827 si
= gsi_after_labels (bb
);
9829 gimple_seq stmts
= NULL
;
9831 niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
9832 /* If we are using the loop mask to "peel" for alignment then we need
9833 to adjust the start value here. */
9834 if (niters_skip
!= NULL_TREE
)
9835 init_expr
= vect_peel_nonlinear_iv_init (&stmts
, init_expr
, niters_skip
,
9836 step_expr
, induction_type
);
9838 vec_init
= vect_create_nonlinear_iv_init (&stmts
, init_expr
,
9839 step_expr
, nunits
, vectype
,
9843 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9844 gcc_assert (!new_bb
);
9848 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
9849 vf
, induction_type
);
9852 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9853 gcc_assert (!new_bb
);
9856 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
9859 /* Create the following def-use cycle:
9864 vec_iv = PHI <vec_init, vec_loop>
9868 vec_loop = vec_iv + vec_step; */
9870 /* Create the induction-phi that defines the induction-operand. */
9871 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
9872 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9873 induc_def
= PHI_RESULT (induction_phi
);
9875 /* Create the iv update inside the loop. */
9877 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
9878 induc_def
, vec_step
,
9881 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9882 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9884 /* Set the arguments of the phi node: */
9885 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9886 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9889 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
9890 *vec_stmt
= induction_phi
;
9892 /* In case that vectorization factor (VF) is bigger than the number
9893 of elements that we can fit in a vectype (nunits), we have to generate
9894 more than one vector stmt - i.e - we need to "unroll" the
9895 vector stmt by a factor VF/nunits. For more details see documentation
9896 in vectorizable_operation. */
9901 /* FORNOW. This restriction should be relaxed. */
9902 gcc_assert (!nested_in_vect_loop
);
9904 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
9905 nunits
, induction_type
);
9907 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
9910 vec_def
= induc_def
;
9911 for (i
= 1; i
< ncopies
; i
++)
9913 /* vec_i = vec_prev + vec_step. */
9915 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
9918 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9919 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9920 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
9924 if (dump_enabled_p ())
9925 dump_printf_loc (MSG_NOTE
, vect_location
,
9926 "transform induction: created def-use cycle: %G%G",
9927 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
9932 /* Function vectorizable_induction
9934 Check if STMT_INFO performs an induction computation that can be vectorized.
9935 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9936 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9937 Return true if STMT_INFO is vectorizable in this way. */
9940 vectorizable_induction (loop_vec_info loop_vinfo
,
9941 stmt_vec_info stmt_info
,
9942 gimple
**vec_stmt
, slp_tree slp_node
,
9943 stmt_vector_for_cost
*cost_vec
)
9945 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9947 bool nested_in_vect_loop
= false;
9948 class loop
*iv_loop
;
9950 edge pe
= loop_preheader_edge (loop
);
9952 tree new_vec
, vec_init
, vec_step
, t
;
9955 gphi
*induction_phi
;
9956 tree induc_def
, vec_dest
;
9957 tree init_expr
, step_expr
;
9958 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9961 gimple_stmt_iterator si
;
9962 enum vect_induction_op_type induction_type
9963 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
9965 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
9969 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
9972 /* Make sure it was recognized as induction computation. */
9973 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
9976 /* Handle nonlinear induction in a separate place. */
9977 if (induction_type
!= vect_step_op_add
)
9978 return vectorizable_nonlinear_induction (loop_vinfo
, stmt_info
,
9979 vec_stmt
, slp_node
, cost_vec
);
9981 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9982 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9987 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9988 gcc_assert (ncopies
>= 1);
9990 /* FORNOW. These restrictions should be relaxed. */
9991 if (nested_in_vect_loop_p (loop
, stmt_info
))
9993 imm_use_iterator imm_iter
;
9994 use_operand_p use_p
;
10001 if (dump_enabled_p ())
10002 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10003 "multiple types in nested loop.\n");
10008 latch_e
= loop_latch_edge (loop
->inner
);
10009 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
10010 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
10012 gimple
*use_stmt
= USE_STMT (use_p
);
10013 if (is_gimple_debug (use_stmt
))
10016 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
10018 exit_phi
= use_stmt
;
10024 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
10025 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
10026 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
10028 if (dump_enabled_p ())
10029 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10030 "inner-loop induction only used outside "
10031 "of the outer vectorized loop.\n");
10036 nested_in_vect_loop
= true;
10037 iv_loop
= loop
->inner
;
10041 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
10043 if (slp_node
&& !nunits
.is_constant ())
10045 /* The current SLP code creates the step value element-by-element. */
10046 if (dump_enabled_p ())
10047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10048 "SLP induction not supported for variable-length"
10053 if (FLOAT_TYPE_P (vectype
) && !param_vect_induction_float
)
10055 if (dump_enabled_p ())
10056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10057 "floating point induction vectorization disabled\n");
10061 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
10062 gcc_assert (step_expr
!= NULL_TREE
);
10063 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
10064 && !type_has_mode_precision_p (TREE_TYPE (step_expr
)))
10066 if (dump_enabled_p ())
10067 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10068 "bit-precision induction vectorization not "
10072 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
10074 /* Check for backend support of PLUS/MINUS_EXPR. */
10075 if (!directly_supported_p (PLUS_EXPR
, step_vectype
)
10076 || !directly_supported_p (MINUS_EXPR
, step_vectype
))
10079 if (!vec_stmt
) /* transformation not required. */
10081 unsigned inside_cost
= 0, prologue_cost
= 0;
10084 /* We eventually need to set a vector type on invariant
10088 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
10089 if (!vect_maybe_update_slp_op_vectype
10090 (child
, SLP_TREE_VECTYPE (slp_node
)))
10092 if (dump_enabled_p ())
10093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10094 "incompatible vector types for "
10098 /* loop cost for vec_loop. */
10100 = record_stmt_cost (cost_vec
,
10101 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
10102 vector_stmt
, stmt_info
, 0, vect_body
);
10103 /* prologue cost for vec_init (if not nested) and step. */
10104 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
10106 stmt_info
, 0, vect_prologue
);
10108 else /* if (!slp_node) */
10110 /* loop cost for vec_loop. */
10111 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
10112 stmt_info
, 0, vect_body
);
10113 /* prologue cost for vec_init and vec_step. */
10114 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
10115 stmt_info
, 0, vect_prologue
);
10117 if (dump_enabled_p ())
10118 dump_printf_loc (MSG_NOTE
, vect_location
,
10119 "vect_model_induction_cost: inside_cost = %d, "
10120 "prologue_cost = %d .\n", inside_cost
,
10123 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
10124 DUMP_VECT_SCOPE ("vectorizable_induction");
10130 /* Compute a vector variable, initialized with the first VF values of
10131 the induction variable. E.g., for an iv with IV_PHI='X' and
10132 evolution S, for a vector of 4 units, we want to compute:
10133 [X, X + S, X + 2*S, X + 3*S]. */
10135 if (dump_enabled_p ())
10136 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
10138 pe
= loop_preheader_edge (iv_loop
);
10139 /* Find the first insertion point in the BB. */
10140 basic_block bb
= gimple_bb (phi
);
10141 si
= gsi_after_labels (bb
);
10143 /* For SLP induction we have to generate several IVs as for example
10144 with group size 3 we need
10145 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10146 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10149 /* Enforced above. */
10150 unsigned int const_nunits
= nunits
.to_constant ();
10152 /* The initial values are vectorized, but any lanes > group_size
10153 need adjustment. */
10155 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
10157 /* Gather steps. Since we do not vectorize inductions as
10158 cycles we have to reconstruct the step from SCEV data. */
10159 unsigned group_size
= SLP_TREE_LANES (slp_node
);
10160 tree
*steps
= XALLOCAVEC (tree
, group_size
);
10161 tree
*inits
= XALLOCAVEC (tree
, group_size
);
10162 stmt_vec_info phi_info
;
10163 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
10165 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
10167 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
10171 /* Now generate the IVs. */
10172 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
10173 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
10175 if (nested_in_vect_loop
)
10179 /* Compute the number of distinct IVs we need. First reduce
10180 group_size if it is a multiple of const_nunits so we get
10181 one IV for a group_size of 4 but const_nunits 2. */
10182 unsigned group_sizep
= group_size
;
10183 if (group_sizep
% const_nunits
== 0)
10184 group_sizep
= group_sizep
/ const_nunits
;
10185 nivs
= least_common_multiple (group_sizep
,
10186 const_nunits
) / const_nunits
;
10188 tree stept
= TREE_TYPE (step_vectype
);
10189 tree lupdate_mul
= NULL_TREE
;
10190 if (!nested_in_vect_loop
)
10192 /* The number of iterations covered in one vector iteration. */
10193 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
10195 = build_vector_from_val (step_vectype
,
10196 SCALAR_FLOAT_TYPE_P (stept
)
10197 ? build_real_from_wide (stept
, lup_mul
,
10199 : build_int_cstu (stept
, lup_mul
));
10201 tree peel_mul
= NULL_TREE
;
10202 gimple_seq init_stmts
= NULL
;
10203 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
10205 if (SCALAR_FLOAT_TYPE_P (stept
))
10206 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
10207 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
10209 peel_mul
= gimple_convert (&init_stmts
, stept
,
10210 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
10211 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
10212 step_vectype
, peel_mul
);
10215 auto_vec
<tree
> vec_steps
;
10216 for (ivn
= 0; ivn
< nivs
; ++ivn
)
10218 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
10219 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
10220 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
10221 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
10223 /* The scalar steps of the IVs. */
10224 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
10225 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
10226 step_elts
.quick_push (elt
);
10229 /* The scalar inits of the IVs if not vectorized. */
10230 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
10231 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
10233 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
10234 TREE_TYPE (vectype
), elt
);
10235 init_elts
.quick_push (elt
);
10237 /* The number of steps to add to the initial values. */
10238 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
10239 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
10240 ? build_real_from_wide (stept
,
10242 : build_int_cstu (stept
, mul_elt
));
10244 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
10245 vec_steps
.safe_push (vec_step
);
10246 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
10248 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
10249 step_mul
, peel_mul
);
10251 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
10253 /* Create the induction-phi that defines the induction-operand. */
10254 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
10256 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
10257 induc_def
= PHI_RESULT (induction_phi
);
10259 /* Create the iv update inside the loop */
10260 tree up
= vec_step
;
10262 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
10263 vec_step
, lupdate_mul
);
10264 gimple_seq stmts
= NULL
;
10265 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
10266 vec_def
= gimple_build (&stmts
,
10267 PLUS_EXPR
, step_vectype
, vec_def
, up
);
10268 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
10269 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10270 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
10274 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
10275 if (!nested_in_vect_loop
10276 && !integer_zerop (step_mul
))
10278 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
10279 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
10280 vec_step
, step_mul
);
10281 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
10283 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
10286 /* Set the arguments of the phi node: */
10287 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
10289 slp_node
->push_vec_def (induction_phi
);
10291 if (!nested_in_vect_loop
)
10293 /* Fill up to the number of vectors we need for the whole group. */
10294 nivs
= least_common_multiple (group_size
,
10295 const_nunits
) / const_nunits
;
10296 vec_steps
.reserve (nivs
-ivn
);
10297 for (; ivn
< nivs
; ++ivn
)
10299 slp_node
->push_vec_def (SLP_TREE_VEC_DEFS (slp_node
)[0]);
10300 vec_steps
.quick_push (vec_steps
[0]);
10304 /* Re-use IVs when we can. We are generating further vector
10305 stmts by adding VF' * stride to the IVs generated above. */
10309 = least_common_multiple (group_size
, const_nunits
) / group_size
;
10311 = build_vector_from_val (step_vectype
,
10312 SCALAR_FLOAT_TYPE_P (stept
)
10313 ? build_real_from_wide (stept
,
10315 : build_int_cstu (stept
, vfp
));
10316 for (; ivn
< nvects
; ++ivn
)
10319 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node
)[ivn
- nivs
]);
10320 tree def
= gimple_get_lhs (iv
);
10322 vec_steps
[ivn
- nivs
]
10323 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
10324 vec_steps
[ivn
- nivs
], lupdate_mul
);
10325 gimple_seq stmts
= NULL
;
10326 def
= gimple_convert (&stmts
, step_vectype
, def
);
10327 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
10328 def
, vec_steps
[ivn
% nivs
]);
10329 def
= gimple_convert (&stmts
, vectype
, def
);
10330 if (gimple_code (iv
) == GIMPLE_PHI
)
10331 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10334 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
10335 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
10337 slp_node
->push_vec_def (def
);
10341 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
10342 gcc_assert (!new_bb
);
10347 init_expr
= vect_phi_initial_value (phi
);
10349 gimple_seq stmts
= NULL
;
10350 if (!nested_in_vect_loop
)
10352 /* Convert the initial value to the IV update type. */
10353 tree new_type
= TREE_TYPE (step_expr
);
10354 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
10356 /* If we are using the loop mask to "peel" for alignment then we need
10357 to adjust the start value here. */
10358 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10359 if (skip_niters
!= NULL_TREE
)
10361 if (FLOAT_TYPE_P (vectype
))
10362 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
10365 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
10366 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
10367 skip_niters
, step_expr
);
10368 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
10369 init_expr
, skip_step
);
10375 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10376 gcc_assert (!new_bb
);
10379 /* Create the vector that holds the initial_value of the induction. */
10380 if (nested_in_vect_loop
)
10382 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10383 been created during vectorization of previous stmts. We obtain it
10384 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10385 auto_vec
<tree
> vec_inits
;
10386 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
10387 init_expr
, &vec_inits
);
10388 vec_init
= vec_inits
[0];
10389 /* If the initial value is not of proper type, convert it. */
10390 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
10393 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
10397 build1 (VIEW_CONVERT_EXPR
, vectype
,
10399 vec_init
= gimple_assign_lhs (new_stmt
);
10400 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
10402 gcc_assert (!new_bb
);
10407 /* iv_loop is the loop to be vectorized. Create:
10408 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10410 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
10412 unsigned HOST_WIDE_INT const_nunits
;
10413 if (nunits
.is_constant (&const_nunits
))
10415 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
10416 elts
.quick_push (new_name
);
10417 for (i
= 1; i
< const_nunits
; i
++)
10419 /* Create: new_name_i = new_name + step_expr */
10420 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
10421 new_name
, step_expr
);
10422 elts
.quick_push (new_name
);
10424 /* Create a vector from [new_name_0, new_name_1, ...,
10425 new_name_nunits-1] */
10426 vec_init
= gimple_build_vector (&stmts
, &elts
);
10428 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
10429 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10430 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
10431 new_name
, step_expr
);
10435 [base, base, base, ...]
10436 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10437 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
10438 gcc_assert (flag_associative_math
);
10439 tree index
= build_index_vector (step_vectype
, 0, 1);
10440 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
10442 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
10444 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
10445 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
10446 vec_init
, step_vec
);
10447 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
10448 vec_init
, base_vec
);
10450 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
10454 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10455 gcc_assert (!new_bb
);
10460 /* Create the vector that holds the step of the induction. */
10461 gimple_stmt_iterator
*step_iv_si
= NULL
;
10462 if (nested_in_vect_loop
)
10463 /* iv_loop is nested in the loop to be vectorized. Generate:
10464 vec_step = [S, S, S, S] */
10465 new_name
= step_expr
;
10466 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
))
10468 /* When we're using loop_len produced by SELEC_VL, the non-final
10469 iterations are not always processing VF elements. So vectorize
10470 induction variable instead of
10472 _21 = vect_vec_iv_.6_22 + { VF, ... };
10474 We should generate:
10476 _35 = .SELECT_VL (ivtmp_33, VF);
10477 vect_cst__22 = [vec_duplicate_expr] _35;
10478 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10479 gcc_assert (!slp_node
);
10480 gimple_seq seq
= NULL
;
10481 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
10482 tree len
= vect_get_loop_len (loop_vinfo
, NULL
, lens
, 1, vectype
, 0, 0);
10483 expr
= force_gimple_operand (fold_convert (TREE_TYPE (step_expr
),
10484 unshare_expr (len
)),
10485 &seq
, true, NULL_TREE
);
10486 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
), expr
,
10488 gsi_insert_seq_before (&si
, seq
, GSI_SAME_STMT
);
10493 /* iv_loop is the loop to be vectorized. Generate:
10494 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10495 gimple_seq seq
= NULL
;
10496 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
10498 expr
= build_int_cst (integer_type_node
, vf
);
10499 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
10502 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
10503 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
10507 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
10508 gcc_assert (!new_bb
);
10512 t
= unshare_expr (new_name
);
10513 gcc_assert (CONSTANT_CLASS_P (new_name
)
10514 || TREE_CODE (new_name
) == SSA_NAME
);
10515 new_vec
= build_vector_from_val (step_vectype
, t
);
10516 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
10517 new_vec
, step_vectype
, step_iv_si
);
10520 /* Create the following def-use cycle:
10525 vec_iv = PHI <vec_init, vec_loop>
10529 vec_loop = vec_iv + vec_step; */
10531 /* Create the induction-phi that defines the induction-operand. */
10532 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
10533 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
10534 induc_def
= PHI_RESULT (induction_phi
);
10536 /* Create the iv update inside the loop */
10538 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
10539 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
10540 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
10541 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10542 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
10544 /* Set the arguments of the phi node: */
10545 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
10546 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
10549 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
10550 *vec_stmt
= induction_phi
;
10552 /* In case that vectorization factor (VF) is bigger than the number
10553 of elements that we can fit in a vectype (nunits), we have to generate
10554 more than one vector stmt - i.e - we need to "unroll" the
10555 vector stmt by a factor VF/nunits. For more details see documentation
10556 in vectorizable_operation. */
10560 gimple_seq seq
= NULL
;
10561 /* FORNOW. This restriction should be relaxed. */
10562 gcc_assert (!nested_in_vect_loop
);
10563 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10564 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
));
10566 /* Create the vector that holds the step of the induction. */
10567 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
10569 expr
= build_int_cst (integer_type_node
, nunits
);
10570 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
10573 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
10574 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
10578 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
10579 gcc_assert (!new_bb
);
10582 t
= unshare_expr (new_name
);
10583 gcc_assert (CONSTANT_CLASS_P (new_name
)
10584 || TREE_CODE (new_name
) == SSA_NAME
);
10585 new_vec
= build_vector_from_val (step_vectype
, t
);
10586 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
10587 new_vec
, step_vectype
, NULL
);
10589 vec_def
= induc_def
;
10590 for (i
= 1; i
< ncopies
+ 1; i
++)
10592 /* vec_i = vec_prev + vec_step */
10593 gimple_seq stmts
= NULL
;
10594 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
10595 vec_def
= gimple_build (&stmts
,
10596 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
10597 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
10599 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10602 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
10603 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
10607 /* vec_1 = vec_iv + (VF/n * S)
10608 vec_2 = vec_1 + (VF/n * S)
10610 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10612 vec_n is used as vec_loop to save the large step register and
10613 related operations. */
10614 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
10620 if (dump_enabled_p ())
10621 dump_printf_loc (MSG_NOTE
, vect_location
,
10622 "transform induction: created def-use cycle: %G%G",
10623 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
10628 /* Function vectorizable_live_operation_1.
10630 helper function for vectorizable_live_operation. */
10633 vectorizable_live_operation_1 (loop_vec_info loop_vinfo
,
10634 stmt_vec_info stmt_info
, basic_block exit_bb
,
10635 tree vectype
, int ncopies
, slp_tree slp_node
,
10636 tree bitsize
, tree bitstart
, tree vec_lhs
,
10637 tree lhs_type
, gimple_stmt_iterator
*exit_gsi
)
10639 gcc_assert (single_pred_p (exit_bb
) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
));
10641 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
10642 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
10643 for (unsigned i
= 0; i
< gimple_phi_num_args (phi
); i
++)
10644 SET_PHI_ARG_DEF (phi
, i
, vec_lhs
);
10646 gimple_seq stmts
= NULL
;
10649 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10650 if (integer_zerop (bitstart
))
10652 tree scalar_res
= gimple_build (&stmts
, BIT_FIELD_REF
, TREE_TYPE (vectype
),
10653 vec_lhs_phi
, bitsize
, bitstart
);
10655 /* Convert the extracted vector element to the scalar type. */
10656 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
10658 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
10662 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10664 where VEC_LHS is the vectorized live-out result and MASK is
10665 the loop mask for the final iteration. */
10666 gcc_assert (ncopies
== 1 && !slp_node
);
10667 gimple_seq tem
= NULL
;
10668 gimple_stmt_iterator gsi
= gsi_last (tem
);
10669 tree len
= vect_get_loop_len (loop_vinfo
, &gsi
,
10670 &LOOP_VINFO_LENS (loop_vinfo
),
10674 signed char biasval
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
10675 tree bias_minus_one
10676 = int_const_binop (MINUS_EXPR
,
10677 build_int_cst (TREE_TYPE (len
), biasval
),
10678 build_one_cst (TREE_TYPE (len
)));
10680 /* LAST_INDEX = LEN + (BIAS - 1). */
10681 tree last_index
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (len
),
10682 len
, bias_minus_one
);
10684 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10686 = gimple_build (&stmts
, CFN_VEC_EXTRACT
, TREE_TYPE (vectype
),
10687 vec_lhs_phi
, last_index
);
10689 /* Convert the extracted vector element to the scalar type. */
10690 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
10692 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
10696 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10698 where VEC_LHS is the vectorized live-out result and MASK is
10699 the loop mask for the final iteration. */
10700 gcc_assert (!slp_node
);
10701 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
10702 gimple_seq tem
= NULL
;
10703 gimple_stmt_iterator gsi
= gsi_last (tem
);
10704 tree mask
= vect_get_loop_mask (loop_vinfo
, &gsi
,
10705 &LOOP_VINFO_MASKS (loop_vinfo
),
10708 gimple_seq_add_seq (&stmts
, tem
);
10710 scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
10711 mask
, vec_lhs_phi
);
10713 /* Convert the extracted vector element to the scalar type. */
10714 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
10718 tree bftype
= TREE_TYPE (vectype
);
10719 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
10720 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
10721 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs_phi
, bitsize
, bitstart
);
10722 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
10723 &stmts
, true, NULL_TREE
);
10726 *exit_gsi
= gsi_after_labels (exit_bb
);
10728 gsi_insert_seq_before (exit_gsi
, stmts
, GSI_SAME_STMT
);
10733 /* Function vectorizable_live_operation.
10735 STMT_INFO computes a value that is used outside the loop. Check if
10736 it can be supported. */
10739 vectorizable_live_operation (vec_info
*vinfo
, stmt_vec_info stmt_info
,
10740 slp_tree slp_node
, slp_instance slp_node_instance
,
10741 int slp_index
, bool vec_stmt_p
,
10742 stmt_vector_for_cost
*cost_vec
)
10744 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
10745 imm_use_iterator imm_iter
;
10746 tree lhs
, lhs_type
, bitsize
;
10747 tree vectype
= (slp_node
10748 ? SLP_TREE_VECTYPE (slp_node
)
10749 : STMT_VINFO_VECTYPE (stmt_info
));
10750 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
10753 use_operand_p use_p
;
10754 auto_vec
<tree
> vec_oprnds
;
10756 poly_uint64 vec_index
= 0;
10758 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
)
10759 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
));
10761 /* If a stmt of a reduction is live, vectorize it via
10762 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10763 validity so just trigger the transform here. */
10764 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
10770 /* For reduction chains the meta-info is attached to
10771 the group leader. */
10772 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
10773 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
10774 /* For SLP reductions we vectorize the epilogue for
10775 all involved stmts together. */
10776 else if (slp_index
!= 0)
10779 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
10780 gcc_assert (reduc_info
->is_reduc_info
);
10781 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
10782 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
10785 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
10787 LOOP_VINFO_IV_EXIT (loop_vinfo
));
10789 /* If early break we only have to materialize the reduction on the merge
10790 block, but we have to find an alternate exit first. */
10791 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
10793 for (auto exit
: get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo
)))
10794 if (exit
!= LOOP_VINFO_IV_EXIT (loop_vinfo
))
10796 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
,
10797 slp_node
, slp_node_instance
,
10806 /* If STMT is not relevant and it is a simple assignment and its inputs are
10807 invariant then it can remain in place, unvectorized. The original last
10808 scalar value that it computes will be used. */
10809 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
10811 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
10812 if (dump_enabled_p ())
10813 dump_printf_loc (MSG_NOTE
, vect_location
,
10814 "statement is simple and uses invariant. Leaving in "
10822 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
10826 gcc_assert (slp_index
>= 0);
10828 /* Get the last occurrence of the scalar index from the concatenation of
10829 all the slp vectors. Calculate which slp vector it is and the index
10831 int num_scalar
= SLP_TREE_LANES (slp_node
);
10832 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
10833 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
10835 /* Calculate which vector contains the result, and which lane of
10836 that vector we need. */
10837 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
10839 if (dump_enabled_p ())
10840 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10841 "Cannot determine which vector holds the"
10842 " final result.\n");
10849 /* No transformation required. */
10850 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
10854 if (dump_enabled_p ())
10855 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10856 "can't operate on partial vectors "
10857 "because an SLP statement is live after "
10859 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
10861 else if (ncopies
> 1)
10863 if (dump_enabled_p ())
10864 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10865 "can't operate on partial vectors "
10866 "because ncopies is greater than 1.\n");
10867 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
10871 gcc_assert (ncopies
== 1 && !slp_node
);
10872 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
10873 OPTIMIZE_FOR_SPEED
))
10874 vect_record_loop_mask (loop_vinfo
,
10875 &LOOP_VINFO_MASKS (loop_vinfo
),
10877 else if (can_vec_extract_var_idx_p (
10878 TYPE_MODE (vectype
), TYPE_MODE (TREE_TYPE (vectype
))))
10879 vect_record_loop_len (loop_vinfo
,
10880 &LOOP_VINFO_LENS (loop_vinfo
),
10884 if (dump_enabled_p ())
10886 MSG_MISSED_OPTIMIZATION
, vect_location
,
10887 "can't operate on partial vectors "
10888 "because the target doesn't support extract "
10889 "last reduction.\n");
10890 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
10894 /* ??? Enable for loop costing as well. */
10896 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
10901 /* Use the lhs of the original scalar statement. */
10902 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
10903 if (dump_enabled_p ())
10904 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
10907 lhs
= gimple_get_lhs (stmt
);
10908 lhs_type
= TREE_TYPE (lhs
);
10910 bitsize
= vector_element_bits_tree (vectype
);
10912 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10913 tree vec_lhs
, vec_lhs0
, bitstart
;
10914 gimple
*vec_stmt
, *vec_stmt0
;
10917 gcc_assert (!loop_vinfo
10918 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
10919 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
)));
10921 /* Get the correct slp vectorized stmt. */
10922 vec_lhs
= SLP_TREE_VEC_DEFS (slp_node
)[vec_entry
];
10923 vec_stmt
= SSA_NAME_DEF_STMT (vec_lhs
);
10925 /* In case we need to early break vectorize also get the first stmt. */
10926 vec_lhs0
= SLP_TREE_VEC_DEFS (slp_node
)[0];
10927 vec_stmt0
= SSA_NAME_DEF_STMT (vec_lhs0
);
10929 /* Get entry to use. */
10930 bitstart
= bitsize_int (vec_index
);
10931 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
10935 /* For multiple copies, get the last copy. */
10936 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
10937 vec_lhs
= gimple_get_lhs (vec_stmt
);
10939 /* In case we need to early break vectorize also get the first stmt. */
10940 vec_stmt0
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
10941 vec_lhs0
= gimple_get_lhs (vec_stmt0
);
10943 /* Get the last lane in the vector. */
10944 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
10949 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10950 requirement, insert one phi node for it. It looks like:
10957 # vec_lhs' = PHI <vec_lhs>
10958 new_tree = lane_extract <vec_lhs', ...>;
10959 lhs' = new_tree; */
10961 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10962 /* Check if we have a loop where the chosen exit is not the main exit,
10963 in these cases for an early break we restart the iteration the vector code
10964 did. For the live values we want the value at the start of the iteration
10965 rather than at the end. */
10966 edge main_e
= LOOP_VINFO_IV_EXIT (loop_vinfo
);
10967 bool all_exits_as_early_p
= LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
);
10968 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
10969 if (!is_gimple_debug (use_stmt
)
10970 && !flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
10971 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
10973 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (use_stmt
),
10974 phi_arg_index_from_use (use_p
));
10975 gcc_assert (loop_exit_edge_p (loop
, e
));
10976 bool main_exit_edge
= e
== main_e
;
10977 tree tmp_vec_lhs
= vec_lhs
;
10978 tree tmp_bitstart
= bitstart
;
10980 /* For early exit where the exit is not in the BB that leads
10981 to the latch then we're restarting the iteration in the
10982 scalar loop. So get the first live value. */
10983 if ((all_exits_as_early_p
|| !main_exit_edge
)
10984 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
10986 tmp_vec_lhs
= vec_lhs0
;
10987 tmp_bitstart
= build_zero_cst (TREE_TYPE (bitstart
));
10990 gimple_stmt_iterator exit_gsi
;
10992 = vectorizable_live_operation_1 (loop_vinfo
, stmt_info
,
10993 e
->dest
, vectype
, ncopies
,
10995 tmp_bitstart
, tmp_vec_lhs
,
10996 lhs_type
, &exit_gsi
);
10998 auto gsi
= gsi_for_stmt (use_stmt
);
10999 remove_phi_node (&gsi
, false);
11000 tree lhs_phi
= gimple_phi_result (use_stmt
);
11001 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
11002 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
11006 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11007 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
11008 gcc_assert (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)));
11012 /* For basic-block vectorization simply insert the lane-extraction. */
11013 tree bftype
= TREE_TYPE (vectype
);
11014 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
11015 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
11016 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
11017 vec_lhs
, bitsize
, bitstart
);
11018 gimple_seq stmts
= NULL
;
11019 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
11020 &stmts
, true, NULL_TREE
);
11021 if (TREE_CODE (new_tree
) == SSA_NAME
11022 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
11023 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
11024 if (is_a
<gphi
*> (vec_stmt
))
11026 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
11027 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
11031 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
11032 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
11035 /* Replace use of lhs with newly computed result. If the use stmt is a
11036 single arg PHI, just replace all uses of PHI result. It's necessary
11037 because lcssa PHI defining lhs may be before newly inserted stmt. */
11038 use_operand_p use_p
;
11039 stmt_vec_info use_stmt_info
;
11040 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
11041 if (!is_gimple_debug (use_stmt
)
11042 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
11043 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
11045 /* ??? This can happen when the live lane ends up being
11046 rooted in a vector construction code-generated by an
11047 external SLP node (and code-generation for that already
11048 happened). See gcc.dg/vect/bb-slp-47.c.
11049 Doing this is what would happen if that vector CTOR
11050 were not code-generated yet so it is not too bad.
11051 ??? In fact we'd likely want to avoid this situation
11052 in the first place. */
11053 if (TREE_CODE (new_tree
) == SSA_NAME
11054 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
11055 && gimple_code (use_stmt
) != GIMPLE_PHI
11056 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
11059 if (dump_enabled_p ())
11060 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11061 "Using original scalar computation for "
11062 "live lane because use preceeds vector "
11066 /* ??? It can also happen that we end up pulling a def into
11067 a loop where replacing out-of-loop uses would require
11068 a new LC SSA PHI node. Retain the original scalar in
11069 those cases as well. PR98064. */
11070 if (TREE_CODE (new_tree
) == SSA_NAME
11071 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
11072 && (gimple_bb (use_stmt
)->loop_father
11073 != gimple_bb (vec_stmt
)->loop_father
)
11074 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
11075 gimple_bb (use_stmt
)->loop_father
))
11077 if (dump_enabled_p ())
11078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11079 "Using original scalar computation for "
11080 "live lane because there is an out-of-loop "
11081 "definition for it\n");
11084 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
11085 SET_USE (use_p
, new_tree
);
11086 update_stmt (use_stmt
);
11093 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11096 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
11098 ssa_op_iter op_iter
;
11099 imm_use_iterator imm_iter
;
11100 def_operand_p def_p
;
11103 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
11105 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
11109 if (!is_gimple_debug (ustmt
))
11112 bb
= gimple_bb (ustmt
);
11114 if (!flow_bb_inside_loop_p (loop
, bb
))
11116 if (gimple_debug_bind_p (ustmt
))
11118 if (dump_enabled_p ())
11119 dump_printf_loc (MSG_NOTE
, vect_location
,
11120 "killing debug use\n");
11122 gimple_debug_bind_reset_value (ustmt
);
11123 update_stmt (ustmt
);
11126 gcc_unreachable ();
11132 /* Given loop represented by LOOP_VINFO, return true if computation of
11133 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11137 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
11139 /* Constant case. */
11140 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
11142 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
11143 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
11145 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
11146 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
11147 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
11152 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11153 /* Check the upper bound of loop niters. */
11154 if (get_max_loop_iterations (loop
, &max
))
11156 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
11157 signop sgn
= TYPE_SIGN (type
);
11158 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
11159 if (max
< type_max
)
11165 /* Return a mask type with half the number of elements as OLD_TYPE,
11166 given that it should have mode NEW_MODE. */
11169 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
11171 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
11172 return build_truth_vector_type_for_mode (nunits
, new_mode
);
11175 /* Return a mask type with twice as many elements as OLD_TYPE,
11176 given that it should have mode NEW_MODE. */
11179 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
11181 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
11182 return build_truth_vector_type_for_mode (nunits
, new_mode
);
11185 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11186 contain a sequence of NVECTORS masks that each control a vector of type
11187 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11188 these vector masks with the vector version of SCALAR_MASK. */
11191 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
11192 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
11194 gcc_assert (nvectors
!= 0);
11198 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
11199 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
11202 masks
->mask_set
.add (std::make_pair (vectype
, nvectors
));
11205 /* Given a complete set of masks MASKS, extract mask number INDEX
11206 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11207 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11209 See the comment above vec_loop_masks for more details about the mask
11213 vect_get_loop_mask (loop_vec_info loop_vinfo
,
11214 gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
11215 unsigned int nvectors
, tree vectype
, unsigned int index
)
11217 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
11218 == vect_partial_vectors_while_ult
)
11220 rgroup_controls
*rgm
= &(masks
->rgc_vec
)[nvectors
- 1];
11221 tree mask_type
= rgm
->type
;
11223 /* Populate the rgroup's mask array, if this is the first time we've
11225 if (rgm
->controls
.is_empty ())
11227 rgm
->controls
.safe_grow_cleared (nvectors
, true);
11228 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11230 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
11231 /* Provide a dummy definition until the real one is available. */
11232 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
11233 rgm
->controls
[i
] = mask
;
11237 tree mask
= rgm
->controls
[index
];
11238 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
11239 TYPE_VECTOR_SUBPARTS (vectype
)))
11241 /* A loop mask for data type X can be reused for data type Y
11242 if X has N times more elements than Y and if Y's elements
11243 are N times bigger than X's. In this case each sequence
11244 of N elements in the loop mask will be all-zero or all-one.
11245 We can then view-convert the mask so that each sequence of
11246 N elements is replaced by a single element. */
11247 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
11248 TYPE_VECTOR_SUBPARTS (vectype
)));
11249 gimple_seq seq
= NULL
;
11250 mask_type
= truth_type_for (vectype
);
11251 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
11253 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11257 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
11258 == vect_partial_vectors_avx512
)
11260 /* The number of scalars per iteration and the number of vectors are
11261 both compile-time constants. */
11262 unsigned int nscalars_per_iter
11263 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11264 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
11266 rgroup_controls
*rgm
= &masks
->rgc_vec
[nscalars_per_iter
- 1];
11268 /* The stored nV is dependent on the mask type produced. */
11269 gcc_assert (exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11270 TYPE_VECTOR_SUBPARTS (rgm
->type
)).to_constant ()
11272 nvectors
= rgm
->factor
;
11274 /* Populate the rgroup's mask array, if this is the first time we've
11276 if (rgm
->controls
.is_empty ())
11278 rgm
->controls
.safe_grow_cleared (nvectors
, true);
11279 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11281 tree mask
= make_temp_ssa_name (rgm
->type
, NULL
, "loop_mask");
11282 /* Provide a dummy definition until the real one is available. */
11283 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
11284 rgm
->controls
[i
] = mask
;
11287 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm
->type
),
11288 TYPE_VECTOR_SUBPARTS (vectype
)))
11289 return rgm
->controls
[index
];
11291 /* Split the vector if needed. Since we are dealing with integer mode
11292 masks with AVX512 we can operate on the integer representation
11293 performing the whole vector shifting. */
11294 unsigned HOST_WIDE_INT factor
;
11295 bool ok
= constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm
->type
),
11296 TYPE_VECTOR_SUBPARTS (vectype
), &factor
);
11298 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm
->type
)) == MODE_INT
);
11299 tree mask_type
= truth_type_for (vectype
);
11300 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type
)) == MODE_INT
);
11301 unsigned vi
= index
/ factor
;
11302 unsigned vpart
= index
% factor
;
11303 tree vec
= rgm
->controls
[vi
];
11304 gimple_seq seq
= NULL
;
11305 vec
= gimple_build (&seq
, VIEW_CONVERT_EXPR
,
11306 lang_hooks
.types
.type_for_mode
11307 (TYPE_MODE (rgm
->type
), 1), vec
);
11308 /* For integer mode masks simply shift the right bits into position. */
11310 vec
= gimple_build (&seq
, RSHIFT_EXPR
, TREE_TYPE (vec
), vec
,
11311 build_int_cst (integer_type_node
,
11312 (TYPE_VECTOR_SUBPARTS (vectype
)
11314 vec
= gimple_convert (&seq
, lang_hooks
.types
.type_for_mode
11315 (TYPE_MODE (mask_type
), 1), vec
);
11316 vec
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, vec
);
11318 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11322 gcc_unreachable ();
11325 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11326 lengths for controlling an operation on VECTYPE. The operation splits
11327 each element of VECTYPE into FACTOR separate subelements, measuring the
11328 length as a number of these subelements. */
11331 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
11332 unsigned int nvectors
, tree vectype
, unsigned int factor
)
11334 gcc_assert (nvectors
!= 0);
11335 if (lens
->length () < nvectors
)
11336 lens
->safe_grow_cleared (nvectors
, true);
11337 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
11339 /* The number of scalars per iteration, scalar occupied bytes and
11340 the number of vectors are both compile-time constants. */
11341 unsigned int nscalars_per_iter
11342 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11343 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
11345 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
11347 /* For now, we only support cases in which all loads and stores fall back
11348 to VnQI or none do. */
11349 gcc_assert (!rgl
->max_nscalars_per_iter
11350 || (rgl
->factor
== 1 && factor
== 1)
11351 || (rgl
->max_nscalars_per_iter
* rgl
->factor
11352 == nscalars_per_iter
* factor
));
11353 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
11354 rgl
->type
= vectype
;
11355 rgl
->factor
= factor
;
11359 /* Given a complete set of lengths LENS, extract length number INDEX
11360 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11361 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11362 multipled by the number of elements that should be processed.
11363 Insert any set-up statements before GSI. */
11366 vect_get_loop_len (loop_vec_info loop_vinfo
, gimple_stmt_iterator
*gsi
,
11367 vec_loop_lens
*lens
, unsigned int nvectors
, tree vectype
,
11368 unsigned int index
, unsigned int factor
)
11370 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
11371 bool use_bias_adjusted_len
=
11372 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) != 0;
11374 /* Populate the rgroup's len array, if this is the first time we've
11376 if (rgl
->controls
.is_empty ())
11378 rgl
->controls
.safe_grow_cleared (nvectors
, true);
11379 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11381 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
11382 gcc_assert (len_type
!= NULL_TREE
);
11384 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
11386 /* Provide a dummy definition until the real one is available. */
11387 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
11388 rgl
->controls
[i
] = len
;
11390 if (use_bias_adjusted_len
)
11392 gcc_assert (i
== 0);
11393 tree adjusted_len
=
11394 make_temp_ssa_name (len_type
, NULL
, "adjusted_loop_len");
11395 SSA_NAME_DEF_STMT (adjusted_len
) = gimple_build_nop ();
11396 rgl
->bias_adjusted_ctrl
= adjusted_len
;
11401 if (use_bias_adjusted_len
)
11402 return rgl
->bias_adjusted_ctrl
;
11404 tree loop_len
= rgl
->controls
[index
];
11405 if (rgl
->factor
== 1 && factor
== 1)
11407 poly_int64 nunits1
= TYPE_VECTOR_SUBPARTS (rgl
->type
);
11408 poly_int64 nunits2
= TYPE_VECTOR_SUBPARTS (vectype
);
11409 if (maybe_ne (nunits1
, nunits2
))
11411 /* A loop len for data type X can be reused for data type Y
11412 if X has N times more elements than Y and if Y's elements
11413 are N times bigger than X's. */
11414 gcc_assert (multiple_p (nunits1
, nunits2
));
11415 factor
= exact_div (nunits1
, nunits2
).to_constant ();
11416 tree iv_type
= LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
);
11417 gimple_seq seq
= NULL
;
11418 loop_len
= gimple_build (&seq
, RDIV_EXPR
, iv_type
, loop_len
,
11419 build_int_cst (iv_type
, factor
));
11421 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11427 /* Scale profiling counters by estimation for LOOP which is vectorized
11429 If FLAT is true, the loop we started with had unrealistically flat
11433 scale_profile_for_vect_loop (class loop
*loop
, edge exit_e
, unsigned vf
, bool flat
)
11435 /* For flat profiles do not scale down proportionally by VF and only
11436 cap by known iteration count bounds. */
11439 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
11440 fprintf (dump_file
,
11441 "Vectorized loop profile seems flat; not scaling iteration "
11442 "count down by the vectorization factor %i\n", vf
);
11443 scale_loop_profile (loop
, profile_probability::always (),
11444 get_likely_max_loop_iterations_int (loop
));
11447 /* Loop body executes VF fewer times and exit increases VF times. */
11448 profile_count entry_count
= loop_preheader_edge (loop
)->count ();
11450 /* If we have unreliable loop profile avoid dropping entry
11451 count bellow header count. This can happen since loops
11452 has unrealistically low trip counts. */
11454 && loop
->header
->count
> entry_count
11455 && loop
->header
->count
< entry_count
* vf
)
11457 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
11458 fprintf (dump_file
,
11459 "Vectorization factor %i seems too large for profile "
11460 "prevoiusly believed to be consistent; reducing.\n", vf
);
11464 if (entry_count
.nonzero_p ())
11465 set_edge_probability_and_rescale_others
11467 entry_count
.probability_in (loop
->header
->count
/ vf
));
11468 /* Avoid producing very large exit probability when we do not have
11469 sensible profile. */
11470 else if (exit_e
->probability
< profile_probability::always () / (vf
* 2))
11471 set_edge_probability_and_rescale_others (exit_e
, exit_e
->probability
* vf
);
11472 loop
->latch
->count
= single_pred_edge (loop
->latch
)->count ();
11474 scale_loop_profile (loop
, profile_probability::always () / vf
,
11475 get_likely_max_loop_iterations_int (loop
));
11478 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11479 latch edge values originally defined by it. */
11482 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
11483 stmt_vec_info def_stmt_info
)
11485 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
11486 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
11488 stmt_vec_info phi_info
;
11489 imm_use_iterator iter
;
11490 use_operand_p use_p
;
11491 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
11493 gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
));
11496 if (!(gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
11497 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
11498 && STMT_VINFO_RELEVANT_P (phi_info
)))
11500 loop_p loop
= gimple_bb (phi
)->loop_father
;
11501 edge e
= loop_latch_edge (loop
);
11502 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) != def
)
11505 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
11506 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
11507 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
11509 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
11510 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
11511 gcc_assert (phi_defs
.length () == latch_defs
.length ());
11512 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
11513 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
11514 gimple_get_lhs (latch_defs
[i
]), e
,
11515 gimple_phi_arg_location (phi
, e
->dest_idx
));
11517 else if (STMT_VINFO_DEF_TYPE (phi_info
) == vect_first_order_recurrence
)
11519 /* For first order recurrences we have to update both uses of
11520 the latch definition, the one in the PHI node and the one
11521 in the generated VEC_PERM_EXPR. */
11522 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
11523 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
11524 gcc_assert (phi_defs
.length () == latch_defs
.length ());
11525 tree phidef
= gimple_assign_rhs1 (phi_defs
[0]);
11526 gphi
*vphi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (phidef
));
11527 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
11529 gassign
*perm
= as_a
<gassign
*> (phi_defs
[i
]);
11531 gimple_assign_set_rhs1 (perm
, gimple_get_lhs (latch_defs
[i
-1]));
11532 gimple_assign_set_rhs2 (perm
, gimple_get_lhs (latch_defs
[i
]));
11533 update_stmt (perm
);
11535 add_phi_arg (vphi
, gimple_get_lhs (latch_defs
.last ()), e
,
11536 gimple_phi_arg_location (phi
, e
->dest_idx
));
11541 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11542 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11546 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
11547 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
11549 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11550 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
11552 if (dump_enabled_p ())
11553 dump_printf_loc (MSG_NOTE
, vect_location
,
11554 "------>vectorizing statement: %G", stmt_info
->stmt
);
11556 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
11557 vect_loop_kill_debug_uses (loop
, stmt_info
);
11559 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
11560 && !STMT_VINFO_LIVE_P (stmt_info
))
11562 if (is_gimple_call (stmt_info
->stmt
)
11563 && gimple_call_internal_p (stmt_info
->stmt
, IFN_MASK_CALL
))
11565 gcc_assert (!gimple_call_lhs (stmt_info
->stmt
));
11566 *seen_store
= stmt_info
;
11572 if (STMT_VINFO_VECTYPE (stmt_info
))
11575 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
11576 if (!STMT_SLP_TYPE (stmt_info
)
11577 && maybe_ne (nunits
, vf
)
11578 && dump_enabled_p ())
11579 /* For SLP VF is set according to unrolling factor, and not
11580 to vector size, hence for SLP this print is not valid. */
11581 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
11584 /* Pure SLP statements have already been vectorized. We still need
11585 to apply loop vectorization to hybrid SLP statements. */
11586 if (PURE_SLP_STMT (stmt_info
))
11589 if (dump_enabled_p ())
11590 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
11592 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
11593 *seen_store
= stmt_info
;
11598 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11599 in the hash_map with its corresponding values. */
11602 find_in_mapping (tree t
, void *context
)
11604 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
11606 tree
*value
= mapping
->get (t
);
11607 return value
? *value
: t
;
11610 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11611 original loop that has now been vectorized.
11613 The inits of the data_references need to be advanced with the number of
11614 iterations of the main loop. This has been computed in vect_do_peeling and
11615 is stored in parameter ADVANCE. We first restore the data_references
11616 initial offset with the values recored in ORIG_DRS_INIT.
11618 Since the loop_vec_info of this EPILOGUE was constructed for the original
11619 loop, its stmt_vec_infos all point to the original statements. These need
11620 to be updated to point to their corresponding copies as well as the SSA_NAMES
11621 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11623 The data_reference's connections also need to be updated. Their
11624 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11625 stmt_vec_infos, their statements need to point to their corresponding copy,
11626 if they are gather loads or scatter stores then their reference needs to be
11627 updated to point to its corresponding copy and finally we set
11628 'base_misaligned' to false as we have already peeled for alignment in the
11629 prologue of the main loop. */
11632 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
11634 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
11635 auto_vec
<gimple
*> stmt_worklist
;
11636 hash_map
<tree
,tree
> mapping
;
11637 gimple
*orig_stmt
, *new_stmt
;
11638 gimple_stmt_iterator epilogue_gsi
;
11639 gphi_iterator epilogue_phi_gsi
;
11640 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
11641 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
11644 free (LOOP_VINFO_BBS (epilogue_vinfo
));
11645 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
11647 /* Advance data_reference's with the number of iterations of the previous
11648 loop and its prologue. */
11649 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
11652 /* The EPILOGUE loop is a copy of the original loop so they share the same
11653 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11654 point to the copied statements. We also create a mapping of all LHS' in
11655 the original loop and all the LHS' in the EPILOGUE and create worklists to
11656 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11657 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
11659 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
11660 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
11662 new_stmt
= epilogue_phi_gsi
.phi ();
11664 gcc_assert (gimple_uid (new_stmt
) > 0);
11666 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
11668 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
11669 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
11671 mapping
.put (gimple_phi_result (orig_stmt
),
11672 gimple_phi_result (new_stmt
));
11673 /* PHI nodes can not have patterns or related statements. */
11674 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
11675 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
11678 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
11679 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
11681 new_stmt
= gsi_stmt (epilogue_gsi
);
11682 if (is_gimple_debug (new_stmt
))
11685 gcc_assert (gimple_uid (new_stmt
) > 0);
11687 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
11689 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
11690 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
11692 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
11693 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
11695 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
11697 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
11698 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
11699 !gsi_end_p (gsi
); gsi_next (&gsi
))
11700 stmt_worklist
.safe_push (gsi_stmt (gsi
));
11703 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
11704 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
11706 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
11707 stmt_worklist
.safe_push (stmt
);
11708 /* Set BB such that the assert in
11709 'get_initial_def_for_reduction' is able to determine that
11710 the BB of the related stmt is inside this loop. */
11711 gimple_set_bb (stmt
,
11712 gimple_bb (new_stmt
));
11713 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
11714 gcc_assert (related_vinfo
== NULL
11715 || related_vinfo
== stmt_vinfo
);
11720 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11721 using the original main loop and thus need to be updated to refer to the
11722 cloned variables used in the epilogue. */
11723 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
11725 gimple
*stmt
= stmt_worklist
[i
];
11728 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
11730 tree op
= gimple_op (stmt
, j
);
11731 if ((new_op
= mapping
.get(op
)))
11732 gimple_set_op (stmt
, j
, *new_op
);
11735 /* PR92429: The last argument of simplify_replace_tree disables
11736 folding when replacing arguments. This is required as
11737 otherwise you might end up with different statements than the
11738 ones analyzed in vect_loop_analyze, leading to different
11740 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
11741 &find_in_mapping
, &mapping
, false);
11742 gimple_set_op (stmt
, j
, op
);
11747 struct data_reference
*dr
;
11748 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
11749 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
11751 orig_stmt
= DR_STMT (dr
);
11752 gcc_assert (gimple_uid (orig_stmt
) > 0);
11753 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
11754 /* Data references for gather loads and scatter stores do not use the
11755 updated offset we set using ADVANCE. Instead we have to make sure the
11756 reference in the data references point to the corresponding copy of
11757 the original in the epilogue. Make sure to update both
11758 gather/scatters recognized by dataref analysis and also other
11759 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11760 auto vstmt_vinfo
= vect_stmt_to_vectorize (stmt_vinfo
);
11761 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo
) == VMAT_GATHER_SCATTER
11762 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo
))
11765 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
11766 &find_in_mapping
, &mapping
);
11767 DR_BASE_ADDRESS (dr
)
11768 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
11769 &find_in_mapping
, &mapping
);
11771 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
11772 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
11773 /* The vector size of the epilogue is smaller than that of the main loop
11774 so the alignment is either the same or lower. This means the dr will
11775 thus by definition be aligned. */
11776 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
11779 epilogue_vinfo
->shared
->datarefs_copy
.release ();
11780 epilogue_vinfo
->shared
->save_datarefs ();
11783 /* When vectorizing early break statements instructions that happen before
11784 the early break in the current BB need to be moved to after the early
11785 break. This function deals with that and assumes that any validity
11786 checks has already been performed.
11788 While moving the instructions if it encounters a VUSE or VDEF it then
11789 corrects the VUSES as it moves the statements along. GDEST is the location
11790 in which to insert the new statements. */
11793 move_early_exit_stmts (loop_vec_info loop_vinfo
)
11795 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11797 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).is_empty ())
11800 /* Move all stmts that need moving. */
11801 basic_block dest_bb
= LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
);
11802 gimple_stmt_iterator dest_gsi
= gsi_after_labels (dest_bb
);
11804 tree last_seen_vuse
= NULL_TREE
;
11805 for (gimple
*stmt
: LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
))
11807 /* We have to update crossed degenerate virtual PHIs. Simply
11809 if (gphi
*vphi
= dyn_cast
<gphi
*> (stmt
))
11811 tree vdef
= gimple_phi_result (vphi
);
11812 tree vuse
= gimple_phi_arg_def (vphi
, 0);
11813 imm_use_iterator iter
;
11814 use_operand_p use_p
;
11816 FOR_EACH_IMM_USE_STMT (use_stmt
, iter
, vdef
)
11818 FOR_EACH_IMM_USE_ON_STMT (use_p
, iter
)
11819 SET_USE (use_p
, vuse
);
11821 auto gsi
= gsi_for_stmt (stmt
);
11822 remove_phi_node (&gsi
, true);
11823 last_seen_vuse
= vuse
;
11827 /* Check to see if statement is still required for vect or has been
11829 auto stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
11833 if (dump_enabled_p ())
11834 dump_printf_loc (MSG_NOTE
, vect_location
, "moving stmt %G", stmt
);
11836 gimple_stmt_iterator stmt_gsi
= gsi_for_stmt (stmt
);
11837 gsi_move_before (&stmt_gsi
, &dest_gsi
, GSI_NEW_STMT
);
11838 last_seen_vuse
= gimple_vuse (stmt
);
11841 /* Update all the stmts with their new reaching VUSES. */
11842 for (auto p
: LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
))
11844 if (dump_enabled_p ())
11845 dump_printf_loc (MSG_NOTE
, vect_location
,
11846 "updating vuse to %T for load %G",
11847 last_seen_vuse
, p
);
11848 gimple_set_vuse (p
, last_seen_vuse
);
11852 /* And update the LC PHIs on exits. */
11853 for (edge e
: get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo
)))
11854 if (!dominated_by_p (CDI_DOMINATORS
, e
->src
, dest_bb
))
11855 if (gphi
*phi
= get_virtual_phi (e
->dest
))
11856 SET_PHI_ARG_DEF_ON_EDGE (phi
, e
, last_seen_vuse
);
11859 /* Function vect_transform_loop.
11861 The analysis phase has determined that the loop is vectorizable.
11862 Vectorize the loop - created vectorized stmts to replace the scalar
11863 stmts in the loop, and update the loop exit condition.
11864 Returns scalar epilogue loop if any. */
11867 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
11869 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11870 class loop
*epilogue
= NULL
;
11871 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
11872 int nbbs
= loop
->num_nodes
;
11874 tree niters_vector
= NULL_TREE
;
11875 tree step_vector
= NULL_TREE
;
11876 tree niters_vector_mult_vf
= NULL_TREE
;
11877 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
11878 unsigned int lowest_vf
= constant_lower_bound (vf
);
11880 bool check_profitability
= false;
11882 bool flat
= maybe_flat_loop_profile (loop
);
11884 DUMP_VECT_SCOPE ("vec_transform_loop");
11886 loop_vinfo
->shared
->check_datarefs ();
11888 /* Use the more conservative vectorization threshold. If the number
11889 of iterations is constant assume the cost check has been performed
11890 by our caller. If the threshold makes all loops profitable that
11891 run at least the (estimated) vectorization factor number of times
11892 checking is pointless, too. */
11893 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
11894 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
11896 if (dump_enabled_p ())
11897 dump_printf_loc (MSG_NOTE
, vect_location
,
11898 "Profitability threshold is %d loop iterations.\n",
11900 check_profitability
= true;
11903 /* Make sure there exists a single-predecessor exit bb. Do this before
11905 edge e
= LOOP_VINFO_IV_EXIT (loop_vinfo
);
11906 if (! single_pred_p (e
->dest
) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
11908 split_loop_exit_edge (e
, true);
11909 if (dump_enabled_p ())
11910 dump_printf (MSG_NOTE
, "split exit edge\n");
11913 /* Version the loop first, if required, so the profitability check
11916 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
11919 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
11920 sloop
->force_vectorize
= false;
11921 check_profitability
= false;
11924 /* Make sure there exists a single-predecessor exit bb also on the
11925 scalar loop copy. Do this after versioning but before peeling
11926 so CFG structure is fine for both scalar and if-converted loop
11927 to make slpeel_duplicate_current_defs_from_edges face matched
11928 loop closed PHI nodes on the exit. */
11929 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
11931 e
= LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo
);
11932 if (! single_pred_p (e
->dest
))
11934 split_loop_exit_edge (e
, true);
11935 if (dump_enabled_p ())
11936 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
11940 tree niters
= vect_build_loop_niters (loop_vinfo
);
11941 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
11942 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
11943 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
11945 drs_init_vec orig_drs_init
;
11947 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
11948 &step_vector
, &niters_vector_mult_vf
, th
,
11949 check_profitability
, niters_no_overflow
,
11951 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
11952 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
11954 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11955 block after loop exit. We need to scale all that. */
11956 basic_block preheader
11957 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))->src
;
11959 = preheader
->count
.apply_probability
11960 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
11961 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
11962 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
11963 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo
)->dest
->count
= preheader
->count
;
11966 if (niters_vector
== NULL_TREE
)
11968 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
11969 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
11970 && known_eq (lowest_vf
, vf
))
11973 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
11974 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
11975 step_vector
= build_one_cst (TREE_TYPE (niters
));
11977 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
11978 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
11979 &step_vector
, niters_no_overflow
);
11981 /* vect_do_peeling subtracted the number of peeled prologue
11982 iterations from LOOP_VINFO_NITERS. */
11983 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
11984 &niters_vector
, &step_vector
,
11985 niters_no_overflow
);
11988 /* 1) Make sure the loop header has exactly two entries
11989 2) Make sure we have a preheader basic block. */
11991 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
11993 split_edge (loop_preheader_edge (loop
));
11995 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
11996 /* This will deal with any possible peeling. */
11997 vect_prepare_for_masked_peels (loop_vinfo
);
11999 /* Handle any code motion that we need to for early-break vectorization after
12000 we've done peeling but just before we start vectorizing. */
12001 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
12002 move_early_exit_stmts (loop_vinfo
);
12004 /* Schedule the SLP instances first, then handle loop vectorization
12006 if (!loop_vinfo
->slp_instances
.is_empty ())
12008 DUMP_VECT_SCOPE ("scheduling SLP instances");
12009 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
12012 /* FORNOW: the vectorizer supports only loops which body consist
12013 of one basic block (header + empty latch). When the vectorizer will
12014 support more involved loop forms, the order by which the BBs are
12015 traversed need to be reconsidered. */
12017 for (i
= 0; i
< nbbs
; i
++)
12019 basic_block bb
= bbs
[i
];
12020 stmt_vec_info stmt_info
;
12022 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
12025 gphi
*phi
= si
.phi ();
12026 if (dump_enabled_p ())
12027 dump_printf_loc (MSG_NOTE
, vect_location
,
12028 "------>vectorizing phi: %G", (gimple
*) phi
);
12029 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
12033 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
12034 vect_loop_kill_debug_uses (loop
, stmt_info
);
12036 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
12037 && !STMT_VINFO_LIVE_P (stmt_info
))
12040 if (STMT_VINFO_VECTYPE (stmt_info
)
12042 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
12043 && dump_enabled_p ())
12044 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
12046 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
12047 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
12048 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
12049 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
12050 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
12051 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
12052 && ! PURE_SLP_STMT (stmt_info
))
12054 if (dump_enabled_p ())
12055 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
12056 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
12060 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
12063 gphi
*phi
= si
.phi ();
12064 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
12068 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
12069 && !STMT_VINFO_LIVE_P (stmt_info
))
12072 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
12073 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
12074 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
12075 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
12076 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
12077 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
)
12078 && ! PURE_SLP_STMT (stmt_info
))
12079 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
12082 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
12085 stmt
= gsi_stmt (si
);
12086 /* During vectorization remove existing clobber stmts. */
12087 if (gimple_clobber_p (stmt
))
12089 unlink_stmt_vdef (stmt
);
12090 gsi_remove (&si
, true);
12091 release_defs (stmt
);
12095 /* Ignore vector stmts created in the outer loop. */
12096 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
12098 /* vector stmts created in the outer-loop during vectorization of
12099 stmts in an inner-loop may not have a stmt_info, and do not
12100 need to be vectorized. */
12101 stmt_vec_info seen_store
= NULL
;
12104 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
12106 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
12107 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
12108 !gsi_end_p (subsi
); gsi_next (&subsi
))
12110 stmt_vec_info pat_stmt_info
12111 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
12112 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
12115 stmt_vec_info pat_stmt_info
12116 = STMT_VINFO_RELATED_STMT (stmt_info
);
12117 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
12119 maybe_set_vectorized_backedge_value (loop_vinfo
,
12124 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
12126 maybe_set_vectorized_backedge_value (loop_vinfo
,
12133 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
12134 /* Interleaving. If IS_STORE is TRUE, the
12135 vectorization of the interleaving chain was
12136 completed - free all the stores in the chain. */
12137 vect_remove_stores (loop_vinfo
,
12138 DR_GROUP_FIRST_ELEMENT (seen_store
));
12140 /* Free the attached stmt_vec_info and remove the stmt. */
12141 loop_vinfo
->remove_stmt (stmt_info
);
12146 /* Stub out scalar statements that must not survive vectorization.
12147 Doing this here helps with grouped statements, or statements that
12148 are involved in patterns. */
12149 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
12150 !gsi_end_p (gsi
); gsi_next (&gsi
))
12152 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
12153 if (!call
|| !gimple_call_internal_p (call
))
12155 internal_fn ifn
= gimple_call_internal_fn (call
);
12156 if (ifn
== IFN_MASK_LOAD
)
12158 tree lhs
= gimple_get_lhs (call
);
12159 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12161 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
12162 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
12163 gsi_replace (&gsi
, new_stmt
, true);
12166 else if (conditional_internal_fn_code (ifn
) != ERROR_MARK
)
12168 tree lhs
= gimple_get_lhs (call
);
12169 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12172 = gimple_call_arg (call
, gimple_call_num_args (call
) - 1);
12173 gimple
*new_stmt
= gimple_build_assign (lhs
, else_arg
);
12174 gsi_replace (&gsi
, new_stmt
, true);
12178 } /* BBs in loop */
12180 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12181 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12182 if (integer_onep (step_vector
))
12183 niters_no_overflow
= true;
12184 vect_set_loop_condition (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
), loop_vinfo
,
12185 niters_vector
, step_vector
, niters_vector_mult_vf
,
12186 !niters_no_overflow
);
12188 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
12190 /* True if the final iteration might not handle a full vector's
12191 worth of scalar iterations. */
12192 bool final_iter_may_be_partial
12193 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
12194 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
);
12195 /* The minimum number of iterations performed by the epilogue. This
12196 is 1 when peeling for gaps because we always need a final scalar
12198 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
12199 /* +1 to convert latch counts to loop iteration counts,
12200 -min_epilogue_iters to remove iterations that cannot be performed
12201 by the vector code. */
12202 int bias_for_lowest
= 1 - min_epilogue_iters
;
12203 int bias_for_assumed
= bias_for_lowest
;
12204 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
12205 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
12207 /* When the amount of peeling is known at compile time, the first
12208 iteration will have exactly alignment_npeels active elements.
12209 In the worst case it will have at least one. */
12210 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
12211 bias_for_lowest
+= lowest_vf
- min_first_active
;
12212 bias_for_assumed
+= assumed_vf
- min_first_active
;
12214 /* In these calculations the "- 1" converts loop iteration counts
12215 back to latch counts. */
12216 if (loop
->any_upper_bound
)
12218 loop_vec_info main_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
12219 loop
->nb_iterations_upper_bound
12220 = (final_iter_may_be_partial
12221 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
12223 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
12226 /* Both peeling for alignment and peeling for gaps can end up
12227 with the scalar epilogue running for more than VF-1 iterations. */
12228 && !main_vinfo
->peeling_for_alignment
12229 && !main_vinfo
->peeling_for_gaps
)
12231 unsigned int bound
;
12232 poly_uint64 main_iters
12233 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo
),
12234 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo
));
12236 = upper_bound (main_iters
,
12237 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo
));
12238 if (can_div_away_from_zero_p (main_iters
,
12239 LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
12241 loop
->nb_iterations_upper_bound
12242 = wi::umin ((bound_wide_int
) (bound
- 1),
12243 loop
->nb_iterations_upper_bound
);
12246 if (loop
->any_likely_upper_bound
)
12247 loop
->nb_iterations_likely_upper_bound
12248 = (final_iter_may_be_partial
12249 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
12250 + bias_for_lowest
, lowest_vf
) - 1
12251 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
12252 + bias_for_lowest
, lowest_vf
) - 1);
12253 if (loop
->any_estimate
)
12254 loop
->nb_iterations_estimate
12255 = (final_iter_may_be_partial
12256 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
12258 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
12260 scale_profile_for_vect_loop (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
),
12263 if (dump_enabled_p ())
12265 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
12267 dump_printf_loc (MSG_NOTE
, vect_location
,
12268 "LOOP VECTORIZED\n");
12270 dump_printf_loc (MSG_NOTE
, vect_location
,
12271 "OUTER LOOP VECTORIZED\n");
12272 dump_printf (MSG_NOTE
, "\n");
12275 dump_printf_loc (MSG_NOTE
, vect_location
,
12276 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12277 GET_MODE_NAME (loop_vinfo
->vector_mode
));
12280 /* Loops vectorized with a variable factor won't benefit from
12281 unrolling/peeling. */
12282 if (!vf
.is_constant ())
12285 if (dump_enabled_p ())
12286 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
12287 " variable-length vectorization factor\n");
12289 /* Free SLP instances here because otherwise stmt reference counting
12291 slp_instance instance
;
12292 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
12293 vect_free_slp_instance (instance
);
12294 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
12295 /* Clear-up safelen field since its value is invalid after vectorization
12296 since vectorized loop can have loop-carried dependencies. */
12301 update_epilogue_loop_vinfo (epilogue
, advance
);
12303 epilogue
->simduid
= loop
->simduid
;
12304 epilogue
->force_vectorize
= loop
->force_vectorize
;
12305 epilogue
->dont_vectorize
= false;
12311 /* The code below is trying to perform simple optimization - revert
12312 if-conversion for masked stores, i.e. if the mask of a store is zero
12313 do not perform it and all stored value producers also if possible.
12315 for (i=0; i<n; i++)
12321 this transformation will produce the following semi-hammock:
12323 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12325 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12326 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12327 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12328 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12329 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12330 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12335 optimize_mask_stores (class loop
*loop
)
12337 basic_block
*bbs
= get_loop_body (loop
);
12338 unsigned nbbs
= loop
->num_nodes
;
12341 class loop
*bb_loop
;
12342 gimple_stmt_iterator gsi
;
12344 auto_vec
<gimple
*> worklist
;
12345 auto_purge_vect_location sentinel
;
12347 vect_location
= find_loop_location (loop
);
12348 /* Pick up all masked stores in loop if any. */
12349 for (i
= 0; i
< nbbs
; i
++)
12352 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
12355 stmt
= gsi_stmt (gsi
);
12356 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
12357 worklist
.safe_push (stmt
);
12362 if (worklist
.is_empty ())
12365 /* Loop has masked stores. */
12366 while (!worklist
.is_empty ())
12368 gimple
*last
, *last_store
;
12371 basic_block store_bb
, join_bb
;
12372 gimple_stmt_iterator gsi_to
;
12373 tree vdef
, new_vdef
;
12378 last
= worklist
.pop ();
12379 mask
= gimple_call_arg (last
, 2);
12380 bb
= gimple_bb (last
);
12381 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12382 the same loop as if_bb. It could be different to LOOP when two
12383 level loop-nest is vectorized and mask_store belongs to the inner
12385 e
= split_block (bb
, last
);
12386 bb_loop
= bb
->loop_father
;
12387 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
12389 store_bb
= create_empty_bb (bb
);
12390 add_bb_to_loop (store_bb
, bb_loop
);
12391 e
->flags
= EDGE_TRUE_VALUE
;
12392 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
12393 /* Put STORE_BB to likely part. */
12394 efalse
->probability
= profile_probability::likely ();
12395 e
->probability
= efalse
->probability
.invert ();
12396 store_bb
->count
= efalse
->count ();
12397 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
12398 if (dom_info_available_p (CDI_DOMINATORS
))
12399 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
12400 if (dump_enabled_p ())
12401 dump_printf_loc (MSG_NOTE
, vect_location
,
12402 "Create new block %d to sink mask stores.",
12404 /* Create vector comparison with boolean result. */
12405 vectype
= TREE_TYPE (mask
);
12406 zero
= build_zero_cst (vectype
);
12407 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
12408 gsi
= gsi_last_bb (bb
);
12409 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
12410 /* Create new PHI node for vdef of the last masked store:
12411 .MEM_2 = VDEF <.MEM_1>
12412 will be converted to
12413 .MEM.3 = VDEF <.MEM_1>
12414 and new PHI node will be created in join bb
12415 .MEM_2 = PHI <.MEM_1, .MEM_3>
12417 vdef
= gimple_vdef (last
);
12418 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
12419 gimple_set_vdef (last
, new_vdef
);
12420 phi
= create_phi_node (vdef
, join_bb
);
12421 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
12423 /* Put all masked stores with the same mask to STORE_BB if possible. */
12426 gimple_stmt_iterator gsi_from
;
12427 gimple
*stmt1
= NULL
;
12429 /* Move masked store to STORE_BB. */
12431 gsi
= gsi_for_stmt (last
);
12433 /* Shift GSI to the previous stmt for further traversal. */
12435 gsi_to
= gsi_start_bb (store_bb
);
12436 gsi_move_before (&gsi_from
, &gsi_to
);
12437 /* Setup GSI_TO to the non-empty block start. */
12438 gsi_to
= gsi_start_bb (store_bb
);
12439 if (dump_enabled_p ())
12440 dump_printf_loc (MSG_NOTE
, vect_location
,
12441 "Move stmt to created bb\n%G", last
);
12442 /* Move all stored value producers if possible. */
12443 while (!gsi_end_p (gsi
))
12446 imm_use_iterator imm_iter
;
12447 use_operand_p use_p
;
12450 /* Skip debug statements. */
12451 if (is_gimple_debug (gsi_stmt (gsi
)))
12456 stmt1
= gsi_stmt (gsi
);
12457 /* Do not consider statements writing to memory or having
12458 volatile operand. */
12459 if (gimple_vdef (stmt1
)
12460 || gimple_has_volatile_ops (stmt1
))
12464 lhs
= gimple_get_lhs (stmt1
);
12468 /* LHS of vectorized stmt must be SSA_NAME. */
12469 if (TREE_CODE (lhs
) != SSA_NAME
)
12472 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12474 /* Remove dead scalar statement. */
12475 if (has_zero_uses (lhs
))
12477 gsi_remove (&gsi_from
, true);
12482 /* Check that LHS does not have uses outside of STORE_BB. */
12484 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
12487 use_stmt
= USE_STMT (use_p
);
12488 if (is_gimple_debug (use_stmt
))
12490 if (gimple_bb (use_stmt
) != store_bb
)
12499 if (gimple_vuse (stmt1
)
12500 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
12503 /* Can move STMT1 to STORE_BB. */
12504 if (dump_enabled_p ())
12505 dump_printf_loc (MSG_NOTE
, vect_location
,
12506 "Move stmt to created bb\n%G", stmt1
);
12507 gsi_move_before (&gsi_from
, &gsi_to
);
12508 /* Shift GSI_TO for further insertion. */
12509 gsi_prev (&gsi_to
);
12511 /* Put other masked stores with the same mask to STORE_BB. */
12512 if (worklist
.is_empty ()
12513 || gimple_call_arg (worklist
.last (), 2) != mask
12514 || worklist
.last () != stmt1
)
12516 last
= worklist
.pop ();
12518 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
12522 /* Decide whether it is possible to use a zero-based induction variable
12523 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12524 the value that the induction variable must be able to hold in order
12525 to ensure that the rgroups eventually have no active vector elements.
12526 Return -1 otherwise. */
12529 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
12531 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
12532 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
12533 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
12535 /* Calculate the value that the induction variable must be able
12536 to hit in order to ensure that we end the loop with an all-false mask.
12537 This involves adding the maximum number of inactive trailing scalar
12539 widest_int iv_limit
= -1;
12540 if (max_loop_iterations (loop
, &iv_limit
))
12544 /* Add the maximum number of skipped iterations to the
12545 maximum iteration count. */
12546 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
12547 iv_limit
+= wi::to_widest (niters_skip
);
12549 iv_limit
+= max_vf
- 1;
12551 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
12552 /* Make a conservatively-correct assumption. */
12553 iv_limit
+= max_vf
- 1;
12555 /* IV_LIMIT is the maximum number of latch iterations, which is also
12556 the maximum in-range IV value. Round this value down to the previous
12557 vector alignment boundary and then add an extra full iteration. */
12558 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
12559 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
12564 /* For the given rgroup_controls RGC, check whether an induction variable
12565 would ever hit a value that produces a set of all-false masks or zero
12566 lengths before wrapping around. Return true if it's possible to wrap
12567 around before hitting the desirable value, otherwise return false. */
12570 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
12572 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
12574 if (iv_limit
== -1)
12577 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
12578 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
12579 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
12581 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)