2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
32 #include "tree-pass.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
78 for (i=0; i<N/8; i++){
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *,
159 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
167 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
168 bool vectype_maybe_set_p
,
171 gimple
*stmt
= stmt_info
->stmt
;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
174 && !STMT_VINFO_LIVE_P (stmt_info
))
175 || gimple_clobber_p (stmt
))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype
, nunits_vectype
;
183 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
191 if (STMT_VINFO_VECTYPE (stmt_info
))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
196 || vectype_maybe_set_p
)
197 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
199 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
203 vect_update_max_nunits (vf
, nunits_vectype
);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
214 vect_determine_vf_for_stmt (vec_info
*vinfo
,
215 stmt_vec_info stmt_info
, poly_uint64
*vf
)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
220 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
225 && STMT_VINFO_RELATED_STMT (stmt_info
))
227 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
228 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
232 !gsi_end_p (si
); gsi_next (&si
))
234 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE
, vect_location
,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info
->stmt
);
239 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE
, vect_location
,
246 "==> examining pattern statement: %G",
248 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
284 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
285 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
286 unsigned nbbs
= loop
->num_nodes
;
287 poly_uint64 vectorization_factor
= 1;
288 tree scalar_type
= NULL_TREE
;
291 stmt_vec_info stmt_info
;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i
= 0; i
< nbbs
; i
++)
298 basic_block bb
= bbs
[i
];
300 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
304 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
309 gcc_assert (stmt_info
);
311 if (STMT_VINFO_RELEVANT_P (stmt_info
)
312 || STMT_VINFO_LIVE_P (stmt_info
))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
315 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE
, vect_location
,
319 "get vectype for scalar type: %T\n",
322 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
324 return opt_result::failure_at (phi
,
325 "not vectorized: unsupported "
328 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
337 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
338 dump_printf (MSG_NOTE
, "\n");
341 vect_update_max_nunits (&vectorization_factor
, vectype
);
345 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
348 if (is_gimple_debug (gsi_stmt (si
)))
350 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
352 = vect_determine_vf_for_stmt (loop_vinfo
,
353 stmt_info
, &vectorization_factor
);
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
363 dump_dec (MSG_NOTE
, vectorization_factor
);
364 dump_printf (MSG_NOTE
, "\n");
367 if (known_le (vectorization_factor
, 1U))
368 return opt_result::failure_at (vect_location
,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
381 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
386 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
389 /* When there is no evolution in this loop, the evolution function
391 if (evolution_part
== NULL_TREE
)
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part
))
399 step_expr
= evolution_part
;
400 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
404 step_expr
, init_expr
);
409 if (TREE_CODE (step_expr
) != INTEGER_CST
410 && (TREE_CODE (step_expr
) != SSA_NAME
411 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
412 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
415 || !flag_associative_math
)))
416 && (TREE_CODE (step_expr
) != REAL_CST
417 || !flag_associative_math
))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429 what we are assuming is a double reduction. For example, given
430 a structure like this:
433 x_1 = PHI <x_4(outer2), ...>;
437 x_2 = PHI <x_1(outer1), ...>;
443 x_4 = PHI <x_3(inner)>;
446 outer loop analysis would treat x_1 as a double reduction phi and
447 this function would then return true for x_2. */
450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
454 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
455 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
456 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
461 /* Function vect_analyze_scalar_cycles_1.
463 Examine the cross iteration def-use cycles of scalar variables
464 in LOOP. LOOP_VINFO represents the loop that is now being
465 considered for vectorization (can be LOOP, or an outer-loop
469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
)
471 basic_block bb
= loop
->header
;
473 auto_vec
<stmt_vec_info
, 64> worklist
;
475 bool double_reduc
, reduc_chain
;
477 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
479 /* First - identify all inductions. Reduction detection assumes that all the
480 inductions have been identified, therefore, this order must not be
482 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
484 gphi
*phi
= gsi
.phi ();
485 tree access_fn
= NULL
;
486 tree def
= PHI_RESULT (phi
);
487 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
492 /* Skip virtual phi's. The data dependences that are associated with
493 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
494 if (virtual_operand_p (def
))
497 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
499 /* Analyze the evolution function. */
500 access_fn
= analyze_scalar_evolution (loop
, def
);
503 STRIP_NOPS (access_fn
);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE
, vect_location
,
506 "Access function of PHI: %T\n", access_fn
);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
508 = initial_condition_in_loop_num (access_fn
, loop
->num
);
509 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
510 = evolution_part_in_loop_num (access_fn
, loop
->num
);
514 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
515 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
516 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
517 && TREE_CODE (step
) != INTEGER_CST
))
519 worklist
.safe_push (stmt_vinfo
);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
525 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
527 if (dump_enabled_p ())
528 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
529 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
533 /* Second - identify all reductions and nested cycles. */
534 while (worklist
.length () > 0)
536 stmt_vec_info stmt_vinfo
= worklist
.pop ();
537 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
538 tree def
= PHI_RESULT (phi
);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
543 gcc_assert (!virtual_operand_p (def
)
544 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
546 stmt_vec_info reduc_stmt_info
547 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
551 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
552 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE
, vect_location
,
557 "Detected double reduction.\n");
559 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
560 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
564 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE
, vect_location
,
568 "Detected vectorizable nested cycle.\n");
570 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE
, vect_location
,
576 "Detected reduction.\n");
578 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
579 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
580 /* Store the reduction cycles for possible vectorization in
581 loop-aware SLP if it was not detected as reduction
584 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
592 "Unknown def-use cycle pattern.\n");
597 /* Function vect_analyze_scalar_cycles.
599 Examine the cross iteration def-use cycles of scalar variables, by
600 analyzing the loop-header PHIs of scalar variables. Classify each
601 cycle as one of the following: invariant, induction, reduction, unknown.
602 We do that for the loop represented by LOOP_VINFO, and also to its
603 inner-loop, if exists.
604 Examples for scalar cycles:
619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
621 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
623 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
625 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626 Reductions in such inner-loop therefore have different properties than
627 the reductions in the nest that gets vectorized:
628 1. When vectorized, they are executed in the same order as in the original
629 scalar loop, so we can't change the order of computation when
631 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632 current checks are too strict. */
635 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
638 /* Transfer group and reduction information from STMT_INFO to its
642 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
644 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
646 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
647 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
648 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
651 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
652 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
653 == STMT_VINFO_DEF_TYPE (stmt_info
));
654 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
655 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
657 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
658 = STMT_VINFO_RELATED_STMT (stmt_info
);
663 /* Fixup scalar cycles that now have their stmts detected as patterns. */
666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
671 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
673 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
676 if ((STMT_VINFO_IN_PATTERN_P (next
)
677 != STMT_VINFO_IN_PATTERN_P (first
))
678 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
680 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
682 /* If all reduction chain members are well-formed patterns adjust
683 the group to group the pattern stmts instead. */
685 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
687 if (STMT_VINFO_IN_PATTERN_P (first
))
689 vect_fixup_reduc_chain (first
);
690 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
691 = STMT_VINFO_RELATED_STMT (first
);
694 /* If not all stmt in the chain are patterns or if we failed
695 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696 it as regular reduction instead. */
699 stmt_vec_info vinfo
= first
;
700 stmt_vec_info last
= NULL
;
703 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
704 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
705 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
709 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
711 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
712 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
718 /* Function vect_get_loop_niters.
720 Determine how many iterations the loop is executed and place it
721 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
722 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
723 niter information holds in ASSUMPTIONS.
725 Return the loop exit condition. */
729 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
730 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
732 edge exit
= single_exit (loop
);
733 class tree_niter_desc niter_desc
;
734 tree niter_assumptions
, niter
, may_be_zero
;
735 gcond
*cond
= get_loop_exit_condition (loop
);
737 *assumptions
= boolean_true_node
;
738 *number_of_iterationsm1
= chrec_dont_know
;
739 *number_of_iterations
= chrec_dont_know
;
740 DUMP_VECT_SCOPE ("get_loop_niters");
745 may_be_zero
= NULL_TREE
;
746 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
747 || chrec_contains_undetermined (niter_desc
.niter
))
750 niter_assumptions
= niter_desc
.assumptions
;
751 may_be_zero
= niter_desc
.may_be_zero
;
752 niter
= niter_desc
.niter
;
754 if (may_be_zero
&& integer_zerop (may_be_zero
))
755 may_be_zero
= NULL_TREE
;
759 if (COMPARISON_CLASS_P (may_be_zero
))
761 /* Try to combine may_be_zero with assumptions, this can simplify
762 computation of niter expression. */
763 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
764 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
766 fold_build1 (TRUTH_NOT_EXPR
,
770 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
771 build_int_cst (TREE_TYPE (niter
), 0),
772 rewrite_to_non_trapping_overflow (niter
));
774 may_be_zero
= NULL_TREE
;
776 else if (integer_nonzerop (may_be_zero
))
778 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
779 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
786 *assumptions
= niter_assumptions
;
787 *number_of_iterationsm1
= niter
;
789 /* We want the number of loop header executions which is the number
790 of latch executions plus one.
791 ??? For UINT_MAX latch executions this number overflows to zero
792 for loops like do { n++; } while (n != 0); */
793 if (niter
&& !chrec_contains_undetermined (niter
))
794 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
795 build_int_cst (TREE_TYPE (niter
), 1));
796 *number_of_iterations
= niter
;
801 /* Function bb_in_loop_p
803 Used as predicate for dfs order traversal of the loop bbs. */
806 bb_in_loop_p (const_basic_block bb
, const void *data
)
808 const class loop
*const loop
= (const class loop
*)data
;
809 if (flow_bb_inside_loop_p (loop
, bb
))
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816 stmt_vec_info structs for all the stmts in LOOP_IN. */
818 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
819 : vec_info (vec_info::loop
, shared
),
821 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
822 num_itersm1 (NULL_TREE
),
823 num_iters (NULL_TREE
),
824 num_iters_unchanged (NULL_TREE
),
825 num_iters_assumptions (NULL_TREE
),
826 vector_costs (nullptr),
827 scalar_costs (nullptr),
829 versioning_threshold (0),
830 vectorization_factor (0),
831 main_loop_edge (nullptr),
832 skip_main_loop_edge (nullptr),
833 skip_this_loop_edge (nullptr),
834 reusable_accumulators (),
835 suggested_unroll_factor (1),
836 max_vectorization_factor (0),
837 mask_skip_niters (NULL_TREE
),
838 rgroup_compare_type (NULL_TREE
),
839 simd_if_cond (NULL_TREE
),
841 peeling_for_alignment (0),
845 slp_unrolling_factor (1),
846 inner_loop_cost_factor (param_vect_inner_loop_cost_factor
),
847 vectorizable (false),
848 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
849 using_partial_vectors_p (false),
850 epil_using_partial_vectors_p (false),
851 partial_load_store_bias (0),
852 peeling_for_gaps (false),
853 peeling_for_niter (false),
854 no_data_dependencies (false),
855 has_mask_store (false),
856 scalar_loop_scaling (profile_probability::uninitialized ()),
858 orig_loop_info (NULL
)
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
866 bbs
, loop
->num_nodes
, loop
);
867 gcc_assert (nbbs
== loop
->num_nodes
);
869 for (unsigned int i
= 0; i
< nbbs
; i
++)
871 basic_block bb
= bbs
[i
];
872 gimple_stmt_iterator si
;
874 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
876 gimple
*phi
= gsi_stmt (si
);
877 gimple_set_uid (phi
, 0);
881 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
883 gimple
*stmt
= gsi_stmt (si
);
884 gimple_set_uid (stmt
, 0);
885 if (is_gimple_debug (stmt
))
888 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889 third argument is the #pragma omp simd if (x) condition, when 0,
890 loop shouldn't be vectorized, when non-zero constant, it should
891 be vectorized normally, otherwise versioned with vectorized loop
892 done if the condition is non-zero at runtime. */
894 && is_gimple_call (stmt
)
895 && gimple_call_internal_p (stmt
)
896 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
897 && gimple_call_num_args (stmt
) >= 3
898 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
900 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
902 tree arg
= gimple_call_arg (stmt
, 2);
903 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
906 gcc_assert (integer_nonzerop (arg
));
911 epilogue_vinfos
.create (6);
914 /* Free all levels of rgroup CONTROLS. */
917 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
919 rgroup_controls
*rgc
;
921 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
922 rgc
->controls
.release ();
923 controls
->release ();
926 /* Free all memory used by the _loop_vec_info, as well as all the
927 stmt_vec_info structs of all the stmts in the loop. */
929 _loop_vec_info::~_loop_vec_info ()
933 release_vec_loop_controls (&masks
);
934 release_vec_loop_controls (&lens
);
937 epilogue_vinfos
.release ();
941 /* When we release an epiloge vinfo that we do not intend to use
942 avoid clearing AUX of the main loop which should continue to
943 point to the main loop vinfo since otherwise we'll leak that. */
944 if (loop
->aux
== this)
948 /* Return an invariant or register for EXPR and emit necessary
949 computations in the LOOP_VINFO loop preheader. */
952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
954 if (is_gimple_reg (expr
)
955 || is_gimple_min_invariant (expr
))
958 if (! loop_vinfo
->ivexpr_map
)
959 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
960 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
963 gimple_seq stmts
= NULL
;
964 cached
= force_gimple_operand (unshare_expr (expr
),
965 &stmts
, true, NULL_TREE
);
968 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
969 gsi_insert_seq_on_edge_immediate (e
, stmts
);
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976 all masks required to mask LOOP_VINFO. */
979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
981 rgroup_controls
*rgm
;
983 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
984 if (rgm
->type
!= NULL_TREE
985 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
992 /* Calculate the maximum number of scalars per iteration for every
993 rgroup in LOOP_VINFO. */
996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
998 unsigned int res
= 1;
1000 rgroup_controls
*rgm
;
1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1002 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1006 /* Calculate the minimum precision necessary to represent:
1010 as an unsigned integer, where MAX_NITERS is the maximum number of
1011 loop header iterations for the original scalar form of LOOP_VINFO. */
1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1016 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1018 /* Get the maximum number of iterations that is representable
1019 in the counter type. */
1020 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1021 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1023 /* Get a more refined estimate for the number of iterations. */
1024 widest_int max_back_edges
;
1025 if (max_loop_iterations (loop
, &max_back_edges
))
1026 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1028 /* Work out how many bits we need to represent the limit. */
1029 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1032 /* True if the loop needs peeling or partial vectors when vectorized. */
1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1037 unsigned HOST_WIDE_INT const_vf
;
1038 HOST_WIDE_INT max_niter
1039 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1041 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1042 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1043 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1046 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1047 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1049 /* Work out the (constant) number of iterations that need to be
1050 peeled for reasons other than niters. */
1051 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1052 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1054 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1055 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1058 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1059 /* ??? When peeling for gaps but not alignment, we could
1060 try to check whether the (variable) niters is known to be
1061 VF * N + 1. That's something of a niche case though. */
1062 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1063 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1064 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1065 < (unsigned) exact_log2 (const_vf
))
1066 /* In case of versioning, check if the maximum number of
1067 iterations is greater than th. If they are identical,
1068 the epilogue is unnecessary. */
1069 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1070 || ((unsigned HOST_WIDE_INT
) max_niter
1071 > (th
/ const_vf
) * const_vf
))))
1077 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1078 whether we can actually generate the masks required. Return true if so,
1079 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1082 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1084 unsigned int min_ni_width
;
1085 unsigned int max_nscalars_per_iter
1086 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1088 /* Use a normal loop if there are no statements that need masking.
1089 This only happens in rare degenerate cases: it means that the loop
1090 has no loads, no stores, and no live-out values. */
1091 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1094 /* Work out how many bits we need to represent the limit. */
1096 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1098 /* Find a scalar mode for which WHILE_ULT is supported. */
1099 opt_scalar_int_mode cmp_mode_iter
;
1100 tree cmp_type
= NULL_TREE
;
1101 tree iv_type
= NULL_TREE
;
1102 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1103 unsigned int iv_precision
= UINT_MAX
;
1106 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1109 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1111 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1112 if (cmp_bits
>= min_ni_width
1113 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1115 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1117 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1119 /* Although we could stop as soon as we find a valid mode,
1120 there are at least two reasons why that's not always the
1123 - An IV that's Pmode or wider is more likely to be reusable
1124 in address calculations than an IV that's narrower than
1127 - Doing the comparison in IV_PRECISION or wider allows
1128 a natural 0-based IV, whereas using a narrower comparison
1129 type requires mitigations against wrap-around.
1131 Conversely, if the IV limit is variable, doing the comparison
1132 in a wider type than the original type can introduce
1133 unnecessary extensions, so picking the widest valid mode
1134 is not always a good choice either.
1136 Here we prefer the first IV type that's Pmode or wider,
1137 and the first comparison type that's IV_PRECISION or wider.
1138 (The comparison type must be no wider than the IV type,
1139 to avoid extensions in the vector loop.)
1141 ??? We might want to try continuing beyond Pmode for ILP32
1142 targets if CMP_BITS < IV_PRECISION. */
1143 iv_type
= this_type
;
1144 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1145 cmp_type
= this_type
;
1146 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1155 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1156 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1160 /* Check whether we can use vector access with length based on precison
1161 comparison. So far, to keep it simple, we only allow the case that the
1162 precision of the target supported length is larger than the precision
1163 required by loop niters. */
1166 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1168 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1171 machine_mode len_load_mode
= get_len_load_store_mode
1172 (loop_vinfo
->vector_mode
, true).require ();
1173 machine_mode len_store_mode
= get_len_load_store_mode
1174 (loop_vinfo
->vector_mode
, false).require ();
1176 signed char partial_load_bias
= internal_len_load_store_bias
1177 (IFN_LEN_LOAD
, len_load_mode
);
1179 signed char partial_store_bias
= internal_len_load_store_bias
1180 (IFN_LEN_STORE
, len_store_mode
);
1182 gcc_assert (partial_load_bias
== partial_store_bias
);
1184 if (partial_load_bias
== VECT_PARTIAL_BIAS_UNSUPPORTED
)
1187 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188 len_loads with a length of zero. In order to avoid that we prohibit
1189 more than one loop length here. */
1190 if (partial_load_bias
== -1
1191 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
1194 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) = partial_load_bias
;
1196 unsigned int max_nitems_per_iter
= 1;
1198 rgroup_controls
*rgl
;
1199 /* Find the maximum number of items per iteration for every rgroup. */
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1202 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1203 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1206 /* Work out how many bits we need to represent the length limit. */
1207 unsigned int min_ni_prec
1208 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1210 /* Now use the maximum of below precisions for one suitable IV type:
1211 - the IV's natural precision
1212 - the precision needed to hold: the maximum number of scalar
1213 iterations multiplied by the scale factor (min_ni_prec above)
1214 - the Pmode precision
1216 If min_ni_prec is less than the precision of the current niters,
1217 we perfer to still use the niters type. Prefer to use Pmode and
1218 wider IV to avoid narrow conversions. */
1220 unsigned int ni_prec
1221 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1222 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1223 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1225 tree iv_type
= NULL_TREE
;
1226 opt_scalar_int_mode tmode_iter
;
1227 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1229 scalar_mode tmode
= tmode_iter
.require ();
1230 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1232 /* ??? Do we really want to construct one IV whose precision exceeds
1234 if (tbits
> BITS_PER_WORD
)
1237 /* Find the first available standard integral type. */
1238 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1240 iv_type
= build_nonstandard_integer_type (tbits
, true);
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1249 "can't vectorize with length-based partial vectors"
1250 " because there is no suitable iv type.\n");
1254 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1255 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1260 /* Calculate the cost of one scalar iteration of the loop. */
1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1264 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1265 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1266 int nbbs
= loop
->num_nodes
, factor
;
1267 int innerloop_iters
, i
;
1269 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1271 /* Gather costs for statements in the scalar loop. */
1274 innerloop_iters
= 1;
1276 innerloop_iters
= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
);
1278 for (i
= 0; i
< nbbs
; i
++)
1280 gimple_stmt_iterator si
;
1281 basic_block bb
= bbs
[i
];
1283 if (bb
->loop_father
== loop
->inner
)
1284 factor
= innerloop_iters
;
1288 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1290 gimple
*stmt
= gsi_stmt (si
);
1291 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1293 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1296 /* Skip stmts that are not vectorized inside the loop. */
1297 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1298 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1299 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1300 || !VECTORIZABLE_CYCLE_DEF
1301 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1304 vect_cost_for_stmt kind
;
1305 if (STMT_VINFO_DATA_REF (stmt_info
))
1307 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1310 kind
= scalar_store
;
1312 else if (vect_nop_conversion_p (stmt_info
))
1317 /* We are using vect_prologue here to avoid scaling twice
1318 by the inner loop factor. */
1319 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1320 factor
, kind
, stmt_info
, 0, vect_prologue
);
1324 /* Now accumulate cost. */
1325 loop_vinfo
->scalar_costs
= init_cost (loop_vinfo
, true);
1326 add_stmt_costs (loop_vinfo
->scalar_costs
,
1327 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
));
1328 loop_vinfo
->scalar_costs
->finish_cost (nullptr);
1332 /* Function vect_analyze_loop_form.
1334 Verify that certain CFG restrictions hold, including:
1335 - the loop has a pre-header
1336 - the loop has a single entry and exit
1337 - the loop exit condition is simple enough
1338 - the number of iterations can be analyzed, i.e, a countable loop. The
1339 niter could be analyzed under some assumptions. */
1342 vect_analyze_loop_form (class loop
*loop
, vect_loop_form_info
*info
)
1344 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1346 /* Different restrictions apply when we are considering an inner-most loop,
1347 vs. an outer (nested) loop.
1348 (FORNOW. May want to relax some of these restrictions in the future). */
1350 info
->inner_loop_cond
= NULL
;
1353 /* Inner-most loop. We currently require that the number of BBs is
1354 exactly 2 (the header and latch). Vectorizable inner-most loops
1365 if (loop
->num_nodes
!= 2)
1366 return opt_result::failure_at (vect_location
,
1368 " control flow in loop.\n");
1370 if (empty_block_p (loop
->header
))
1371 return opt_result::failure_at (vect_location
,
1372 "not vectorized: empty loop.\n");
1376 class loop
*innerloop
= loop
->inner
;
1379 /* Nested loop. We currently require that the loop is doubly-nested,
1380 contains a single inner loop, and the number of BBs is exactly 5.
1381 Vectorizable outer-loops look like this:
1393 The inner-loop has the properties expected of inner-most loops
1394 as described above. */
1396 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1397 return opt_result::failure_at (vect_location
,
1399 " multiple nested loops.\n");
1401 if (loop
->num_nodes
!= 5)
1402 return opt_result::failure_at (vect_location
,
1404 " control flow in loop.\n");
1406 entryedge
= loop_preheader_edge (innerloop
);
1407 if (entryedge
->src
!= loop
->header
1408 || !single_exit (innerloop
)
1409 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1410 return opt_result::failure_at (vect_location
,
1412 " unsupported outerloop form.\n");
1414 /* Analyze the inner-loop. */
1415 vect_loop_form_info inner
;
1416 opt_result res
= vect_analyze_loop_form (loop
->inner
, &inner
);
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1421 "not vectorized: Bad inner loop.\n");
1425 /* Don't support analyzing niter under assumptions for inner
1427 if (!integer_onep (inner
.assumptions
))
1428 return opt_result::failure_at (vect_location
,
1429 "not vectorized: Bad inner loop.\n");
1431 if (!expr_invariant_in_loop_p (loop
, inner
.number_of_iterations
))
1432 return opt_result::failure_at (vect_location
,
1433 "not vectorized: inner-loop count not"
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE
, vect_location
,
1438 "Considering outer-loop vectorization.\n");
1439 info
->inner_loop_cond
= inner
.loop_cond
;
1442 if (!single_exit (loop
))
1443 return opt_result::failure_at (vect_location
,
1444 "not vectorized: multiple exits.\n");
1445 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1446 return opt_result::failure_at (vect_location
,
1448 " too many incoming edges.\n");
1450 /* We assume that the loop exit condition is at the end of the loop. i.e,
1451 that the loop is represented as a do-while (with a proper if-guard
1452 before the loop if needed), where the loop header contains all the
1453 executable statements, and the latch is empty. */
1454 if (!empty_block_p (loop
->latch
)
1455 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1456 return opt_result::failure_at (vect_location
,
1457 "not vectorized: latch block not empty.\n");
1459 /* Make sure the exit is not abnormal. */
1460 edge e
= single_exit (loop
);
1461 if (e
->flags
& EDGE_ABNORMAL
)
1462 return opt_result::failure_at (vect_location
,
1464 " abnormal loop exit edge.\n");
1467 = vect_get_loop_niters (loop
, &info
->assumptions
,
1468 &info
->number_of_iterations
,
1469 &info
->number_of_iterationsm1
);
1470 if (!info
->loop_cond
)
1471 return opt_result::failure_at
1473 "not vectorized: complicated exit condition.\n");
1475 if (integer_zerop (info
->assumptions
)
1476 || !info
->number_of_iterations
1477 || chrec_contains_undetermined (info
->number_of_iterations
))
1478 return opt_result::failure_at
1480 "not vectorized: number of iterations cannot be computed.\n");
1482 if (integer_zerop (info
->number_of_iterations
))
1483 return opt_result::failure_at
1485 "not vectorized: number of iterations = 0.\n");
1487 if (!(tree_fits_shwi_p (info
->number_of_iterations
)
1488 && tree_to_shwi (info
->number_of_iterations
) > 0))
1490 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE
, vect_location
,
1493 "Symbolic number of iterations is ");
1494 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, info
->number_of_iterations
);
1495 dump_printf (MSG_NOTE
, "\n");
1499 return opt_result::success ();
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503 vect_analyze_loop_form result. */
1506 vect_create_loop_vinfo (class loop
*loop
, vec_info_shared
*shared
,
1507 const vect_loop_form_info
*info
,
1508 loop_vec_info main_loop_info
)
1510 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1511 LOOP_VINFO_NITERSM1 (loop_vinfo
) = info
->number_of_iterationsm1
;
1512 LOOP_VINFO_NITERS (loop_vinfo
) = info
->number_of_iterations
;
1513 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = info
->number_of_iterations
;
1514 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = main_loop_info
;
1515 /* Also record the assumptions for versioning. */
1516 if (!integer_onep (info
->assumptions
) && !main_loop_info
)
1517 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = info
->assumptions
;
1519 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (info
->loop_cond
);
1520 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1521 if (info
->inner_loop_cond
)
1523 stmt_vec_info inner_loop_cond_info
1524 = loop_vinfo
->lookup_stmt (info
->inner_loop_cond
);
1525 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1526 /* If we have an estimate on the number of iterations of the inner
1527 loop use that to limit the scale for costing, otherwise use
1528 --param vect-inner-loop-cost-factor literally. */
1530 if (estimated_stmt_executions (loop
->inner
, &nit
))
1531 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
)
1532 = wi::smin (nit
, param_vect_inner_loop_cost_factor
).to_uhwi ();
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541 statements update the vectorization factor. */
1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1546 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1547 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1548 int nbbs
= loop
->num_nodes
;
1549 poly_uint64 vectorization_factor
;
1552 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1554 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1555 gcc_assert (known_ne (vectorization_factor
, 0U));
1557 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558 vectorization factor of the loop is the unrolling factor required by
1559 the SLP instances. If that unrolling factor is 1, we say, that we
1560 perform pure SLP on loop - cross iteration parallelism is not
1562 bool only_slp_in_loop
= true;
1563 for (i
= 0; i
< nbbs
; i
++)
1565 basic_block bb
= bbs
[i
];
1566 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1569 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1574 && !PURE_SLP_STMT (stmt_info
))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop
= false;
1578 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1581 if (is_gimple_debug (gsi_stmt (si
)))
1583 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1584 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1585 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1586 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1587 && !PURE_SLP_STMT (stmt_info
))
1588 /* STMT needs both SLP and loop-based vectorization. */
1589 only_slp_in_loop
= false;
1593 if (only_slp_in_loop
)
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_NOTE
, vect_location
,
1597 "Loop contains only SLP stmts\n");
1598 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE
, vect_location
,
1604 "Loop contains SLP and non-SLP stmts\n");
1605 /* Both the vectorization factor and unroll factor have the form
1606 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607 so they must have a common multiple. */
1608 vectorization_factor
1609 = force_common_multiple (vectorization_factor
,
1610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1613 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1614 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE
, vect_location
,
1617 "Updating vectorization factor to ");
1618 dump_dec (MSG_NOTE
, vectorization_factor
);
1619 dump_printf (MSG_NOTE
, ".\n");
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624 the other phi in the reduction is also relevant for vectorization.
1625 This rejects cases such as:
1628 x_1 = PHI <x_3(outer2), ...>;
1636 x_3 = PHI <x_2(inner)>;
1638 if nothing in x_2 or elsewhere makes x_1 relevant. */
1641 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1643 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1646 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1649 /* Function vect_analyze_loop_operations.
1651 Scan the loop stmts and make sure they are all vectorizable. */
1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1656 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1657 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1658 int nbbs
= loop
->num_nodes
;
1660 stmt_vec_info stmt_info
;
1661 bool need_to_vectorize
= false;
1664 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1666 auto_vec
<stmt_info_for_cost
> cost_vec
;
1668 for (i
= 0; i
< nbbs
; i
++)
1670 basic_block bb
= bbs
[i
];
1672 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1675 gphi
*phi
= si
.phi ();
1678 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1681 if (virtual_operand_p (gimple_phi_result (phi
)))
1684 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685 (i.e., a phi in the tail of the outer-loop). */
1686 if (! is_loop_header_bb_p (bb
))
1688 /* FORNOW: we currently don't support the case that these phis
1689 are not used in the outerloop (unless it is double reduction,
1690 i.e., this phi is vect_reduction_def), cause this case
1691 requires to actually do something here. */
1692 if (STMT_VINFO_LIVE_P (stmt_info
)
1693 && !vect_active_double_reduction_p (stmt_info
))
1694 return opt_result::failure_at (phi
,
1695 "Unsupported loop-closed phi"
1696 " in outer-loop.\n");
1698 /* If PHI is used in the outer loop, we check that its operand
1699 is defined in the inner loop. */
1700 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1704 if (gimple_phi_num_args (phi
) != 1)
1705 return opt_result::failure_at (phi
, "unsupported phi");
1707 phi_op
= PHI_ARG_DEF (phi
, 0);
1708 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1710 return opt_result::failure_at (phi
, "unsupported phi\n");
1712 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1713 && (STMT_VINFO_RELEVANT (op_def_info
)
1714 != vect_used_in_outer_by_reduction
))
1715 return opt_result::failure_at (phi
, "unsupported phi\n");
1717 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1718 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1719 == vect_double_reduction_def
))
1720 && !vectorizable_lc_phi (loop_vinfo
,
1721 stmt_info
, NULL
, NULL
))
1722 return opt_result::failure_at (phi
, "unsupported phi\n");
1728 gcc_assert (stmt_info
);
1730 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1731 || STMT_VINFO_LIVE_P (stmt_info
))
1732 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1733 /* A scalar-dependence cycle that we don't support. */
1734 return opt_result::failure_at (phi
,
1736 " scalar dependence cycle.\n");
1738 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1740 need_to_vectorize
= true;
1741 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1742 && ! PURE_SLP_STMT (stmt_info
))
1743 ok
= vectorizable_induction (loop_vinfo
,
1744 stmt_info
, NULL
, NULL
,
1746 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1747 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1748 == vect_double_reduction_def
)
1749 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1750 && ! PURE_SLP_STMT (stmt_info
))
1751 ok
= vectorizable_reduction (loop_vinfo
,
1752 stmt_info
, NULL
, NULL
, &cost_vec
);
1755 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1757 && STMT_VINFO_LIVE_P (stmt_info
)
1758 && !PURE_SLP_STMT (stmt_info
))
1759 ok
= vectorizable_live_operation (loop_vinfo
,
1760 stmt_info
, NULL
, NULL
, NULL
,
1761 -1, false, &cost_vec
);
1764 return opt_result::failure_at (phi
,
1765 "not vectorized: relevant phi not "
1767 static_cast <gimple
*> (phi
));
1770 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1773 gimple
*stmt
= gsi_stmt (si
);
1774 if (!gimple_clobber_p (stmt
)
1775 && !is_gimple_debug (stmt
))
1778 = vect_analyze_stmt (loop_vinfo
,
1779 loop_vinfo
->lookup_stmt (stmt
),
1781 NULL
, NULL
, &cost_vec
);
1788 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
1790 /* All operations in the loop are either irrelevant (deal with loop
1791 control, or dead), or only used outside the loop and can be moved
1792 out of the loop (e.g. invariants, inductions). The loop can be
1793 optimized away by scalar optimizations. We're better off not
1794 touching this loop. */
1795 if (!need_to_vectorize
)
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_NOTE
, vect_location
,
1799 "All the computation can be taken out of the loop.\n");
1800 return opt_result::failure_at
1802 "not vectorized: redundant loop. no profit to vectorize.\n");
1805 return opt_result::success ();
1808 /* Return true if we know that the iteration count is smaller than the
1809 vectorization factor. Return false if it isn't, or if we can't be sure
1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
1815 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1817 HOST_WIDE_INT max_niter
;
1818 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1819 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1821 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1823 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1829 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1830 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1831 definitely no, or -1 if it's worth retrying. */
1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo
,
1835 unsigned *suggested_unroll_factor
)
1837 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1838 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1840 /* Only loops that can handle partially-populated vectors can have iteration
1841 counts less than the vectorization factor. */
1842 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
1844 if (vect_known_niters_smaller_than_vf (loop_vinfo
))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1848 "not vectorized: iteration count smaller than "
1849 "vectorization factor.\n");
1854 /* If using the "very cheap" model. reject cases in which we'd keep
1855 a copy of the scalar code (even if we might be able to vectorize it). */
1856 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
1857 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1858 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1859 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1863 "some scalar iterations would need to be peeled\n");
1867 int min_profitable_iters
, min_profitable_estimate
;
1868 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1869 &min_profitable_estimate
,
1870 suggested_unroll_factor
);
1872 if (min_profitable_iters
< 0)
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1876 "not vectorized: vectorization not profitable.\n");
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1879 "not vectorized: vector version will never be "
1884 int min_scalar_loop_bound
= (param_min_vect_loop_bound
1887 /* Use the cost model only if it is more conservative than user specified
1889 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1890 min_profitable_iters
);
1892 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1894 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1895 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1899 "not vectorized: vectorization not profitable.\n");
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE
, vect_location
,
1902 "not vectorized: iteration count smaller than user "
1903 "specified loop bound parameter or minimum profitable "
1904 "iterations (whichever is more conservative).\n");
1908 /* The static profitablity threshold min_profitable_estimate includes
1909 the cost of having to check at runtime whether the scalar loop
1910 should be used instead. If it turns out that we don't need or want
1911 such a check, the threshold we should use for the static estimate
1912 is simply the point at which the vector loop becomes more profitable
1913 than the scalar loop. */
1914 if (min_profitable_estimate
> min_profitable_iters
1915 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1916 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
1917 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1918 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
1922 " choice between the scalar and vector loops\n");
1923 min_profitable_estimate
= min_profitable_iters
;
1926 /* If the vector loop needs multiple iterations to be beneficial then
1927 things are probably too close to call, and the conservative thing
1928 would be to stick with the scalar code. */
1929 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
1930 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1934 "one iteration of the vector loop would be"
1935 " more expensive than the equivalent number of"
1936 " iterations of the scalar loop\n");
1940 HOST_WIDE_INT estimated_niter
;
1942 /* If we are vectorizing an epilogue then we know the maximum number of
1943 scalar iterations it will cover is at least one lower than the
1944 vectorization factor of the main loop. */
1945 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1947 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
1950 estimated_niter
= estimated_stmt_executions_int (loop
);
1951 if (estimated_niter
== -1)
1952 estimated_niter
= likely_max_stmt_executions_int (loop
);
1954 if (estimated_niter
!= -1
1955 && ((unsigned HOST_WIDE_INT
) estimated_niter
1956 < MAX (th
, (unsigned) min_profitable_estimate
)))
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1960 "not vectorized: estimated iteration count too "
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE
, vect_location
,
1964 "not vectorized: estimated iteration count smaller "
1965 "than specified loop bound parameter or minimum "
1966 "profitable iterations (whichever is more "
1967 "conservative).\n");
1975 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1976 vec
<data_reference_p
> *datarefs
,
1977 unsigned int *n_stmts
)
1980 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1981 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1982 !gsi_end_p (gsi
); gsi_next (&gsi
))
1984 gimple
*stmt
= gsi_stmt (gsi
);
1985 if (is_gimple_debug (stmt
))
1988 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
1992 if (is_gimple_call (stmt
) && loop
->safelen
)
1994 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1995 if (fndecl
!= NULL_TREE
)
1997 cgraph_node
*node
= cgraph_node::get (fndecl
);
1998 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2000 unsigned int j
, n
= gimple_call_num_args (stmt
);
2001 for (j
= 0; j
< n
; j
++)
2003 op
= gimple_call_arg (stmt
, j
);
2005 || (REFERENCE_CLASS_P (op
)
2006 && get_base_address (op
)))
2009 op
= gimple_call_lhs (stmt
);
2010 /* Ignore #pragma omp declare simd functions
2011 if they don't have data references in the
2012 call stmt itself. */
2016 || (REFERENCE_CLASS_P (op
)
2017 && get_base_address (op
)))))
2024 /* If dependence analysis will give up due to the limit on the
2025 number of datarefs stop here and fail fatally. */
2026 if (datarefs
->length ()
2027 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2028 return opt_result::failure_at (stmt
, "exceeded param "
2029 "loop-max-datarefs-for-datadeps\n");
2031 return opt_result::success ();
2034 /* Look for SLP-only access groups and turn each individual access into its own
2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2040 struct data_reference
*dr
;
2042 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2044 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2045 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2047 gcc_assert (DR_REF (dr
));
2048 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
2050 /* Check if the load is a part of an interleaving chain. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2053 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2054 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_element
);
2055 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2057 /* Check if SLP-only groups. */
2058 if (!STMT_SLP_TYPE (stmt_info
)
2059 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2061 /* Dissolve the group. */
2062 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2064 stmt_vec_info vinfo
= first_element
;
2067 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2068 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2069 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2070 DR_GROUP_SIZE (vinfo
) = 1;
2071 if (STMT_VINFO_STRIDED_P (first_element
))
2072 DR_GROUP_GAP (vinfo
) = 0;
2074 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2075 /* Duplicate and adjust alignment info, it needs to
2076 be present on each group leader, see dr_misalignment. */
2077 if (vinfo
!= first_element
)
2079 dr_vec_info
*dr_info2
= STMT_VINFO_DR_INFO (vinfo
);
2080 dr_info2
->target_alignment
= dr_info
->target_alignment
;
2081 int misalignment
= dr_info
->misalignment
;
2082 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
)
2085 = (TREE_INT_CST_LOW (DR_INIT (dr_info2
->dr
))
2086 - TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
)));
2087 unsigned HOST_WIDE_INT align_c
2088 = dr_info
->target_alignment
.to_constant ();
2089 misalignment
= (misalignment
+ diff
) % align_c
;
2091 dr_info2
->misalignment
= misalignment
;
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101 some scalar iterations still to do. If so, decide how we should
2102 handle those scalar iterations. The possibilities are:
2104 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2107 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109 LOOP_VINFO_PEELING_FOR_NITER == false
2111 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112 to handle the remaining scalar iterations. In this case:
2114 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115 LOOP_VINFO_PEELING_FOR_NITER == true
2117 There are two choices:
2119 (2a) Consider vectorizing the epilogue loop at the same VF as the
2120 main loop, but using partial vectors instead of full vectors.
2123 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2125 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2128 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2130 When FOR_EPILOGUE_P is true, make this determination based on the
2131 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132 based on the assumption that LOOP_VINFO is the main loop. The caller
2133 has made sure that the number of iterations is set appropriately for
2134 this value of FOR_EPILOGUE_P. */
2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
,
2138 bool for_epilogue_p
)
2140 /* Determine whether there would be any scalar iterations left over. */
2141 bool need_peeling_or_partial_vectors_p
2142 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2144 /* Decide whether to vectorize the loop with partial vectors. */
2145 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2146 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2147 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2148 && need_peeling_or_partial_vectors_p
)
2150 /* For partial-vector-usage=1, try to push the handling of partial
2151 vectors to the epilogue, with the main loop continuing to operate
2154 If we are unrolling we also do not want to use partial vectors. This
2155 is to avoid the overhead of generating multiple masks and also to
2156 avoid having to execute entire iterations of FALSE masked instructions
2157 when dealing with one or less full iterations.
2159 ??? We could then end up failing to use partial vectors if we
2160 decide to peel iterations into a prologue, and if the main loop
2161 then ends up processing fewer than VF iterations. */
2162 if ((param_vect_partial_vector_usage
== 1
2163 || loop_vinfo
->suggested_unroll_factor
> 1)
2164 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2165 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2166 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2168 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2171 if (dump_enabled_p ())
2173 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2174 dump_printf_loc (MSG_NOTE
, vect_location
,
2175 "operating on partial vectors%s.\n",
2176 for_epilogue_p
? " for epilogue loop" : "");
2178 dump_printf_loc (MSG_NOTE
, vect_location
,
2179 "operating only on full vectors%s.\n",
2180 for_epilogue_p
? " for epilogue loop" : "");
2185 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2186 gcc_assert (orig_loop_vinfo
);
2187 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2188 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2189 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)));
2192 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2193 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2195 /* Check that the loop processes at least one full vector. */
2196 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2197 tree scalar_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2198 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2199 return opt_result::failure_at (vect_location
,
2200 "loop does not have enough iterations"
2201 " to support vectorization.\n");
2203 /* If we need to peel an extra epilogue iteration to handle data
2204 accesses with gaps, check that there are enough scalar iterations
2207 The check above is redundant with this one when peeling for gaps,
2208 but the distinction is useful for diagnostics. */
2209 tree scalar_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2210 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2211 && known_lt (wi::to_widest (scalar_nitersm1
), vf
))
2212 return opt_result::failure_at (vect_location
,
2213 "loop does not have enough iterations"
2214 " to support peeling for gaps.\n");
2217 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2218 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2219 && need_peeling_or_partial_vectors_p
);
2221 return opt_result::success ();
2224 /* Function vect_analyze_loop_2.
2226 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227 for it. The different analyses will record information in the
2228 loop_vec_info struct. */
2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
,
2231 unsigned *suggested_unroll_factor
)
2233 opt_result ok
= opt_result::success ();
2235 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2236 poly_uint64 min_vf
= 2;
2237 loop_vec_info orig_loop_vinfo
= NULL
;
2239 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240 loop_vec_info of the first vectorized loop. */
2241 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2242 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2244 orig_loop_vinfo
= loop_vinfo
;
2245 gcc_assert (orig_loop_vinfo
);
2247 /* The first group of checks is independent of the vector size. */
2250 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2251 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2252 return opt_result::failure_at (vect_location
,
2253 "not vectorized: simd if(0)\n");
2255 /* Find all data references in the loop (which correspond to vdefs/vuses)
2256 and analyze their evolution in the loop. */
2258 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2260 /* Gather the data references and count stmts in the loop. */
2261 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2264 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2265 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2266 &LOOP_VINFO_N_STMTS (loop_vinfo
));
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2271 "not vectorized: loop contains function "
2272 "calls or data references that cannot "
2276 loop_vinfo
->shared
->save_datarefs ();
2279 loop_vinfo
->shared
->check_datarefs ();
2281 /* Analyze the data references and also adjust the minimal
2282 vectorization factor according to the loads and stores. */
2284 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2289 "bad data references.\n");
2293 /* Classify all cross-iteration scalar data-flow cycles.
2294 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2295 vect_analyze_scalar_cycles (loop_vinfo
);
2297 vect_pattern_recog (loop_vinfo
);
2299 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2301 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2304 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2309 "bad data access.\n");
2313 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2315 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2320 "unexpected pattern.\n");
2324 /* While the rest of the analysis below depends on it in some way. */
2327 /* Analyze data dependences between the data-refs in the loop
2328 and adjust the maximum vectorization factor according to
2330 FORNOW: fail at the first data dependence that we encounter. */
2332 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2337 "bad data dependence.\n");
2340 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2341 && maybe_lt (max_vf
, min_vf
))
2342 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2343 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2345 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2350 "can't determine vectorization factor.\n");
2353 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2354 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2355 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2357 /* Compute the scalar iteration cost. */
2358 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2360 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2362 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2363 ok
= vect_analyze_slp (loop_vinfo
, LOOP_VINFO_N_STMTS (loop_vinfo
));
2367 /* If there are any SLP instances mark them as pure_slp. */
2368 bool slp
= vect_make_slp_decision (loop_vinfo
);
2371 /* Find stmts that need to be both vectorized and SLPed. */
2372 vect_detect_hybrid_slp (loop_vinfo
);
2374 /* Update the vectorization factor based on the SLP decision. */
2375 vect_update_vf_for_slp (loop_vinfo
);
2377 /* Optimize the SLP graph with the vectorization factor fixed. */
2378 vect_optimize_slp (loop_vinfo
);
2380 /* Gather the loads reachable from the SLP graph entries. */
2381 vect_gather_slp_loads (loop_vinfo
);
2384 bool saved_can_use_partial_vectors_p
2385 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2387 /* We don't expect to have to roll back to anything other than an empty
2389 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2391 /* Apply the suggested unrolling factor, this was determined by the backend
2392 during finish_cost the first time we ran the analyzis for this
2394 if (loop_vinfo
->suggested_unroll_factor
> 1)
2395 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *= loop_vinfo
->suggested_unroll_factor
;
2397 /* This is the point where we can re-start analysis with SLP forced off. */
2400 /* Now the vectorization factor is final. */
2401 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2402 gcc_assert (known_ne (vectorization_factor
, 0U));
2404 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2406 dump_printf_loc (MSG_NOTE
, vect_location
,
2407 "vectorization_factor = ");
2408 dump_dec (MSG_NOTE
, vectorization_factor
);
2409 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2410 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2413 loop_vinfo
->vector_costs
= init_cost (loop_vinfo
, false);
2415 /* Analyze the alignment of the data-refs in the loop.
2416 Fail if a data reference is found that cannot be vectorized. */
2418 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2423 "bad data alignment.\n");
2427 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428 It is important to call pruning after vect_analyze_data_ref_accesses,
2429 since we use grouping information gathered by interleaving analysis. */
2430 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2434 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435 vectorization, since we do not want to add extra peeling or
2436 add versioning for alignment. */
2437 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2438 /* This pass will decide on using loop versioning and/or loop peeling in
2439 order to enhance the alignment of data references in the loop. */
2440 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2446 /* Analyze operations in the SLP instances. Note this may
2447 remove unsupported SLP instances which makes the above
2448 SLP kind detection invalid. */
2449 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2450 vect_slp_analyze_operations (loop_vinfo
);
2451 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2453 ok
= opt_result::failure_at (vect_location
,
2454 "unsupported SLP instances\n");
2458 /* Check whether any load in ALL SLP instances is possibly permuted. */
2459 slp_tree load_node
, slp_root
;
2461 slp_instance instance
;
2462 bool can_use_lanes
= true;
2463 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2465 slp_root
= SLP_INSTANCE_TREE (instance
);
2466 int group_size
= SLP_TREE_LANES (slp_root
);
2467 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2468 bool loads_permuted
= false;
2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2471 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2474 stmt_vec_info load_info
;
2475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2476 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2478 loads_permuted
= true;
2483 /* If the loads and stores can be handled with load/store-lane
2484 instructions record it and move on to the next instance. */
2486 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2487 && vect_store_lanes_supported (vectype
, group_size
, false))
2489 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2491 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2492 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2493 /* Use SLP for strided accesses (or if we can't
2495 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2496 || ! vect_load_lanes_supported
2497 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2498 DR_GROUP_SIZE (stmt_vinfo
), false))
2503 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
2505 if (can_use_lanes
&& dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE
, vect_location
,
2507 "SLP instance %p can use load/store-lanes\n",
2512 can_use_lanes
= false;
2517 /* If all SLP instances can use load/store-lanes abort SLP and try again
2518 with SLP disabled. */
2521 ok
= opt_result::failure_at (vect_location
,
2522 "Built SLP cancelled: can use "
2523 "load/store-lanes\n");
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2526 "Built SLP cancelled: all SLP instances support "
2527 "load/store-lanes\n");
2532 /* Dissolve SLP-only groups. */
2533 vect_dissolve_slp_only_groups (loop_vinfo
);
2535 /* Scan all the remaining operations in the loop that are not subject
2536 to SLP and make sure they are vectorizable. */
2537 ok
= vect_analyze_loop_operations (loop_vinfo
);
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2542 "bad operation or unsupported loop bound.\n");
2546 /* For now, we don't expect to mix both masking and length approaches for one
2547 loop, disable it if both are recorded. */
2548 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2549 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
2550 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2554 "can't vectorize a loop with partial vectors"
2555 " because we don't expect to mix different"
2556 " approaches with partial vectors for the"
2558 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2561 /* If we still have the option of using partial vectors,
2562 check whether we can generate the necessary loop controls. */
2563 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2564 && !vect_verify_full_masking (loop_vinfo
)
2565 && !vect_verify_loop_lens (loop_vinfo
))
2566 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2568 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569 to be able to handle fewer than VF scalars, or needs to have a lower VF
2570 than the main loop. */
2571 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2572 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2573 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2574 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2575 return opt_result::failure_at (vect_location
,
2576 "Vectorization factor too high for"
2577 " epilogue loop.\n");
2579 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580 assuming that the loop will be used as a main loop. We will redo
2581 this analysis later if we instead decide to use the loop as an
2583 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
, false);
2587 /* Check the costings of the loop make vectorizing worthwhile. */
2588 res
= vect_analyze_loop_costing (loop_vinfo
, suggested_unroll_factor
);
2591 ok
= opt_result::failure_at (vect_location
,
2592 "Loop costings may not be worthwhile.\n");
2596 return opt_result::failure_at (vect_location
,
2597 "Loop costings not worthwhile.\n");
2599 /* If an epilogue loop is required make sure we can create one. */
2600 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2601 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2605 if (!vect_can_advance_ivs_p (loop_vinfo
)
2606 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2607 single_exit (LOOP_VINFO_LOOP
2610 ok
= opt_result::failure_at (vect_location
,
2611 "not vectorized: can't create required "
2617 /* During peeling, we need to check if number of loop iterations is
2618 enough for both peeled prolog loop and vector loop. This check
2619 can be merged along with threshold check of loop versioning, so
2620 increase threshold for this case if necessary.
2622 If we are analyzing an epilogue we still want to check what its
2623 versioning threshold would be. If we decide to vectorize the epilogues we
2624 will want to use the lowest versioning threshold of all epilogues and main
2625 loop. This will enable us to enter a vectorized epilogue even when
2626 versioning the loop. We can't simply check whether the epilogue requires
2627 versioning though since we may have skipped some versioning checks when
2628 analyzing the epilogue. For instance, checks for alias versioning will be
2629 skipped when dealing with epilogues as we assume we already checked them
2630 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2631 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2633 poly_uint64 niters_th
= 0;
2634 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2636 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2638 /* Niters for peeled prolog loop. */
2639 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2641 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2642 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2643 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2646 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2649 /* Niters for at least one iteration of vectorized loop. */
2650 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2651 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2652 /* One additional iteration because of peeling for gap. */
2653 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2656 /* Use the same condition as vect_transform_loop to decide when to use
2657 the cost to determine a versioning threshold. */
2658 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2659 && ordered_p (th
, niters_th
))
2660 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2662 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2665 gcc_assert (known_eq (vectorization_factor
,
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2668 /* Ok to vectorize! */
2669 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2670 return opt_result::success ();
2673 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2676 /* Try again with SLP forced off but if we didn't do any SLP there is
2677 no point in re-trying. */
2681 /* If there are reduction chains re-trying will fail anyway. */
2682 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2685 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686 via interleaving or lane instructions. */
2687 slp_instance instance
;
2690 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2692 stmt_vec_info vinfo
;
2693 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2694 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2696 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2697 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2698 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2699 if (! vect_store_lanes_supported (vectype
, size
, false)
2700 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2701 && ! vect_grouped_store_supported (vectype
, size
))
2702 return opt_result::failure_at (vinfo
->stmt
,
2703 "unsupported grouped store\n");
2704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2706 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2707 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2708 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2709 size
= DR_GROUP_SIZE (vinfo
);
2710 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2711 if (! vect_load_lanes_supported (vectype
, size
, false)
2712 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2714 return opt_result::failure_at (vinfo
->stmt
,
2715 "unsupported grouped load\n");
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE
, vect_location
,
2721 "re-trying with SLP disabled\n");
2723 /* Roll back state appropriately. No SLP this time. */
2725 /* Restore vectorization factor as it were without SLP. */
2726 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2727 /* Free the SLP instances. */
2728 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2729 vect_free_slp_instance (instance
);
2730 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2731 /* Reset SLP type to loop_vect on all stmts. */
2732 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2734 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2735 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2736 !gsi_end_p (si
); gsi_next (&si
))
2738 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2739 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2740 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2741 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2743 /* vectorizable_reduction adjusts reduction stmt def-types,
2744 restore them to that of the PHI. */
2745 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2746 = STMT_VINFO_DEF_TYPE (stmt_info
);
2747 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2749 = STMT_VINFO_DEF_TYPE (stmt_info
);
2752 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2753 !gsi_end_p (si
); gsi_next (&si
))
2755 if (is_gimple_debug (gsi_stmt (si
)))
2757 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2758 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2759 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2761 stmt_vec_info pattern_stmt_info
2762 = STMT_VINFO_RELATED_STMT (stmt_info
);
2763 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
2764 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
2766 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2767 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
2768 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2769 !gsi_end_p (pi
); gsi_next (&pi
))
2770 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2775 /* Free optimized alias test DDRS. */
2776 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2777 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2778 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2779 /* Reset target cost data. */
2780 delete loop_vinfo
->vector_costs
;
2781 loop_vinfo
->vector_costs
= nullptr;
2782 /* Reset accumulated rgroup information. */
2783 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
));
2784 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
2785 /* Reset assorted flags. */
2786 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2787 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2788 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2789 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2790 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2791 = saved_can_use_partial_vectors_p
;
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2798 OLD_LOOP_VINFO is better unless something specifically indicates
2801 Note that this deliberately isn't a partial order. */
2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2805 loop_vec_info old_loop_vinfo
)
2807 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2808 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2810 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2811 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2813 /* Always prefer a VF of loop->simdlen over any other VF. */
2816 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2817 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2818 if (new_simdlen_p
!= old_simdlen_p
)
2819 return new_simdlen_p
;
2822 const auto *old_costs
= old_loop_vinfo
->vector_costs
;
2823 const auto *new_costs
= new_loop_vinfo
->vector_costs
;
2824 if (loop_vec_info main_loop
= LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo
))
2825 return new_costs
->better_epilogue_loop_than_p (old_costs
, main_loop
);
2827 return new_costs
->better_main_loop_than_p (old_costs
);
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2831 true if we should. */
2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
2835 loop_vec_info old_loop_vinfo
)
2837 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
2840 if (dump_enabled_p ())
2841 dump_printf_loc (MSG_NOTE
, vect_location
,
2842 "***** Preferring vector mode %s to vector mode %s\n",
2843 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
2844 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850 MODE_I to the next mode useful to analyze.
2851 Return the loop_vinfo on success and wrapped null on failure. */
2853 static opt_loop_vec_info
2854 vect_analyze_loop_1 (class loop
*loop
, vec_info_shared
*shared
,
2855 const vect_loop_form_info
*loop_form_info
,
2856 loop_vec_info main_loop_vinfo
,
2857 const vector_modes
&vector_modes
, unsigned &mode_i
,
2858 machine_mode
&autodetected_vector_mode
,
2861 loop_vec_info loop_vinfo
2862 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
2864 machine_mode vector_mode
= vector_modes
[mode_i
];
2865 loop_vinfo
->vector_mode
= vector_mode
;
2866 unsigned int suggested_unroll_factor
= 1;
2868 /* Run the main analysis. */
2869 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
,
2870 &suggested_unroll_factor
);
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE
, vect_location
,
2873 "***** Analysis %s with vector mode %s\n",
2874 res
? "succeeded" : " failed",
2875 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2877 if (!main_loop_vinfo
&& suggested_unroll_factor
> 1)
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_NOTE
, vect_location
,
2881 "***** Re-trying analysis for unrolling"
2882 " with unroll factor %d.\n",
2883 suggested_unroll_factor
);
2884 loop_vec_info unroll_vinfo
2885 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
2886 unroll_vinfo
->vector_mode
= vector_mode
;
2887 unroll_vinfo
->suggested_unroll_factor
= suggested_unroll_factor
;
2888 opt_result new_res
= vect_analyze_loop_2 (unroll_vinfo
, fatal
, NULL
);
2892 loop_vinfo
= unroll_vinfo
;
2895 delete unroll_vinfo
;
2898 /* Remember the autodetected vector mode. */
2899 if (vector_mode
== VOIDmode
)
2900 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
2902 /* Advance mode_i, first skipping modes that would result in the
2903 same analysis result. */
2904 while (mode_i
+ 1 < vector_modes
.length ()
2905 && vect_chooses_same_modes_p (loop_vinfo
,
2906 vector_modes
[mode_i
+ 1]))
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE
, vect_location
,
2910 "***** The result for vector mode %s would"
2912 GET_MODE_NAME (vector_modes
[mode_i
+ 1]));
2915 if (mode_i
+ 1 < vector_modes
.length ()
2916 && VECTOR_MODE_P (autodetected_vector_mode
)
2917 && (related_vector_mode (vector_modes
[mode_i
+ 1],
2918 GET_MODE_INNER (autodetected_vector_mode
))
2919 == autodetected_vector_mode
)
2920 && (related_vector_mode (autodetected_vector_mode
,
2921 GET_MODE_INNER (vector_modes
[mode_i
+ 1]))
2922 == vector_modes
[mode_i
+ 1]))
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_NOTE
, vect_location
,
2926 "***** Skipping vector mode %s, which would"
2927 " repeat the analysis for %s\n",
2928 GET_MODE_NAME (vector_modes
[mode_i
+ 1]),
2929 GET_MODE_NAME (autodetected_vector_mode
));
2938 gcc_checking_assert (main_loop_vinfo
== NULL
);
2939 return opt_loop_vec_info::propagate_failure (res
);
2942 return opt_loop_vec_info::success (loop_vinfo
);
2945 /* Function vect_analyze_loop.
2947 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948 for it. The different analyses will record information in the
2949 loop_vec_info struct. */
2951 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
2953 DUMP_VECT_SCOPE ("analyze_loop_nest");
2955 if (loop_outer (loop
)
2956 && loop_vec_info_for_loop (loop_outer (loop
))
2957 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2958 return opt_loop_vec_info::failure_at (vect_location
,
2959 "outer-loop already vectorized.\n");
2961 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2962 return opt_loop_vec_info::failure_at
2964 "not vectorized: loop nest containing two or more consecutive inner"
2965 " loops cannot be vectorized\n");
2967 /* Analyze the loop form. */
2968 vect_loop_form_info loop_form_info
;
2969 opt_result res
= vect_analyze_loop_form (loop
, &loop_form_info
);
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2974 "bad loop form.\n");
2975 return opt_loop_vec_info::propagate_failure (res
);
2977 if (!integer_onep (loop_form_info
.assumptions
))
2979 /* We consider to vectorize this loop by versioning it under
2980 some assumptions. In order to do this, we need to clear
2981 existing information computed by scev and niter analyzer. */
2983 free_numbers_of_iterations_estimates (loop
);
2984 /* Also set flag for this loop so that following scev and niter
2985 analysis are done under the assumptions. */
2986 loop_constraint_set (loop
, LOOP_C_FINITE
);
2989 auto_vector_modes vector_modes
;
2990 /* Autodetect first vector size we try. */
2991 vector_modes
.safe_push (VOIDmode
);
2992 unsigned int autovec_flags
2993 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
2994 loop
->simdlen
!= 0);
2995 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
2996 && !unlimited_cost_model (loop
));
2997 machine_mode autodetected_vector_mode
= VOIDmode
;
2998 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2999 unsigned int mode_i
= 0;
3000 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
3002 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3003 a mode has not been analyzed. */
3004 auto_vec
<poly_uint64
, 8> cached_vf_per_mode
;
3005 for (unsigned i
= 0; i
< vector_modes
.length (); ++i
)
3006 cached_vf_per_mode
.safe_push (0);
3008 /* First determine the main loop vectorization mode, either the first
3009 one that works, starting with auto-detecting the vector mode and then
3010 following the targets order of preference, or the one with the
3011 lowest cost if pick_lowest_cost_p. */
3015 unsigned int last_mode_i
= mode_i
;
3016 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3018 cached_vf_per_mode
[last_mode_i
] = -1;
3019 opt_loop_vec_info loop_vinfo
3020 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3021 NULL
, vector_modes
, mode_i
,
3022 autodetected_vector_mode
, fatal
);
3028 /* Analyzis has been successful so update the VF value. The
3029 VF should always be a multiple of unroll_factor and we want to
3030 capture the original VF here. */
3031 cached_vf_per_mode
[last_mode_i
]
3032 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
3033 loop_vinfo
->suggested_unroll_factor
);
3034 /* Once we hit the desired simdlen for the first time,
3035 discard any previous attempts. */
3037 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3039 delete first_loop_vinfo
;
3040 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3043 else if (pick_lowest_cost_p
3045 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3047 /* Pick loop_vinfo over first_loop_vinfo. */
3048 delete first_loop_vinfo
;
3049 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3051 if (first_loop_vinfo
== NULL
)
3052 first_loop_vinfo
= loop_vinfo
;
3056 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3059 /* Commit to first_loop_vinfo if we have no reason to try
3061 if (!simdlen
&& !pick_lowest_cost_p
)
3064 if (mode_i
== vector_modes
.length ()
3065 || autodetected_vector_mode
== VOIDmode
)
3068 /* Try the next biggest vector size. */
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE
, vect_location
,
3071 "***** Re-trying analysis with vector mode %s\n",
3072 GET_MODE_NAME (vector_modes
[mode_i
]));
3074 if (!first_loop_vinfo
)
3075 return opt_loop_vec_info::propagate_failure (res
);
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE
, vect_location
,
3079 "***** Choosing vector mode %s\n",
3080 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3082 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083 enabled, SIMDUID is not set, it is the innermost loop and we have
3084 either already found the loop's SIMDLEN or there was no SIMDLEN to
3086 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3087 bool vect_epilogues
= (!simdlen
3088 && loop
->inner
== NULL
3089 && param_vect_epilogues_nomask
3090 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3092 if (!vect_epilogues
)
3093 return first_loop_vinfo
;
3095 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3096 poly_uint64 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3098 /* For epilogues start the analysis from the first mode. The motivation
3099 behind starting from the beginning comes from cases where the VECTOR_MODES
3100 array may contain length-agnostic and length-specific modes. Their
3101 ordering is not guaranteed, so we could end up picking a mode for the main
3102 loop that is after the epilogue's optimal mode. */
3103 vector_modes
[0] = autodetected_vector_mode
;
3106 bool supports_partial_vectors
=
3107 partial_vectors_supported_p () && param_vect_partial_vector_usage
!= 0;
3108 poly_uint64 first_vinfo_vf
= LOOP_VINFO_VECT_FACTOR (first_loop_vinfo
);
3112 /* If the target does not support partial vectors we can shorten the
3113 number of modes to analyze for the epilogue as we know we can't pick a
3114 mode that would lead to a VF at least as big as the
3116 if (!supports_partial_vectors
3117 && maybe_ge (cached_vf_per_mode
[mode_i
], first_vinfo_vf
))
3120 if (mode_i
== vector_modes
.length ())
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE
, vect_location
,
3127 "***** Re-trying epilogue analysis with vector "
3128 "mode %s\n", GET_MODE_NAME (vector_modes
[mode_i
]));
3131 opt_loop_vec_info loop_vinfo
3132 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3134 vector_modes
, mode_i
,
3135 autodetected_vector_mode
, fatal
);
3141 if (pick_lowest_cost_p
)
3143 /* Keep trying to roll back vectorization attempts while the
3144 loop_vec_infos they produced were worse than this one. */
3145 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3146 while (!vinfos
.is_empty ()
3147 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3149 gcc_assert (vect_epilogues
);
3150 delete vinfos
.pop ();
3153 /* For now only allow one epilogue loop. */
3154 if (first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3156 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3157 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3158 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3159 || maybe_ne (lowest_th
, 0U));
3160 /* Keep track of the known smallest versioning
3162 if (ordered_p (lowest_th
, th
))
3163 lowest_th
= ordered_min (lowest_th
, th
);
3168 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3171 /* For now only allow one epilogue loop, but allow
3172 pick_lowest_cost_p to replace it, so commit to the
3173 first epilogue if we have no reason to try alternatives. */
3174 if (!pick_lowest_cost_p
)
3178 if (mode_i
== vector_modes
.length ())
3183 if (!first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3185 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3186 if (dump_enabled_p ())
3187 dump_printf_loc (MSG_NOTE
, vect_location
,
3188 "***** Choosing epilogue vector mode %s\n",
3190 (first_loop_vinfo
->epilogue_vinfos
[0]->vector_mode
));
3193 return first_loop_vinfo
;
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197 it in *REDUC_FN if so. */
3200 fold_left_reduction_fn (code_helper code
, internal_fn
*reduc_fn
)
3202 if (code
== PLUS_EXPR
)
3204 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3210 /* Function reduction_fn_for_scalar_code
3213 CODE - tree_code of a reduction operations.
3216 REDUC_FN - the corresponding internal function to be used to reduce the
3217 vector of partial results into a single scalar result, or IFN_LAST
3218 if the operation is a supported reduction operation, but does not have
3219 such an internal function.
3221 Return FALSE if CODE currently cannot be vectorized as reduction. */
3224 reduction_fn_for_scalar_code (code_helper code
, internal_fn
*reduc_fn
)
3226 if (code
.is_tree_code ())
3227 switch (tree_code (code
))
3230 *reduc_fn
= IFN_REDUC_MAX
;
3234 *reduc_fn
= IFN_REDUC_MIN
;
3238 *reduc_fn
= IFN_REDUC_PLUS
;
3242 *reduc_fn
= IFN_REDUC_AND
;
3246 *reduc_fn
= IFN_REDUC_IOR
;
3250 *reduc_fn
= IFN_REDUC_XOR
;
3255 *reduc_fn
= IFN_LAST
;
3262 switch (combined_fn (code
))
3265 *reduc_fn
= IFN_REDUC_FMAX
;
3269 *reduc_fn
= IFN_REDUC_FMIN
;
3277 /* If there is a neutral value X such that a reduction would not be affected
3278 by the introduction of additional X elements, return that X, otherwise
3279 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3280 of the scalar elements. If the reduction has just a single initial value
3281 then INITIAL_VALUE is that value, otherwise it is null. */
3284 neutral_op_for_reduction (tree scalar_type
, code_helper code
,
3287 if (code
.is_tree_code ())
3288 switch (tree_code (code
))
3290 case WIDEN_SUM_EXPR
:
3297 return build_zero_cst (scalar_type
);
3300 return build_one_cst (scalar_type
);
3303 return build_all_ones_cst (scalar_type
);
3307 return initial_value
;
3313 switch (combined_fn (code
))
3317 return initial_value
;
3324 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3325 STMT is printed with a message MSG. */
3328 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3330 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3333 /* Return true if we need an in-order reduction for operation CODE
3334 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335 overflow must wrap. */
3338 needs_fold_left_reduction_p (tree type
, code_helper code
)
3340 /* CHECKME: check for !flag_finite_math_only too? */
3341 if (SCALAR_FLOAT_TYPE_P (type
))
3343 if (code
.is_tree_code ())
3344 switch (tree_code (code
))
3351 return !flag_associative_math
;
3354 switch (combined_fn (code
))
3361 return !flag_associative_math
;
3365 if (INTEGRAL_TYPE_P (type
))
3366 return (!code
.is_tree_code ()
3367 || !operation_no_trapping_overflow (type
, tree_code (code
)));
3369 if (SAT_FIXED_POINT_TYPE_P (type
))
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376 has a handled computation expression. Store the main reduction
3377 operation in *CODE. */
3380 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3381 tree loop_arg
, code_helper
*code
,
3382 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3384 auto_bitmap visited
;
3385 tree lookfor
= PHI_RESULT (phi
);
3387 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3388 while (USE_FROM_PTR (curr
) != loop_arg
)
3389 curr
= op_iter_next_use (&curri
);
3390 curri
.i
= curri
.numops
;
3393 path
.safe_push (std::make_pair (curri
, curr
));
3394 tree use
= USE_FROM_PTR (curr
);
3397 gimple
*def
= SSA_NAME_DEF_STMT (use
);
3398 if (gimple_nop_p (def
)
3399 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3404 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3408 curr
= op_iter_next_use (&curri
);
3409 /* Skip already visited or non-SSA operands (from iterating
3411 while (curr
!= NULL_USE_OPERAND_P
3412 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3413 || ! bitmap_set_bit (visited
,
3415 (USE_FROM_PTR (curr
)))));
3417 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3418 if (curr
== NULL_USE_OPERAND_P
)
3423 if (gimple_code (def
) == GIMPLE_PHI
)
3424 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3426 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3427 while (curr
!= NULL_USE_OPERAND_P
3428 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3429 || ! bitmap_set_bit (visited
,
3431 (USE_FROM_PTR (curr
)))))
3432 curr
= op_iter_next_use (&curri
);
3433 if (curr
== NULL_USE_OPERAND_P
)
3438 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3440 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3442 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3443 FOR_EACH_VEC_ELT (path
, i
, x
)
3444 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
3445 dump_printf (MSG_NOTE
, "\n");
3448 /* Check whether the reduction path detected is valid. */
3449 bool fail
= path
.length () == 0;
3453 for (unsigned i
= 1; i
< path
.length (); ++i
)
3455 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3457 if (!gimple_extract_op (use_stmt
, &op
))
3462 unsigned int opi
= op
.num_ops
;
3463 if (gassign
*assign
= dyn_cast
<gassign
*> (use_stmt
))
3465 /* The following make sure we can compute the operand index
3466 easily plus it mostly disallows chaining via COND_EXPR condition
3468 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3469 if (gimple_assign_rhs1_ptr (assign
) + opi
== path
[i
].second
->use
)
3472 else if (gcall
*call
= dyn_cast
<gcall
*> (use_stmt
))
3474 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3475 if (gimple_call_arg_ptr (call
, opi
) == path
[i
].second
->use
)
3478 if (opi
== op
.num_ops
)
3483 op
.code
= canonicalize_code (op
.code
, op
.type
);
3484 if (op
.code
== MINUS_EXPR
)
3486 op
.code
= PLUS_EXPR
;
3487 /* Track whether we negate the reduction value each iteration. */
3488 if (op
.ops
[1] == op
.ops
[opi
])
3491 if (CONVERT_EXPR_CODE_P (op
.code
)
3492 && tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
3494 else if (*code
== ERROR_MARK
)
3497 sign
= TYPE_SIGN (op
.type
);
3499 else if (op
.code
!= *code
)
3504 else if ((op
.code
== MIN_EXPR
3505 || op
.code
== MAX_EXPR
)
3506 && sign
!= TYPE_SIGN (op
.type
))
3511 /* Check there's only a single stmt the op is used on. For the
3512 not value-changing tail and the last stmt allow out-of-loop uses.
3513 ??? We could relax this and handle arbitrary live stmts by
3514 forcing a scalar epilogue for example. */
3515 imm_use_iterator imm_iter
;
3516 gimple
*op_use_stmt
;
3518 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
.ops
[opi
])
3519 if (!is_gimple_debug (op_use_stmt
)
3520 && (*code
!= ERROR_MARK
3521 || flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
))))
3523 /* We want to allow x + x but not x < 1 ? x : 2. */
3524 if (is_gimple_assign (op_use_stmt
)
3525 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3527 use_operand_p use_p
;
3528 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3540 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3544 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3545 tree loop_arg
, enum tree_code code
)
3547 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3549 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3555 /* Function vect_is_simple_reduction
3557 (1) Detect a cross-iteration def-use cycle that represents a simple
3558 reduction computation. We look for the following pattern:
3563 a2 = operation (a3, a1)
3570 a2 = operation (a3, a1)
3573 1. operation is commutative and associative and it is safe to
3574 change the order of the computation
3575 2. no uses for a2 in the loop (a2 is used out of the loop)
3576 3. no uses of a1 in the loop besides the reduction operation
3577 4. no uses of a1 outside the loop.
3579 Conditions 1,4 are tested here.
3580 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3582 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3585 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3589 inner loop (def of a3)
3592 (4) Detect condition expressions, ie:
3593 for (int i = 0; i < N; i++)
3599 static stmt_vec_info
3600 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3601 bool *double_reduc
, bool *reduc_chain_p
)
3603 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3604 gimple
*phi_use_stmt
= NULL
;
3605 imm_use_iterator imm_iter
;
3606 use_operand_p use_p
;
3608 *double_reduc
= false;
3609 *reduc_chain_p
= false;
3610 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3612 tree phi_name
= PHI_RESULT (phi
);
3613 /* ??? If there are no uses of the PHI result the inner loop reduction
3614 won't be detected as possibly double-reduction by vectorizable_reduction
3615 because that tries to walk the PHI arg from the preheader edge which
3616 can be constant. See PR60382. */
3617 if (has_zero_uses (phi_name
))
3619 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3620 unsigned nphi_def_loop_uses
= 0;
3621 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3623 gimple
*use_stmt
= USE_STMT (use_p
);
3624 if (is_gimple_debug (use_stmt
))
3627 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3629 if (dump_enabled_p ())
3630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3631 "intermediate value used outside loop.\n");
3636 nphi_def_loop_uses
++;
3637 phi_use_stmt
= use_stmt
;
3640 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3641 if (TREE_CODE (latch_def
) != SSA_NAME
)
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3645 "reduction: not ssa_name: %T\n", latch_def
);
3649 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3651 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3654 bool nested_in_vect_loop
3655 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3656 unsigned nlatch_def_loop_uses
= 0;
3657 auto_vec
<gphi
*, 3> lcphis
;
3658 bool inner_loop_of_double_reduc
= false;
3659 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3661 gimple
*use_stmt
= USE_STMT (use_p
);
3662 if (is_gimple_debug (use_stmt
))
3664 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3665 nlatch_def_loop_uses
++;
3668 /* We can have more than one loop-closed PHI. */
3669 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3670 if (nested_in_vect_loop
3671 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3672 == vect_double_reduction_def
))
3673 inner_loop_of_double_reduc
= true;
3677 /* If we are vectorizing an inner reduction we are executing that
3678 in the original order only in case we are not dealing with a
3679 double reduction. */
3680 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3682 if (dump_enabled_p ())
3683 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3684 "detected nested cycle: ");
3685 return def_stmt_info
;
3688 /* When the inner loop of a double reduction ends up with more than
3689 one loop-closed PHI we have failed to classify alternate such
3690 PHIs as double reduction, leading to wrong code. See PR103237. */
3691 if (inner_loop_of_double_reduc
&& lcphis
.length () != 1)
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3695 "unhandle double reduction\n");
3699 /* If this isn't a nested cycle or if the nested cycle reduction value
3700 is used ouside of the inner loop we cannot handle uses of the reduction
3702 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3706 "reduction used in loop.\n");
3710 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3711 defined in the inner loop. */
3712 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3714 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3715 if (gimple_phi_num_args (def_stmt
) != 1
3716 || TREE_CODE (op1
) != SSA_NAME
)
3718 if (dump_enabled_p ())
3719 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3720 "unsupported phi node definition.\n");
3725 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3726 if (gimple_bb (def1
)
3727 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3729 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3730 && (is_gimple_assign (def1
) || is_gimple_call (def1
))
3731 && is_a
<gphi
*> (phi_use_stmt
)
3732 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3734 if (dump_enabled_p ())
3735 report_vect_op (MSG_NOTE
, def_stmt
,
3736 "detected double reduction: ");
3738 *double_reduc
= true;
3739 return def_stmt_info
;
3745 /* Look for the expression computing latch_def from then loop PHI result. */
3746 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3748 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3751 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3752 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3753 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3755 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3756 reduction chain for which the additional restriction is that
3757 all operations in the chain are the same. */
3758 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3760 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3761 for (i
= path
.length () - 1; i
>= 1; --i
)
3763 gimple
*stmt
= USE_STMT (path
[i
].second
);
3764 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3766 if (!gimple_extract_op (stmt
, &op
))
3768 if (gassign
*assign
= dyn_cast
<gassign
*> (stmt
))
3769 STMT_VINFO_REDUC_IDX (stmt_info
)
3770 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (assign
);
3773 gcall
*call
= as_a
<gcall
*> (stmt
);
3774 STMT_VINFO_REDUC_IDX (stmt_info
)
3775 = path
[i
].second
->use
- gimple_call_arg_ptr (call
, 0);
3777 bool leading_conversion
= (CONVERT_EXPR_CODE_P (op
.code
)
3778 && (i
== 1 || i
== path
.length () - 1));
3779 if ((op
.code
!= code
&& !leading_conversion
)
3780 /* We can only handle the final value in epilogue
3781 generation for reduction chains. */
3782 || (i
!= 1 && !has_single_use (gimple_get_lhs (stmt
))))
3783 is_slp_reduc
= false;
3784 /* For reduction chains we support a trailing/leading
3785 conversions. We do not store those in the actual chain. */
3786 if (leading_conversion
)
3788 reduc_chain
.safe_push (stmt_info
);
3790 if (is_slp_reduc
&& reduc_chain
.length () > 1)
3792 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3794 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3795 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3797 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3798 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3800 /* Save the chain for further analysis in SLP detection. */
3801 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3802 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3804 *reduc_chain_p
= true;
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE
, vect_location
,
3807 "reduction: detected reduction chain\n");
3809 else if (dump_enabled_p ())
3810 dump_printf_loc (MSG_NOTE
, vect_location
,
3811 "reduction: detected reduction\n");
3813 return def_stmt_info
;
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_NOTE
, vect_location
,
3818 "reduction: unknown pattern\n");
3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3824 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3825 or -1 if not known. */
3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
3830 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3831 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
3833 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_NOTE
, vect_location
,
3835 "cost model: epilogue peel iters set to vf/2 "
3836 "because loop iterations are unknown .\n");
3837 return assumed_vf
/ 2;
3841 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3842 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
3843 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3844 /* If we need to peel for gaps, but no peeling is required, we have to
3845 peel VF iterations. */
3846 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
3847 peel_iters_epilogue
= assumed_vf
;
3848 return peel_iters_epilogue
;
3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3855 int *peel_iters_epilogue
,
3856 stmt_vector_for_cost
*scalar_cost_vec
,
3857 stmt_vector_for_cost
*prologue_cost_vec
,
3858 stmt_vector_for_cost
*epilogue_cost_vec
)
3862 *peel_iters_epilogue
3863 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
3865 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3867 /* If peeled iterations are known but number of scalar loop
3868 iterations are unknown, count a taken branch per peeled loop. */
3869 if (peel_iters_prologue
> 0)
3870 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3872 if (*peel_iters_epilogue
> 0)
3873 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3877 stmt_info_for_cost
*si
;
3879 if (peel_iters_prologue
)
3880 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3881 retval
+= record_stmt_cost (prologue_cost_vec
,
3882 si
->count
* peel_iters_prologue
,
3883 si
->kind
, si
->stmt_info
, si
->misalign
,
3885 if (*peel_iters_epilogue
)
3886 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3887 retval
+= record_stmt_cost (epilogue_cost_vec
,
3888 si
->count
* *peel_iters_epilogue
,
3889 si
->kind
, si
->stmt_info
, si
->misalign
,
3895 /* Function vect_estimate_min_profitable_iters
3897 Return the number of iterations required for the vector version of the
3898 loop to be profitable relative to the cost of the scalar version of the
3901 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3902 of iterations for vectorization. -1 value means loop vectorization
3903 is not profitable. This returned value may be used for dynamic
3904 profitability check.
3906 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3907 for static check against estimated number of iterations. */
3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3911 int *ret_min_profitable_niters
,
3912 int *ret_min_profitable_estimate
,
3913 unsigned *suggested_unroll_factor
)
3915 int min_profitable_iters
;
3916 int min_profitable_estimate
;
3917 int peel_iters_prologue
;
3918 int peel_iters_epilogue
;
3919 unsigned vec_inside_cost
= 0;
3920 int vec_outside_cost
= 0;
3921 unsigned vec_prologue_cost
= 0;
3922 unsigned vec_epilogue_cost
= 0;
3923 int scalar_single_iter_cost
= 0;
3924 int scalar_outside_cost
= 0;
3925 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3926 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3927 vector_costs
*target_cost_data
= loop_vinfo
->vector_costs
;
3929 /* Cost model disabled. */
3930 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3932 if (dump_enabled_p ())
3933 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3934 *ret_min_profitable_niters
= 0;
3935 *ret_min_profitable_estimate
= 0;
3939 /* Requires loop versioning tests to handle misalignment. */
3940 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3942 /* FIXME: Make cost depend on complexity of individual check. */
3943 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3944 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
3945 if (dump_enabled_p ())
3946 dump_printf (MSG_NOTE
,
3947 "cost model: Adding cost of checks for loop "
3948 "versioning to treat misalignment.\n");
3951 /* Requires loop versioning with alias checks. */
3952 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3954 /* FIXME: Make cost depend on complexity of individual check. */
3955 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3956 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
3957 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3959 /* Count LEN - 1 ANDs and LEN comparisons. */
3960 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1,
3961 scalar_stmt
, vect_prologue
);
3962 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3965 /* Count LEN - 1 ANDs and LEN comparisons. */
3966 unsigned int nstmts
= len
* 2 - 1;
3967 /* +1 for each bias that needs adding. */
3968 for (unsigned int i
= 0; i
< len
; ++i
)
3969 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3971 (void) add_stmt_cost (target_cost_data
, nstmts
,
3972 scalar_stmt
, vect_prologue
);
3974 if (dump_enabled_p ())
3975 dump_printf (MSG_NOTE
,
3976 "cost model: Adding cost of checks for loop "
3977 "versioning aliasing.\n");
3980 /* Requires loop versioning with niter checks. */
3981 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3983 /* FIXME: Make cost depend on complexity of individual check. */
3984 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3985 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
3986 if (dump_enabled_p ())
3987 dump_printf (MSG_NOTE
,
3988 "cost model: Adding cost of checks for loop "
3989 "versioning niters.\n");
3992 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3993 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3996 /* Count statements in scalar loop. Using this as scalar cost for a single
3999 TODO: Add outer loop support.
4001 TODO: Consider assigning different costs to different scalar
4004 scalar_single_iter_cost
= loop_vinfo
->scalar_costs
->total_cost ();
4006 /* Add additional cost for the peeled instructions in prologue and epilogue
4007 loop. (For fully-masked loops there will be no peeling.)
4009 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4010 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4012 TODO: Build an expression that represents peel_iters for prologue and
4013 epilogue to be used in a run-time test. */
4015 bool prologue_need_br_taken_cost
= false;
4016 bool prologue_need_br_not_taken_cost
= false;
4018 /* Calculate peel_iters_prologue. */
4019 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
4020 peel_iters_prologue
= 0;
4023 peel_iters_prologue
= assumed_vf
/ 2;
4024 if (dump_enabled_p ())
4025 dump_printf (MSG_NOTE
, "cost model: "
4026 "prologue peel iters set to vf/2.\n");
4028 /* If peeled iterations are unknown, count a taken branch and a not taken
4029 branch per peeled loop. Even if scalar loop iterations are known,
4030 vector iterations are not known since peeled prologue iterations are
4031 not known. Hence guards remain the same. */
4032 prologue_need_br_taken_cost
= true;
4033 prologue_need_br_not_taken_cost
= true;
4037 peel_iters_prologue
= npeel
;
4038 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
4039 /* If peeled iterations are known but number of scalar loop
4040 iterations are unknown, count a taken branch per peeled loop. */
4041 prologue_need_br_taken_cost
= true;
4044 bool epilogue_need_br_taken_cost
= false;
4045 bool epilogue_need_br_not_taken_cost
= false;
4047 /* Calculate peel_iters_epilogue. */
4048 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4049 /* We need to peel exactly one iteration for gaps. */
4050 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
4053 /* If peeling for alignment is unknown, loop bound of main loop
4055 peel_iters_epilogue
= assumed_vf
/ 2;
4056 if (dump_enabled_p ())
4057 dump_printf (MSG_NOTE
, "cost model: "
4058 "epilogue peel iters set to vf/2 because "
4059 "peeling for alignment is unknown.\n");
4061 /* See the same reason above in peel_iters_prologue calculation. */
4062 epilogue_need_br_taken_cost
= true;
4063 epilogue_need_br_not_taken_cost
= true;
4067 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4068 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4069 /* If peeled iterations are known but number of scalar loop
4070 iterations are unknown, count a taken branch per peeled loop. */
4071 epilogue_need_br_taken_cost
= true;
4074 stmt_info_for_cost
*si
;
4076 /* Add costs associated with peel_iters_prologue. */
4077 if (peel_iters_prologue
)
4078 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4080 (void) add_stmt_cost (target_cost_data
,
4081 si
->count
* peel_iters_prologue
, si
->kind
,
4082 si
->stmt_info
, si
->node
, si
->vectype
,
4083 si
->misalign
, vect_prologue
);
4086 /* Add costs associated with peel_iters_epilogue. */
4087 if (peel_iters_epilogue
)
4088 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4090 (void) add_stmt_cost (target_cost_data
,
4091 si
->count
* peel_iters_epilogue
, si
->kind
,
4092 si
->stmt_info
, si
->node
, si
->vectype
,
4093 si
->misalign
, vect_epilogue
);
4096 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4098 if (prologue_need_br_taken_cost
)
4099 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4102 if (prologue_need_br_not_taken_cost
)
4103 (void) add_stmt_cost (target_cost_data
, 1,
4104 cond_branch_not_taken
, vect_prologue
);
4106 if (epilogue_need_br_taken_cost
)
4107 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4110 if (epilogue_need_br_not_taken_cost
)
4111 (void) add_stmt_cost (target_cost_data
, 1,
4112 cond_branch_not_taken
, vect_epilogue
);
4114 /* Take care of special costs for rgroup controls of partial vectors. */
4115 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
4117 /* Calculate how many masks we need to generate. */
4118 unsigned int num_masks
= 0;
4119 rgroup_controls
*rgm
;
4120 unsigned int num_vectors_m1
;
4121 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
4123 num_masks
+= num_vectors_m1
+ 1;
4124 gcc_assert (num_masks
> 0);
4126 /* In the worst case, we need to generate each mask in the prologue
4127 and in the loop body. One of the loop body mask instructions
4128 replaces the comparison in the scalar loop, and since we don't
4129 count the scalar comparison against the scalar body, we shouldn't
4130 count that vector instruction against the vector body either.
4132 Sometimes we can use unpacks instead of generating prologue
4133 masks and sometimes the prologue mask will fold to a constant,
4134 so the actual prologue cost might be smaller. However, it's
4135 simpler and safer to use the worst-case cost; if this ends up
4136 being the tie-breaker between vectorizing or not, then it's
4137 probably better not to vectorize. */
4138 (void) add_stmt_cost (target_cost_data
, num_masks
,
4139 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4141 (void) add_stmt_cost (target_cost_data
, num_masks
- 1,
4142 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4145 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4147 /* Referring to the functions vect_set_loop_condition_partial_vectors
4148 and vect_set_loop_controls_directly, we need to generate each
4149 length in the prologue and in the loop body if required. Although
4150 there are some possible optimizations, we consider the worst case
4153 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4154 signed char partial_load_store_bias
4155 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
4157 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4158 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4160 /* Calculate how many statements to be added. */
4161 unsigned int prologue_stmts
= 0;
4162 unsigned int body_stmts
= 0;
4164 rgroup_controls
*rgc
;
4165 unsigned int num_vectors_m1
;
4166 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4169 /* May need one SHIFT for nitems_total computation. */
4170 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4171 if (nitems
!= 1 && !niters_known_p
)
4172 prologue_stmts
+= 1;
4174 /* May need one MAX and one MINUS for wrap around. */
4175 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4176 prologue_stmts
+= 2;
4178 /* Need one MAX and one MINUS for each batch limit excepting for
4180 prologue_stmts
+= num_vectors_m1
* 2;
4182 unsigned int num_vectors
= num_vectors_m1
+ 1;
4184 /* Need to set up lengths in prologue, only one MIN required
4185 for each since start index is zero. */
4186 prologue_stmts
+= num_vectors
;
4188 /* If we have a non-zero partial load bias, we need one PLUS
4189 to adjust the load length. */
4190 if (partial_load_store_bias
!= 0)
4193 /* Each may need two MINs and one MINUS to update lengths in body
4194 for next iteration. */
4196 body_stmts
+= 3 * num_vectors
;
4199 (void) add_stmt_cost (target_cost_data
, prologue_stmts
,
4200 scalar_stmt
, vect_prologue
);
4201 (void) add_stmt_cost (target_cost_data
, body_stmts
,
4202 scalar_stmt
, vect_body
);
4205 /* FORNOW: The scalar outside cost is incremented in one of the
4208 1. The vectorizer checks for alignment and aliasing and generates
4209 a condition that allows dynamic vectorization. A cost model
4210 check is ANDED with the versioning condition. Hence scalar code
4211 path now has the added cost of the versioning check.
4213 if (cost > th & versioning_check)
4216 Hence run-time scalar is incremented by not-taken branch cost.
4218 2. The vectorizer then checks if a prologue is required. If the
4219 cost model check was not done before during versioning, it has to
4220 be done before the prologue check.
4223 prologue = scalar_iters
4228 if (prologue == num_iters)
4231 Hence the run-time scalar cost is incremented by a taken branch,
4232 plus a not-taken branch, plus a taken branch cost.
4234 3. The vectorizer then checks if an epilogue is required. If the
4235 cost model check was not done before during prologue check, it
4236 has to be done with the epilogue check.
4242 if (prologue == num_iters)
4245 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4248 Hence the run-time scalar cost should be incremented by 2 taken
4251 TODO: The back end may reorder the BBS's differently and reverse
4252 conditions/branch directions. Change the estimates below to
4253 something more reasonable. */
4255 /* If the number of iterations is known and we do not do versioning, we can
4256 decide whether to vectorize at compile time. Hence the scalar version
4257 do not carry cost model guard costs. */
4258 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4259 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4261 /* Cost model check occurs at versioning. */
4262 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4263 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4266 /* Cost model check occurs at prologue generation. */
4267 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4268 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4269 + vect_get_stmt_cost (cond_branch_not_taken
);
4270 /* Cost model check occurs at epilogue generation. */
4272 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4276 /* Complete the target-specific cost calculations. */
4277 finish_cost (loop_vinfo
->vector_costs
, loop_vinfo
->scalar_costs
,
4278 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
,
4279 suggested_unroll_factor
);
4281 if (suggested_unroll_factor
&& *suggested_unroll_factor
> 1
4282 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) != MAX_VECTORIZATION_FACTOR
4283 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *
4284 *suggested_unroll_factor
,
4285 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
)))
4287 if (dump_enabled_p ())
4288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4289 "can't unroll as unrolled vectorization factor larger"
4290 " than maximum vectorization factor: %d\n",
4291 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
));
4292 *suggested_unroll_factor
= 1;
4295 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
4297 if (dump_enabled_p ())
4299 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4300 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
4302 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
4304 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
4306 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
4307 scalar_single_iter_cost
);
4308 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
4309 scalar_outside_cost
);
4310 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
4312 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
4313 peel_iters_prologue
);
4314 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
4315 peel_iters_epilogue
);
4318 /* Calculate number of iterations required to make the vector version
4319 profitable, relative to the loop bodies only. The following condition
4321 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4323 SIC = scalar iteration cost, VIC = vector iteration cost,
4324 VOC = vector outside cost, VF = vectorization factor,
4325 NPEEL = prologue iterations + epilogue iterations,
4326 SOC = scalar outside cost for run time cost model check. */
4328 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
4330 if (saving_per_viter
<= 0)
4332 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
4333 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
4334 "vectorization did not happen for a simd loop");
4336 if (dump_enabled_p ())
4337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4338 "cost model: the vector iteration cost = %d "
4339 "divided by the scalar iteration cost = %d "
4340 "is greater or equal to the vectorization factor = %d"
4342 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4343 *ret_min_profitable_niters
= -1;
4344 *ret_min_profitable_estimate
= -1;
4348 /* ??? The "if" arm is written to handle all cases; see below for what
4349 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4350 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4352 /* Rewriting the condition above in terms of the number of
4353 vector iterations (vniters) rather than the number of
4354 scalar iterations (niters) gives:
4356 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4358 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4360 For integer N, X and Y when X > 0:
4362 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4363 int outside_overhead
= (vec_outside_cost
4364 - scalar_single_iter_cost
* peel_iters_prologue
4365 - scalar_single_iter_cost
* peel_iters_epilogue
4366 - scalar_outside_cost
);
4367 /* We're only interested in cases that require at least one
4368 vector iteration. */
4369 int min_vec_niters
= 1;
4370 if (outside_overhead
> 0)
4371 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4373 if (dump_enabled_p ())
4374 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
4377 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4379 /* Now that we know the minimum number of vector iterations,
4380 find the minimum niters for which the scalar cost is larger:
4382 SIC * niters > VIC * vniters + VOC - SOC
4384 We know that the minimum niters is no more than
4385 vniters * VF + NPEEL, but it might be (and often is) less
4386 than that if a partial vector iteration is cheaper than the
4387 equivalent scalar code. */
4388 int threshold
= (vec_inside_cost
* min_vec_niters
4390 - scalar_outside_cost
);
4392 min_profitable_iters
= 1;
4394 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
4397 /* Convert the number of vector iterations into a number of
4398 scalar iterations. */
4399 min_profitable_iters
= (min_vec_niters
* assumed_vf
4400 + peel_iters_prologue
4401 + peel_iters_epilogue
);
4405 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
4407 - vec_inside_cost
* peel_iters_prologue
4408 - vec_inside_cost
* peel_iters_epilogue
);
4409 if (min_profitable_iters
<= 0)
4410 min_profitable_iters
= 0;
4413 min_profitable_iters
/= saving_per_viter
;
4415 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
4416 <= (((int) vec_inside_cost
* min_profitable_iters
)
4417 + (((int) vec_outside_cost
- scalar_outside_cost
)
4419 min_profitable_iters
++;
4423 if (dump_enabled_p ())
4424 dump_printf (MSG_NOTE
,
4425 " Calculated minimum iters for profitability: %d\n",
4426 min_profitable_iters
);
4428 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
4429 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4430 /* We want the vectorized loop to execute at least once. */
4431 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4432 else if (min_profitable_iters
< peel_iters_prologue
)
4433 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4434 vectorized loop executes at least once. */
4435 min_profitable_iters
= peel_iters_prologue
;
4437 if (dump_enabled_p ())
4438 dump_printf_loc (MSG_NOTE
, vect_location
,
4439 " Runtime profitability threshold = %d\n",
4440 min_profitable_iters
);
4442 *ret_min_profitable_niters
= min_profitable_iters
;
4444 /* Calculate number of iterations required to make the vector version
4445 profitable, relative to the loop bodies only.
4447 Non-vectorized variant is SIC * niters and it must win over vector
4448 variant on the expected loop trip count. The following condition must hold true:
4449 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4451 if (vec_outside_cost
<= 0)
4452 min_profitable_estimate
= 0;
4453 /* ??? This "else if" arm is written to handle all cases; see below for
4454 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4455 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4457 /* This is a repeat of the code above, but with + SOC rather
4459 int outside_overhead
= (vec_outside_cost
4460 - scalar_single_iter_cost
* peel_iters_prologue
4461 - scalar_single_iter_cost
* peel_iters_epilogue
4462 + scalar_outside_cost
);
4463 int min_vec_niters
= 1;
4464 if (outside_overhead
> 0)
4465 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4467 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4469 int threshold
= (vec_inside_cost
* min_vec_niters
4471 + scalar_outside_cost
);
4472 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
4475 min_profitable_estimate
= (min_vec_niters
* assumed_vf
4476 + peel_iters_prologue
4477 + peel_iters_epilogue
);
4481 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4483 - vec_inside_cost
* peel_iters_prologue
4484 - vec_inside_cost
* peel_iters_epilogue
)
4485 / ((scalar_single_iter_cost
* assumed_vf
)
4488 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4489 if (dump_enabled_p ())
4490 dump_printf_loc (MSG_NOTE
, vect_location
,
4491 " Static estimate profitability threshold = %d\n",
4492 min_profitable_estimate
);
4494 *ret_min_profitable_estimate
= min_profitable_estimate
;
4497 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4498 vector elements (not bits) for a vector with NELT elements. */
4500 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4501 vec_perm_builder
*sel
)
4503 /* The encoding is a single stepped pattern. Any wrap-around is handled
4504 by vec_perm_indices. */
4505 sel
->new_vector (nelt
, 1, 3);
4506 for (unsigned int i
= 0; i
< 3; i
++)
4507 sel
->quick_push (i
+ offset
);
4510 /* Checks whether the target supports whole-vector shifts for vectors of mode
4511 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4512 it supports vec_perm_const with masks for all necessary shift amounts. */
4514 have_whole_vector_shift (machine_mode mode
)
4516 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4519 /* Variable-length vectors should be handled via the optab. */
4521 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4524 vec_perm_builder sel
;
4525 vec_perm_indices indices
;
4526 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4528 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4529 indices
.new_vector (sel
, 2, nelt
);
4530 if (!can_vec_perm_const_p (mode
, indices
, false))
4536 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4537 functions. Design better to avoid maintenance issues. */
4539 /* Function vect_model_reduction_cost.
4541 Models cost for a reduction operation, including the vector ops
4542 generated within the strip-mine loop in some cases, the initial
4543 definition before the loop, and the epilogue code that must be generated. */
4546 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
4547 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4548 vect_reduction_type reduction_type
,
4549 int ncopies
, stmt_vector_for_cost
*cost_vec
)
4551 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
= 0;
4554 class loop
*loop
= NULL
;
4557 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4559 /* Condition reductions generate two reductions in the loop. */
4560 if (reduction_type
== COND_REDUCTION
)
4563 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4564 mode
= TYPE_MODE (vectype
);
4565 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4568 if (!gimple_extract_op (orig_stmt_info
->stmt
, &op
))
4571 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
4572 /* No extra instructions are needed in the prologue. The loop body
4573 operations are costed in vectorizable_condition. */
4575 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
4577 /* No extra instructions needed in the prologue. */
4580 if (reduc_fn
!= IFN_LAST
)
4581 /* Count one reduction-like operation per vector. */
4582 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
4583 stmt_info
, 0, vect_body
);
4586 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4587 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4588 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
4589 vec_to_scalar
, stmt_info
, 0,
4591 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
4592 scalar_stmt
, stmt_info
, 0,
4598 /* Add in cost for initial definition.
4599 For cond reduction we have four vectors: initial index, step,
4600 initial result of the data reduction, initial value of the index
4602 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
4603 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
4604 scalar_to_vec
, stmt_info
, 0,
4608 /* Determine cost of epilogue code.
4610 We have a reduction operator that will reduce the vector in one statement.
4611 Also requires scalar extract. */
4613 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4615 if (reduc_fn
!= IFN_LAST
)
4617 if (reduction_type
== COND_REDUCTION
)
4619 /* An EQ stmt and an COND_EXPR stmt. */
4620 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4621 vector_stmt
, stmt_info
, 0,
4623 /* Reduction of the max index and a reduction of the found
4625 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4626 vec_to_scalar
, stmt_info
, 0,
4628 /* A broadcast of the max value. */
4629 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4630 scalar_to_vec
, stmt_info
, 0,
4635 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4636 stmt_info
, 0, vect_epilogue
);
4637 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4638 vec_to_scalar
, stmt_info
, 0,
4642 else if (reduction_type
== COND_REDUCTION
)
4644 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4645 /* Extraction of scalar elements. */
4646 epilogue_cost
+= record_stmt_cost (cost_vec
,
4647 2 * estimated_nunits
,
4648 vec_to_scalar
, stmt_info
, 0,
4650 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4651 epilogue_cost
+= record_stmt_cost (cost_vec
,
4652 2 * estimated_nunits
- 3,
4653 scalar_stmt
, stmt_info
, 0,
4656 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4657 || reduction_type
== FOLD_LEFT_REDUCTION
)
4658 /* No extra instructions need in the epilogue. */
4662 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4663 tree bitsize
= TYPE_SIZE (op
.type
);
4664 int element_bitsize
= tree_to_uhwi (bitsize
);
4665 int nelements
= vec_size_in_bits
/ element_bitsize
;
4667 if (op
.code
== COND_EXPR
)
4670 /* We have a whole vector shift available. */
4671 if (VECTOR_MODE_P (mode
)
4672 && directly_supported_p (op
.code
, vectype
)
4673 && have_whole_vector_shift (mode
))
4675 /* Final reduction via vector shifts and the reduction operator.
4676 Also requires scalar extract. */
4677 epilogue_cost
+= record_stmt_cost (cost_vec
,
4678 exact_log2 (nelements
) * 2,
4679 vector_stmt
, stmt_info
, 0,
4681 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4682 vec_to_scalar
, stmt_info
, 0,
4686 /* Use extracts and reduction op for final reduction. For N
4687 elements, we have N extracts and N-1 reduction ops. */
4688 epilogue_cost
+= record_stmt_cost (cost_vec
,
4689 nelements
+ nelements
- 1,
4690 vector_stmt
, stmt_info
, 0,
4695 if (dump_enabled_p ())
4696 dump_printf (MSG_NOTE
,
4697 "vect_model_reduction_cost: inside_cost = %d, "
4698 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4699 prologue_cost
, epilogue_cost
);
4702 /* SEQ is a sequence of instructions that initialize the reduction
4703 described by REDUC_INFO. Emit them in the appropriate place. */
4706 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo
,
4707 stmt_vec_info reduc_info
, gimple
*seq
)
4709 if (reduc_info
->reused_accumulator
)
4711 /* When reusing an accumulator from the main loop, we only need
4712 initialization instructions if the main loop can be skipped.
4713 In that case, emit the initialization instructions at the end
4714 of the guard block that does the skip. */
4715 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
4716 gcc_assert (skip_edge
);
4717 gimple_stmt_iterator gsi
= gsi_last_bb (skip_edge
->src
);
4718 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
4722 /* The normal case: emit the initialization instructions on the
4724 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4725 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), seq
);
4729 /* Function get_initial_def_for_reduction
4732 REDUC_INFO - the info_for_reduction
4733 INIT_VAL - the initial value of the reduction variable
4734 NEUTRAL_OP - a value that has no effect on the reduction, as per
4735 neutral_op_for_reduction
4738 Return a vector variable, initialized according to the operation that
4739 STMT_VINFO performs. This vector will be used as the initial value
4740 of the vector of partial results.
4742 The value we need is a vector in which element 0 has value INIT_VAL
4743 and every other element has value NEUTRAL_OP. */
4746 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4747 stmt_vec_info reduc_info
,
4748 tree init_val
, tree neutral_op
)
4750 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4751 tree scalar_type
= TREE_TYPE (init_val
);
4752 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4754 gimple_seq stmts
= NULL
;
4756 gcc_assert (vectype
);
4758 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4759 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4761 gcc_assert (nested_in_vect_loop_p (loop
, reduc_info
)
4762 || loop
== (gimple_bb (reduc_info
->stmt
))->loop_father
);
4764 if (operand_equal_p (init_val
, neutral_op
))
4766 /* If both elements are equal then the vector described above is
4768 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4769 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, neutral_op
);
4773 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4774 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4775 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4777 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4779 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4781 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4782 vectype
, init_def
, init_val
);
4786 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4787 tree_vector_builder
elts (vectype
, 1, 2);
4788 elts
.quick_push (init_val
);
4789 elts
.quick_push (neutral_op
);
4790 init_def
= gimple_build_vector (&stmts
, &elts
);
4795 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, stmts
);
4799 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4800 which performs a reduction involving GROUP_SIZE scalar statements.
4801 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4802 is nonnull, introducing extra elements of that value will not change the
4806 get_initial_defs_for_reduction (loop_vec_info loop_vinfo
,
4807 stmt_vec_info reduc_info
,
4808 vec
<tree
> *vec_oprnds
,
4809 unsigned int number_of_vectors
,
4810 unsigned int group_size
, tree neutral_op
)
4812 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
4813 unsigned HOST_WIDE_INT nunits
;
4814 unsigned j
, number_of_places_left_in_vector
;
4815 tree vector_type
= STMT_VINFO_VECTYPE (reduc_info
);
4818 gcc_assert (group_size
== initial_values
.length () || neutral_op
);
4820 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4821 created vectors. It is greater than 1 if unrolling is performed.
4823 For example, we have two scalar operands, s1 and s2 (e.g., group of
4824 strided accesses of size two), while NUNITS is four (i.e., four scalars
4825 of this type can be packed in a vector). The output vector will contain
4826 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4829 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4830 vectors containing the operands.
4832 For example, NUNITS is four as before, and the group size is 8
4833 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4834 {s5, s6, s7, s8}. */
4836 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4837 nunits
= group_size
;
4839 number_of_places_left_in_vector
= nunits
;
4840 bool constant_p
= true;
4841 tree_vector_builder
elts (vector_type
, nunits
, 1);
4842 elts
.quick_grow (nunits
);
4843 gimple_seq ctor_seq
= NULL
;
4844 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4849 /* Get the def before the loop. In reduction chain we have only
4850 one initial value. Else we have as many as PHIs in the group. */
4851 if (i
>= initial_values
.length () || (j
> i
&& neutral_op
))
4854 op
= initial_values
[i
];
4856 /* Create 'vect_ = {op0,op1,...,opn}'. */
4857 number_of_places_left_in_vector
--;
4858 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4859 if (!CONSTANT_CLASS_P (op
))
4862 if (number_of_places_left_in_vector
== 0)
4865 if (constant_p
&& !neutral_op
4866 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4867 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4868 /* Build the vector directly from ELTS. */
4869 init
= gimple_build_vector (&ctor_seq
, &elts
);
4870 else if (neutral_op
)
4872 /* Build a vector of the neutral value and shift the
4873 other elements into place. */
4874 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4877 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4882 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4883 vector_type
, init
, elts
[k
]);
4888 /* First time round, duplicate ELTS to fill the
4889 required number of vectors. */
4890 duplicate_and_interleave (loop_vinfo
, &ctor_seq
, vector_type
,
4891 elts
, number_of_vectors
, *vec_oprnds
);
4894 vec_oprnds
->quick_push (init
);
4896 number_of_places_left_in_vector
= nunits
;
4897 elts
.new_vector (vector_type
, nunits
, 1);
4898 elts
.quick_grow (nunits
);
4902 if (ctor_seq
!= NULL
)
4903 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, ctor_seq
);
4906 /* For a statement STMT_INFO taking part in a reduction operation return
4907 the stmt_vec_info the meta information is stored on. */
4910 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
4912 stmt_info
= vect_orig_stmt (stmt_info
);
4913 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
4914 if (!is_a
<gphi
*> (stmt_info
->stmt
)
4915 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
4916 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4917 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
4918 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4920 if (gimple_phi_num_args (phi
) == 1)
4921 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4923 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
4925 stmt_vec_info info
= vinfo
->lookup_def (vect_phi_initial_value (phi
));
4926 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
4932 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4933 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4937 vect_find_reusable_accumulator (loop_vec_info loop_vinfo
,
4938 stmt_vec_info reduc_info
)
4940 loop_vec_info main_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
4941 if (!main_loop_vinfo
)
4944 if (STMT_VINFO_REDUC_TYPE (reduc_info
) != TREE_CODE_REDUCTION
)
4947 unsigned int num_phis
= reduc_info
->reduc_initial_values
.length ();
4948 auto_vec
<tree
, 16> main_loop_results (num_phis
);
4949 auto_vec
<tree
, 16> initial_values (num_phis
);
4950 if (edge main_loop_edge
= loop_vinfo
->main_loop_edge
)
4952 /* The epilogue loop can be entered either from the main loop or
4953 from an earlier guard block. */
4954 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
4955 for (tree incoming_value
: reduc_info
->reduc_initial_values
)
4959 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4960 INITIAL_VALUE(guard block)>. */
4961 gcc_assert (TREE_CODE (incoming_value
) == SSA_NAME
);
4963 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (incoming_value
));
4964 gcc_assert (gimple_bb (phi
) == main_loop_edge
->dest
);
4966 tree from_main_loop
= PHI_ARG_DEF_FROM_EDGE (phi
, main_loop_edge
);
4967 tree from_skip
= PHI_ARG_DEF_FROM_EDGE (phi
, skip_edge
);
4969 main_loop_results
.quick_push (from_main_loop
);
4970 initial_values
.quick_push (from_skip
);
4974 /* The main loop dominates the epilogue loop. */
4975 main_loop_results
.splice (reduc_info
->reduc_initial_values
);
4977 /* See if the main loop has the kind of accumulator we need. */
4978 vect_reusable_accumulator
*accumulator
4979 = main_loop_vinfo
->reusable_accumulators
.get (main_loop_results
[0]);
4981 || num_phis
!= accumulator
->reduc_info
->reduc_scalar_results
.length ()
4982 || !std::equal (main_loop_results
.begin (), main_loop_results
.end (),
4983 accumulator
->reduc_info
->reduc_scalar_results
.begin ()))
4986 /* Handle the case where we can reduce wider vectors to narrower ones. */
4987 tree vectype
= STMT_VINFO_VECTYPE (reduc_info
);
4988 tree old_vectype
= TREE_TYPE (accumulator
->reduc_input
);
4989 unsigned HOST_WIDE_INT m
;
4990 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype
),
4991 TYPE_VECTOR_SUBPARTS (vectype
), &m
))
4993 /* Check the intermediate vector types and operations are available. */
4994 tree prev_vectype
= old_vectype
;
4995 poly_uint64 intermediate_nunits
= TYPE_VECTOR_SUBPARTS (old_vectype
);
4996 while (known_gt (intermediate_nunits
, TYPE_VECTOR_SUBPARTS (vectype
)))
4998 intermediate_nunits
= exact_div (intermediate_nunits
, 2);
4999 tree intermediate_vectype
= get_related_vectype_for_scalar_type
5000 (TYPE_MODE (vectype
), TREE_TYPE (vectype
), intermediate_nunits
);
5001 if (!intermediate_vectype
5002 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info
),
5003 intermediate_vectype
)
5004 || !can_vec_extract (TYPE_MODE (prev_vectype
),
5005 TYPE_MODE (intermediate_vectype
)))
5007 prev_vectype
= intermediate_vectype
;
5010 /* Non-SLP reductions might apply an adjustment after the reduction
5011 operation, in order to simplify the initialization of the accumulator.
5012 If the epilogue loop carries on from where the main loop left off,
5013 it should apply the same adjustment to the final reduction result.
5015 If the epilogue loop can also be entered directly (rather than via
5016 the main loop), we need to be able to handle that case in the same way,
5017 with the same adjustment. (In principle we could add a PHI node
5018 to select the correct adjustment, but in practice that shouldn't be
5020 tree main_adjustment
5021 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator
->reduc_info
);
5022 if (loop_vinfo
->main_loop_edge
&& main_adjustment
)
5024 gcc_assert (num_phis
== 1);
5025 tree initial_value
= initial_values
[0];
5026 /* Check that we can use INITIAL_VALUE as the adjustment and
5027 initialize the accumulator with a neutral value instead. */
5028 if (!operand_equal_p (initial_value
, main_adjustment
))
5030 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5031 initial_values
[0] = neutral_op_for_reduction (TREE_TYPE (initial_value
),
5032 code
, initial_value
);
5034 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = main_adjustment
;
5035 reduc_info
->reduc_initial_values
.truncate (0);
5036 reduc_info
->reduc_initial_values
.splice (initial_values
);
5037 reduc_info
->reused_accumulator
= accumulator
;
5041 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5042 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5045 vect_create_partial_epilog (tree vec_def
, tree vectype
, code_helper code
,
5048 unsigned nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def
)).to_constant ();
5049 unsigned nunits1
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5050 tree stype
= TREE_TYPE (vectype
);
5051 tree new_temp
= vec_def
;
5052 while (nunits
> nunits1
)
5055 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5057 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5059 /* The target has to make sure we support lowpart/highpart
5060 extraction, either via direct vector extract or through
5061 an integer mode punning. */
5063 gimple
*epilog_stmt
;
5064 if (convert_optab_handler (vec_extract_optab
,
5065 TYPE_MODE (TREE_TYPE (new_temp
)),
5066 TYPE_MODE (vectype1
))
5067 != CODE_FOR_nothing
)
5069 /* Extract sub-vectors directly once vec_extract becomes
5070 a conversion optab. */
5071 dst1
= make_ssa_name (vectype1
);
5073 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5074 build3 (BIT_FIELD_REF
, vectype1
,
5075 new_temp
, TYPE_SIZE (vectype1
),
5077 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5078 dst2
= make_ssa_name (vectype1
);
5080 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5081 build3 (BIT_FIELD_REF
, vectype1
,
5082 new_temp
, TYPE_SIZE (vectype1
),
5083 bitsize_int (bitsize
)));
5084 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5088 /* Extract via punning to appropriately sized integer mode
5090 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5091 tree etype
= build_vector_type (eltype
, 2);
5092 gcc_assert (convert_optab_handler (vec_extract_optab
,
5095 != CODE_FOR_nothing
);
5096 tree tem
= make_ssa_name (etype
);
5097 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5098 build1 (VIEW_CONVERT_EXPR
,
5100 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5102 tem
= make_ssa_name (eltype
);
5104 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5105 build3 (BIT_FIELD_REF
, eltype
,
5106 new_temp
, TYPE_SIZE (eltype
),
5108 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5109 dst1
= make_ssa_name (vectype1
);
5110 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5111 build1 (VIEW_CONVERT_EXPR
,
5113 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5114 tem
= make_ssa_name (eltype
);
5116 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5117 build3 (BIT_FIELD_REF
, eltype
,
5118 new_temp
, TYPE_SIZE (eltype
),
5119 bitsize_int (bitsize
)));
5120 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5121 dst2
= make_ssa_name (vectype1
);
5122 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5123 build1 (VIEW_CONVERT_EXPR
,
5125 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5128 new_temp
= gimple_build (seq
, code
, vectype1
, dst1
, dst2
);
5134 /* Function vect_create_epilog_for_reduction
5136 Create code at the loop-epilog to finalize the result of a reduction
5139 STMT_INFO is the scalar reduction stmt that is being vectorized.
5140 SLP_NODE is an SLP node containing a group of reduction statements. The
5141 first one in this group is STMT_INFO.
5142 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5143 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5147 1. Completes the reduction def-use cycles.
5148 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5149 by calling the function specified by REDUC_FN if available, or by
5150 other means (whole-vector shifts or a scalar loop).
5151 The function also creates a new phi node at the loop exit to preserve
5152 loop-closed form, as illustrated below.
5154 The flow at the entry to this function:
5157 vec_def = phi <vec_init, null> # REDUCTION_PHI
5158 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5159 s_loop = scalar_stmt # (scalar) STMT_INFO
5161 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5165 The above is transformed by this function into:
5168 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5169 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5170 s_loop = scalar_stmt # (scalar) STMT_INFO
5172 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5173 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5174 v_out2 = reduce <v_out1>
5175 s_out3 = extract_field <v_out2, 0>
5176 s_out4 = adjust_result <s_out3>
5182 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
5183 stmt_vec_info stmt_info
,
5185 slp_instance slp_node_instance
)
5187 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5188 gcc_assert (reduc_info
->is_reduc_info
);
5189 /* For double reductions we need to get at the inner loop reduction
5190 stmt which has the meta info attached. Our stmt_info is that of the
5191 loop-closed PHI of the inner loop which we remember as
5192 def for the reduction PHI generation. */
5193 bool double_reduc
= false;
5194 stmt_vec_info rdef_info
= stmt_info
;
5195 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5197 gcc_assert (!slp_node
);
5198 double_reduc
= true;
5199 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
5200 (stmt_info
->stmt
, 0));
5201 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
5203 gphi
*reduc_def_stmt
5204 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
5205 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5206 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
5209 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
5210 basic_block exit_bb
;
5213 gimple
*new_phi
= NULL
, *phi
;
5214 gimple_stmt_iterator exit_gsi
;
5215 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
5216 gimple
*epilog_stmt
= NULL
;
5220 tree orig_name
, scalar_result
;
5221 imm_use_iterator imm_iter
, phi_imm_iter
;
5222 use_operand_p use_p
, phi_use_p
;
5224 auto_vec
<tree
> reduc_inputs
;
5226 vec
<tree
> &scalar_results
= reduc_info
->reduc_scalar_results
;
5227 unsigned int group_size
= 1, k
;
5228 auto_vec
<gimple
*> phis
;
5229 /* SLP reduction without reduction chain, e.g.,
5233 b2 = operation (b1) */
5234 bool slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5235 bool direct_slp_reduc
;
5236 tree induction_index
= NULL_TREE
;
5239 group_size
= SLP_TREE_LANES (slp_node
);
5241 if (nested_in_vect_loop_p (loop
, stmt_info
))
5245 gcc_assert (!slp_node
&& double_reduc
);
5248 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
5249 gcc_assert (vectype
);
5250 mode
= TYPE_MODE (vectype
);
5252 tree induc_val
= NULL_TREE
;
5253 tree adjustment_def
= NULL
;
5258 /* Optimize: for induction condition reduction, if we can't use zero
5259 for induc_val, use initial_def. */
5260 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5261 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
5262 else if (double_reduc
)
5265 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
5268 stmt_vec_info single_live_out_stmt
[] = { stmt_info
};
5269 array_slice
<const stmt_vec_info
> live_out_stmts
= single_live_out_stmt
;
5271 /* All statements produce live-out values. */
5272 live_out_stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
5274 /* The last statement in the reduction chain produces the live-out
5276 single_live_out_stmt
[0] = SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5282 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
5287 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
5289 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
5292 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5293 which is updated with the current index of the loop for every match of
5294 the original loop's cond_expr (VEC_STMT). This results in a vector
5295 containing the last time the condition passed for that vector lane.
5296 The first match will be a 1 to allow 0 to be used for non-matching
5297 indexes. If there are no matches at all then the vector will be all
5300 PR92772: This algorithm is broken for architectures that support
5301 masked vectors, but do not provide fold_extract_last. */
5302 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
5304 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
5305 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
5306 cond_info
= vect_stmt_to_vectorize (cond_info
);
5307 while (cond_info
!= reduc_info
)
5309 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
5311 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
5312 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
5314 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
5315 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
5318 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
5319 1 + STMT_VINFO_REDUC_IDX
5321 cond_info
= vect_stmt_to_vectorize (cond_info
);
5323 gcc_assert (ccompares
.length () != 0);
5325 tree indx_before_incr
, indx_after_incr
;
5326 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
5327 int scalar_precision
5328 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
5329 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5330 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
5331 (TYPE_MODE (vectype
), cr_index_scalar_type
,
5332 TYPE_VECTOR_SUBPARTS (vectype
));
5334 /* First we create a simple vector induction variable which starts
5335 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5336 vector size (STEP). */
5338 /* Create a {1,2,3,...} vector. */
5339 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
5341 /* Create a vector of the step value. */
5342 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
5343 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
5345 /* Create an induction variable. */
5346 gimple_stmt_iterator incr_gsi
;
5348 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5349 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
5350 insert_after
, &indx_before_incr
, &indx_after_incr
);
5352 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5353 filled with zeros (VEC_ZERO). */
5355 /* Create a vector of 0s. */
5356 tree zero
= build_zero_cst (cr_index_scalar_type
);
5357 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
5359 /* Create a vector phi node. */
5360 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
5361 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
5362 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
5363 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
5365 /* Now take the condition from the loops original cond_exprs
5366 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5367 every match uses values from the induction variable
5368 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5370 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5371 the new cond_expr (INDEX_COND_EXPR). */
5372 gimple_seq stmts
= NULL
;
5373 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
5375 tree ccompare
= ccompares
[i
].first
;
5376 if (ccompares
[i
].second
)
5377 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5378 cr_index_vector_type
,
5380 indx_before_incr
, new_phi_tree
);
5382 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5383 cr_index_vector_type
,
5385 new_phi_tree
, indx_before_incr
);
5387 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
5389 /* Update the phi with the vec cond. */
5390 induction_index
= new_phi_tree
;
5391 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
5392 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
5395 /* 2. Create epilog code.
5396 The reduction epilog code operates across the elements of the vector
5397 of partial results computed by the vectorized loop.
5398 The reduction epilog code consists of:
5400 step 1: compute the scalar result in a vector (v_out2)
5401 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5402 step 3: adjust the scalar result (s_out3) if needed.
5404 Step 1 can be accomplished using one the following three schemes:
5405 (scheme 1) using reduc_fn, if available.
5406 (scheme 2) using whole-vector shifts, if available.
5407 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5410 The overall epilog code looks like this:
5412 s_out0 = phi <s_loop> # original EXIT_PHI
5413 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5414 v_out2 = reduce <v_out1> # step 1
5415 s_out3 = extract_field <v_out2, 0> # step 2
5416 s_out4 = adjust_result <s_out3> # step 3
5418 (step 3 is optional, and steps 1 and 2 may be combined).
5419 Lastly, the uses of s_out0 are replaced by s_out4. */
5422 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5423 v_out1 = phi <VECT_DEF>
5424 Store them in NEW_PHIS. */
5427 exit_bb
= single_exit (loop
)->dest
;
5428 exit_gsi
= gsi_after_labels (exit_bb
);
5429 reduc_inputs
.create (slp_node
? vec_num
: ncopies
);
5430 for (unsigned i
= 0; i
< vec_num
; i
++)
5432 gimple_seq stmts
= NULL
;
5434 def
= vect_get_slp_vect_def (slp_node
, i
);
5436 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
5437 for (j
= 0; j
< ncopies
; j
++)
5439 tree new_def
= copy_ssa_name (def
);
5440 phi
= create_phi_node (new_def
, exit_bb
);
5442 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
5443 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5444 new_def
= gimple_convert (&stmts
, vectype
, new_def
);
5445 reduc_inputs
.quick_push (new_def
);
5447 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5450 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5451 (i.e. when reduc_fn is not available) and in the final adjustment
5452 code (if needed). Also get the original scalar reduction variable as
5453 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5454 represents a reduction pattern), the tree-code and scalar-def are
5455 taken from the original stmt that the pattern-stmt (STMT) replaces.
5456 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5457 are taken from STMT. */
5459 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5460 if (orig_stmt_info
!= stmt_info
)
5462 /* Reduction pattern */
5463 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5464 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
5467 scalar_dest
= gimple_get_lhs (orig_stmt_info
->stmt
);
5468 scalar_type
= TREE_TYPE (scalar_dest
);
5469 scalar_results
.create (group_size
);
5470 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5471 bitsize
= TYPE_SIZE (scalar_type
);
5473 /* True if we should implement SLP_REDUC using native reduction operations
5474 instead of scalar operations. */
5475 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5477 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5479 /* In case of reduction chain, e.g.,
5482 a3 = operation (a2),
5484 we may end up with more than one vector result. Here we reduce them
5487 The same is true if we couldn't use a single defuse cycle. */
5488 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
5492 gimple_seq stmts
= NULL
;
5493 tree single_input
= reduc_inputs
[0];
5494 for (k
= 1; k
< reduc_inputs
.length (); k
++)
5495 single_input
= gimple_build (&stmts
, code
, vectype
,
5496 single_input
, reduc_inputs
[k
]);
5497 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5499 reduc_inputs
.truncate (0);
5500 reduc_inputs
.safe_push (single_input
);
5503 tree orig_reduc_input
= reduc_inputs
[0];
5505 /* If this loop is an epilogue loop that can be skipped after the
5506 main loop, we can only share a reduction operation between the
5507 main loop and the epilogue if we put it at the target of the
5510 We can still reuse accumulators if this check fails. Doing so has
5511 the minor(?) benefit of making the epilogue loop's scalar result
5512 independent of the main loop's scalar result. */
5513 bool unify_with_main_loop_p
= false;
5514 if (reduc_info
->reused_accumulator
5515 && loop_vinfo
->skip_this_loop_edge
5516 && single_succ_p (exit_bb
)
5517 && single_succ (exit_bb
) == loop_vinfo
->skip_this_loop_edge
->dest
)
5519 unify_with_main_loop_p
= true;
5521 basic_block reduc_block
= loop_vinfo
->skip_this_loop_edge
->dest
;
5522 reduc_inputs
[0] = make_ssa_name (vectype
);
5523 gphi
*new_phi
= create_phi_node (reduc_inputs
[0], reduc_block
);
5524 add_phi_arg (new_phi
, orig_reduc_input
, single_succ_edge (exit_bb
),
5526 add_phi_arg (new_phi
, reduc_info
->reused_accumulator
->reduc_input
,
5527 loop_vinfo
->skip_this_loop_edge
, UNKNOWN_LOCATION
);
5528 exit_gsi
= gsi_after_labels (reduc_block
);
5531 /* Shouldn't be used beyond this point. */
5534 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5535 && reduc_fn
!= IFN_LAST
)
5537 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5538 various data values where the condition matched and another vector
5539 (INDUCTION_INDEX) containing all the indexes of those matches. We
5540 need to extract the last matching index (which will be the index with
5541 highest value) and use this to index into the data vector.
5542 For the case where there were no matches, the data vector will contain
5543 all default values and the index vector will be all zeros. */
5545 /* Get various versions of the type of the vector of indexes. */
5546 tree index_vec_type
= TREE_TYPE (induction_index
);
5547 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5548 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5549 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
5551 /* Get an unsigned integer version of the type of the data vector. */
5552 int scalar_precision
5553 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5554 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5555 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
5558 /* First we need to create a vector (ZERO_VEC) of zeros and another
5559 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5560 can create using a MAX reduction and then expanding.
5561 In the case where the loop never made any matches, the max index will
5564 /* Vector of {0, 0, 0,...}. */
5565 tree zero_vec
= build_zero_cst (vectype
);
5567 /* Find maximum value from the vector of found indexes. */
5568 tree max_index
= make_ssa_name (index_scalar_type
);
5569 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5570 1, induction_index
);
5571 gimple_call_set_lhs (max_index_stmt
, max_index
);
5572 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5574 /* Vector of {max_index, max_index, max_index,...}. */
5575 tree max_index_vec
= make_ssa_name (index_vec_type
);
5576 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5578 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5580 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5582 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5583 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5584 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5585 otherwise. Only one value should match, resulting in a vector
5586 (VEC_COND) with one data value and the rest zeros.
5587 In the case where the loop never made any matches, every index will
5588 match, resulting in a vector with all data values (which will all be
5589 the default value). */
5591 /* Compare the max index vector to the vector of found indexes to find
5592 the position of the max value. */
5593 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5594 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5597 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5599 /* Use the compare to choose either values from the data vector or
5601 tree vec_cond
= make_ssa_name (vectype
);
5602 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5606 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5608 /* Finally we need to extract the data value from the vector (VEC_COND)
5609 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5610 reduction, but because this doesn't exist, we can use a MAX reduction
5611 instead. The data value might be signed or a float so we need to cast
5613 In the case where the loop never made any matches, the data values are
5614 all identical, and so will reduce down correctly. */
5616 /* Make the matched data values unsigned. */
5617 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5618 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5620 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5623 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5625 /* Reduce down to a scalar value. */
5626 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5627 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5629 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5630 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5632 /* Convert the reduced value back to the result type and set as the
5634 gimple_seq stmts
= NULL
;
5635 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5637 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5638 scalar_results
.safe_push (new_temp
);
5640 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5641 && reduc_fn
== IFN_LAST
)
5643 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5645 idx_val = induction_index[0];
5646 val = data_reduc[0];
5647 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5648 if (induction_index[i] > idx_val)
5649 val = data_reduc[i], idx_val = induction_index[i];
5652 tree data_eltype
= TREE_TYPE (vectype
);
5653 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5654 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5655 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5656 /* Enforced by vectorizable_reduction, which ensures we have target
5657 support before allowing a conditional reduction on variable-length
5659 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5660 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5661 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5663 tree old_idx_val
= idx_val
;
5665 idx_val
= make_ssa_name (idx_eltype
);
5666 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5667 build3 (BIT_FIELD_REF
, idx_eltype
,
5669 bitsize_int (el_size
),
5670 bitsize_int (off
)));
5671 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5672 val
= make_ssa_name (data_eltype
);
5673 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5674 build3 (BIT_FIELD_REF
,
5677 bitsize_int (el_size
),
5678 bitsize_int (off
)));
5679 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5682 tree new_idx_val
= idx_val
;
5683 if (off
!= v_size
- el_size
)
5685 new_idx_val
= make_ssa_name (idx_eltype
);
5686 epilog_stmt
= gimple_build_assign (new_idx_val
,
5689 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5691 tree new_val
= make_ssa_name (data_eltype
);
5692 epilog_stmt
= gimple_build_assign (new_val
,
5699 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5700 idx_val
= new_idx_val
;
5704 /* Convert the reduced value back to the result type and set as the
5706 gimple_seq stmts
= NULL
;
5707 val
= gimple_convert (&stmts
, scalar_type
, val
);
5708 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5709 scalar_results
.safe_push (val
);
5712 /* 2.3 Create the reduction code, using one of the three schemes described
5713 above. In SLP we simply need to extract all the elements from the
5714 vector (without reducing them), so we use scalar shifts. */
5715 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5721 v_out2 = reduc_expr <v_out1> */
5723 if (dump_enabled_p ())
5724 dump_printf_loc (MSG_NOTE
, vect_location
,
5725 "Reduce using direct vector reduction.\n");
5727 gimple_seq stmts
= NULL
;
5728 vec_elem_type
= TREE_TYPE (vectype
);
5729 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5730 vec_elem_type
, reduc_inputs
[0]);
5731 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5732 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5734 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5737 /* Earlier we set the initial value to be a vector if induc_val
5738 values. Check the result and if it is induc_val then replace
5739 with the original initial value, unless induc_val is
5740 the same as initial_def already. */
5741 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5743 tree initial_def
= reduc_info
->reduc_initial_values
[0];
5745 tmp
= make_ssa_name (new_scalar_dest
);
5746 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5747 initial_def
, new_temp
);
5748 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5752 scalar_results
.safe_push (new_temp
);
5754 else if (direct_slp_reduc
)
5756 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5757 with the elements for other SLP statements replaced with the
5758 neutral value. We can then do a normal reduction on each vector. */
5760 /* Enforced by vectorizable_reduction. */
5761 gcc_assert (reduc_inputs
.length () == 1);
5762 gcc_assert (pow2p_hwi (group_size
));
5764 gimple_seq seq
= NULL
;
5766 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5767 and the same element size as VECTYPE. */
5768 tree index
= build_index_vector (vectype
, 0, 1);
5769 tree index_type
= TREE_TYPE (index
);
5770 tree index_elt_type
= TREE_TYPE (index_type
);
5771 tree mask_type
= truth_type_for (index_type
);
5773 /* Create a vector that, for each element, identifies which of
5774 the REDUC_GROUP_SIZE results should use it. */
5775 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5776 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5777 build_vector_from_val (index_type
, index_mask
));
5779 /* Get a neutral vector value. This is simply a splat of the neutral
5780 scalar value if we have one, otherwise the initial scalar value
5781 is itself a neutral value. */
5782 tree vector_identity
= NULL_TREE
;
5783 tree neutral_op
= NULL_TREE
;
5786 tree initial_value
= NULL_TREE
;
5787 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5788 initial_value
= reduc_info
->reduc_initial_values
[0];
5789 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype
), code
,
5793 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5795 for (unsigned int i
= 0; i
< group_size
; ++i
)
5797 /* If there's no univeral neutral value, we can use the
5798 initial scalar value from the original PHI. This is used
5799 for MIN and MAX reduction, for example. */
5802 tree scalar_value
= reduc_info
->reduc_initial_values
[i
];
5803 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
5805 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5809 /* Calculate the equivalent of:
5811 sel[j] = (index[j] == i);
5813 which selects the elements of REDUC_INPUTS[0] that should
5814 be included in the result. */
5815 tree compare_val
= build_int_cst (index_elt_type
, i
);
5816 compare_val
= build_vector_from_val (index_type
, compare_val
);
5817 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5818 index
, compare_val
);
5820 /* Calculate the equivalent of:
5822 vec = seq ? reduc_inputs[0] : vector_identity;
5824 VEC is now suitable for a full vector reduction. */
5825 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5826 sel
, reduc_inputs
[0], vector_identity
);
5828 /* Do the reduction and convert it to the appropriate type. */
5829 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5830 TREE_TYPE (vectype
), vec
);
5831 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5832 scalar_results
.safe_push (scalar
);
5834 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5838 bool reduce_with_shift
;
5841 gcc_assert (slp_reduc
|| reduc_inputs
.length () == 1);
5843 /* See if the target wants to do the final (shift) reduction
5844 in a vector mode of smaller size and first reduce upper/lower
5845 halves against each other. */
5846 enum machine_mode mode1
= mode
;
5847 tree stype
= TREE_TYPE (vectype
);
5848 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5849 unsigned nunits1
= nunits
;
5850 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
5851 && reduc_inputs
.length () == 1)
5853 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5854 /* For SLP reductions we have to make sure lanes match up, but
5855 since we're doing individual element final reduction reducing
5856 vector width here is even more important.
5857 ??? We can also separate lanes with permutes, for the common
5858 case of power-of-two group-size odd/even extracts would work. */
5859 if (slp_reduc
&& nunits
!= nunits1
)
5861 nunits1
= least_common_multiple (nunits1
, group_size
);
5862 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
5866 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5867 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5869 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5871 reduce_with_shift
= have_whole_vector_shift (mode1
);
5872 if (!VECTOR_MODE_P (mode1
)
5873 || !directly_supported_p (code
, vectype1
))
5874 reduce_with_shift
= false;
5876 /* First reduce the vector to the desired vector size we should
5877 do shift reduction on by combining upper and lower halves. */
5878 gimple_seq stmts
= NULL
;
5879 new_temp
= vect_create_partial_epilog (reduc_inputs
[0], vectype1
,
5881 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5882 reduc_inputs
[0] = new_temp
;
5884 if (reduce_with_shift
&& !slp_reduc
)
5886 int element_bitsize
= tree_to_uhwi (bitsize
);
5887 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5888 for variable-length vectors and also requires direct target support
5889 for loop reductions. */
5890 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5891 int nelements
= vec_size_in_bits
/ element_bitsize
;
5892 vec_perm_builder sel
;
5893 vec_perm_indices indices
;
5897 tree zero_vec
= build_zero_cst (vectype1
);
5899 for (offset = nelements/2; offset >= 1; offset/=2)
5901 Create: va' = vec_shift <va, offset>
5902 Create: va = vop <va, va'>
5907 if (dump_enabled_p ())
5908 dump_printf_loc (MSG_NOTE
, vect_location
,
5909 "Reduce using vector shifts\n");
5911 gimple_seq stmts
= NULL
;
5912 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
5913 for (elt_offset
= nelements
/ 2;
5917 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5918 indices
.new_vector (sel
, 2, nelements
);
5919 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5920 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
5921 new_temp
, zero_vec
, mask
);
5922 new_temp
= gimple_build (&stmts
, code
,
5923 vectype1
, new_name
, new_temp
);
5925 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5927 /* 2.4 Extract the final scalar result. Create:
5928 s_out3 = extract_field <v_out2, bitpos> */
5930 if (dump_enabled_p ())
5931 dump_printf_loc (MSG_NOTE
, vect_location
,
5932 "extract scalar result\n");
5934 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5935 bitsize
, bitsize_zero_node
);
5936 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5937 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5938 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5939 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5940 scalar_results
.safe_push (new_temp
);
5945 s = extract_field <v_out2, 0>
5946 for (offset = element_size;
5947 offset < vector_size;
5948 offset += element_size;)
5950 Create: s' = extract_field <v_out2, offset>
5951 Create: s = op <s, s'> // For non SLP cases
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE
, vect_location
,
5956 "Reduce using scalar code.\n");
5958 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5959 int element_bitsize
= tree_to_uhwi (bitsize
);
5960 tree compute_type
= TREE_TYPE (vectype
);
5961 gimple_seq stmts
= NULL
;
5962 FOR_EACH_VEC_ELT (reduc_inputs
, i
, vec_temp
)
5965 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
5966 vec_temp
, bitsize
, bitsize_zero_node
);
5968 /* In SLP we don't need to apply reduction operation, so we just
5969 collect s' values in SCALAR_RESULTS. */
5971 scalar_results
.safe_push (new_temp
);
5973 for (bit_offset
= element_bitsize
;
5974 bit_offset
< vec_size_in_bits
;
5975 bit_offset
+= element_bitsize
)
5977 tree bitpos
= bitsize_int (bit_offset
);
5978 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
5979 compute_type
, vec_temp
,
5983 /* In SLP we don't need to apply reduction operation, so
5984 we just collect s' values in SCALAR_RESULTS. */
5985 new_temp
= new_name
;
5986 scalar_results
.safe_push (new_name
);
5989 new_temp
= gimple_build (&stmts
, code
, compute_type
,
5990 new_name
, new_temp
);
5994 /* The only case where we need to reduce scalar results in SLP, is
5995 unrolling. If the size of SCALAR_RESULTS is greater than
5996 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5997 REDUC_GROUP_SIZE. */
6000 tree res
, first_res
, new_res
;
6002 /* Reduce multiple scalar results in case of SLP unrolling. */
6003 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
6006 first_res
= scalar_results
[j
% group_size
];
6007 new_res
= gimple_build (&stmts
, code
, compute_type
,
6009 scalar_results
[j
% group_size
] = new_res
;
6011 scalar_results
.truncate (group_size
);
6012 for (k
= 0; k
< group_size
; k
++)
6013 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
6018 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6019 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6020 scalar_results
.safe_push (new_temp
);
6023 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6026 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6029 /* Earlier we set the initial value to be a vector if induc_val
6030 values. Check the result and if it is induc_val then replace
6031 with the original initial value, unless induc_val is
6032 the same as initial_def already. */
6033 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
6035 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6037 tree tmp
= make_ssa_name (new_scalar_dest
);
6038 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6039 initial_def
, new_temp
);
6040 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6041 scalar_results
[0] = tmp
;
6045 /* 2.5 Adjust the final result by the initial value of the reduction
6046 variable. (When such adjustment is not needed, then
6047 'adjustment_def' is zero). For example, if code is PLUS we create:
6048 new_temp = loop_exit_def + adjustment_def */
6052 gcc_assert (!slp_reduc
);
6053 gimple_seq stmts
= NULL
;
6056 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
6057 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
6058 new_temp
= gimple_build (&stmts
, code
, vectype
,
6059 reduc_inputs
[0], adjustment_def
);
6063 new_temp
= scalar_results
[0];
6064 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
6065 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
6066 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
6067 new_temp
, adjustment_def
);
6070 epilog_stmt
= gimple_seq_last_stmt (stmts
);
6071 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6072 scalar_results
[0] = new_temp
;
6075 /* Record this operation if it could be reused by the epilogue loop. */
6076 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
)
6077 loop_vinfo
->reusable_accumulators
.put (scalar_results
[0],
6078 { orig_reduc_input
, reduc_info
});
6083 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6084 phis with new adjusted scalar results, i.e., replace use <s_out0>
6089 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6090 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6091 v_out2 = reduce <v_out1>
6092 s_out3 = extract_field <v_out2, 0>
6093 s_out4 = adjust_result <s_out3>
6100 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6101 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6102 v_out2 = reduce <v_out1>
6103 s_out3 = extract_field <v_out2, 0>
6104 s_out4 = adjust_result <s_out3>
6108 gcc_assert (live_out_stmts
.size () == scalar_results
.length ());
6109 for (k
= 0; k
< live_out_stmts
.size (); k
++)
6111 stmt_vec_info scalar_stmt_info
= vect_orig_stmt (live_out_stmts
[k
]);
6112 scalar_dest
= gimple_get_lhs (scalar_stmt_info
->stmt
);
6115 /* Find the loop-closed-use at the loop exit of the original scalar
6116 result. (The reduction result is expected to have two immediate uses,
6117 one at the latch block, and one at the loop exit). For double
6118 reductions we are looking for exit phis of the outer loop. */
6119 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6121 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6123 if (!is_gimple_debug (USE_STMT (use_p
)))
6124 phis
.safe_push (USE_STMT (use_p
));
6128 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6130 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6132 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6134 if (!flow_bb_inside_loop_p (loop
,
6135 gimple_bb (USE_STMT (phi_use_p
)))
6136 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6137 phis
.safe_push (USE_STMT (phi_use_p
));
6143 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6145 /* Replace the uses: */
6146 orig_name
= PHI_RESULT (exit_phi
);
6148 /* Look for a single use at the target of the skip edge. */
6149 if (unify_with_main_loop_p
)
6151 use_operand_p use_p
;
6153 if (!single_imm_use (orig_name
, &use_p
, &user
))
6155 orig_name
= gimple_get_lhs (user
);
6158 scalar_result
= scalar_results
[k
];
6159 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6161 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6162 SET_USE (use_p
, scalar_result
);
6163 update_stmt (use_stmt
);
6171 /* Return a vector of type VECTYPE that is equal to the vector select
6172 operation "MASK ? VEC : IDENTITY". Insert the select statements
6176 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6177 tree vec
, tree identity
)
6179 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6180 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6181 mask
, vec
, identity
);
6182 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6186 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6187 order, starting with LHS. Insert the extraction statements before GSI and
6188 associate the new scalar SSA names with variable SCALAR_DEST.
6189 Return the SSA name for the result. */
6192 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6193 tree_code code
, tree lhs
, tree vector_rhs
)
6195 tree vectype
= TREE_TYPE (vector_rhs
);
6196 tree scalar_type
= TREE_TYPE (vectype
);
6197 tree bitsize
= TYPE_SIZE (scalar_type
);
6198 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6199 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6201 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6202 bit_offset
< vec_size_in_bits
;
6203 bit_offset
+= element_bitsize
)
6205 tree bitpos
= bitsize_int (bit_offset
);
6206 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6209 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6210 rhs
= make_ssa_name (scalar_dest
, stmt
);
6211 gimple_assign_set_lhs (stmt
, rhs
);
6212 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6214 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6215 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6216 gimple_assign_set_lhs (stmt
, new_name
);
6217 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6223 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6224 type of the vector input. */
6227 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
6229 internal_fn mask_reduc_fn
;
6233 case IFN_FOLD_LEFT_PLUS
:
6234 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
6241 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
6242 OPTIMIZE_FOR_SPEED
))
6243 return mask_reduc_fn
;
6247 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6248 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6249 statement. CODE is the operation performed by STMT_INFO and OPS are
6250 its scalar operands. REDUC_INDEX is the index of the operand in
6251 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6252 implements in-order reduction, or IFN_LAST if we should open-code it.
6253 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6254 that should be used to control the operation in a fully-masked loop. */
6257 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
6258 stmt_vec_info stmt_info
,
6259 gimple_stmt_iterator
*gsi
,
6260 gimple
**vec_stmt
, slp_tree slp_node
,
6261 gimple
*reduc_def_stmt
,
6262 tree_code code
, internal_fn reduc_fn
,
6263 tree ops
[3], tree vectype_in
,
6264 int reduc_index
, vec_loop_masks
*masks
)
6266 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6267 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6268 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
6274 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6276 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
6277 gcc_assert (ncopies
== 1);
6278 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6281 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6282 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6284 tree op0
= ops
[1 - reduc_index
];
6287 stmt_vec_info scalar_dest_def_info
;
6288 auto_vec
<tree
> vec_oprnds0
;
6291 auto_vec
<vec
<tree
> > vec_defs (2);
6292 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6293 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
6294 vec_defs
[0].release ();
6295 vec_defs
[1].release ();
6296 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6297 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6301 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
6303 scalar_dest_def_info
= stmt_info
;
6306 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
6307 tree scalar_type
= TREE_TYPE (scalar_dest
);
6308 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6310 int vec_num
= vec_oprnds0
.length ();
6311 gcc_assert (vec_num
== 1 || slp_node
);
6312 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6313 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6315 tree vector_identity
= NULL_TREE
;
6316 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6317 vector_identity
= build_zero_cst (vectype_out
);
6319 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6322 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6325 tree mask
= NULL_TREE
;
6326 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6327 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6329 /* Handle MINUS by adding the negative. */
6330 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6332 tree negated
= make_ssa_name (vectype_out
);
6333 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6334 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6338 if (mask
&& mask_reduc_fn
== IFN_LAST
)
6339 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6342 /* On the first iteration the input is simply the scalar phi
6343 result, and for subsequent iterations it is the output of
6344 the preceding operation. */
6345 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
6347 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
6348 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
6351 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
6353 /* For chained SLP reductions the output of the previous reduction
6354 operation serves as the input of the next. For the final statement
6355 the output cannot be a temporary - we reuse the original
6356 scalar destination of the last statement. */
6357 if (i
!= vec_num
- 1)
6359 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6360 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6361 gimple_set_lhs (new_stmt
, reduc_var
);
6366 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6368 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6369 /* Remove the statement, so that we can use the same code paths
6370 as for statements that we've just created. */
6371 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6372 gsi_remove (&tmp_gsi
, true);
6375 if (i
== vec_num
- 1)
6377 gimple_set_lhs (new_stmt
, scalar_dest
);
6378 vect_finish_replace_stmt (loop_vinfo
,
6379 scalar_dest_def_info
,
6383 vect_finish_stmt_generation (loop_vinfo
,
6384 scalar_dest_def_info
,
6388 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6391 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
6392 *vec_stmt
= new_stmt
;
6399 /* Function is_nonwrapping_integer_induction.
6401 Check if STMT_VINO (which is part of loop LOOP) both increments and
6402 does not cause overflow. */
6405 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
6407 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
6408 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6409 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6410 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
6411 widest_int ni
, max_loop_value
, lhs_max
;
6412 wi::overflow_type overflow
= wi::OVF_NONE
;
6414 /* Make sure the loop is integer based. */
6415 if (TREE_CODE (base
) != INTEGER_CST
6416 || TREE_CODE (step
) != INTEGER_CST
)
6419 /* Check that the max size of the loop will not wrap. */
6421 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6424 if (! max_stmt_executions (loop
, &ni
))
6427 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6432 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6433 TYPE_SIGN (lhs_type
), &overflow
);
6437 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6438 <= TYPE_PRECISION (lhs_type
));
6441 /* Check if masking can be supported by inserting a conditional expression.
6442 CODE is the code for the operation. COND_FN is the conditional internal
6443 function, if it exists. VECTYPE_IN is the type of the vector input. */
6445 use_mask_by_cond_expr_p (code_helper code
, internal_fn cond_fn
,
6448 if (cond_fn
!= IFN_LAST
6449 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6450 OPTIMIZE_FOR_SPEED
))
6453 if (code
.is_tree_code ())
6454 switch (tree_code (code
))
6466 /* Insert a conditional expression to enable masked vectorization. CODE is the
6467 code for the operation. VOP is the array of operands. MASK is the loop
6468 mask. GSI is a statement iterator used to place the new conditional
6471 build_vect_cond_expr (code_helper code
, tree vop
[3], tree mask
,
6472 gimple_stmt_iterator
*gsi
)
6474 switch (tree_code (code
))
6478 tree vectype
= TREE_TYPE (vop
[1]);
6479 tree zero
= build_zero_cst (vectype
);
6480 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6481 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6482 mask
, vop
[1], zero
);
6483 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6484 vop
[1] = masked_op1
;
6490 tree vectype
= TREE_TYPE (vop
[1]);
6491 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6492 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6493 mask
, vop
[1], vop
[0]);
6494 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6495 vop
[1] = masked_op1
;
6504 /* Function vectorizable_reduction.
6506 Check if STMT_INFO performs a reduction operation that can be vectorized.
6507 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6508 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6509 Return true if STMT_INFO is vectorizable in this way.
6511 This function also handles reduction idioms (patterns) that have been
6512 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6513 may be of this form:
6514 X = pattern_expr (arg0, arg1, ..., X)
6515 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6516 sequence that had been detected and replaced by the pattern-stmt
6519 This function also handles reduction of condition expressions, for example:
6520 for (int i = 0; i < N; i++)
6523 This is handled by vectorising the loop and creating an additional vector
6524 containing the loop indexes for which "a[i] < value" was true. In the
6525 function epilogue this is reduced to a single max value and then used to
6526 index into the vector of results.
6528 In some cases of reduction patterns, the type of the reduction variable X is
6529 different than the type of the other arguments of STMT_INFO.
6530 In such cases, the vectype that is used when transforming STMT_INFO into
6531 a vector stmt is different than the vectype that is used to determine the
6532 vectorization factor, because it consists of a different number of elements
6533 than the actual number of elements that are being operated upon in parallel.
6535 For example, consider an accumulation of shorts into an int accumulator.
6536 On some targets it's possible to vectorize this pattern operating on 8
6537 shorts at a time (hence, the vectype for purposes of determining the
6538 vectorization factor should be V8HI); on the other hand, the vectype that
6539 is used to create the vector form is actually V4SI (the type of the result).
6541 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6542 indicates what is the actual level of parallelism (V8HI in the example), so
6543 that the right vectorization factor would be derived. This vectype
6544 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6545 be used to create the vectorized stmt. The right vectype for the vectorized
6546 stmt is obtained from the type of the result X:
6547 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6549 This means that, contrary to "regular" reductions (or "regular" stmts in
6550 general), the following equation:
6551 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6552 does *NOT* necessarily hold for reduction patterns. */
6555 vectorizable_reduction (loop_vec_info loop_vinfo
,
6556 stmt_vec_info stmt_info
, slp_tree slp_node
,
6557 slp_instance slp_node_instance
,
6558 stmt_vector_for_cost
*cost_vec
)
6560 tree vectype_in
= NULL_TREE
;
6561 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6562 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
6563 stmt_vec_info cond_stmt_vinfo
= NULL
;
6566 bool single_defuse_cycle
= false;
6567 bool nested_cycle
= false;
6568 bool double_reduc
= false;
6571 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6572 tree cond_reduc_val
= NULL_TREE
;
6574 /* Make sure it was already recognized as a reduction computation. */
6575 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6576 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6577 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6580 /* The stmt we store reduction analysis meta on. */
6581 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6582 reduc_info
->is_reduc_info
= true;
6584 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6586 if (is_a
<gphi
*> (stmt_info
->stmt
))
6590 /* We eventually need to set a vector type on invariant
6594 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
6595 if (!vect_maybe_update_slp_op_vectype
6596 (child
, SLP_TREE_VECTYPE (slp_node
)))
6598 if (dump_enabled_p ())
6599 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6600 "incompatible vector types for "
6605 /* Analysis for double-reduction is done on the outer
6606 loop PHI, nested cycles have no further restrictions. */
6607 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6610 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6614 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6615 stmt_vec_info phi_info
= stmt_info
;
6616 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6618 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6623 slp_node_instance
->reduc_phis
= slp_node
;
6624 /* ??? We're leaving slp_node to point to the PHIs, we only
6625 need it to get at the number of vector stmts which wasn't
6626 yet initialized for the instance root. */
6628 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
6629 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
6632 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
)
6633 == vect_double_reduction_def
);
6634 use_operand_p use_p
;
6636 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6639 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6640 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
6643 /* PHIs should not participate in patterns. */
6644 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6645 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6647 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6648 and compute the reduction chain length. Discover the real
6649 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6651 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6653 (gimple_bb (reduc_def_phi
)->loop_father
));
6654 unsigned reduc_chain_length
= 0;
6655 bool only_slp_reduc_chain
= true;
6657 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
6658 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6660 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6661 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6662 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6664 if (dump_enabled_p ())
6665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6666 "reduction chain broken by patterns.\n");
6669 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6670 only_slp_reduc_chain
= false;
6671 /* ??? For epilogue generation live members of the chain need
6672 to point back to the PHI via their original stmt for
6673 info_for_reduction to work. */
6674 if (STMT_VINFO_LIVE_P (vdef
))
6675 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6677 if (!gimple_extract_op (vdef
->stmt
, &op
))
6679 if (dump_enabled_p ())
6680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6681 "reduction chain includes unsupported"
6682 " statement type.\n");
6685 if (CONVERT_EXPR_CODE_P (op
.code
))
6687 if (!tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
6689 if (dump_enabled_p ())
6690 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6691 "conversion in the reduction chain.\n");
6695 else if (!stmt_info
)
6696 /* First non-conversion stmt. */
6698 reduc_def
= op
.ops
[STMT_VINFO_REDUC_IDX (vdef
)];
6699 reduc_chain_length
++;
6700 if (!stmt_info
&& slp_node
)
6701 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6703 /* PHIs should not participate in patterns. */
6704 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6706 if (nested_in_vect_loop_p (loop
, stmt_info
))
6709 nested_cycle
= true;
6712 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6714 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6716 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6717 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6719 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6720 gcc_assert (slp_node
6721 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6723 /* 1. Is vectorizable reduction? */
6724 /* Not supportable if the reduction variable is used in the loop, unless
6725 it's a reduction chain. */
6726 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6727 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6730 /* Reductions that are not used even in an enclosing outer-loop,
6731 are expected to be "live" (used out of the loop). */
6732 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6733 && !STMT_VINFO_LIVE_P (stmt_info
))
6736 /* 2. Has this been recognized as a reduction pattern?
6738 Check if STMT represents a pattern that has been recognized
6739 in earlier analysis stages. For stmts that represent a pattern,
6740 the STMT_VINFO_RELATED_STMT field records the last stmt in
6741 the original sequence that constitutes the pattern. */
6743 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6746 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6747 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6750 /* 3. Check the operands of the operation. The first operands are defined
6751 inside the loop body. The last operand is the reduction variable,
6752 which is defined by the loop-header-phi. */
6754 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6755 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6757 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
6759 bool lane_reduc_code_p
= (op
.code
== DOT_PROD_EXPR
6760 || op
.code
== WIDEN_SUM_EXPR
6761 || op
.code
== SAD_EXPR
);
6762 enum optab_subtype optab_query_kind
= optab_vector
;
6763 if (op
.code
== DOT_PROD_EXPR
6764 && (TYPE_SIGN (TREE_TYPE (op
.ops
[0]))
6765 != TYPE_SIGN (TREE_TYPE (op
.ops
[1]))))
6766 optab_query_kind
= optab_vector_mixed_sign
;
6768 if (!POINTER_TYPE_P (op
.type
) && !INTEGRAL_TYPE_P (op
.type
)
6769 && !SCALAR_FLOAT_TYPE_P (op
.type
))
6772 /* Do not try to vectorize bit-precision reductions. */
6773 if (!type_has_mode_precision_p (op
.type
))
6776 /* For lane-reducing ops we're reducing the number of reduction PHIs
6777 which means the only use of that may be in the lane-reducing operation. */
6778 if (lane_reduc_code_p
6779 && reduc_chain_length
!= 1
6780 && !only_slp_reduc_chain
)
6782 if (dump_enabled_p ())
6783 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6784 "lane-reducing reduction with extra stmts.\n");
6788 /* All uses but the last are expected to be defined in the loop.
6789 The last use is the reduction variable. In case of nested cycle this
6790 assumption is not true: we use reduc_index to record the index of the
6791 reduction variable. */
6792 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op
.num_ops
);
6793 /* We need to skip an extra operand for COND_EXPRs with embedded
6795 unsigned opno_adjust
= 0;
6796 if (op
.code
== COND_EXPR
&& COMPARISON_CLASS_P (op
.ops
[0]))
6798 for (i
= 0; i
< (int) op
.num_ops
; i
++)
6800 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6801 if (i
== 0 && op
.code
== COND_EXPR
)
6804 stmt_vec_info def_stmt_info
;
6805 enum vect_def_type dt
;
6806 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
6807 i
+ opno_adjust
, &op
.ops
[i
], &slp_op
[i
], &dt
,
6808 &tem
, &def_stmt_info
))
6810 if (dump_enabled_p ())
6811 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6812 "use not simple.\n");
6815 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
6818 /* There should be only one cycle def in the stmt, the one
6819 leading to reduc_def. */
6820 if (VECTORIZABLE_CYCLE_DEF (dt
))
6823 /* To properly compute ncopies we are interested in the widest
6824 non-reduction input type in case we're looking at a widening
6825 accumulation that we later handle in vect_transform_reduction. */
6826 if (lane_reduc_code_p
6829 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6830 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
))))))
6833 if (op
.code
== COND_EXPR
)
6835 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6836 if (dt
== vect_constant_def
)
6839 cond_reduc_val
= op
.ops
[i
];
6841 if (dt
== vect_induction_def
6843 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6846 cond_stmt_vinfo
= def_stmt_info
;
6851 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
6852 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
6854 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
6855 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
6856 /* If we have a condition reduction, see if we can simplify it further. */
6857 if (v_reduc_type
== COND_REDUCTION
)
6862 /* When the condition uses the reduction value in the condition, fail. */
6863 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6867 "condition depends on previous iteration\n");
6871 if (reduc_chain_length
== 1
6872 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6873 vectype_in
, OPTIMIZE_FOR_SPEED
))
6875 if (dump_enabled_p ())
6876 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6877 "optimizing condition reduction with"
6878 " FOLD_EXTRACT_LAST.\n");
6879 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
6881 else if (cond_reduc_dt
== vect_induction_def
)
6884 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6885 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6887 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6888 && TREE_CODE (step
) == INTEGER_CST
);
6889 cond_reduc_val
= NULL_TREE
;
6890 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6891 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
6892 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
6894 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6895 above base; punt if base is the minimum value of the type for
6896 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6897 else if (tree_int_cst_sgn (step
) == -1)
6899 cond_reduc_op_code
= MIN_EXPR
;
6900 if (tree_int_cst_sgn (base
) == -1)
6901 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6902 else if (tree_int_cst_lt (base
,
6903 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6905 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6909 cond_reduc_op_code
= MAX_EXPR
;
6910 if (tree_int_cst_sgn (base
) == 1)
6911 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6912 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6915 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6919 if (dump_enabled_p ())
6920 dump_printf_loc (MSG_NOTE
, vect_location
,
6921 "condition expression based on "
6922 "integer induction.\n");
6923 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
6924 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
6926 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
6929 else if (cond_reduc_dt
== vect_constant_def
)
6931 enum vect_def_type cond_initial_dt
;
6932 tree cond_initial_val
= vect_phi_initial_value (reduc_def_phi
);
6933 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
6934 if (cond_initial_dt
== vect_constant_def
6935 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6936 TREE_TYPE (cond_reduc_val
)))
6938 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6939 cond_initial_val
, cond_reduc_val
);
6940 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6942 if (dump_enabled_p ())
6943 dump_printf_loc (MSG_NOTE
, vect_location
,
6944 "condition expression based on "
6945 "compile time constant.\n");
6946 /* Record reduction code at analysis stage. */
6947 STMT_VINFO_REDUC_CODE (reduc_info
)
6948 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6949 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
6955 if (STMT_VINFO_LIVE_P (phi_info
))
6961 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6963 gcc_assert (ncopies
>= 1);
6965 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6969 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
6970 == vect_double_reduction_def
);
6971 double_reduc
= true;
6974 /* 4.2. Check support for the epilog operation.
6976 If STMT represents a reduction pattern, then the type of the
6977 reduction variable may be different than the type of the rest
6978 of the arguments. For example, consider the case of accumulation
6979 of shorts into an int accumulator; The original code:
6980 S1: int_a = (int) short_a;
6981 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6984 STMT: int_acc = widen_sum <short_a, int_acc>
6987 1. The tree-code that is used to create the vector operation in the
6988 epilog code (that reduces the partial results) is not the
6989 tree-code of STMT, but is rather the tree-code of the original
6990 stmt from the pattern that STMT is replacing. I.e, in the example
6991 above we want to use 'widen_sum' in the loop, but 'plus' in the
6993 2. The type (mode) we use to check available target support
6994 for the vector operation to be created in the *epilog*, is
6995 determined by the type of the reduction variable (in the example
6996 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6997 However the type (mode) we use to check available target support
6998 for the vector operation to be created *inside the loop*, is
6999 determined by the type of the other arguments to STMT (in the
7000 example we'd check this: optab_handler (widen_sum_optab,
7003 This is contrary to "regular" reductions, in which the types of all
7004 the arguments are the same as the type of the reduction variable.
7005 For "regular" reductions we can therefore use the same vector type
7006 (and also the same tree-code) when generating the epilog code and
7007 when generating the code inside the loop. */
7009 code_helper orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
7010 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
7012 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7013 if (reduction_type
== TREE_CODE_REDUCTION
)
7015 /* Check whether it's ok to change the order of the computation.
7016 Generally, when vectorizing a reduction we change the order of the
7017 computation. This may change the behavior of the program in some
7018 cases, so we need to check that this is ok. One exception is when
7019 vectorizing an outer-loop: the inner-loop is executed sequentially,
7020 and therefore vectorizing reductions in the inner-loop during
7021 outer-loop vectorization is safe. Likewise when we are vectorizing
7022 a series of reductions using SLP and the VF is one the reductions
7023 are performed in scalar order. */
7025 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7026 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
7028 else if (needs_fold_left_reduction_p (op
.type
, orig_code
))
7030 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7031 is not directy used in stmt. */
7032 if (!only_slp_reduc_chain
7033 && reduc_chain_length
!= 1)
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7037 "in-order reduction chain without SLP.\n");
7040 STMT_VINFO_REDUC_TYPE (reduc_info
)
7041 = reduction_type
= FOLD_LEFT_REDUCTION
;
7043 else if (!commutative_binary_op_p (orig_code
, op
.type
)
7044 || !associative_binary_op_p (orig_code
, op
.type
))
7046 if (dump_enabled_p ())
7047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7048 "reduction: not commutative/associative");
7053 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7058 "multiple types in double reduction or condition "
7059 "reduction or fold-left reduction.\n");
7063 internal_fn reduc_fn
= IFN_LAST
;
7064 if (reduction_type
== TREE_CODE_REDUCTION
7065 || reduction_type
== FOLD_LEFT_REDUCTION
7066 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
7067 || reduction_type
== CONST_COND_REDUCTION
)
7069 if (reduction_type
== FOLD_LEFT_REDUCTION
7070 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
7071 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
7073 if (reduc_fn
!= IFN_LAST
7074 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
7075 OPTIMIZE_FOR_SPEED
))
7077 if (dump_enabled_p ())
7078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7079 "reduc op not supported by target.\n");
7081 reduc_fn
= IFN_LAST
;
7086 if (!nested_cycle
|| double_reduc
)
7088 if (dump_enabled_p ())
7089 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7090 "no reduc code for scalar code.\n");
7096 else if (reduction_type
== COND_REDUCTION
)
7098 int scalar_precision
7099 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op
.type
));
7100 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7101 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
7104 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7105 OPTIMIZE_FOR_SPEED
))
7106 reduc_fn
= IFN_REDUC_MAX
;
7108 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
7110 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7111 && (!nested_cycle
|| double_reduc
)
7112 && reduc_fn
== IFN_LAST
7113 && !nunits_out
.is_constant ())
7115 if (dump_enabled_p ())
7116 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7117 "missing target support for reduction on"
7118 " variable-length vectors.\n");
7122 /* For SLP reductions, see if there is a neutral value we can use. */
7123 tree neutral_op
= NULL_TREE
;
7126 tree initial_value
= NULL_TREE
;
7127 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
)
7128 initial_value
= vect_phi_initial_value (reduc_def_phi
);
7129 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype_out
),
7130 orig_code
, initial_value
);
7133 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7135 /* We can't support in-order reductions of code such as this:
7137 for (int i = 0; i < n1; ++i)
7138 for (int j = 0; j < n2; ++j)
7141 since GCC effectively transforms the loop when vectorizing:
7143 for (int i = 0; i < n1 / VF; ++i)
7144 for (int j = 0; j < n2; ++j)
7145 for (int k = 0; k < VF; ++k)
7148 which is a reassociation of the original operation. */
7149 if (dump_enabled_p ())
7150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7151 "in-order double reduction not supported.\n");
7156 if (reduction_type
== FOLD_LEFT_REDUCTION
7158 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7160 /* We cannot use in-order reductions in this case because there is
7161 an implicit reassociation of the operations involved. */
7162 if (dump_enabled_p ())
7163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7164 "in-order unchained SLP reductions not supported.\n");
7168 /* For double reductions, and for SLP reductions with a neutral value,
7169 we construct a variable-length initial vector by loading a vector
7170 full of the neutral value and then shift-and-inserting the start
7171 values into the low-numbered elements. */
7172 if ((double_reduc
|| neutral_op
)
7173 && !nunits_out
.is_constant ()
7174 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7175 vectype_out
, OPTIMIZE_FOR_SPEED
))
7177 if (dump_enabled_p ())
7178 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7179 "reduction on variable-length vectors requires"
7180 " target support for a vector-shift-and-insert"
7185 /* Check extra constraints for variable-length unchained SLP reductions. */
7186 if (STMT_SLP_TYPE (stmt_info
)
7187 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7188 && !nunits_out
.is_constant ())
7190 /* We checked above that we could build the initial vector when
7191 there's a neutral element value. Check here for the case in
7192 which each SLP statement has its own initial value and in which
7193 that value needs to be repeated for every instance of the
7194 statement within the initial vector. */
7195 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
7197 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
7198 TREE_TYPE (vectype_out
)))
7200 if (dump_enabled_p ())
7201 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7202 "unsupported form of SLP reduction for"
7203 " variable-length vectors: cannot build"
7204 " initial vector.\n");
7207 /* The epilogue code relies on the number of elements being a multiple
7208 of the group size. The duplicate-and-interleave approach to setting
7209 up the initial vector does too. */
7210 if (!multiple_p (nunits_out
, group_size
))
7212 if (dump_enabled_p ())
7213 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7214 "unsupported form of SLP reduction for"
7215 " variable-length vectors: the vector size"
7216 " is not a multiple of the number of results.\n");
7221 if (reduction_type
== COND_REDUCTION
)
7225 if (! max_loop_iterations (loop
, &ni
))
7227 if (dump_enabled_p ())
7228 dump_printf_loc (MSG_NOTE
, vect_location
,
7229 "loop count not known, cannot create cond "
7233 /* Convert backedges to iterations. */
7236 /* The additional index will be the same type as the condition. Check
7237 that the loop can fit into this less one (because we'll use up the
7238 zero slot for when there are no matches). */
7239 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7240 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7242 if (dump_enabled_p ())
7243 dump_printf_loc (MSG_NOTE
, vect_location
,
7244 "loop size is greater than data size.\n");
7249 /* In case the vectorization factor (VF) is bigger than the number
7250 of elements that we can fit in a vectype (nunits), we have to generate
7251 more than one vector stmt - i.e - we need to "unroll" the
7252 vector stmt by a factor VF/nunits. For more details see documentation
7253 in vectorizable_operation. */
7255 /* If the reduction is used in an outer loop we need to generate
7256 VF intermediate results, like so (e.g. for ncopies=2):
7261 (i.e. we generate VF results in 2 registers).
7262 In this case we have a separate def-use cycle for each copy, and therefore
7263 for each copy we get the vector def for the reduction variable from the
7264 respective phi node created for this copy.
7266 Otherwise (the reduction is unused in the loop nest), we can combine
7267 together intermediate results, like so (e.g. for ncopies=2):
7271 (i.e. we generate VF/2 results in a single register).
7272 In this case for each copy we get the vector def for the reduction variable
7273 from the vectorized reduction operation generated in the previous iteration.
7275 This only works when we see both the reduction PHI and its only consumer
7276 in vectorizable_reduction and there are no intermediate stmts
7277 participating. When unrolling we want each unrolled iteration to have its
7278 own reduction accumulator since one of the main goals of unrolling a
7279 reduction is to reduce the aggregate loop-carried latency. */
7281 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7282 && reduc_chain_length
== 1
7283 && loop_vinfo
->suggested_unroll_factor
== 1)
7284 single_defuse_cycle
= true;
7286 if (single_defuse_cycle
|| lane_reduc_code_p
)
7288 gcc_assert (op
.code
!= COND_EXPR
);
7290 /* 4. Supportable by target? */
7293 /* 4.1. check support for the operation in the loop */
7294 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
7295 if (!directly_supported_p (op
.code
, vectype_in
, optab_query_kind
))
7297 if (dump_enabled_p ())
7298 dump_printf (MSG_NOTE
, "op not supported by target.\n");
7299 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
7300 || !vect_can_vectorize_without_simd_p (op
.code
))
7303 if (dump_enabled_p ())
7304 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
7307 if (vect_emulated_vector_p (vectype_in
)
7308 && !vect_can_vectorize_without_simd_p (op
.code
))
7310 if (dump_enabled_p ())
7311 dump_printf (MSG_NOTE
, "using word mode not possible.\n");
7315 /* lane-reducing operations have to go through vect_transform_reduction.
7316 For the other cases try without the single cycle optimization. */
7319 if (lane_reduc_code_p
)
7322 single_defuse_cycle
= false;
7325 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
7327 /* If the reduction stmt is one of the patterns that have lane
7328 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7329 if ((ncopies
> 1 && ! single_defuse_cycle
)
7330 && lane_reduc_code_p
)
7332 if (dump_enabled_p ())
7333 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7334 "multi def-use cycle not possible for lane-reducing "
7335 "reduction operation\n");
7340 && !(!single_defuse_cycle
7341 && !lane_reduc_code_p
7342 && reduction_type
!= FOLD_LEFT_REDUCTION
))
7343 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7344 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_in
))
7346 if (dump_enabled_p ())
7347 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7348 "incompatible vector types for invariants\n");
7353 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7357 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
7358 reduction_type
, ncopies
, cost_vec
);
7359 /* Cost the reduction op inside the loop if transformed via
7360 vect_transform_reduction. Otherwise this is costed by the
7361 separate vectorizable_* routines. */
7362 if (single_defuse_cycle
|| lane_reduc_code_p
)
7363 record_stmt_cost (cost_vec
, ncopies
, vector_stmt
, stmt_info
, 0, vect_body
);
7365 if (dump_enabled_p ()
7366 && reduction_type
== FOLD_LEFT_REDUCTION
)
7367 dump_printf_loc (MSG_NOTE
, vect_location
,
7368 "using an in-order (fold-left) reduction.\n");
7369 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
7370 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7371 reductions go through their own vectorizable_* routines. */
7372 if (!single_defuse_cycle
7373 && !lane_reduc_code_p
7374 && reduction_type
!= FOLD_LEFT_REDUCTION
)
7377 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
7378 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
7380 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
7381 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
7383 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
7384 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
7386 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7388 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7389 internal_fn cond_fn
= get_conditional_internal_fn (op
.code
, op
.type
);
7391 if (reduction_type
!= FOLD_LEFT_REDUCTION
7392 && !use_mask_by_cond_expr_p (op
.code
, cond_fn
, vectype_in
)
7393 && (cond_fn
== IFN_LAST
7394 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7395 OPTIMIZE_FOR_SPEED
)))
7397 if (dump_enabled_p ())
7398 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7399 "can't operate on partial vectors because"
7400 " no conditional operation is available.\n");
7401 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7403 else if (reduction_type
== FOLD_LEFT_REDUCTION
7404 && reduc_fn
== IFN_LAST
7405 && !expand_vec_cond_expr_p (vectype_in
,
7406 truth_type_for (vectype_in
),
7409 if (dump_enabled_p ())
7410 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7411 "can't operate on partial vectors because"
7412 " no conditional operation is available.\n");
7413 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7416 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7422 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7426 vect_transform_reduction (loop_vec_info loop_vinfo
,
7427 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
7428 gimple
**vec_stmt
, slp_tree slp_node
)
7430 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7431 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7436 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7437 gcc_assert (reduc_info
->is_reduc_info
);
7439 if (nested_in_vect_loop_p (loop
, stmt_info
))
7442 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
7446 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7448 gcc_assert (op
.code
.is_tree_code ());
7449 auto code
= tree_code (op
.code
);
7451 /* All uses but the last are expected to be defined in the loop.
7452 The last use is the reduction variable. In case of nested cycle this
7453 assumption is not true: we use reduc_index to record the index of the
7454 reduction variable. */
7455 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7456 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7457 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
7458 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7463 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7467 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7471 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7472 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7473 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7476 tree new_temp
= NULL_TREE
;
7477 auto_vec
<tree
> vec_oprnds0
;
7478 auto_vec
<tree
> vec_oprnds1
;
7479 auto_vec
<tree
> vec_oprnds2
;
7482 if (dump_enabled_p ())
7483 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7485 /* FORNOW: Multiple types are not supported for condition. */
7486 if (code
== COND_EXPR
)
7487 gcc_assert (ncopies
== 1);
7489 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7491 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7492 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7494 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7495 return vectorize_fold_left_reduction
7496 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
7497 reduc_fn
, op
.ops
, vectype_in
, reduc_index
, masks
);
7500 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
7501 gcc_assert (single_defuse_cycle
7502 || code
== DOT_PROD_EXPR
7503 || code
== WIDEN_SUM_EXPR
7504 || code
== SAD_EXPR
);
7506 /* Create the destination vector */
7507 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
7508 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7510 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
7511 single_defuse_cycle
&& reduc_index
== 0
7512 ? NULL_TREE
: op
.ops
[0], &vec_oprnds0
,
7513 single_defuse_cycle
&& reduc_index
== 1
7514 ? NULL_TREE
: op
.ops
[1], &vec_oprnds1
,
7516 && !(single_defuse_cycle
&& reduc_index
== 2)
7517 ? op
.ops
[2] : NULL_TREE
, &vec_oprnds2
);
7518 if (single_defuse_cycle
)
7520 gcc_assert (!slp_node
);
7521 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7522 op
.ops
[reduc_index
],
7523 reduc_index
== 0 ? &vec_oprnds0
7524 : (reduc_index
== 1 ? &vec_oprnds1
7528 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7531 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7532 if (masked_loop_p
&& !mask_by_cond_expr
)
7534 /* Make sure that the reduction accumulator is vop[0]. */
7535 if (reduc_index
== 1)
7537 gcc_assert (commutative_tree_code (code
));
7538 std::swap (vop
[0], vop
[1]);
7540 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7542 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7543 vop
[0], vop
[1], vop
[0]);
7544 new_temp
= make_ssa_name (vec_dest
, call
);
7545 gimple_call_set_lhs (call
, new_temp
);
7546 gimple_call_set_nothrow (call
, true);
7547 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
7552 if (op
.num_ops
== 3)
7553 vop
[2] = vec_oprnds2
[i
];
7555 if (masked_loop_p
&& mask_by_cond_expr
)
7557 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7559 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7562 new_stmt
= gimple_build_assign (vec_dest
, code
,
7563 vop
[0], vop
[1], vop
[2]);
7564 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7565 gimple_assign_set_lhs (new_stmt
, new_temp
);
7566 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7570 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7571 else if (single_defuse_cycle
7574 if (reduc_index
== 0)
7575 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
7576 else if (reduc_index
== 1)
7577 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
7578 else if (reduc_index
== 2)
7579 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
7582 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7586 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7591 /* Transform phase of a cycle PHI. */
7594 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7595 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7596 slp_tree slp_node
, slp_instance slp_node_instance
)
7598 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7599 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7603 bool nested_cycle
= false;
7606 if (nested_in_vect_loop_p (loop
, stmt_info
))
7609 nested_cycle
= true;
7612 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7613 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7614 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7615 gcc_assert (reduc_info
->is_reduc_info
);
7617 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7618 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7619 /* Leave the scalar phi in place. */
7622 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7623 /* For a nested cycle we do not fill the above. */
7625 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7626 gcc_assert (vectype_in
);
7630 /* The size vect_schedule_slp_instance computes is off for us. */
7631 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7632 * SLP_TREE_LANES (slp_node
), vectype_in
);
7638 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7641 /* Check whether we should use a single PHI node and accumulate
7642 vectors to one before the backedge. */
7643 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7646 /* Create the destination vector */
7647 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7648 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7651 /* Get the loop-entry arguments. */
7652 tree vec_initial_def
= NULL_TREE
;
7653 auto_vec
<tree
> vec_initial_defs
;
7656 vec_initial_defs
.reserve (vec_num
);
7659 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
7660 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
7665 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
7666 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
7667 vec
<stmt_vec_info
> &stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
7669 unsigned int num_phis
= stmts
.length ();
7670 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
7672 initial_values
.reserve (num_phis
);
7673 for (unsigned int i
= 0; i
< num_phis
; ++i
)
7675 gphi
*this_phi
= as_a
<gphi
*> (stmts
[i
]->stmt
);
7676 initial_values
.quick_push (vect_phi_initial_value (this_phi
));
7679 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
7680 if (!initial_values
.is_empty ())
7683 = (num_phis
== 1 ? initial_values
[0] : NULL_TREE
);
7684 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7686 = neutral_op_for_reduction (TREE_TYPE (vectype_out
),
7687 code
, initial_value
);
7688 get_initial_defs_for_reduction (loop_vinfo
, reduc_info
,
7689 &vec_initial_defs
, vec_num
,
7690 stmts
.length (), neutral_op
);
7696 /* Get at the scalar def before the loop, that defines the initial
7697 value of the reduction variable. */
7698 tree initial_def
= vect_phi_initial_value (phi
);
7699 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
7700 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7701 and we can't use zero for induc_val, use initial_def. Similarly
7702 for REDUC_MIN and initial_def larger than the base. */
7703 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
7705 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
7706 if (TREE_CODE (initial_def
) == INTEGER_CST
7707 && !integer_zerop (induc_val
)
7708 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
7709 && tree_int_cst_lt (initial_def
, induc_val
))
7710 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
7711 && tree_int_cst_lt (induc_val
, initial_def
))))
7713 induc_val
= initial_def
;
7714 /* Communicate we used the initial_def to epilouge
7716 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
7718 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
7720 else if (nested_cycle
)
7722 /* Do not use an adjustment def as that case is not supported
7723 correctly if ncopies is not one. */
7724 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
7725 ncopies
, initial_def
,
7728 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == CONST_COND_REDUCTION
7729 || STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
7730 /* Fill the initial vector with the initial scalar value. */
7732 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
,
7733 initial_def
, initial_def
);
7737 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
7738 if (!reduc_info
->reduc_initial_values
.is_empty ())
7740 initial_def
= reduc_info
->reduc_initial_values
[0];
7741 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7743 = neutral_op_for_reduction (TREE_TYPE (initial_def
),
7745 gcc_assert (neutral_op
);
7746 /* Try to simplify the vector initialization by applying an
7747 adjustment after the reduction has been performed. */
7748 if (!reduc_info
->reused_accumulator
7749 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
7750 && !operand_equal_p (neutral_op
, initial_def
))
7752 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
)
7754 initial_def
= neutral_op
;
7757 = get_initial_def_for_reduction (loop_vinfo
, reduc_info
,
7758 initial_def
, neutral_op
);
7763 if (vec_initial_def
)
7765 vec_initial_defs
.create (ncopies
);
7766 for (i
= 0; i
< ncopies
; ++i
)
7767 vec_initial_defs
.quick_push (vec_initial_def
);
7770 if (auto *accumulator
= reduc_info
->reused_accumulator
)
7772 tree def
= accumulator
->reduc_input
;
7773 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
7775 unsigned int nreduc
;
7776 bool res
= constant_multiple_p (TYPE_VECTOR_SUBPARTS
7778 TYPE_VECTOR_SUBPARTS (vectype_out
),
7781 gimple_seq stmts
= NULL
;
7782 /* Reduce the single vector to a smaller one. */
7785 /* Perform the reduction in the appropriate type. */
7786 tree rvectype
= vectype_out
;
7787 if (!useless_type_conversion_p (TREE_TYPE (vectype_out
),
7788 TREE_TYPE (TREE_TYPE (def
))))
7789 rvectype
= build_vector_type (TREE_TYPE (TREE_TYPE (def
)),
7790 TYPE_VECTOR_SUBPARTS
7792 def
= vect_create_partial_epilog (def
, rvectype
,
7793 STMT_VINFO_REDUC_CODE
7797 /* The epilogue loop might use a different vector mode, like
7799 if (TYPE_MODE (vectype_out
) != TYPE_MODE (TREE_TYPE (def
)))
7801 tree reduc_type
= build_vector_type_for_mode
7802 (TREE_TYPE (TREE_TYPE (def
)), TYPE_MODE (vectype_out
));
7803 def
= gimple_convert (&stmts
, reduc_type
, def
);
7805 /* Adjust the input so we pick up the partially reduced value
7806 for the skip edge in vect_create_epilog_for_reduction. */
7807 accumulator
->reduc_input
= def
;
7808 /* And the reduction could be carried out using a different sign. */
7809 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
7810 def
= gimple_convert (&stmts
, vectype_out
, def
);
7811 if (loop_vinfo
->main_loop_edge
)
7813 /* While we'd like to insert on the edge this will split
7814 blocks and disturb bookkeeping, we also will eventually
7815 need this on the skip edge. Rely on sinking to
7816 fixup optimal placement and insert in the pred. */
7817 gimple_stmt_iterator gsi
7818 = gsi_last_bb (loop_vinfo
->main_loop_edge
->src
);
7819 /* Insert before a cond that eventually skips the
7821 if (!gsi_end_p (gsi
) && stmt_ends_bb_p (gsi_stmt (gsi
)))
7823 gsi_insert_seq_after (&gsi
, stmts
, GSI_CONTINUE_LINKING
);
7826 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
),
7829 if (loop_vinfo
->main_loop_edge
)
7831 = vect_get_main_loop_result (loop_vinfo
, def
,
7832 vec_initial_defs
[0]);
7834 vec_initial_defs
.safe_push (def
);
7837 /* Generate the reduction PHIs upfront. */
7838 for (i
= 0; i
< vec_num
; i
++)
7840 tree vec_init_def
= vec_initial_defs
[i
];
7841 for (j
= 0; j
< ncopies
; j
++)
7843 /* Create the reduction-phi that defines the reduction
7845 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
7847 /* Set the loop-entry arg of the reduction-phi. */
7848 if (j
!= 0 && nested_cycle
)
7849 vec_init_def
= vec_initial_defs
[j
];
7850 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
7853 /* The loop-latch arg is set in epilogue processing. */
7856 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7860 *vec_stmt
= new_phi
;
7861 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7869 /* Vectorizes LC PHIs. */
7872 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
7873 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7877 || !is_a
<gphi
*> (stmt_info
->stmt
)
7878 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
7881 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
7882 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
7885 if (!vec_stmt
) /* transformation not required. */
7887 /* Deal with copies from externs or constants that disguise as
7888 loop-closed PHI nodes (PR97886). */
7890 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
7891 SLP_TREE_VECTYPE (slp_node
)))
7893 if (dump_enabled_p ())
7894 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7895 "incompatible vector types for invariants\n");
7898 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
7902 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7903 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7904 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7905 edge e
= single_pred_edge (bb
);
7906 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7907 auto_vec
<tree
> vec_oprnds
;
7908 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
7909 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
7910 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
7911 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
7913 /* Create the vectorized LC PHI node. */
7914 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
7915 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
7917 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7919 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7922 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7927 /* Vectorizes PHIs. */
7930 vectorizable_phi (vec_info
*,
7931 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7932 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
7934 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
7937 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
7940 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
7942 if (!vec_stmt
) /* transformation not required. */
7946 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
7949 if (dump_enabled_p ())
7950 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7951 "PHI node with unvectorized backedge def\n");
7954 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
7956 if (dump_enabled_p ())
7957 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7958 "incompatible vector types for invariants\n");
7961 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
7962 && !useless_type_conversion_p (vectype
,
7963 SLP_TREE_VECTYPE (child
)))
7965 /* With bools we can have mask and non-mask precision vectors
7966 or different non-mask precisions. while pattern recog is
7967 supposed to guarantee consistency here bugs in it can cause
7968 mismatches (PR103489 and PR103800 for example).
7969 Deal with them here instead of ICEing later. */
7970 if (dump_enabled_p ())
7971 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7972 "incompatible vector type setup from "
7973 "bool pattern detection\n");
7977 /* For single-argument PHIs assume coalescing which means zero cost
7978 for the scalar and the vector PHIs. This avoids artificially
7979 favoring the vector path (but may pessimize it in some cases). */
7980 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
7981 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
7982 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
7983 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
7987 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7988 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7989 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7990 auto_vec
<gphi
*> new_phis
;
7991 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
7993 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
7995 /* Skip not yet vectorized defs. */
7996 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
7997 && SLP_TREE_VEC_STMTS (child
).is_empty ())
8000 auto_vec
<tree
> vec_oprnds
;
8001 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
8002 if (!new_phis
.exists ())
8004 new_phis
.create (vec_oprnds
.length ());
8005 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8007 /* Create the vectorized LC PHI node. */
8008 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
8009 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phis
[j
]);
8012 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
8013 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8014 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
8016 /* We should have at least one already vectorized child. */
8017 gcc_assert (new_phis
.exists ());
8022 /* Return true if VECTYPE represents a vector that requires lowering
8023 by the vector lowering pass. */
8026 vect_emulated_vector_p (tree vectype
)
8028 return (!VECTOR_MODE_P (TYPE_MODE (vectype
))
8029 && (!VECTOR_BOOLEAN_TYPE_P (vectype
)
8030 || TYPE_PRECISION (TREE_TYPE (vectype
)) != 1));
8033 /* Return true if we can emulate CODE on an integer mode representation
8037 vect_can_vectorize_without_simd_p (tree_code code
)
8055 /* Likewise, but taking a code_helper. */
8058 vect_can_vectorize_without_simd_p (code_helper code
)
8060 return (code
.is_tree_code ()
8061 && vect_can_vectorize_without_simd_p (tree_code (code
)));
8064 /* Function vectorizable_induction
8066 Check if STMT_INFO performs an induction computation that can be vectorized.
8067 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8068 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8069 Return true if STMT_INFO is vectorizable in this way. */
8072 vectorizable_induction (loop_vec_info loop_vinfo
,
8073 stmt_vec_info stmt_info
,
8074 gimple
**vec_stmt
, slp_tree slp_node
,
8075 stmt_vector_for_cost
*cost_vec
)
8077 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8079 bool nested_in_vect_loop
= false;
8080 class loop
*iv_loop
;
8082 edge pe
= loop_preheader_edge (loop
);
8084 tree new_vec
, vec_init
, vec_step
, t
;
8087 gphi
*induction_phi
;
8088 tree induc_def
, vec_dest
;
8089 tree init_expr
, step_expr
;
8090 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8093 gimple_stmt_iterator si
;
8095 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
8099 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8102 /* Make sure it was recognized as induction computation. */
8103 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
8106 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8107 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8112 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8113 gcc_assert (ncopies
>= 1);
8115 /* FORNOW. These restrictions should be relaxed. */
8116 if (nested_in_vect_loop_p (loop
, stmt_info
))
8118 imm_use_iterator imm_iter
;
8119 use_operand_p use_p
;
8126 if (dump_enabled_p ())
8127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8128 "multiple types in nested loop.\n");
8133 latch_e
= loop_latch_edge (loop
->inner
);
8134 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
8135 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
8137 gimple
*use_stmt
= USE_STMT (use_p
);
8138 if (is_gimple_debug (use_stmt
))
8141 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
8143 exit_phi
= use_stmt
;
8149 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
8150 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
8151 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
8153 if (dump_enabled_p ())
8154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8155 "inner-loop induction only used outside "
8156 "of the outer vectorized loop.\n");
8161 nested_in_vect_loop
= true;
8162 iv_loop
= loop
->inner
;
8166 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
8168 if (slp_node
&& !nunits
.is_constant ())
8170 /* The current SLP code creates the step value element-by-element. */
8171 if (dump_enabled_p ())
8172 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8173 "SLP induction not supported for variable-length"
8178 if (FLOAT_TYPE_P (vectype
) && !param_vect_induction_float
)
8180 if (dump_enabled_p ())
8181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8182 "floating point induction vectorization disabled\n");
8186 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
8187 gcc_assert (step_expr
!= NULL_TREE
);
8188 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
8190 /* Check for backend support of PLUS/MINUS_EXPR. */
8191 if (!directly_supported_p (PLUS_EXPR
, step_vectype
)
8192 || !directly_supported_p (MINUS_EXPR
, step_vectype
))
8195 if (!vec_stmt
) /* transformation not required. */
8197 unsigned inside_cost
= 0, prologue_cost
= 0;
8200 /* We eventually need to set a vector type on invariant
8204 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
8205 if (!vect_maybe_update_slp_op_vectype
8206 (child
, SLP_TREE_VECTYPE (slp_node
)))
8208 if (dump_enabled_p ())
8209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8210 "incompatible vector types for "
8214 /* loop cost for vec_loop. */
8216 = record_stmt_cost (cost_vec
,
8217 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
8218 vector_stmt
, stmt_info
, 0, vect_body
);
8219 /* prologue cost for vec_init (if not nested) and step. */
8220 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
8222 stmt_info
, 0, vect_prologue
);
8224 else /* if (!slp_node) */
8226 /* loop cost for vec_loop. */
8227 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
8228 stmt_info
, 0, vect_body
);
8229 /* prologue cost for vec_init and vec_step. */
8230 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
8231 stmt_info
, 0, vect_prologue
);
8233 if (dump_enabled_p ())
8234 dump_printf_loc (MSG_NOTE
, vect_location
,
8235 "vect_model_induction_cost: inside_cost = %d, "
8236 "prologue_cost = %d .\n", inside_cost
,
8239 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
8240 DUMP_VECT_SCOPE ("vectorizable_induction");
8246 /* Compute a vector variable, initialized with the first VF values of
8247 the induction variable. E.g., for an iv with IV_PHI='X' and
8248 evolution S, for a vector of 4 units, we want to compute:
8249 [X, X + S, X + 2*S, X + 3*S]. */
8251 if (dump_enabled_p ())
8252 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
8254 pe
= loop_preheader_edge (iv_loop
);
8255 /* Find the first insertion point in the BB. */
8256 basic_block bb
= gimple_bb (phi
);
8257 si
= gsi_after_labels (bb
);
8259 /* For SLP induction we have to generate several IVs as for example
8260 with group size 3 we need
8261 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8262 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8265 /* Enforced above. */
8266 unsigned int const_nunits
= nunits
.to_constant ();
8268 /* The initial values are vectorized, but any lanes > group_size
8271 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
8273 /* Gather steps. Since we do not vectorize inductions as
8274 cycles we have to reconstruct the step from SCEV data. */
8275 unsigned group_size
= SLP_TREE_LANES (slp_node
);
8276 tree
*steps
= XALLOCAVEC (tree
, group_size
);
8277 tree
*inits
= XALLOCAVEC (tree
, group_size
);
8278 stmt_vec_info phi_info
;
8279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
8281 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
8283 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
8287 /* Now generate the IVs. */
8288 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8289 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
8291 if (nested_in_vect_loop
)
8295 /* Compute the number of distinct IVs we need. First reduce
8296 group_size if it is a multiple of const_nunits so we get
8297 one IV for a group_size of 4 but const_nunits 2. */
8298 unsigned group_sizep
= group_size
;
8299 if (group_sizep
% const_nunits
== 0)
8300 group_sizep
= group_sizep
/ const_nunits
;
8301 nivs
= least_common_multiple (group_sizep
,
8302 const_nunits
) / const_nunits
;
8304 tree stept
= TREE_TYPE (step_vectype
);
8305 tree lupdate_mul
= NULL_TREE
;
8306 if (!nested_in_vect_loop
)
8308 /* The number of iterations covered in one vector iteration. */
8309 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
8311 = build_vector_from_val (step_vectype
,
8312 SCALAR_FLOAT_TYPE_P (stept
)
8313 ? build_real_from_wide (stept
, lup_mul
,
8315 : build_int_cstu (stept
, lup_mul
));
8317 tree peel_mul
= NULL_TREE
;
8318 gimple_seq init_stmts
= NULL
;
8319 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
8321 if (SCALAR_FLOAT_TYPE_P (stept
))
8322 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
8323 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8325 peel_mul
= gimple_convert (&init_stmts
, stept
,
8326 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8327 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
8328 step_vectype
, peel_mul
);
8331 auto_vec
<tree
> vec_steps
;
8332 for (ivn
= 0; ivn
< nivs
; ++ivn
)
8334 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
8335 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
8336 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
8337 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
8339 /* The scalar steps of the IVs. */
8340 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
8341 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
8342 step_elts
.quick_push (elt
);
8345 /* The scalar inits of the IVs if not vectorized. */
8346 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
8347 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
8349 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
8350 TREE_TYPE (vectype
), elt
);
8351 init_elts
.quick_push (elt
);
8353 /* The number of steps to add to the initial values. */
8354 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
8355 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
8356 ? build_real_from_wide (stept
,
8358 : build_int_cstu (stept
, mul_elt
));
8360 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
8361 vec_steps
.safe_push (vec_step
);
8362 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
8364 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8365 step_mul
, peel_mul
);
8367 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
8369 /* Create the induction-phi that defines the induction-operand. */
8370 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
8372 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8373 induc_def
= PHI_RESULT (induction_phi
);
8375 /* Create the iv update inside the loop */
8378 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8379 vec_step
, lupdate_mul
);
8380 gimple_seq stmts
= NULL
;
8381 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8382 vec_def
= gimple_build (&stmts
,
8383 PLUS_EXPR
, step_vectype
, vec_def
, up
);
8384 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8385 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8386 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8390 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
8391 if (!nested_in_vect_loop
8392 && !integer_zerop (step_mul
))
8394 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
8395 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8396 vec_step
, step_mul
);
8397 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8399 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
8402 /* Set the arguments of the phi node: */
8403 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8405 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
8407 if (!nested_in_vect_loop
)
8409 /* Fill up to the number of vectors we need for the whole group. */
8410 nivs
= least_common_multiple (group_size
,
8411 const_nunits
) / const_nunits
;
8412 vec_steps
.reserve (nivs
-ivn
);
8413 for (; ivn
< nivs
; ++ivn
)
8415 SLP_TREE_VEC_STMTS (slp_node
)
8416 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
8417 vec_steps
.quick_push (vec_steps
[0]);
8421 /* Re-use IVs when we can. We are generating further vector
8422 stmts by adding VF' * stride to the IVs generated above. */
8426 = least_common_multiple (group_size
, const_nunits
) / group_size
;
8428 = build_vector_from_val (step_vectype
,
8429 SCALAR_FLOAT_TYPE_P (stept
)
8430 ? build_real_from_wide (stept
,
8432 : build_int_cstu (stept
, vfp
));
8433 for (; ivn
< nvects
; ++ivn
)
8435 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
8436 tree def
= gimple_get_lhs (iv
);
8438 vec_steps
[ivn
- nivs
]
8439 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8440 vec_steps
[ivn
- nivs
], lupdate_mul
);
8441 gimple_seq stmts
= NULL
;
8442 def
= gimple_convert (&stmts
, step_vectype
, def
);
8443 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8444 def
, vec_steps
[ivn
% nivs
]);
8445 def
= gimple_convert (&stmts
, vectype
, def
);
8446 if (gimple_code (iv
) == GIMPLE_PHI
)
8447 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8450 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
8451 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
8453 SLP_TREE_VEC_STMTS (slp_node
)
8454 .quick_push (SSA_NAME_DEF_STMT (def
));
8458 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
8459 gcc_assert (!new_bb
);
8464 init_expr
= vect_phi_initial_value (phi
);
8466 gimple_seq stmts
= NULL
;
8467 if (!nested_in_vect_loop
)
8469 /* Convert the initial value to the IV update type. */
8470 tree new_type
= TREE_TYPE (step_expr
);
8471 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
8473 /* If we are using the loop mask to "peel" for alignment then we need
8474 to adjust the start value here. */
8475 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
8476 if (skip_niters
!= NULL_TREE
)
8478 if (FLOAT_TYPE_P (vectype
))
8479 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
8482 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
8483 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
8484 skip_niters
, step_expr
);
8485 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
8486 init_expr
, skip_step
);
8492 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8493 gcc_assert (!new_bb
);
8496 /* Create the vector that holds the initial_value of the induction. */
8497 if (nested_in_vect_loop
)
8499 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8500 been created during vectorization of previous stmts. We obtain it
8501 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8502 auto_vec
<tree
> vec_inits
;
8503 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
8504 init_expr
, &vec_inits
);
8505 vec_init
= vec_inits
[0];
8506 /* If the initial value is not of proper type, convert it. */
8507 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
8510 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
8514 build1 (VIEW_CONVERT_EXPR
, vectype
,
8516 vec_init
= gimple_assign_lhs (new_stmt
);
8517 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
8519 gcc_assert (!new_bb
);
8524 /* iv_loop is the loop to be vectorized. Create:
8525 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8527 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
8529 unsigned HOST_WIDE_INT const_nunits
;
8530 if (nunits
.is_constant (&const_nunits
))
8532 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
8533 elts
.quick_push (new_name
);
8534 for (i
= 1; i
< const_nunits
; i
++)
8536 /* Create: new_name_i = new_name + step_expr */
8537 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
8538 new_name
, step_expr
);
8539 elts
.quick_push (new_name
);
8541 /* Create a vector from [new_name_0, new_name_1, ...,
8542 new_name_nunits-1] */
8543 vec_init
= gimple_build_vector (&stmts
, &elts
);
8545 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
8546 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8547 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
8548 new_name
, step_expr
);
8552 [base, base, base, ...]
8553 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8554 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
8555 gcc_assert (flag_associative_math
);
8556 tree index
= build_index_vector (step_vectype
, 0, 1);
8557 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8559 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8561 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
8562 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
8563 vec_init
, step_vec
);
8564 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8565 vec_init
, base_vec
);
8567 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
8571 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8572 gcc_assert (!new_bb
);
8577 /* Create the vector that holds the step of the induction. */
8578 if (nested_in_vect_loop
)
8579 /* iv_loop is nested in the loop to be vectorized. Generate:
8580 vec_step = [S, S, S, S] */
8581 new_name
= step_expr
;
8584 /* iv_loop is the loop to be vectorized. Generate:
8585 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8586 gimple_seq seq
= NULL
;
8587 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8589 expr
= build_int_cst (integer_type_node
, vf
);
8590 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8593 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
8594 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8598 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8599 gcc_assert (!new_bb
);
8603 t
= unshare_expr (new_name
);
8604 gcc_assert (CONSTANT_CLASS_P (new_name
)
8605 || TREE_CODE (new_name
) == SSA_NAME
);
8606 new_vec
= build_vector_from_val (step_vectype
, t
);
8607 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8608 new_vec
, step_vectype
, NULL
);
8611 /* Create the following def-use cycle:
8616 vec_iv = PHI <vec_init, vec_loop>
8620 vec_loop = vec_iv + vec_step; */
8622 /* Create the induction-phi that defines the induction-operand. */
8623 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
8624 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8625 induc_def
= PHI_RESULT (induction_phi
);
8627 /* Create the iv update inside the loop */
8629 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8630 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8631 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8632 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8633 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8635 /* Set the arguments of the phi node: */
8636 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8637 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8640 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
8641 *vec_stmt
= induction_phi
;
8643 /* In case that vectorization factor (VF) is bigger than the number
8644 of elements that we can fit in a vectype (nunits), we have to generate
8645 more than one vector stmt - i.e - we need to "unroll" the
8646 vector stmt by a factor VF/nunits. For more details see documentation
8647 in vectorizable_operation. */
8651 gimple_seq seq
= NULL
;
8652 /* FORNOW. This restriction should be relaxed. */
8653 gcc_assert (!nested_in_vect_loop
);
8655 /* Create the vector that holds the step of the induction. */
8656 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8658 expr
= build_int_cst (integer_type_node
, nunits
);
8659 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8662 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
8663 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8667 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8668 gcc_assert (!new_bb
);
8671 t
= unshare_expr (new_name
);
8672 gcc_assert (CONSTANT_CLASS_P (new_name
)
8673 || TREE_CODE (new_name
) == SSA_NAME
);
8674 new_vec
= build_vector_from_val (step_vectype
, t
);
8675 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8676 new_vec
, step_vectype
, NULL
);
8678 vec_def
= induc_def
;
8679 for (i
= 1; i
< ncopies
; i
++)
8681 /* vec_i = vec_prev + vec_step */
8682 gimple_seq stmts
= NULL
;
8683 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
8684 vec_def
= gimple_build (&stmts
,
8685 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8686 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8688 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8689 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8690 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
8694 if (dump_enabled_p ())
8695 dump_printf_loc (MSG_NOTE
, vect_location
,
8696 "transform induction: created def-use cycle: %G%G",
8697 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
8702 /* Function vectorizable_live_operation.
8704 STMT_INFO computes a value that is used outside the loop. Check if
8705 it can be supported. */
8708 vectorizable_live_operation (vec_info
*vinfo
,
8709 stmt_vec_info stmt_info
,
8710 gimple_stmt_iterator
*gsi
,
8711 slp_tree slp_node
, slp_instance slp_node_instance
,
8712 int slp_index
, bool vec_stmt_p
,
8713 stmt_vector_for_cost
*cost_vec
)
8715 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
8716 imm_use_iterator imm_iter
;
8717 tree lhs
, lhs_type
, bitsize
;
8718 tree vectype
= (slp_node
8719 ? SLP_TREE_VECTYPE (slp_node
)
8720 : STMT_VINFO_VECTYPE (stmt_info
));
8721 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8724 auto_vec
<tree
> vec_oprnds
;
8726 poly_uint64 vec_index
= 0;
8728 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
8730 /* If a stmt of a reduction is live, vectorize it via
8731 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8732 validity so just trigger the transform here. */
8733 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
8739 /* For reduction chains the meta-info is attached to
8740 the group leader. */
8741 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
8742 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
8743 /* For SLP reductions we vectorize the epilogue for
8744 all involved stmts together. */
8745 else if (slp_index
!= 0)
8748 /* For SLP reductions the meta-info is attached to
8749 the representative. */
8750 stmt_info
= SLP_TREE_REPRESENTATIVE (slp_node
);
8752 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8753 gcc_assert (reduc_info
->is_reduc_info
);
8754 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
8755 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
8757 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
8762 /* If STMT is not relevant and it is a simple assignment and its inputs are
8763 invariant then it can remain in place, unvectorized. The original last
8764 scalar value that it computes will be used. */
8765 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8767 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
8768 if (dump_enabled_p ())
8769 dump_printf_loc (MSG_NOTE
, vect_location
,
8770 "statement is simple and uses invariant. Leaving in "
8778 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8782 gcc_assert (slp_index
>= 0);
8784 /* Get the last occurrence of the scalar index from the concatenation of
8785 all the slp vectors. Calculate which slp vector it is and the index
8787 int num_scalar
= SLP_TREE_LANES (slp_node
);
8788 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8789 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
8791 /* Calculate which vector contains the result, and which lane of
8792 that vector we need. */
8793 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
8795 if (dump_enabled_p ())
8796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8797 "Cannot determine which vector holds the"
8798 " final result.\n");
8805 /* No transformation required. */
8806 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
8808 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
8809 OPTIMIZE_FOR_SPEED
))
8811 if (dump_enabled_p ())
8812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8813 "can't operate on partial vectors "
8814 "because the target doesn't support extract "
8815 "last reduction.\n");
8816 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8820 if (dump_enabled_p ())
8821 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8822 "can't operate on partial vectors "
8823 "because an SLP statement is live after "
8825 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8827 else if (ncopies
> 1)
8829 if (dump_enabled_p ())
8830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8831 "can't operate on partial vectors "
8832 "because ncopies is greater than 1.\n");
8833 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8837 gcc_assert (ncopies
== 1 && !slp_node
);
8838 vect_record_loop_mask (loop_vinfo
,
8839 &LOOP_VINFO_MASKS (loop_vinfo
),
8843 /* ??? Enable for loop costing as well. */
8845 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
8850 /* Use the lhs of the original scalar statement. */
8851 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
8852 if (dump_enabled_p ())
8853 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
8856 lhs
= gimple_get_lhs (stmt
);
8857 lhs_type
= TREE_TYPE (lhs
);
8859 bitsize
= vector_element_bits_tree (vectype
);
8861 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8862 tree vec_lhs
, bitstart
;
8866 gcc_assert (!loop_vinfo
|| !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8868 /* Get the correct slp vectorized stmt. */
8869 vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
8870 vec_lhs
= gimple_get_lhs (vec_stmt
);
8872 /* Get entry to use. */
8873 bitstart
= bitsize_int (vec_index
);
8874 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8878 /* For multiple copies, get the last copy. */
8879 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
8880 vec_lhs
= gimple_get_lhs (vec_stmt
);
8882 /* Get the last lane in the vector. */
8883 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
8888 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8889 requirement, insert one phi node for it. It looks like:
8896 # vec_lhs' = PHI <vec_lhs>
8897 new_tree = lane_extract <vec_lhs', ...>;
8900 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8901 basic_block exit_bb
= single_exit (loop
)->dest
;
8902 gcc_assert (single_pred_p (exit_bb
));
8904 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
8905 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
8906 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
8908 gimple_seq stmts
= NULL
;
8910 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8914 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8916 where VEC_LHS is the vectorized live-out result and MASK is
8917 the loop mask for the final iteration. */
8918 gcc_assert (ncopies
== 1 && !slp_node
);
8919 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8920 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8922 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
8925 /* Convert the extracted vector element to the scalar type. */
8926 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8930 tree bftype
= TREE_TYPE (vectype
);
8931 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8932 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8933 new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8934 vec_lhs_phi
, bitsize
, bitstart
);
8935 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8936 &stmts
, true, NULL_TREE
);
8941 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
8942 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
8944 /* Remove existing phi from lhs and create one copy from new_tree. */
8945 tree lhs_phi
= NULL_TREE
;
8946 gimple_stmt_iterator gsi
;
8947 for (gsi
= gsi_start_phis (exit_bb
);
8948 !gsi_end_p (gsi
); gsi_next (&gsi
))
8950 gimple
*phi
= gsi_stmt (gsi
);
8951 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
8953 remove_phi_node (&gsi
, false);
8954 lhs_phi
= gimple_phi_result (phi
);
8955 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
8956 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
8962 /* Replace use of lhs with newly computed result. If the use stmt is a
8963 single arg PHI, just replace all uses of PHI result. It's necessary
8964 because lcssa PHI defining lhs may be before newly inserted stmt. */
8965 use_operand_p use_p
;
8966 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8967 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8968 && !is_gimple_debug (use_stmt
))
8970 if (gimple_code (use_stmt
) == GIMPLE_PHI
8971 && gimple_phi_num_args (use_stmt
) == 1)
8973 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8977 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8978 SET_USE (use_p
, new_tree
);
8980 update_stmt (use_stmt
);
8985 /* For basic-block vectorization simply insert the lane-extraction. */
8986 tree bftype
= TREE_TYPE (vectype
);
8987 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8988 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8989 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8990 vec_lhs
, bitsize
, bitstart
);
8991 gimple_seq stmts
= NULL
;
8992 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8993 &stmts
, true, NULL_TREE
);
8994 if (TREE_CODE (new_tree
) == SSA_NAME
8995 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
8996 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
8997 if (is_a
<gphi
*> (vec_stmt
))
8999 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
9000 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9004 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
9005 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
9008 /* Replace use of lhs with newly computed result. If the use stmt is a
9009 single arg PHI, just replace all uses of PHI result. It's necessary
9010 because lcssa PHI defining lhs may be before newly inserted stmt. */
9011 use_operand_p use_p
;
9012 stmt_vec_info use_stmt_info
;
9013 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
9014 if (!is_gimple_debug (use_stmt
)
9015 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
9016 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
9018 /* ??? This can happen when the live lane ends up being
9019 used in a vector construction code-generated by an
9020 external SLP node (and code-generation for that already
9021 happened). See gcc.dg/vect/bb-slp-47.c.
9022 Doing this is what would happen if that vector CTOR
9023 were not code-generated yet so it is not too bad.
9024 ??? In fact we'd likely want to avoid this situation
9025 in the first place. */
9026 if (TREE_CODE (new_tree
) == SSA_NAME
9027 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
9028 && gimple_code (use_stmt
) != GIMPLE_PHI
9029 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
9032 enum tree_code code
= gimple_assign_rhs_code (use_stmt
);
9033 gcc_assert (code
== CONSTRUCTOR
9034 || code
== VIEW_CONVERT_EXPR
9035 || CONVERT_EXPR_CODE_P (code
));
9036 if (dump_enabled_p ())
9037 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9038 "Using original scalar computation for "
9039 "live lane because use preceeds vector "
9043 /* ??? It can also happen that we end up pulling a def into
9044 a loop where replacing out-of-loop uses would require
9045 a new LC SSA PHI node. Retain the original scalar in
9046 those cases as well. PR98064. */
9047 if (TREE_CODE (new_tree
) == SSA_NAME
9048 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
9049 && (gimple_bb (use_stmt
)->loop_father
9050 != gimple_bb (vec_stmt
)->loop_father
)
9051 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
9052 gimple_bb (use_stmt
)->loop_father
))
9054 if (dump_enabled_p ())
9055 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9056 "Using original scalar computation for "
9057 "live lane because there is an out-of-loop "
9058 "definition for it\n");
9061 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
9062 SET_USE (use_p
, new_tree
);
9063 update_stmt (use_stmt
);
9070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9073 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
9075 ssa_op_iter op_iter
;
9076 imm_use_iterator imm_iter
;
9077 def_operand_p def_p
;
9080 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
9082 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
9086 if (!is_gimple_debug (ustmt
))
9089 bb
= gimple_bb (ustmt
);
9091 if (!flow_bb_inside_loop_p (loop
, bb
))
9093 if (gimple_debug_bind_p (ustmt
))
9095 if (dump_enabled_p ())
9096 dump_printf_loc (MSG_NOTE
, vect_location
,
9097 "killing debug use\n");
9099 gimple_debug_bind_reset_value (ustmt
);
9100 update_stmt (ustmt
);
9109 /* Given loop represented by LOOP_VINFO, return true if computation of
9110 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9114 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
9116 /* Constant case. */
9117 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
9119 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
9120 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
9122 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
9123 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
9124 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
9129 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9130 /* Check the upper bound of loop niters. */
9131 if (get_max_loop_iterations (loop
, &max
))
9133 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
9134 signop sgn
= TYPE_SIGN (type
);
9135 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
9142 /* Return a mask type with half the number of elements as OLD_TYPE,
9143 given that it should have mode NEW_MODE. */
9146 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
9148 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
9149 return build_truth_vector_type_for_mode (nunits
, new_mode
);
9152 /* Return a mask type with twice as many elements as OLD_TYPE,
9153 given that it should have mode NEW_MODE. */
9156 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
9158 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
9159 return build_truth_vector_type_for_mode (nunits
, new_mode
);
9162 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9163 contain a sequence of NVECTORS masks that each control a vector of type
9164 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9165 these vector masks with the vector version of SCALAR_MASK. */
9168 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
9169 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
9171 gcc_assert (nvectors
!= 0);
9172 if (masks
->length () < nvectors
)
9173 masks
->safe_grow_cleared (nvectors
, true);
9174 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
9175 /* The number of scalars per iteration and the number of vectors are
9176 both compile-time constants. */
9177 unsigned int nscalars_per_iter
9178 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
9179 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
9183 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
9184 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
9187 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
9189 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
9190 rgm
->type
= truth_type_for (vectype
);
9195 /* Given a complete set of masks MASKS, extract mask number INDEX
9196 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9197 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9199 See the comment above vec_loop_masks for more details about the mask
9203 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
9204 unsigned int nvectors
, tree vectype
, unsigned int index
)
9206 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
9207 tree mask_type
= rgm
->type
;
9209 /* Populate the rgroup's mask array, if this is the first time we've
9211 if (rgm
->controls
.is_empty ())
9213 rgm
->controls
.safe_grow_cleared (nvectors
, true);
9214 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9216 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
9217 /* Provide a dummy definition until the real one is available. */
9218 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
9219 rgm
->controls
[i
] = mask
;
9223 tree mask
= rgm
->controls
[index
];
9224 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
9225 TYPE_VECTOR_SUBPARTS (vectype
)))
9227 /* A loop mask for data type X can be reused for data type Y
9228 if X has N times more elements than Y and if Y's elements
9229 are N times bigger than X's. In this case each sequence
9230 of N elements in the loop mask will be all-zero or all-one.
9231 We can then view-convert the mask so that each sequence of
9232 N elements is replaced by a single element. */
9233 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
9234 TYPE_VECTOR_SUBPARTS (vectype
)));
9235 gimple_seq seq
= NULL
;
9236 mask_type
= truth_type_for (vectype
);
9237 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
9239 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
9244 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9245 lengths for controlling an operation on VECTYPE. The operation splits
9246 each element of VECTYPE into FACTOR separate subelements, measuring the
9247 length as a number of these subelements. */
9250 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9251 unsigned int nvectors
, tree vectype
, unsigned int factor
)
9253 gcc_assert (nvectors
!= 0);
9254 if (lens
->length () < nvectors
)
9255 lens
->safe_grow_cleared (nvectors
, true);
9256 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9258 /* The number of scalars per iteration, scalar occupied bytes and
9259 the number of vectors are both compile-time constants. */
9260 unsigned int nscalars_per_iter
9261 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
9262 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
9264 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
9266 /* For now, we only support cases in which all loads and stores fall back
9267 to VnQI or none do. */
9268 gcc_assert (!rgl
->max_nscalars_per_iter
9269 || (rgl
->factor
== 1 && factor
== 1)
9270 || (rgl
->max_nscalars_per_iter
* rgl
->factor
9271 == nscalars_per_iter
* factor
));
9272 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
9273 rgl
->type
= vectype
;
9274 rgl
->factor
= factor
;
9278 /* Given a complete set of length LENS, extract length number INDEX for an
9279 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9282 vect_get_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9283 unsigned int nvectors
, unsigned int index
)
9285 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9286 bool use_bias_adjusted_len
=
9287 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) != 0;
9289 /* Populate the rgroup's len array, if this is the first time we've
9291 if (rgl
->controls
.is_empty ())
9293 rgl
->controls
.safe_grow_cleared (nvectors
, true);
9294 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9296 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
9297 gcc_assert (len_type
!= NULL_TREE
);
9299 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
9301 /* Provide a dummy definition until the real one is available. */
9302 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
9303 rgl
->controls
[i
] = len
;
9305 if (use_bias_adjusted_len
)
9307 gcc_assert (i
== 0);
9309 make_temp_ssa_name (len_type
, NULL
, "adjusted_loop_len");
9310 SSA_NAME_DEF_STMT (adjusted_len
) = gimple_build_nop ();
9311 rgl
->bias_adjusted_ctrl
= adjusted_len
;
9316 if (use_bias_adjusted_len
)
9317 return rgl
->bias_adjusted_ctrl
;
9319 return rgl
->controls
[index
];
9322 /* Scale profiling counters by estimation for LOOP which is vectorized
9326 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
9328 edge preheader
= loop_preheader_edge (loop
);
9329 /* Reduce loop iterations by the vectorization factor. */
9330 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
9331 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
9333 if (freq_h
.nonzero_p ())
9335 profile_probability p
;
9337 /* Avoid dropping loop body profile counter to 0 because of zero count
9338 in loop's preheader. */
9339 if (!(freq_e
== profile_count::zero ()))
9340 freq_e
= freq_e
.force_nonzero ();
9341 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
9342 scale_loop_frequencies (loop
, p
);
9345 edge exit_e
= single_exit (loop
);
9346 exit_e
->probability
= profile_probability::always ()
9347 .apply_scale (1, new_est_niter
+ 1);
9349 edge exit_l
= single_pred_edge (loop
->latch
);
9350 profile_probability prob
= exit_l
->probability
;
9351 exit_l
->probability
= exit_e
->probability
.invert ();
9352 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
9353 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
9356 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9357 latch edge values originally defined by it. */
9360 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
9361 stmt_vec_info def_stmt_info
)
9363 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
9364 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
9366 stmt_vec_info phi_info
;
9367 imm_use_iterator iter
;
9368 use_operand_p use_p
;
9369 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
9370 if (gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
)))
9371 if (gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
9372 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
9373 && STMT_VINFO_RELEVANT_P (phi_info
)
9374 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
9375 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
9376 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
9378 loop_p loop
= gimple_bb (phi
)->loop_father
;
9379 edge e
= loop_latch_edge (loop
);
9380 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) == def
)
9382 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
9383 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
9384 gcc_assert (phi_defs
.length () == latch_defs
.length ());
9385 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
9386 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
9387 gimple_get_lhs (latch_defs
[i
]), e
,
9388 gimple_phi_arg_location (phi
, e
->dest_idx
));
9393 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9394 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9398 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
9399 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
9401 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9402 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9404 if (dump_enabled_p ())
9405 dump_printf_loc (MSG_NOTE
, vect_location
,
9406 "------>vectorizing statement: %G", stmt_info
->stmt
);
9408 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9409 vect_loop_kill_debug_uses (loop
, stmt_info
);
9411 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9412 && !STMT_VINFO_LIVE_P (stmt_info
))
9415 if (STMT_VINFO_VECTYPE (stmt_info
))
9418 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
9419 if (!STMT_SLP_TYPE (stmt_info
)
9420 && maybe_ne (nunits
, vf
)
9421 && dump_enabled_p ())
9422 /* For SLP VF is set according to unrolling factor, and not
9423 to vector size, hence for SLP this print is not valid. */
9424 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9427 /* Pure SLP statements have already been vectorized. We still need
9428 to apply loop vectorization to hybrid SLP statements. */
9429 if (PURE_SLP_STMT (stmt_info
))
9432 if (dump_enabled_p ())
9433 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
9435 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
9436 *seen_store
= stmt_info
;
9441 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9442 in the hash_map with its corresponding values. */
9445 find_in_mapping (tree t
, void *context
)
9447 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
9449 tree
*value
= mapping
->get (t
);
9450 return value
? *value
: t
;
9453 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9454 original loop that has now been vectorized.
9456 The inits of the data_references need to be advanced with the number of
9457 iterations of the main loop. This has been computed in vect_do_peeling and
9458 is stored in parameter ADVANCE. We first restore the data_references
9459 initial offset with the values recored in ORIG_DRS_INIT.
9461 Since the loop_vec_info of this EPILOGUE was constructed for the original
9462 loop, its stmt_vec_infos all point to the original statements. These need
9463 to be updated to point to their corresponding copies as well as the SSA_NAMES
9464 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9466 The data_reference's connections also need to be updated. Their
9467 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9468 stmt_vec_infos, their statements need to point to their corresponding copy,
9469 if they are gather loads or scatter stores then their reference needs to be
9470 updated to point to its corresponding copy and finally we set
9471 'base_misaligned' to false as we have already peeled for alignment in the
9472 prologue of the main loop. */
9475 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
9477 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
9478 auto_vec
<gimple
*> stmt_worklist
;
9479 hash_map
<tree
,tree
> mapping
;
9480 gimple
*orig_stmt
, *new_stmt
;
9481 gimple_stmt_iterator epilogue_gsi
;
9482 gphi_iterator epilogue_phi_gsi
;
9483 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
9484 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
9487 free (LOOP_VINFO_BBS (epilogue_vinfo
));
9488 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
9490 /* Advance data_reference's with the number of iterations of the previous
9491 loop and its prologue. */
9492 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
9495 /* The EPILOGUE loop is a copy of the original loop so they share the same
9496 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9497 point to the copied statements. We also create a mapping of all LHS' in
9498 the original loop and all the LHS' in the EPILOGUE and create worklists to
9499 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9500 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
9502 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
9503 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
9505 new_stmt
= epilogue_phi_gsi
.phi ();
9507 gcc_assert (gimple_uid (new_stmt
) > 0);
9509 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9511 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9512 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9514 mapping
.put (gimple_phi_result (orig_stmt
),
9515 gimple_phi_result (new_stmt
));
9516 /* PHI nodes can not have patterns or related statements. */
9517 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
9518 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
9521 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
9522 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
9524 new_stmt
= gsi_stmt (epilogue_gsi
);
9525 if (is_gimple_debug (new_stmt
))
9528 gcc_assert (gimple_uid (new_stmt
) > 0);
9530 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9532 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9533 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9535 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
9536 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
9538 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
9540 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
9541 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
9542 !gsi_end_p (gsi
); gsi_next (&gsi
))
9543 stmt_worklist
.safe_push (gsi_stmt (gsi
));
9546 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
9547 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
9549 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
9550 stmt_worklist
.safe_push (stmt
);
9551 /* Set BB such that the assert in
9552 'get_initial_def_for_reduction' is able to determine that
9553 the BB of the related stmt is inside this loop. */
9554 gimple_set_bb (stmt
,
9555 gimple_bb (new_stmt
));
9556 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
9557 gcc_assert (related_vinfo
== NULL
9558 || related_vinfo
== stmt_vinfo
);
9563 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9564 using the original main loop and thus need to be updated to refer to the
9565 cloned variables used in the epilogue. */
9566 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
9568 gimple
*stmt
= stmt_worklist
[i
];
9571 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
9573 tree op
= gimple_op (stmt
, j
);
9574 if ((new_op
= mapping
.get(op
)))
9575 gimple_set_op (stmt
, j
, *new_op
);
9578 /* PR92429: The last argument of simplify_replace_tree disables
9579 folding when replacing arguments. This is required as
9580 otherwise you might end up with different statements than the
9581 ones analyzed in vect_loop_analyze, leading to different
9583 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
9584 &find_in_mapping
, &mapping
, false);
9585 gimple_set_op (stmt
, j
, op
);
9590 struct data_reference
*dr
;
9591 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
9592 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
9594 orig_stmt
= DR_STMT (dr
);
9595 gcc_assert (gimple_uid (orig_stmt
) > 0);
9596 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
9597 /* Data references for gather loads and scatter stores do not use the
9598 updated offset we set using ADVANCE. Instead we have to make sure the
9599 reference in the data references point to the corresponding copy of
9600 the original in the epilogue. */
9601 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
9602 == VMAT_GATHER_SCATTER
)
9605 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
9606 &find_in_mapping
, &mapping
);
9607 DR_BASE_ADDRESS (dr
)
9608 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
9609 &find_in_mapping
, &mapping
);
9611 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
9612 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
9613 /* The vector size of the epilogue is smaller than that of the main loop
9614 so the alignment is either the same or lower. This means the dr will
9615 thus by definition be aligned. */
9616 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
9619 epilogue_vinfo
->shared
->datarefs_copy
.release ();
9620 epilogue_vinfo
->shared
->save_datarefs ();
9623 /* Function vect_transform_loop.
9625 The analysis phase has determined that the loop is vectorizable.
9626 Vectorize the loop - created vectorized stmts to replace the scalar
9627 stmts in the loop, and update the loop exit condition.
9628 Returns scalar epilogue loop if any. */
9631 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
9633 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9634 class loop
*epilogue
= NULL
;
9635 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
9636 int nbbs
= loop
->num_nodes
;
9638 tree niters_vector
= NULL_TREE
;
9639 tree step_vector
= NULL_TREE
;
9640 tree niters_vector_mult_vf
= NULL_TREE
;
9641 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9642 unsigned int lowest_vf
= constant_lower_bound (vf
);
9644 bool check_profitability
= false;
9647 DUMP_VECT_SCOPE ("vec_transform_loop");
9649 loop_vinfo
->shared
->check_datarefs ();
9651 /* Use the more conservative vectorization threshold. If the number
9652 of iterations is constant assume the cost check has been performed
9653 by our caller. If the threshold makes all loops profitable that
9654 run at least the (estimated) vectorization factor number of times
9655 checking is pointless, too. */
9656 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
9657 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
9659 if (dump_enabled_p ())
9660 dump_printf_loc (MSG_NOTE
, vect_location
,
9661 "Profitability threshold is %d loop iterations.\n",
9663 check_profitability
= true;
9666 /* Make sure there exists a single-predecessor exit bb. Do this before
9668 edge e
= single_exit (loop
);
9669 if (! single_pred_p (e
->dest
))
9671 split_loop_exit_edge (e
, true);
9672 if (dump_enabled_p ())
9673 dump_printf (MSG_NOTE
, "split exit edge\n");
9676 /* Version the loop first, if required, so the profitability check
9679 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
9682 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
9683 sloop
->force_vectorize
= false;
9684 check_profitability
= false;
9687 /* Make sure there exists a single-predecessor exit bb also on the
9688 scalar loop copy. Do this after versioning but before peeling
9689 so CFG structure is fine for both scalar and if-converted loop
9690 to make slpeel_duplicate_current_defs_from_edges face matched
9691 loop closed PHI nodes on the exit. */
9692 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
9694 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
9695 if (! single_pred_p (e
->dest
))
9697 split_loop_exit_edge (e
, true);
9698 if (dump_enabled_p ())
9699 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
9703 tree niters
= vect_build_loop_niters (loop_vinfo
);
9704 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
9705 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
9706 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
9708 drs_init_vec orig_drs_init
;
9710 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
9711 &step_vector
, &niters_vector_mult_vf
, th
,
9712 check_profitability
, niters_no_overflow
,
9715 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
9716 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
9717 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
9718 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
9720 if (niters_vector
== NULL_TREE
)
9722 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
9723 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
9724 && known_eq (lowest_vf
, vf
))
9727 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
9728 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
9729 step_vector
= build_one_cst (TREE_TYPE (niters
));
9731 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9732 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
9733 &step_vector
, niters_no_overflow
);
9735 /* vect_do_peeling subtracted the number of peeled prologue
9736 iterations from LOOP_VINFO_NITERS. */
9737 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
9738 &niters_vector
, &step_vector
,
9739 niters_no_overflow
);
9742 /* 1) Make sure the loop header has exactly two entries
9743 2) Make sure we have a preheader basic block. */
9745 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
9747 split_edge (loop_preheader_edge (loop
));
9749 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9750 /* This will deal with any possible peeling. */
9751 vect_prepare_for_masked_peels (loop_vinfo
);
9753 /* Schedule the SLP instances first, then handle loop vectorization
9755 if (!loop_vinfo
->slp_instances
.is_empty ())
9757 DUMP_VECT_SCOPE ("scheduling SLP instances");
9758 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
9761 /* FORNOW: the vectorizer supports only loops which body consist
9762 of one basic block (header + empty latch). When the vectorizer will
9763 support more involved loop forms, the order by which the BBs are
9764 traversed need to be reconsidered. */
9766 for (i
= 0; i
< nbbs
; i
++)
9768 basic_block bb
= bbs
[i
];
9769 stmt_vec_info stmt_info
;
9771 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9774 gphi
*phi
= si
.phi ();
9775 if (dump_enabled_p ())
9776 dump_printf_loc (MSG_NOTE
, vect_location
,
9777 "------>vectorizing phi: %G", phi
);
9778 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9782 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9783 vect_loop_kill_debug_uses (loop
, stmt_info
);
9785 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9786 && !STMT_VINFO_LIVE_P (stmt_info
))
9789 if (STMT_VINFO_VECTYPE (stmt_info
)
9791 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
9792 && dump_enabled_p ())
9793 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9795 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9796 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9797 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9798 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9799 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9800 && ! PURE_SLP_STMT (stmt_info
))
9802 if (dump_enabled_p ())
9803 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
9804 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
9808 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9811 gphi
*phi
= si
.phi ();
9812 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9816 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9817 && !STMT_VINFO_LIVE_P (stmt_info
))
9820 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9821 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9822 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9823 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9824 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9825 && ! PURE_SLP_STMT (stmt_info
))
9826 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
9829 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
9832 stmt
= gsi_stmt (si
);
9833 /* During vectorization remove existing clobber stmts. */
9834 if (gimple_clobber_p (stmt
))
9836 unlink_stmt_vdef (stmt
);
9837 gsi_remove (&si
, true);
9838 release_defs (stmt
);
9842 /* Ignore vector stmts created in the outer loop. */
9843 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
9845 /* vector stmts created in the outer-loop during vectorization of
9846 stmts in an inner-loop may not have a stmt_info, and do not
9847 need to be vectorized. */
9848 stmt_vec_info seen_store
= NULL
;
9851 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
9853 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
9854 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
9855 !gsi_end_p (subsi
); gsi_next (&subsi
))
9857 stmt_vec_info pat_stmt_info
9858 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
9859 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9862 stmt_vec_info pat_stmt_info
9863 = STMT_VINFO_RELATED_STMT (stmt_info
);
9864 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9866 maybe_set_vectorized_backedge_value (loop_vinfo
,
9871 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
9873 maybe_set_vectorized_backedge_value (loop_vinfo
,
9880 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
9881 /* Interleaving. If IS_STORE is TRUE, the
9882 vectorization of the interleaving chain was
9883 completed - free all the stores in the chain. */
9884 vect_remove_stores (loop_vinfo
,
9885 DR_GROUP_FIRST_ELEMENT (seen_store
));
9887 /* Free the attached stmt_vec_info and remove the stmt. */
9888 loop_vinfo
->remove_stmt (stmt_info
);
9893 /* Stub out scalar statements that must not survive vectorization.
9894 Doing this here helps with grouped statements, or statements that
9895 are involved in patterns. */
9896 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
9897 !gsi_end_p (gsi
); gsi_next (&gsi
))
9899 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
9900 if (!call
|| !gimple_call_internal_p (call
))
9902 internal_fn ifn
= gimple_call_internal_fn (call
);
9903 if (ifn
== IFN_MASK_LOAD
)
9905 tree lhs
= gimple_get_lhs (call
);
9906 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9908 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
9909 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
9910 gsi_replace (&gsi
, new_stmt
, true);
9913 else if (conditional_internal_fn_code (ifn
) != ERROR_MARK
)
9915 tree lhs
= gimple_get_lhs (call
);
9916 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9919 = gimple_call_arg (call
, gimple_call_num_args (call
) - 1);
9920 gimple
*new_stmt
= gimple_build_assign (lhs
, else_arg
);
9921 gsi_replace (&gsi
, new_stmt
, true);
9927 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9928 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9929 if (integer_onep (step_vector
))
9930 niters_no_overflow
= true;
9931 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
9932 niters_vector_mult_vf
, !niters_no_overflow
);
9934 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
9935 scale_profile_for_vect_loop (loop
, assumed_vf
);
9937 /* True if the final iteration might not handle a full vector's
9938 worth of scalar iterations. */
9939 bool final_iter_may_be_partial
9940 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
);
9941 /* The minimum number of iterations performed by the epilogue. This
9942 is 1 when peeling for gaps because we always need a final scalar
9944 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
9945 /* +1 to convert latch counts to loop iteration counts,
9946 -min_epilogue_iters to remove iterations that cannot be performed
9947 by the vector code. */
9948 int bias_for_lowest
= 1 - min_epilogue_iters
;
9949 int bias_for_assumed
= bias_for_lowest
;
9950 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
9951 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
9953 /* When the amount of peeling is known at compile time, the first
9954 iteration will have exactly alignment_npeels active elements.
9955 In the worst case it will have at least one. */
9956 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
9957 bias_for_lowest
+= lowest_vf
- min_first_active
;
9958 bias_for_assumed
+= assumed_vf
- min_first_active
;
9960 /* In these calculations the "- 1" converts loop iteration counts
9961 back to latch counts. */
9962 if (loop
->any_upper_bound
)
9964 loop_vec_info main_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
9965 loop
->nb_iterations_upper_bound
9966 = (final_iter_may_be_partial
9967 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9969 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9974 poly_uint64 main_iters
9975 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo
),
9976 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo
));
9978 = upper_bound (main_iters
,
9979 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo
));
9980 if (can_div_away_from_zero_p (main_iters
,
9981 LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
9983 loop
->nb_iterations_upper_bound
9984 = wi::umin ((widest_int
) (bound
- 1),
9985 loop
->nb_iterations_upper_bound
);
9988 if (loop
->any_likely_upper_bound
)
9989 loop
->nb_iterations_likely_upper_bound
9990 = (final_iter_may_be_partial
9991 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
9992 + bias_for_lowest
, lowest_vf
) - 1
9993 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
9994 + bias_for_lowest
, lowest_vf
) - 1);
9995 if (loop
->any_estimate
)
9996 loop
->nb_iterations_estimate
9997 = (final_iter_may_be_partial
9998 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
10000 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
10003 if (dump_enabled_p ())
10005 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
10007 dump_printf_loc (MSG_NOTE
, vect_location
,
10008 "LOOP VECTORIZED\n");
10010 dump_printf_loc (MSG_NOTE
, vect_location
,
10011 "OUTER LOOP VECTORIZED\n");
10012 dump_printf (MSG_NOTE
, "\n");
10015 dump_printf_loc (MSG_NOTE
, vect_location
,
10016 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10017 GET_MODE_NAME (loop_vinfo
->vector_mode
));
10020 /* Loops vectorized with a variable factor won't benefit from
10021 unrolling/peeling. */
10022 if (!vf
.is_constant ())
10025 if (dump_enabled_p ())
10026 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
10027 " variable-length vectorization factor\n");
10029 /* Free SLP instances here because otherwise stmt reference counting
10031 slp_instance instance
;
10032 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
10033 vect_free_slp_instance (instance
);
10034 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
10035 /* Clear-up safelen field since its value is invalid after vectorization
10036 since vectorized loop can have loop-carried dependencies. */
10041 update_epilogue_loop_vinfo (epilogue
, advance
);
10043 epilogue
->simduid
= loop
->simduid
;
10044 epilogue
->force_vectorize
= loop
->force_vectorize
;
10045 epilogue
->dont_vectorize
= false;
10051 /* The code below is trying to perform simple optimization - revert
10052 if-conversion for masked stores, i.e. if the mask of a store is zero
10053 do not perform it and all stored value producers also if possible.
10055 for (i=0; i<n; i++)
10061 this transformation will produce the following semi-hammock:
10063 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10065 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10066 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10067 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10068 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10069 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10070 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10075 optimize_mask_stores (class loop
*loop
)
10077 basic_block
*bbs
= get_loop_body (loop
);
10078 unsigned nbbs
= loop
->num_nodes
;
10081 class loop
*bb_loop
;
10082 gimple_stmt_iterator gsi
;
10084 auto_vec
<gimple
*> worklist
;
10085 auto_purge_vect_location sentinel
;
10087 vect_location
= find_loop_location (loop
);
10088 /* Pick up all masked stores in loop if any. */
10089 for (i
= 0; i
< nbbs
; i
++)
10092 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
10095 stmt
= gsi_stmt (gsi
);
10096 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
10097 worklist
.safe_push (stmt
);
10102 if (worklist
.is_empty ())
10105 /* Loop has masked stores. */
10106 while (!worklist
.is_empty ())
10108 gimple
*last
, *last_store
;
10111 basic_block store_bb
, join_bb
;
10112 gimple_stmt_iterator gsi_to
;
10113 tree vdef
, new_vdef
;
10118 last
= worklist
.pop ();
10119 mask
= gimple_call_arg (last
, 2);
10120 bb
= gimple_bb (last
);
10121 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10122 the same loop as if_bb. It could be different to LOOP when two
10123 level loop-nest is vectorized and mask_store belongs to the inner
10125 e
= split_block (bb
, last
);
10126 bb_loop
= bb
->loop_father
;
10127 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
10129 store_bb
= create_empty_bb (bb
);
10130 add_bb_to_loop (store_bb
, bb_loop
);
10131 e
->flags
= EDGE_TRUE_VALUE
;
10132 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
10133 /* Put STORE_BB to likely part. */
10134 efalse
->probability
= profile_probability::unlikely ();
10135 store_bb
->count
= efalse
->count ();
10136 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
10137 if (dom_info_available_p (CDI_DOMINATORS
))
10138 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
10139 if (dump_enabled_p ())
10140 dump_printf_loc (MSG_NOTE
, vect_location
,
10141 "Create new block %d to sink mask stores.",
10143 /* Create vector comparison with boolean result. */
10144 vectype
= TREE_TYPE (mask
);
10145 zero
= build_zero_cst (vectype
);
10146 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
10147 gsi
= gsi_last_bb (bb
);
10148 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
10149 /* Create new PHI node for vdef of the last masked store:
10150 .MEM_2 = VDEF <.MEM_1>
10151 will be converted to
10152 .MEM.3 = VDEF <.MEM_1>
10153 and new PHI node will be created in join bb
10154 .MEM_2 = PHI <.MEM_1, .MEM_3>
10156 vdef
= gimple_vdef (last
);
10157 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
10158 gimple_set_vdef (last
, new_vdef
);
10159 phi
= create_phi_node (vdef
, join_bb
);
10160 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
10162 /* Put all masked stores with the same mask to STORE_BB if possible. */
10165 gimple_stmt_iterator gsi_from
;
10166 gimple
*stmt1
= NULL
;
10168 /* Move masked store to STORE_BB. */
10170 gsi
= gsi_for_stmt (last
);
10172 /* Shift GSI to the previous stmt for further traversal. */
10174 gsi_to
= gsi_start_bb (store_bb
);
10175 gsi_move_before (&gsi_from
, &gsi_to
);
10176 /* Setup GSI_TO to the non-empty block start. */
10177 gsi_to
= gsi_start_bb (store_bb
);
10178 if (dump_enabled_p ())
10179 dump_printf_loc (MSG_NOTE
, vect_location
,
10180 "Move stmt to created bb\n%G", last
);
10181 /* Move all stored value producers if possible. */
10182 while (!gsi_end_p (gsi
))
10185 imm_use_iterator imm_iter
;
10186 use_operand_p use_p
;
10189 /* Skip debug statements. */
10190 if (is_gimple_debug (gsi_stmt (gsi
)))
10195 stmt1
= gsi_stmt (gsi
);
10196 /* Do not consider statements writing to memory or having
10197 volatile operand. */
10198 if (gimple_vdef (stmt1
)
10199 || gimple_has_volatile_ops (stmt1
))
10203 lhs
= gimple_get_lhs (stmt1
);
10207 /* LHS of vectorized stmt must be SSA_NAME. */
10208 if (TREE_CODE (lhs
) != SSA_NAME
)
10211 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
10213 /* Remove dead scalar statement. */
10214 if (has_zero_uses (lhs
))
10216 gsi_remove (&gsi_from
, true);
10221 /* Check that LHS does not have uses outside of STORE_BB. */
10223 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
10226 use_stmt
= USE_STMT (use_p
);
10227 if (is_gimple_debug (use_stmt
))
10229 if (gimple_bb (use_stmt
) != store_bb
)
10238 if (gimple_vuse (stmt1
)
10239 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
10242 /* Can move STMT1 to STORE_BB. */
10243 if (dump_enabled_p ())
10244 dump_printf_loc (MSG_NOTE
, vect_location
,
10245 "Move stmt to created bb\n%G", stmt1
);
10246 gsi_move_before (&gsi_from
, &gsi_to
);
10247 /* Shift GSI_TO for further insertion. */
10248 gsi_prev (&gsi_to
);
10250 /* Put other masked stores with the same mask to STORE_BB. */
10251 if (worklist
.is_empty ()
10252 || gimple_call_arg (worklist
.last (), 2) != mask
10253 || worklist
.last () != stmt1
)
10255 last
= worklist
.pop ();
10257 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
10261 /* Decide whether it is possible to use a zero-based induction variable
10262 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10263 the value that the induction variable must be able to hold in order
10264 to ensure that the rgroups eventually have no active vector elements.
10265 Return -1 otherwise. */
10268 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
10270 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10271 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10272 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
10274 /* Calculate the value that the induction variable must be able
10275 to hit in order to ensure that we end the loop with an all-false mask.
10276 This involves adding the maximum number of inactive trailing scalar
10278 widest_int iv_limit
= -1;
10279 if (max_loop_iterations (loop
, &iv_limit
))
10283 /* Add the maximum number of skipped iterations to the
10284 maximum iteration count. */
10285 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
10286 iv_limit
+= wi::to_widest (niters_skip
);
10288 iv_limit
+= max_vf
- 1;
10290 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
10291 /* Make a conservatively-correct assumption. */
10292 iv_limit
+= max_vf
- 1;
10294 /* IV_LIMIT is the maximum number of latch iterations, which is also
10295 the maximum in-range IV value. Round this value down to the previous
10296 vector alignment boundary and then add an extra full iteration. */
10297 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10298 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
10303 /* For the given rgroup_controls RGC, check whether an induction variable
10304 would ever hit a value that produces a set of all-false masks or zero
10305 lengths before wrapping around. Return true if it's possible to wrap
10306 around before hitting the desirable value, otherwise return false. */
10309 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
10311 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
10313 if (iv_limit
== -1)
10316 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
10317 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
10318 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
10320 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)