2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 for (i=0; i<N/8; i++){
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info
,
164 bool vectype_maybe_set_p
,
166 vec
<stmt_vec_info
> *mask_producers
)
168 gimple
*stmt
= stmt_info
->stmt
;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
171 && !STMT_VINFO_LIVE_P (stmt_info
))
172 || gimple_clobber_p (stmt
))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
179 tree stmt_vectype
, nunits_vectype
;
180 if (!vect_get_vector_types_for_stmt (stmt_info
, &stmt_vectype
,
186 if (STMT_VINFO_VECTYPE (stmt_info
))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
191 || vectype_maybe_set_p
)
192 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
193 else if (stmt_vectype
== boolean_type_node
)
194 mask_producers
->safe_push (stmt_info
);
196 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
200 vect_update_max_nunits (vf
, nunits_vectype
);
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info
, poly_uint64
*vf
,
214 vec
<stmt_vec_info
> *mask_producers
)
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: ");
219 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt_info
->stmt
, 0);
221 if (!vect_determine_vf_for_stmt_1 (stmt_info
, false, vf
, mask_producers
))
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
225 && STMT_VINFO_RELATED_STMT (stmt_info
))
227 stmt_info
= vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
229 /* If a pattern statement has def stmts, analyze them too. */
230 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
231 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
232 !gsi_end_p (si
); gsi_next (&si
))
234 stmt_vec_info def_stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE
, vect_location
,
238 "==> examining pattern def stmt: ");
239 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
240 def_stmt_info
->stmt
, 0);
242 if (!vect_determine_vf_for_stmt_1 (def_stmt_info
, true,
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE
, vect_location
,
250 "==> examining pattern statement: ");
251 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt_info
->stmt
, 0);
253 if (!vect_determine_vf_for_stmt_1 (stmt_info
, true, vf
, mask_producers
))
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
288 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
289 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
290 unsigned nbbs
= loop
->num_nodes
;
291 poly_uint64 vectorization_factor
= 1;
292 tree scalar_type
= NULL_TREE
;
295 stmt_vec_info stmt_info
;
297 auto_vec
<stmt_vec_info
> mask_producers
;
299 if (dump_enabled_p ())
300 dump_printf_loc (MSG_NOTE
, vect_location
,
301 "=== vect_determine_vectorization_factor ===\n");
303 for (i
= 0; i
< nbbs
; i
++)
305 basic_block bb
= bbs
[i
];
307 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
311 stmt_info
= vinfo_for_stmt (phi
);
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
315 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
318 gcc_assert (stmt_info
);
320 if (STMT_VINFO_RELEVANT_P (stmt_info
)
321 || STMT_VINFO_LIVE_P (stmt_info
))
323 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
324 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
326 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE
, vect_location
,
329 "get vectype for scalar type: ");
330 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
331 dump_printf (MSG_NOTE
, "\n");
334 vectype
= get_vectype_for_scalar_type (scalar_type
);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
340 "not vectorized: unsupported "
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
344 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
348 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
353 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
354 dump_printf (MSG_NOTE
, "\n");
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
360 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
361 dump_printf (MSG_NOTE
, "\n");
364 vect_update_max_nunits (&vectorization_factor
, vectype
);
368 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
371 stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
372 if (!vect_determine_vf_for_stmt (stmt_info
, &vectorization_factor
,
378 /* TODO: Analyze cost. Decide if worth while to vectorize. */
379 if (dump_enabled_p ())
381 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
382 dump_dec (MSG_NOTE
, vectorization_factor
);
383 dump_printf (MSG_NOTE
, "\n");
386 if (known_le (vectorization_factor
, 1U))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
390 "not vectorized: unsupported data-type\n");
393 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
395 for (i
= 0; i
< mask_producers
.length (); i
++)
397 stmt_info
= mask_producers
[i
];
398 tree mask_type
= vect_get_mask_type_for_stmt (stmt_info
);
401 STMT_VINFO_VECTYPE (stmt_info
) = mask_type
;
408 /* Function vect_is_simple_iv_evolution.
410 FORNOW: A simple evolution of an induction variables in the loop is
411 considered a polynomial evolution. */
414 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
419 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
422 /* When there is no evolution in this loop, the evolution function
424 if (evolution_part
== NULL_TREE
)
427 /* When the evolution is a polynomial of degree >= 2
428 the evolution function is not "simple". */
429 if (tree_is_chrec (evolution_part
))
432 step_expr
= evolution_part
;
433 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
439 dump_printf (MSG_NOTE
, ", init: ");
440 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
441 dump_printf (MSG_NOTE
, "\n");
447 if (TREE_CODE (step_expr
) != INTEGER_CST
448 && (TREE_CODE (step_expr
) != SSA_NAME
449 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
450 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
451 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
452 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
453 || !flag_associative_math
)))
454 && (TREE_CODE (step_expr
) != REAL_CST
455 || !flag_associative_math
))
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
466 /* Function vect_analyze_scalar_cycles_1.
468 Examine the cross iteration def-use cycles of scalar variables
469 in LOOP. LOOP_VINFO represents the loop that is now being
470 considered for vectorization (can be LOOP, or an outer-loop
474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
476 basic_block bb
= loop
->header
;
478 auto_vec
<gimple
*, 64> worklist
;
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE
, vect_location
,
484 "=== vect_analyze_scalar_cycles ===\n");
486 /* First - identify all inductions. Reduction detection assumes that all the
487 inductions have been identified, therefore, this order must not be
489 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
491 gphi
*phi
= gsi
.phi ();
492 tree access_fn
= NULL
;
493 tree def
= PHI_RESULT (phi
);
494 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
499 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
502 /* Skip virtual phi's. The data dependences that are associated with
503 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
504 if (virtual_operand_p (def
))
507 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
509 /* Analyze the evolution function. */
510 access_fn
= analyze_scalar_evolution (loop
, def
);
513 STRIP_NOPS (access_fn
);
514 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE
, vect_location
,
517 "Access function of PHI: ");
518 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
519 dump_printf (MSG_NOTE
, "\n");
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
522 = initial_condition_in_loop_num (access_fn
, loop
->num
);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
524 = evolution_part_in_loop_num (access_fn
, loop
->num
);
528 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
529 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
530 && TREE_CODE (step
) != INTEGER_CST
))
532 worklist
.safe_push (phi
);
536 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
542 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
546 /* Second - identify all reductions and nested cycles. */
547 while (worklist
.length () > 0)
549 gimple
*phi
= worklist
.pop ();
550 tree def
= PHI_RESULT (phi
);
551 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
554 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
557 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
560 gcc_assert (!virtual_operand_p (def
)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
563 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
,
564 &double_reduc
, false);
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE
, vect_location
,
571 "Detected double reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
574 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
575 vect_double_reduction_def
;
579 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
581 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE
, vect_location
,
583 "Detected vectorizable nested cycle.\n");
585 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
586 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE
, vect_location
,
593 "Detected reduction.\n");
595 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
596 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
598 /* Store the reduction cycles for possible vectorization in
599 loop-aware SLP if it was not detected as reduction
601 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt
)))
602 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
609 "Unknown def-use cycle pattern.\n");
614 /* Function vect_analyze_scalar_cycles.
616 Examine the cross iteration def-use cycles of scalar variables, by
617 analyzing the loop-header PHIs of scalar variables. Classify each
618 cycle as one of the following: invariant, induction, reduction, unknown.
619 We do that for the loop represented by LOOP_VINFO, and also to its
620 inner-loop, if exists.
621 Examples for scalar cycles:
636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
638 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
640 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
642 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643 Reductions in such inner-loop therefore have different properties than
644 the reductions in the nest that gets vectorized:
645 1. When vectorized, they are executed in the same order as in the original
646 scalar loop, so we can't change the order of computation when
648 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649 current checks are too strict. */
652 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
655 /* Transfer group and reduction information from STMT to its pattern stmt. */
658 vect_fixup_reduc_chain (gimple
*stmt
)
660 gimple
*firstp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
662 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp
))
663 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
664 GROUP_SIZE (vinfo_for_stmt (firstp
)) = GROUP_SIZE (vinfo_for_stmt (stmt
));
667 stmtp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
668 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp
)) = firstp
;
669 stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
671 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp
))
672 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
675 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp
)) = vect_reduction_def
;
678 /* Fixup scalar cycles that now have their stmts detected as patterns. */
681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
686 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
687 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first
)))
689 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
));
692 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
)))
694 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
696 /* If not all stmt in the chain are patterns try to handle
697 the chain without patterns. */
700 vect_fixup_reduc_chain (first
);
701 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
702 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first
));
707 /* Function vect_get_loop_niters.
709 Determine how many iterations the loop is executed and place it
710 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
711 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
712 niter information holds in ASSUMPTIONS.
714 Return the loop exit condition. */
718 vect_get_loop_niters (struct loop
*loop
, tree
*assumptions
,
719 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
721 edge exit
= single_exit (loop
);
722 struct tree_niter_desc niter_desc
;
723 tree niter_assumptions
, niter
, may_be_zero
;
724 gcond
*cond
= get_loop_exit_condition (loop
);
726 *assumptions
= boolean_true_node
;
727 *number_of_iterationsm1
= chrec_dont_know
;
728 *number_of_iterations
= chrec_dont_know
;
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE
, vect_location
,
731 "=== get_loop_niters ===\n");
736 niter
= chrec_dont_know
;
737 may_be_zero
= NULL_TREE
;
738 niter_assumptions
= boolean_true_node
;
739 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
740 || chrec_contains_undetermined (niter_desc
.niter
))
743 niter_assumptions
= niter_desc
.assumptions
;
744 may_be_zero
= niter_desc
.may_be_zero
;
745 niter
= niter_desc
.niter
;
747 if (may_be_zero
&& integer_zerop (may_be_zero
))
748 may_be_zero
= NULL_TREE
;
752 if (COMPARISON_CLASS_P (may_be_zero
))
754 /* Try to combine may_be_zero with assumptions, this can simplify
755 computation of niter expression. */
756 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
757 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
759 fold_build1 (TRUTH_NOT_EXPR
,
763 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
764 build_int_cst (TREE_TYPE (niter
), 0),
765 rewrite_to_non_trapping_overflow (niter
));
767 may_be_zero
= NULL_TREE
;
769 else if (integer_nonzerop (may_be_zero
))
771 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
772 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
779 *assumptions
= niter_assumptions
;
780 *number_of_iterationsm1
= niter
;
782 /* We want the number of loop header executions which is the number
783 of latch executions plus one.
784 ??? For UINT_MAX latch executions this number overflows to zero
785 for loops like do { n++; } while (n != 0); */
786 if (niter
&& !chrec_contains_undetermined (niter
))
787 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
788 build_int_cst (TREE_TYPE (niter
), 1));
789 *number_of_iterations
= niter
;
794 /* Function bb_in_loop_p
796 Used as predicate for dfs order traversal of the loop bbs. */
799 bb_in_loop_p (const_basic_block bb
, const void *data
)
801 const struct loop
*const loop
= (const struct loop
*)data
;
802 if (flow_bb_inside_loop_p (loop
, bb
))
808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
809 stmt_vec_info structs for all the stmts in LOOP_IN. */
811 _loop_vec_info::_loop_vec_info (struct loop
*loop_in
)
812 : vec_info (vec_info::loop
, init_cost (loop_in
)),
814 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
815 num_itersm1 (NULL_TREE
),
816 num_iters (NULL_TREE
),
817 num_iters_unchanged (NULL_TREE
),
818 num_iters_assumptions (NULL_TREE
),
820 versioning_threshold (0),
821 vectorization_factor (0),
822 max_vectorization_factor (0),
823 mask_skip_niters (NULL_TREE
),
824 mask_compare_type (NULL_TREE
),
826 peeling_for_alignment (0),
829 slp_unrolling_factor (1),
830 single_scalar_iteration_cost (0),
831 vectorizable (false),
832 can_fully_mask_p (true),
833 fully_masked_p (false),
834 peeling_for_gaps (false),
835 peeling_for_niter (false),
836 operands_swapped (false),
837 no_data_dependencies (false),
838 has_mask_store (false),
840 orig_loop_info (NULL
)
842 /* Create/Update stmt_info for all stmts in the loop. */
843 basic_block
*body
= get_loop_body (loop
);
844 for (unsigned int i
= 0; i
< loop
->num_nodes
; i
++)
846 basic_block bb
= body
[i
];
847 gimple_stmt_iterator si
;
849 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
851 gimple
*phi
= gsi_stmt (si
);
852 gimple_set_uid (phi
, 0);
853 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, this));
856 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
858 gimple
*stmt
= gsi_stmt (si
);
859 gimple_set_uid (stmt
, 0);
860 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, this));
865 /* CHECKME: We want to visit all BBs before their successors (except for
866 latch blocks, for which this assertion wouldn't hold). In the simple
867 case of the loop forms we allow, a dfs order of the BBs would the same
868 as reversed postorder traversal, so we are safe. */
870 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
871 bbs
, loop
->num_nodes
, loop
);
872 gcc_assert (nbbs
== loop
->num_nodes
);
875 /* Free all levels of MASKS. */
878 release_vec_loop_masks (vec_loop_masks
*masks
)
882 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
883 rgm
->masks
.release ();
887 /* Free all memory used by the _loop_vec_info, as well as all the
888 stmt_vec_info structs of all the stmts in the loop. */
890 _loop_vec_info::~_loop_vec_info ()
893 gimple_stmt_iterator si
;
896 nbbs
= loop
->num_nodes
;
897 for (j
= 0; j
< nbbs
; j
++)
899 basic_block bb
= bbs
[j
];
900 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
901 free_stmt_vec_info (gsi_stmt (si
));
903 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
905 gimple
*stmt
= gsi_stmt (si
);
907 /* We may have broken canonical form by moving a constant
908 into RHS1 of a commutative op. Fix such occurrences. */
909 if (operands_swapped
&& is_gimple_assign (stmt
))
911 enum tree_code code
= gimple_assign_rhs_code (stmt
);
913 if ((code
== PLUS_EXPR
914 || code
== POINTER_PLUS_EXPR
915 || code
== MULT_EXPR
)
916 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
917 swap_ssa_operands (stmt
,
918 gimple_assign_rhs1_ptr (stmt
),
919 gimple_assign_rhs2_ptr (stmt
));
920 else if (code
== COND_EXPR
921 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt
)))
923 tree cond_expr
= gimple_assign_rhs1 (stmt
);
924 enum tree_code cond_code
= TREE_CODE (cond_expr
);
926 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
928 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
,
930 cond_code
= invert_tree_comparison (cond_code
,
932 if (cond_code
!= ERROR_MARK
)
934 TREE_SET_CODE (cond_expr
, cond_code
);
935 swap_ssa_operands (stmt
,
936 gimple_assign_rhs2_ptr (stmt
),
937 gimple_assign_rhs3_ptr (stmt
));
943 /* Free stmt_vec_info. */
944 free_stmt_vec_info (stmt
);
951 release_vec_loop_masks (&masks
);
957 /* Return an invariant or register for EXPR and emit necessary
958 computations in the LOOP_VINFO loop preheader. */
961 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
963 if (is_gimple_reg (expr
)
964 || is_gimple_min_invariant (expr
))
967 if (! loop_vinfo
->ivexpr_map
)
968 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
969 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
972 gimple_seq stmts
= NULL
;
973 cached
= force_gimple_operand (unshare_expr (expr
),
974 &stmts
, true, NULL_TREE
);
977 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
978 gsi_insert_seq_on_edge_immediate (e
, stmts
);
984 /* Return true if we can use CMP_TYPE as the comparison type to produce
985 all masks required to mask LOOP_VINFO. */
988 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
992 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
993 if (rgm
->mask_type
!= NULL_TREE
994 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
995 cmp_type
, rgm
->mask_type
,
1001 /* Calculate the maximum number of scalars per iteration for every
1002 rgroup in LOOP_VINFO. */
1005 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1007 unsigned int res
= 1;
1010 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1011 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1015 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1016 whether we can actually generate the masks required. Return true if so,
1017 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1020 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1022 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1023 unsigned int min_ni_width
;
1025 /* Use a normal loop if there are no statements that need masking.
1026 This only happens in rare degenerate cases: it means that the loop
1027 has no loads, no stores, and no live-out values. */
1028 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1031 /* Get the maximum number of iterations that is representable
1032 in the counter type. */
1033 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1034 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1036 /* Get a more refined estimate for the number of iterations. */
1037 widest_int max_back_edges
;
1038 if (max_loop_iterations (loop
, &max_back_edges
))
1039 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1041 /* Account for rgroup masks, in which each bit is replicated N times. */
1042 max_ni
*= vect_get_max_nscalars_per_iter (loop_vinfo
);
1044 /* Work out how many bits we need to represent the limit. */
1045 min_ni_width
= wi::min_precision (max_ni
, UNSIGNED
);
1047 /* Find a scalar mode for which WHILE_ULT is supported. */
1048 opt_scalar_int_mode cmp_mode_iter
;
1049 tree cmp_type
= NULL_TREE
;
1050 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1052 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1053 if (cmp_bits
>= min_ni_width
1054 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1056 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1058 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1060 /* Although we could stop as soon as we find a valid mode,
1061 it's often better to continue until we hit Pmode, since the
1062 operands to the WHILE are more likely to be reusable in
1063 address calculations. */
1064 cmp_type
= this_type
;
1065 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1074 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1078 /* Calculate the cost of one scalar iteration of the loop. */
1080 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1082 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1083 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1084 int nbbs
= loop
->num_nodes
, factor
;
1085 int innerloop_iters
, i
;
1087 /* Gather costs for statements in the scalar loop. */
1090 innerloop_iters
= 1;
1092 innerloop_iters
= 50; /* FIXME */
1094 for (i
= 0; i
< nbbs
; i
++)
1096 gimple_stmt_iterator si
;
1097 basic_block bb
= bbs
[i
];
1099 if (bb
->loop_father
== loop
->inner
)
1100 factor
= innerloop_iters
;
1104 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1106 gimple
*stmt
= gsi_stmt (si
);
1107 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1109 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1112 /* Skip stmts that are not vectorized inside the loop. */
1114 && !STMT_VINFO_RELEVANT_P (stmt_info
)
1115 && (!STMT_VINFO_LIVE_P (stmt_info
)
1116 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1117 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
1120 vect_cost_for_stmt kind
;
1121 if (STMT_VINFO_DATA_REF (stmt_info
))
1123 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1126 kind
= scalar_store
;
1131 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1132 factor
, kind
, stmt_info
, 0, vect_prologue
);
1136 /* Now accumulate cost. */
1137 void *target_cost_data
= init_cost (loop
);
1138 stmt_info_for_cost
*si
;
1140 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1143 struct _stmt_vec_info
*stmt_info
1144 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1145 (void) add_stmt_cost (target_cost_data
, si
->count
,
1146 si
->kind
, stmt_info
, si
->misalign
,
1149 unsigned dummy
, body_cost
= 0;
1150 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1151 destroy_cost_data (target_cost_data
);
1152 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1156 /* Function vect_analyze_loop_form_1.
1158 Verify that certain CFG restrictions hold, including:
1159 - the loop has a pre-header
1160 - the loop has a single entry and exit
1161 - the loop exit condition is simple enough
1162 - the number of iterations can be analyzed, i.e, a countable loop. The
1163 niter could be analyzed under some assumptions. */
1166 vect_analyze_loop_form_1 (struct loop
*loop
, gcond
**loop_cond
,
1167 tree
*assumptions
, tree
*number_of_iterationsm1
,
1168 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_NOTE
, vect_location
,
1172 "=== vect_analyze_loop_form ===\n");
1174 /* Different restrictions apply when we are considering an inner-most loop,
1175 vs. an outer (nested) loop.
1176 (FORNOW. May want to relax some of these restrictions in the future). */
1180 /* Inner-most loop. We currently require that the number of BBs is
1181 exactly 2 (the header and latch). Vectorizable inner-most loops
1192 if (loop
->num_nodes
!= 2)
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1196 "not vectorized: control flow in loop.\n");
1200 if (empty_block_p (loop
->header
))
1202 if (dump_enabled_p ())
1203 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1204 "not vectorized: empty loop.\n");
1210 struct loop
*innerloop
= loop
->inner
;
1213 /* Nested loop. We currently require that the loop is doubly-nested,
1214 contains a single inner loop, and the number of BBs is exactly 5.
1215 Vectorizable outer-loops look like this:
1227 The inner-loop has the properties expected of inner-most loops
1228 as described above. */
1230 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1234 "not vectorized: multiple nested loops.\n");
1238 if (loop
->num_nodes
!= 5)
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1242 "not vectorized: control flow in loop.\n");
1246 entryedge
= loop_preheader_edge (innerloop
);
1247 if (entryedge
->src
!= loop
->header
1248 || !single_exit (innerloop
)
1249 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1253 "not vectorized: unsupported outerloop form.\n");
1257 /* Analyze the inner-loop. */
1258 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1259 if (! vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1260 &inner_assumptions
, &inner_niterm1
,
1262 /* Don't support analyzing niter under assumptions for inner
1264 || !integer_onep (inner_assumptions
))
1266 if (dump_enabled_p ())
1267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1268 "not vectorized: Bad inner loop.\n");
1272 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1274 if (dump_enabled_p ())
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1276 "not vectorized: inner-loop count not"
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_NOTE
, vect_location
,
1283 "Considering outer-loop vectorization.\n");
1286 if (!single_exit (loop
)
1287 || EDGE_COUNT (loop
->header
->preds
) != 2)
1289 if (dump_enabled_p ())
1291 if (!single_exit (loop
))
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1293 "not vectorized: multiple exits.\n");
1294 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1296 "not vectorized: too many incoming edges.\n");
1301 /* We assume that the loop exit condition is at the end of the loop. i.e,
1302 that the loop is represented as a do-while (with a proper if-guard
1303 before the loop if needed), where the loop header contains all the
1304 executable statements, and the latch is empty. */
1305 if (!empty_block_p (loop
->latch
)
1306 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1310 "not vectorized: latch block not empty.\n");
1314 /* Make sure the exit is not abnormal. */
1315 edge e
= single_exit (loop
);
1316 if (e
->flags
& EDGE_ABNORMAL
)
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1320 "not vectorized: abnormal loop exit edge.\n");
1324 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1325 number_of_iterationsm1
);
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1330 "not vectorized: complicated exit condition.\n");
1334 if (integer_zerop (*assumptions
)
1335 || !*number_of_iterations
1336 || chrec_contains_undetermined (*number_of_iterations
))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1340 "not vectorized: number of iterations cannot be "
1345 if (integer_zerop (*number_of_iterations
))
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1349 "not vectorized: number of iterations = 0.\n");
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1359 vect_analyze_loop_form (struct loop
*loop
)
1361 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1362 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1364 if (! vect_analyze_loop_form_1 (loop
, &loop_cond
,
1365 &assumptions
, &number_of_iterationsm1
,
1366 &number_of_iterations
, &inner_loop_cond
))
1369 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
);
1370 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1371 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1372 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1373 if (!integer_onep (assumptions
))
1375 /* We consider to vectorize this loop by versioning it under
1376 some assumptions. In order to do this, we need to clear
1377 existing information computed by scev and niter analyzer. */
1379 free_numbers_of_iterations_estimates (loop
);
1380 /* Also set flag for this loop so that following scev and niter
1381 analysis are done under the assumptions. */
1382 loop_constraint_set (loop
, LOOP_C_FINITE
);
1383 /* Also record the assumptions for versioning. */
1384 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1387 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1389 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE
, vect_location
,
1392 "Symbolic number of iterations is ");
1393 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1394 dump_printf (MSG_NOTE
, "\n");
1398 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1399 if (inner_loop_cond
)
1400 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond
))
1401 = loop_exit_ctrl_vec_info_type
;
1403 gcc_assert (!loop
->aux
);
1404 loop
->aux
= loop_vinfo
;
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411 statements update the vectorization factor. */
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1416 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1417 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1418 int nbbs
= loop
->num_nodes
;
1419 poly_uint64 vectorization_factor
;
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE
, vect_location
,
1424 "=== vect_update_vf_for_slp ===\n");
1426 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1427 gcc_assert (known_ne (vectorization_factor
, 0U));
1429 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1430 vectorization factor of the loop is the unrolling factor required by
1431 the SLP instances. If that unrolling factor is 1, we say, that we
1432 perform pure SLP on loop - cross iteration parallelism is not
1434 bool only_slp_in_loop
= true;
1435 for (i
= 0; i
< nbbs
; i
++)
1437 basic_block bb
= bbs
[i
];
1438 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1441 gimple
*stmt
= gsi_stmt (si
);
1442 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1443 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
1444 && STMT_VINFO_RELATED_STMT (stmt_info
))
1446 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
1447 stmt_info
= vinfo_for_stmt (stmt
);
1449 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1450 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1451 && !PURE_SLP_STMT (stmt_info
))
1452 /* STMT needs both SLP and loop-based vectorization. */
1453 only_slp_in_loop
= false;
1457 if (only_slp_in_loop
)
1459 dump_printf_loc (MSG_NOTE
, vect_location
,
1460 "Loop contains only SLP stmts\n");
1461 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1465 dump_printf_loc (MSG_NOTE
, vect_location
,
1466 "Loop contains SLP and non-SLP stmts\n");
1467 /* Both the vectorization factor and unroll factor have the form
1468 current_vector_size * X for some rational X, so they must have
1469 a common multiple. */
1470 vectorization_factor
1471 = force_common_multiple (vectorization_factor
,
1472 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1475 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1476 if (dump_enabled_p ())
1478 dump_printf_loc (MSG_NOTE
, vect_location
,
1479 "Updating vectorization factor to ");
1480 dump_dec (MSG_NOTE
, vectorization_factor
);
1481 dump_printf (MSG_NOTE
, ".\n");
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486 the other phi in the reduction is also relevant for vectorization.
1487 This rejects cases such as:
1490 x_1 = PHI <x_3(outer2), ...>;
1498 x_3 = PHI <x_2(inner)>;
1500 if nothing in x_2 or elsewhere makes x_1 relevant. */
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1505 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1508 gimple
*other_phi
= STMT_VINFO_REDUC_DEF (stmt_info
);
1509 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi
));
1512 /* Function vect_analyze_loop_operations.
1514 Scan the loop stmts and make sure they are all vectorizable. */
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1519 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1520 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1521 int nbbs
= loop
->num_nodes
;
1523 stmt_vec_info stmt_info
;
1524 bool need_to_vectorize
= false;
1527 if (dump_enabled_p ())
1528 dump_printf_loc (MSG_NOTE
, vect_location
,
1529 "=== vect_analyze_loop_operations ===\n");
1531 stmt_vector_for_cost cost_vec
;
1532 cost_vec
.create (2);
1534 for (i
= 0; i
< nbbs
; i
++)
1536 basic_block bb
= bbs
[i
];
1538 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1541 gphi
*phi
= si
.phi ();
1544 stmt_info
= vinfo_for_stmt (phi
);
1545 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1548 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1550 if (virtual_operand_p (gimple_phi_result (phi
)))
1553 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1554 (i.e., a phi in the tail of the outer-loop). */
1555 if (! is_loop_header_bb_p (bb
))
1557 /* FORNOW: we currently don't support the case that these phis
1558 are not used in the outerloop (unless it is double reduction,
1559 i.e., this phi is vect_reduction_def), cause this case
1560 requires to actually do something here. */
1561 if (STMT_VINFO_LIVE_P (stmt_info
)
1562 && !vect_active_double_reduction_p (stmt_info
))
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1566 "Unsupported loop-closed phi in "
1571 /* If PHI is used in the outer loop, we check that its operand
1572 is defined in the inner loop. */
1573 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1576 gimple
*op_def_stmt
;
1578 if (gimple_phi_num_args (phi
) != 1)
1581 phi_op
= PHI_ARG_DEF (phi
, 0);
1582 if (TREE_CODE (phi_op
) != SSA_NAME
)
1585 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1586 if (gimple_nop_p (op_def_stmt
)
1587 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1588 || !vinfo_for_stmt (op_def_stmt
))
1591 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1592 != vect_used_in_outer
1593 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1594 != vect_used_in_outer_by_reduction
)
1601 gcc_assert (stmt_info
);
1603 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1604 || STMT_VINFO_LIVE_P (stmt_info
))
1605 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1607 /* A scalar-dependence cycle that we don't support. */
1608 if (dump_enabled_p ())
1609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1610 "not vectorized: scalar dependence cycle.\n");
1614 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1616 need_to_vectorize
= true;
1617 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1618 && ! PURE_SLP_STMT (stmt_info
))
1619 ok
= vectorizable_induction (phi
, NULL
, NULL
, NULL
, &cost_vec
);
1620 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1621 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1622 && ! PURE_SLP_STMT (stmt_info
))
1623 ok
= vectorizable_reduction (phi
, NULL
, NULL
, NULL
, NULL
,
1627 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1629 && STMT_VINFO_LIVE_P (stmt_info
)
1630 && !PURE_SLP_STMT (stmt_info
))
1631 ok
= vectorizable_live_operation (phi
, NULL
, NULL
, -1, NULL
,
1636 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1639 "not vectorized: relevant phi not "
1641 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1647 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1650 gimple
*stmt
= gsi_stmt (si
);
1651 if (!gimple_clobber_p (stmt
)
1652 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
, NULL
,
1658 add_stmt_costs (loop_vinfo
->target_cost_data
, &cost_vec
);
1659 cost_vec
.release ();
1661 /* All operations in the loop are either irrelevant (deal with loop
1662 control, or dead), or only used outside the loop and can be moved
1663 out of the loop (e.g. invariants, inductions). The loop can be
1664 optimized away by scalar optimizations. We're better off not
1665 touching this loop. */
1666 if (!need_to_vectorize
)
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE
, vect_location
,
1670 "All the computation can be taken out of the loop.\n");
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1673 "not vectorized: redundant loop. no profit to "
1681 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1682 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1683 definitely no, or -1 if it's worth retrying. */
1686 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1688 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1689 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1691 /* Only fully-masked loops can have iteration counts less than the
1692 vectorization factor. */
1693 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1695 HOST_WIDE_INT max_niter
;
1697 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1698 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1700 max_niter
= max_stmt_executions_int (loop
);
1703 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1707 "not vectorized: iteration count smaller than "
1708 "vectorization factor.\n");
1713 int min_profitable_iters
, min_profitable_estimate
;
1714 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1715 &min_profitable_estimate
);
1717 if (min_profitable_iters
< 0)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1721 "not vectorized: vectorization not profitable.\n");
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1724 "not vectorized: vector version will never be "
1729 int min_scalar_loop_bound
= (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1732 /* Use the cost model only if it is more conservative than user specified
1734 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1735 min_profitable_iters
);
1737 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1739 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1740 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1744 "not vectorized: vectorization not profitable.\n");
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE
, vect_location
,
1747 "not vectorized: iteration count smaller than user "
1748 "specified loop bound parameter or minimum profitable "
1749 "iterations (whichever is more conservative).\n");
1753 HOST_WIDE_INT estimated_niter
= estimated_stmt_executions_int (loop
);
1754 if (estimated_niter
== -1)
1755 estimated_niter
= likely_max_stmt_executions_int (loop
);
1756 if (estimated_niter
!= -1
1757 && ((unsigned HOST_WIDE_INT
) estimated_niter
1758 < MAX (th
, (unsigned) min_profitable_estimate
)))
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1762 "not vectorized: estimated iteration count too "
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE
, vect_location
,
1766 "not vectorized: estimated iteration count smaller "
1767 "than specified loop bound parameter or minimum "
1768 "profitable iterations (whichever is more "
1769 "conservative).\n");
1777 /* Function vect_analyze_loop_2.
1779 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780 for it. The different analyses will record information in the
1781 loop_vec_info struct. */
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
)
1787 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
1788 poly_uint64 min_vf
= 2;
1789 unsigned int n_stmts
= 0;
1791 /* The first group of checks is independent of the vector size. */
1794 /* Find all data references in the loop (which correspond to vdefs/vuses)
1795 and analyze their evolution in the loop. */
1797 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1799 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1800 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
1802 if (dump_enabled_p ())
1803 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1804 "not vectorized: loop nest containing two "
1805 "or more consecutive inner loops cannot be "
1810 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1811 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1812 !gsi_end_p (gsi
); gsi_next (&gsi
))
1814 gimple
*stmt
= gsi_stmt (gsi
);
1815 if (is_gimple_debug (stmt
))
1818 if (!find_data_references_in_stmt (loop
, stmt
,
1819 &LOOP_VINFO_DATAREFS (loop_vinfo
)))
1821 if (is_gimple_call (stmt
) && loop
->safelen
)
1823 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1824 if (fndecl
!= NULL_TREE
)
1826 cgraph_node
*node
= cgraph_node::get (fndecl
);
1827 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1829 unsigned int j
, n
= gimple_call_num_args (stmt
);
1830 for (j
= 0; j
< n
; j
++)
1832 op
= gimple_call_arg (stmt
, j
);
1834 || (REFERENCE_CLASS_P (op
)
1835 && get_base_address (op
)))
1838 op
= gimple_call_lhs (stmt
);
1839 /* Ignore #pragma omp declare simd functions
1840 if they don't have data references in the
1841 call stmt itself. */
1845 || (REFERENCE_CLASS_P (op
)
1846 && get_base_address (op
)))))
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1853 "not vectorized: loop contains function "
1854 "calls or data references that cannot "
1860 /* Analyze the data references and also adjust the minimal
1861 vectorization factor according to the loads and stores. */
1863 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
);
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1868 "bad data references.\n");
1872 /* Classify all cross-iteration scalar data-flow cycles.
1873 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1874 vect_analyze_scalar_cycles (loop_vinfo
);
1876 vect_pattern_recog (loop_vinfo
);
1878 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1880 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1883 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1888 "bad data access.\n");
1892 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1894 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1899 "unexpected pattern.\n");
1903 /* While the rest of the analysis below depends on it in some way. */
1906 /* Analyze data dependences between the data-refs in the loop
1907 and adjust the maximum vectorization factor according to
1909 FORNOW: fail at the first data dependence that we encounter. */
1911 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1913 || (max_vf
!= MAX_VECTORIZATION_FACTOR
1914 && maybe_lt (max_vf
, min_vf
)))
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1918 "bad data dependence.\n");
1921 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
1923 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1928 "can't determine vectorization factor.\n");
1931 if (max_vf
!= MAX_VECTORIZATION_FACTOR
1932 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1936 "bad data dependence.\n");
1940 /* Compute the scalar iteration cost. */
1941 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
1943 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1946 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1947 ok
= vect_analyze_slp (loop_vinfo
, n_stmts
);
1951 /* If there are any SLP instances mark them as pure_slp. */
1952 bool slp
= vect_make_slp_decision (loop_vinfo
);
1955 /* Find stmts that need to be both vectorized and SLPed. */
1956 vect_detect_hybrid_slp (loop_vinfo
);
1958 /* Update the vectorization factor based on the SLP decision. */
1959 vect_update_vf_for_slp (loop_vinfo
);
1962 bool saved_can_fully_mask_p
= LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
);
1964 /* We don't expect to have to roll back to anything other than an empty
1966 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
1968 /* This is the point where we can re-start analysis with SLP forced off. */
1971 /* Now the vectorization factor is final. */
1972 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1973 gcc_assert (known_ne (vectorization_factor
, 0U));
1975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE
, vect_location
,
1978 "vectorization_factor = ");
1979 dump_dec (MSG_NOTE
, vectorization_factor
);
1980 dump_printf (MSG_NOTE
, ", niters = " HOST_WIDE_INT_PRINT_DEC
"\n",
1981 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1984 HOST_WIDE_INT max_niter
1985 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1987 /* Analyze the alignment of the data-refs in the loop.
1988 Fail if a data reference is found that cannot be vectorized. */
1990 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1995 "bad data alignment.\n");
1999 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2000 It is important to call pruning after vect_analyze_data_ref_accesses,
2001 since we use grouping information gathered by interleaving analysis. */
2002 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2006 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2008 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2010 /* This pass will decide on using loop versioning and/or loop peeling in
2011 order to enhance the alignment of data references in the loop. */
2012 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2017 "bad data alignment.\n");
2024 /* Analyze operations in the SLP instances. Note this may
2025 remove unsupported SLP instances which makes the above
2026 SLP kind detection invalid. */
2027 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2028 vect_slp_analyze_operations (loop_vinfo
);
2029 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2033 /* Scan all the remaining operations in the loop that are not subject
2034 to SLP and make sure they are vectorizable. */
2035 ok
= vect_analyze_loop_operations (loop_vinfo
);
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2040 "bad operation or unsupported loop bound.\n");
2044 /* Decide whether to use a fully-masked loop for this vectorization
2046 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2047 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
)
2048 && vect_verify_full_masking (loop_vinfo
));
2049 if (dump_enabled_p ())
2051 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2052 dump_printf_loc (MSG_NOTE
, vect_location
,
2053 "using a fully-masked loop.\n");
2055 dump_printf_loc (MSG_NOTE
, vect_location
,
2056 "not using a fully-masked loop.\n");
2059 /* If epilog loop is required because of data accesses with gaps,
2060 one additional iteration needs to be peeled. Check if there is
2061 enough iterations for vectorization. */
2062 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2063 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2064 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2066 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2067 tree scalar_niters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2069 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE
, vect_location
,
2073 "loop has no enough iterations to support"
2074 " peeling for gaps.\n");
2079 /* Check the costings of the loop make vectorizing worthwhile. */
2080 res
= vect_analyze_loop_costing (loop_vinfo
);
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2087 "Loop costings not worthwhile.\n");
2091 /* Decide whether we need to create an epilogue loop to handle
2092 remaining scalar iterations. */
2093 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2095 unsigned HOST_WIDE_INT const_vf
;
2096 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2097 /* The main loop handles all iterations. */
2098 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2099 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2100 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
2102 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
)
2103 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
),
2104 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2105 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2107 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2108 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
2109 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
2110 < (unsigned) exact_log2 (const_vf
))
2111 /* In case of versioning, check if the maximum number of
2112 iterations is greater than th. If they are identical,
2113 the epilogue is unnecessary. */
2114 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2115 || ((unsigned HOST_WIDE_INT
) max_niter
2116 > (th
/ const_vf
) * const_vf
))))
2117 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2119 /* If an epilogue loop is required make sure we can create one. */
2120 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2121 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2125 if (!vect_can_advance_ivs_p (loop_vinfo
)
2126 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2127 single_exit (LOOP_VINFO_LOOP
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2132 "not vectorized: can't create required "
2138 /* During peeling, we need to check if number of loop iterations is
2139 enough for both peeled prolog loop and vector loop. This check
2140 can be merged along with threshold check of loop versioning, so
2141 increase threshold for this case if necessary. */
2142 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
2144 poly_uint64 niters_th
= 0;
2146 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2148 /* Niters for peeled prolog loop. */
2149 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2151 struct data_reference
*dr
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2153 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
)));
2154 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2157 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2160 /* Niters for at least one iteration of vectorized loop. */
2161 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2162 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2163 /* One additional iteration because of peeling for gap. */
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2166 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2169 gcc_assert (known_eq (vectorization_factor
,
2170 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2172 /* Ok to vectorize! */
2176 /* Try again with SLP forced off but if we didn't do any SLP there is
2177 no point in re-trying. */
2181 /* If there are reduction chains re-trying will fail anyway. */
2182 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2185 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2186 via interleaving or lane instructions. */
2187 slp_instance instance
;
2190 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2192 stmt_vec_info vinfo
;
2193 vinfo
= vinfo_for_stmt
2194 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0]);
2195 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2197 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2198 unsigned int size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2199 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2200 if (! vect_store_lanes_supported (vectype
, size
, false)
2201 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2202 && ! vect_grouped_store_supported (vectype
, size
))
2204 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2206 vinfo
= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]);
2207 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2208 bool single_element_p
= !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo
);
2209 size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2210 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2211 if (! vect_load_lanes_supported (vectype
, size
, false)
2212 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_NOTE
, vect_location
,
2220 "re-trying with SLP disabled\n");
2222 /* Roll back state appropriately. No SLP this time. */
2224 /* Restore vectorization factor as it were without SLP. */
2225 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2226 /* Free the SLP instances. */
2227 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2228 vect_free_slp_instance (instance
);
2229 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2230 /* Reset SLP type to loop_vect on all stmts. */
2231 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2233 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2234 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2235 !gsi_end_p (si
); gsi_next (&si
))
2237 stmt_vec_info stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
2238 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2240 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2241 !gsi_end_p (si
); gsi_next (&si
))
2243 stmt_vec_info stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
2244 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2245 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2247 stmt_info
= vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
2248 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2249 for (gimple_stmt_iterator pi
2250 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2251 !gsi_end_p (pi
); gsi_next (&pi
))
2253 gimple
*pstmt
= gsi_stmt (pi
);
2254 STMT_SLP_TYPE (vinfo_for_stmt (pstmt
)) = loop_vect
;
2259 /* Free optimized alias test DDRS. */
2260 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2261 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2262 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2263 /* Reset target cost data. */
2264 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2265 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2266 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2267 /* Reset accumulated rgroup information. */
2268 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo
));
2269 /* Reset assorted flags. */
2270 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2271 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2272 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2273 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2274 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = saved_can_fully_mask_p
;
2279 /* Function vect_analyze_loop.
2281 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282 for it. The different analyses will record information in the
2283 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2286 vect_analyze_loop (struct loop
*loop
, loop_vec_info orig_loop_vinfo
)
2288 loop_vec_info loop_vinfo
;
2289 auto_vector_sizes vector_sizes
;
2291 /* Autodetect first vector size we try. */
2292 current_vector_size
= 0;
2293 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
2294 unsigned int next_size
= 0;
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE
, vect_location
,
2298 "===== analyze_loop_nest =====\n");
2300 if (loop_outer (loop
)
2301 && loop_vec_info_for_loop (loop_outer (loop
))
2302 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE
, vect_location
,
2306 "outer-loop already vectorized.\n");
2310 poly_uint64 autodetected_vector_size
= 0;
2313 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2314 loop_vinfo
= vect_analyze_loop_form (loop
);
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2319 "bad loop form.\n");
2325 if (orig_loop_vinfo
)
2326 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = orig_loop_vinfo
;
2328 if (vect_analyze_loop_2 (loop_vinfo
, fatal
))
2330 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2338 autodetected_vector_size
= current_vector_size
;
2340 if (next_size
< vector_sizes
.length ()
2341 && known_eq (vector_sizes
[next_size
], autodetected_vector_size
))
2345 || next_size
== vector_sizes
.length ()
2346 || known_eq (current_vector_size
, 0U))
2349 /* Try the next biggest vector size. */
2350 current_vector_size
= vector_sizes
[next_size
++];
2351 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE
, vect_location
,
2354 "***** Re-trying analysis with "
2356 dump_dec (MSG_NOTE
, current_vector_size
);
2357 dump_printf (MSG_NOTE
, "\n");
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363 it in *REDUC_FN if so. */
2366 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
2371 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
2379 /* Function reduction_fn_for_scalar_code
2382 CODE - tree_code of a reduction operations.
2385 REDUC_FN - the corresponding internal function to be used to reduce the
2386 vector of partial results into a single scalar result, or IFN_LAST
2387 if the operation is a supported reduction operation, but does not have
2388 such an internal function.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2393 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
2398 *reduc_fn
= IFN_REDUC_MAX
;
2402 *reduc_fn
= IFN_REDUC_MIN
;
2406 *reduc_fn
= IFN_REDUC_PLUS
;
2410 *reduc_fn
= IFN_REDUC_AND
;
2414 *reduc_fn
= IFN_REDUC_IOR
;
2418 *reduc_fn
= IFN_REDUC_XOR
;
2423 *reduc_fn
= IFN_LAST
;
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432 be affected by the introduction of additional X elements, return that X,
2433 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2434 is true if the SLP statements perform a single reduction, false if each
2435 statement performs an independent reduction. */
2438 neutral_op_for_slp_reduction (slp_tree slp_node
, tree_code code
,
2441 vec
<gimple
*> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2442 gimple
*stmt
= stmts
[0];
2443 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2444 tree vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
2445 tree scalar_type
= TREE_TYPE (vector_type
);
2446 struct loop
*loop
= gimple_bb (stmt
)->loop_father
;
2451 case WIDEN_SUM_EXPR
:
2458 return build_zero_cst (scalar_type
);
2461 return build_one_cst (scalar_type
);
2464 return build_all_ones_cst (scalar_type
);
2468 /* For MIN/MAX the initial values are neutral. A reduction chain
2469 has only a single initial value, so that value is neutral for
2472 return PHI_ARG_DEF_FROM_EDGE (stmt
, loop_preheader_edge (loop
));
2480 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2481 STMT is printed with a message MSG. */
2484 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
2486 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
2487 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
2491 /* Detect SLP reduction of the form:
2501 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2502 FIRST_STMT is the first reduction stmt in the chain
2503 (a2 = operation (a1)).
2505 Return TRUE if a reduction chain was detected. */
2508 vect_is_slp_reduction (loop_vec_info loop_info
, gimple
*phi
,
2511 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2512 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2513 enum tree_code code
;
2514 gimple
*current_stmt
= NULL
, *loop_use_stmt
= NULL
, *first
, *next_stmt
;
2515 stmt_vec_info use_stmt_info
, current_stmt_info
;
2517 imm_use_iterator imm_iter
;
2518 use_operand_p use_p
;
2519 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2522 if (loop
!= vect_loop
)
2525 lhs
= PHI_RESULT (phi
);
2526 code
= gimple_assign_rhs_code (first_stmt
);
2530 n_out_of_loop_uses
= 0;
2531 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2533 gimple
*use_stmt
= USE_STMT (use_p
);
2534 if (is_gimple_debug (use_stmt
))
2537 /* Check if we got back to the reduction phi. */
2538 if (use_stmt
== phi
)
2540 loop_use_stmt
= use_stmt
;
2545 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2547 loop_use_stmt
= use_stmt
;
2551 n_out_of_loop_uses
++;
2553 /* There are can be either a single use in the loop or two uses in
2555 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2562 /* We reached a statement with no loop uses. */
2563 if (nloop_uses
== 0)
2566 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2567 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2570 if (!is_gimple_assign (loop_use_stmt
)
2571 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2572 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2575 /* Insert USE_STMT into reduction chain. */
2576 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2579 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2580 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2581 GROUP_FIRST_ELEMENT (use_stmt_info
)
2582 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2585 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2587 lhs
= gimple_assign_lhs (loop_use_stmt
);
2588 current_stmt
= loop_use_stmt
;
2592 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2595 /* Swap the operands, if needed, to make the reduction operand be the second
2597 lhs
= PHI_RESULT (phi
);
2598 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2601 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2603 tree op
= gimple_assign_rhs1 (next_stmt
);
2604 gimple
*def_stmt
= NULL
;
2606 if (TREE_CODE (op
) == SSA_NAME
)
2607 def_stmt
= SSA_NAME_DEF_STMT (op
);
2609 /* Check that the other def is either defined in the loop
2610 ("vect_internal_def"), or it's an induction (defined by a
2611 loop-header phi-node). */
2613 && gimple_bb (def_stmt
)
2614 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2615 && (is_gimple_assign (def_stmt
)
2616 || is_gimple_call (def_stmt
)
2617 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2618 == vect_induction_def
2619 || (gimple_code (def_stmt
) == GIMPLE_PHI
2620 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2621 == vect_internal_def
2622 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2624 lhs
= gimple_assign_lhs (next_stmt
);
2625 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2633 tree op
= gimple_assign_rhs2 (next_stmt
);
2634 gimple
*def_stmt
= NULL
;
2636 if (TREE_CODE (op
) == SSA_NAME
)
2637 def_stmt
= SSA_NAME_DEF_STMT (op
);
2639 /* Check that the other def is either defined in the loop
2640 ("vect_internal_def"), or it's an induction (defined by a
2641 loop-header phi-node). */
2643 && gimple_bb (def_stmt
)
2644 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2645 && (is_gimple_assign (def_stmt
)
2646 || is_gimple_call (def_stmt
)
2647 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2648 == vect_induction_def
2649 || (gimple_code (def_stmt
) == GIMPLE_PHI
2650 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2651 == vect_internal_def
2652 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2657 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2660 swap_ssa_operands (next_stmt
,
2661 gimple_assign_rhs1_ptr (next_stmt
),
2662 gimple_assign_rhs2_ptr (next_stmt
));
2663 update_stmt (next_stmt
);
2665 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2666 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2672 lhs
= gimple_assign_lhs (next_stmt
);
2673 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2676 /* Save the chain for further analysis in SLP detection. */
2677 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2678 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2679 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2684 /* Return true if we need an in-order reduction for operation CODE
2685 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2686 overflow must wrap. */
2689 needs_fold_left_reduction_p (tree type
, tree_code code
,
2690 bool need_wrapping_integral_overflow
)
2692 /* CHECKME: check for !flag_finite_math_only too? */
2693 if (SCALAR_FLOAT_TYPE_P (type
))
2701 return !flag_associative_math
;
2704 if (INTEGRAL_TYPE_P (type
))
2706 if (!operation_no_trapping_overflow (type
, code
))
2708 if (need_wrapping_integral_overflow
2709 && !TYPE_OVERFLOW_WRAPS (type
)
2710 && operation_can_overflow (code
))
2715 if (SAT_FIXED_POINT_TYPE_P (type
))
2721 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2722 reduction operation CODE has a handled computation expression. */
2725 check_reduction_path (location_t loc
, loop_p loop
, gphi
*phi
, tree loop_arg
,
2726 enum tree_code code
)
2728 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
2729 auto_bitmap visited
;
2730 tree lookfor
= PHI_RESULT (phi
);
2732 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
2733 while (USE_FROM_PTR (curr
) != loop_arg
)
2734 curr
= op_iter_next_use (&curri
);
2735 curri
.i
= curri
.numops
;
2738 path
.safe_push (std::make_pair (curri
, curr
));
2739 tree use
= USE_FROM_PTR (curr
);
2742 gimple
*def
= SSA_NAME_DEF_STMT (use
);
2743 if (gimple_nop_p (def
)
2744 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
2749 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
2753 curr
= op_iter_next_use (&curri
);
2754 /* Skip already visited or non-SSA operands (from iterating
2756 while (curr
!= NULL_USE_OPERAND_P
2757 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2758 || ! bitmap_set_bit (visited
,
2760 (USE_FROM_PTR (curr
)))));
2762 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
2763 if (curr
== NULL_USE_OPERAND_P
)
2768 if (gimple_code (def
) == GIMPLE_PHI
)
2769 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
2771 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
2772 while (curr
!= NULL_USE_OPERAND_P
2773 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2774 || ! bitmap_set_bit (visited
,
2776 (USE_FROM_PTR (curr
)))))
2777 curr
= op_iter_next_use (&curri
);
2778 if (curr
== NULL_USE_OPERAND_P
)
2783 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2785 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
2787 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
2788 FOR_EACH_VEC_ELT (path
, i
, x
)
2790 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, USE_FROM_PTR (x
->second
));
2791 dump_printf (MSG_NOTE
, " ");
2793 dump_printf (MSG_NOTE
, "\n");
2796 /* Check whether the reduction path detected is valid. */
2797 bool fail
= path
.length () == 0;
2799 for (unsigned i
= 1; i
< path
.length (); ++i
)
2801 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
2802 tree op
= USE_FROM_PTR (path
[i
].second
);
2803 if (! has_single_use (op
)
2804 || ! is_gimple_assign (use_stmt
))
2809 if (gimple_assign_rhs_code (use_stmt
) != code
)
2811 if (code
== PLUS_EXPR
2812 && gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
2814 /* Track whether we negate the reduction value each iteration. */
2815 if (gimple_assign_rhs2 (use_stmt
) == op
)
2825 return ! fail
&& ! neg
;
2829 /* Function vect_is_simple_reduction
2831 (1) Detect a cross-iteration def-use cycle that represents a simple
2832 reduction computation. We look for the following pattern:
2837 a2 = operation (a3, a1)
2844 a2 = operation (a3, a1)
2847 1. operation is commutative and associative and it is safe to
2848 change the order of the computation
2849 2. no uses for a2 in the loop (a2 is used out of the loop)
2850 3. no uses of a1 in the loop besides the reduction operation
2851 4. no uses of a1 outside the loop.
2853 Conditions 1,4 are tested here.
2854 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2856 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2859 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2863 inner loop (def of a3)
2866 (4) Detect condition expressions, ie:
2867 for (int i = 0; i < N; i++)
2874 vect_is_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
2876 bool need_wrapping_integral_overflow
,
2877 enum vect_reduction_type
*v_reduc_type
)
2879 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2880 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2881 gimple
*def_stmt
, *def1
= NULL
, *def2
= NULL
, *phi_use_stmt
= NULL
;
2882 enum tree_code orig_code
, code
;
2883 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2887 imm_use_iterator imm_iter
;
2888 use_operand_p use_p
;
2891 *double_reduc
= false;
2892 *v_reduc_type
= TREE_CODE_REDUCTION
;
2894 tree phi_name
= PHI_RESULT (phi
);
2895 /* ??? If there are no uses of the PHI result the inner loop reduction
2896 won't be detected as possibly double-reduction by vectorizable_reduction
2897 because that tries to walk the PHI arg from the preheader edge which
2898 can be constant. See PR60382. */
2899 if (has_zero_uses (phi_name
))
2902 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
2904 gimple
*use_stmt
= USE_STMT (use_p
);
2905 if (is_gimple_debug (use_stmt
))
2908 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2912 "intermediate value used outside loop.\n");
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2922 "reduction value used in loop.\n");
2926 phi_use_stmt
= use_stmt
;
2929 edge latch_e
= loop_latch_edge (loop
);
2930 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2931 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2933 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2936 "reduction: not ssa_name: ");
2937 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2938 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2943 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2944 if (is_gimple_assign (def_stmt
))
2946 name
= gimple_assign_lhs (def_stmt
);
2949 else if (gimple_code (def_stmt
) == GIMPLE_PHI
)
2951 name
= PHI_RESULT (def_stmt
);
2956 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2959 "reduction: unhandled reduction operation: ");
2960 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def_stmt
, 0);
2965 if (! flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
2969 auto_vec
<gphi
*, 3> lcphis
;
2970 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2972 gimple
*use_stmt
= USE_STMT (use_p
);
2973 if (is_gimple_debug (use_stmt
))
2975 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2978 /* We can have more than one loop-closed PHI. */
2979 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
2982 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2984 "reduction used in loop.\n");
2989 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2990 defined in the inner loop. */
2993 op1
= PHI_ARG_DEF (def_stmt
, 0);
2995 if (gimple_phi_num_args (def_stmt
) != 1
2996 || TREE_CODE (op1
) != SSA_NAME
)
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3000 "unsupported phi node definition.\n");
3005 def1
= SSA_NAME_DEF_STMT (op1
);
3006 if (gimple_bb (def1
)
3007 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3009 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3010 && is_gimple_assign (def1
)
3011 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3013 if (dump_enabled_p ())
3014 report_vect_op (MSG_NOTE
, def_stmt
,
3015 "detected double reduction: ");
3017 *double_reduc
= true;
3024 /* If we are vectorizing an inner reduction we are executing that
3025 in the original order only in case we are not dealing with a
3026 double reduction. */
3027 bool check_reduction
= true;
3028 if (flow_loop_nested_p (vect_loop
, loop
))
3032 check_reduction
= false;
3033 FOR_EACH_VEC_ELT (lcphis
, i
, lcphi
)
3034 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, gimple_phi_result (lcphi
))
3036 gimple
*use_stmt
= USE_STMT (use_p
);
3037 if (is_gimple_debug (use_stmt
))
3039 if (! flow_bb_inside_loop_p (vect_loop
, gimple_bb (use_stmt
)))
3040 check_reduction
= true;
3044 bool nested_in_vect_loop
= flow_loop_nested_p (vect_loop
, loop
);
3045 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
3047 /* We can handle "res -= x[i]", which is non-associative by
3048 simply rewriting this into "res += -x[i]". Avoid changing
3049 gimple instruction for the first simple tests and only do this
3050 if we're allowed to change code at all. */
3051 if (code
== MINUS_EXPR
&& gimple_assign_rhs2 (def_stmt
) != phi_name
)
3054 if (code
== COND_EXPR
)
3056 if (! nested_in_vect_loop
)
3057 *v_reduc_type
= COND_REDUCTION
;
3059 op3
= gimple_assign_rhs1 (def_stmt
);
3060 if (COMPARISON_CLASS_P (op3
))
3062 op4
= TREE_OPERAND (op3
, 1);
3063 op3
= TREE_OPERAND (op3
, 0);
3065 if (op3
== phi_name
|| op4
== phi_name
)
3067 if (dump_enabled_p ())
3068 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3069 "reduction: condition depends on previous"
3074 op1
= gimple_assign_rhs2 (def_stmt
);
3075 op2
= gimple_assign_rhs3 (def_stmt
);
3077 else if (!commutative_tree_code (code
) || !associative_tree_code (code
))
3079 if (dump_enabled_p ())
3080 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3081 "reduction: not commutative/associative: ");
3084 else if (get_gimple_rhs_class (code
) == GIMPLE_BINARY_RHS
)
3086 op1
= gimple_assign_rhs1 (def_stmt
);
3087 op2
= gimple_assign_rhs2 (def_stmt
);
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3093 "reduction: not handled operation: ");
3097 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
3099 if (dump_enabled_p ())
3100 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3101 "reduction: both uses not ssa_names: ");
3106 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
3107 if ((TREE_CODE (op1
) == SSA_NAME
3108 && !types_compatible_p (type
,TREE_TYPE (op1
)))
3109 || (TREE_CODE (op2
) == SSA_NAME
3110 && !types_compatible_p (type
, TREE_TYPE (op2
)))
3111 || (op3
&& TREE_CODE (op3
) == SSA_NAME
3112 && !types_compatible_p (type
, TREE_TYPE (op3
)))
3113 || (op4
&& TREE_CODE (op4
) == SSA_NAME
3114 && !types_compatible_p (type
, TREE_TYPE (op4
))))
3116 if (dump_enabled_p ())
3118 dump_printf_loc (MSG_NOTE
, vect_location
,
3119 "reduction: multiple types: operation type: ");
3120 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
3121 dump_printf (MSG_NOTE
, ", operands types: ");
3122 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3124 dump_printf (MSG_NOTE
, ",");
3125 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3129 dump_printf (MSG_NOTE
, ",");
3130 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3136 dump_printf (MSG_NOTE
, ",");
3137 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3140 dump_printf (MSG_NOTE
, "\n");
3146 /* Check whether it's ok to change the order of the computation.
3147 Generally, when vectorizing a reduction we change the order of the
3148 computation. This may change the behavior of the program in some
3149 cases, so we need to check that this is ok. One exception is when
3150 vectorizing an outer-loop: the inner-loop is executed sequentially,
3151 and therefore vectorizing reductions in the inner-loop during
3152 outer-loop vectorization is safe. */
3154 && *v_reduc_type
== TREE_CODE_REDUCTION
3155 && needs_fold_left_reduction_p (type
, code
,
3156 need_wrapping_integral_overflow
))
3157 *v_reduc_type
= FOLD_LEFT_REDUCTION
;
3159 /* Reduction is safe. We're dealing with one of the following:
3160 1) integer arithmetic and no trapv
3161 2) floating point arithmetic, and special flags permit this optimization
3162 3) nested cycle (i.e., outer loop vectorization). */
3163 if (TREE_CODE (op1
) == SSA_NAME
)
3164 def1
= SSA_NAME_DEF_STMT (op1
);
3166 if (TREE_CODE (op2
) == SSA_NAME
)
3167 def2
= SSA_NAME_DEF_STMT (op2
);
3169 if (code
!= COND_EXPR
3170 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
3172 if (dump_enabled_p ())
3173 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
3177 /* Check that one def is the reduction def, defined by PHI,
3178 the other def is either defined in the loop ("vect_internal_def"),
3179 or it's an induction (defined by a loop-header phi-node). */
3181 if (def2
&& def2
== phi
3182 && (code
== COND_EXPR
3183 || !def1
|| gimple_nop_p (def1
)
3184 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
3185 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
3186 && (is_gimple_assign (def1
)
3187 || is_gimple_call (def1
)
3188 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
3189 == vect_induction_def
3190 || (gimple_code (def1
) == GIMPLE_PHI
3191 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
3192 == vect_internal_def
3193 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
3195 if (dump_enabled_p ())
3196 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3200 if (def1
&& def1
== phi
3201 && (code
== COND_EXPR
3202 || !def2
|| gimple_nop_p (def2
)
3203 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
3204 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
3205 && (is_gimple_assign (def2
)
3206 || is_gimple_call (def2
)
3207 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
3208 == vect_induction_def
3209 || (gimple_code (def2
) == GIMPLE_PHI
3210 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
3211 == vect_internal_def
3212 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
3214 if (! nested_in_vect_loop
&& orig_code
!= MINUS_EXPR
)
3216 /* Check if we can swap operands (just for simplicity - so that
3217 the rest of the code can assume that the reduction variable
3218 is always the last (second) argument). */
3219 if (code
== COND_EXPR
)
3221 /* Swap cond_expr by inverting the condition. */
3222 tree cond_expr
= gimple_assign_rhs1 (def_stmt
);
3223 enum tree_code invert_code
= ERROR_MARK
;
3224 enum tree_code cond_code
= TREE_CODE (cond_expr
);
3226 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
3228 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
3229 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
3231 if (invert_code
!= ERROR_MARK
)
3233 TREE_SET_CODE (cond_expr
, invert_code
);
3234 swap_ssa_operands (def_stmt
,
3235 gimple_assign_rhs2_ptr (def_stmt
),
3236 gimple_assign_rhs3_ptr (def_stmt
));
3240 if (dump_enabled_p ())
3241 report_vect_op (MSG_NOTE
, def_stmt
,
3242 "detected reduction: cannot swap operands "
3248 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
3249 gimple_assign_rhs2_ptr (def_stmt
));
3251 if (dump_enabled_p ())
3252 report_vect_op (MSG_NOTE
, def_stmt
,
3253 "detected reduction: need to swap operands: ");
3255 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
3256 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
3260 if (dump_enabled_p ())
3261 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3267 /* Try to find SLP reduction chain. */
3268 if (! nested_in_vect_loop
3269 && code
!= COND_EXPR
3270 && orig_code
!= MINUS_EXPR
3271 && vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
3273 if (dump_enabled_p ())
3274 report_vect_op (MSG_NOTE
, def_stmt
,
3275 "reduction: detected reduction chain: ");
3280 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3281 gimple
*first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt
));
3284 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
));
3285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first
)) = NULL
;
3286 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
)) = NULL
;
3290 /* Look for the expression computing loop_arg from loop PHI result. */
3291 if (check_reduction_path (vect_location
, loop
, as_a
<gphi
*> (phi
), loop_arg
,
3295 if (dump_enabled_p ())
3297 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3298 "reduction: unknown pattern: ");
3304 /* Wrapper around vect_is_simple_reduction, which will modify code
3305 in-place if it enables detection of more reductions. Arguments
3309 vect_force_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
3311 bool need_wrapping_integral_overflow
)
3313 enum vect_reduction_type v_reduc_type
;
3314 gimple
*def
= vect_is_simple_reduction (loop_info
, phi
, double_reduc
,
3315 need_wrapping_integral_overflow
,
3319 stmt_vec_info reduc_def_info
= vinfo_for_stmt (phi
);
3320 STMT_VINFO_REDUC_TYPE (reduc_def_info
) = v_reduc_type
;
3321 STMT_VINFO_REDUC_DEF (reduc_def_info
) = def
;
3322 reduc_def_info
= vinfo_for_stmt (def
);
3323 STMT_VINFO_REDUC_TYPE (reduc_def_info
) = v_reduc_type
;
3324 STMT_VINFO_REDUC_DEF (reduc_def_info
) = phi
;
3329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3332 int *peel_iters_epilogue
,
3333 stmt_vector_for_cost
*scalar_cost_vec
,
3334 stmt_vector_for_cost
*prologue_cost_vec
,
3335 stmt_vector_for_cost
*epilogue_cost_vec
)
3338 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3340 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3342 *peel_iters_epilogue
= assumed_vf
/ 2;
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE
, vect_location
,
3345 "cost model: epilogue peel iters set to vf/2 "
3346 "because loop iterations are unknown .\n");
3348 /* If peeled iterations are known but number of scalar loop
3349 iterations are unknown, count a taken branch per peeled loop. */
3350 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3351 NULL
, 0, vect_prologue
);
3352 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3353 NULL
, 0, vect_epilogue
);
3357 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3358 peel_iters_prologue
= niters
< peel_iters_prologue
?
3359 niters
: peel_iters_prologue
;
3360 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3361 /* If we need to peel for gaps, but no peeling is required, we have to
3362 peel VF iterations. */
3363 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3364 *peel_iters_epilogue
= assumed_vf
;
3367 stmt_info_for_cost
*si
;
3369 if (peel_iters_prologue
)
3370 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3372 stmt_vec_info stmt_info
3373 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3374 retval
+= record_stmt_cost (prologue_cost_vec
,
3375 si
->count
* peel_iters_prologue
,
3376 si
->kind
, stmt_info
, si
->misalign
,
3379 if (*peel_iters_epilogue
)
3380 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3382 stmt_vec_info stmt_info
3383 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3384 retval
+= record_stmt_cost (epilogue_cost_vec
,
3385 si
->count
* *peel_iters_epilogue
,
3386 si
->kind
, stmt_info
, si
->misalign
,
3393 /* Function vect_estimate_min_profitable_iters
3395 Return the number of iterations required for the vector version of the
3396 loop to be profitable relative to the cost of the scalar version of the
3399 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3400 of iterations for vectorization. -1 value means loop vectorization
3401 is not profitable. This returned value may be used for dynamic
3402 profitability check.
3404 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3405 for static check against estimated number of iterations. */
3408 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3409 int *ret_min_profitable_niters
,
3410 int *ret_min_profitable_estimate
)
3412 int min_profitable_iters
;
3413 int min_profitable_estimate
;
3414 int peel_iters_prologue
;
3415 int peel_iters_epilogue
;
3416 unsigned vec_inside_cost
= 0;
3417 int vec_outside_cost
= 0;
3418 unsigned vec_prologue_cost
= 0;
3419 unsigned vec_epilogue_cost
= 0;
3420 int scalar_single_iter_cost
= 0;
3421 int scalar_outside_cost
= 0;
3422 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3423 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3424 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3426 /* Cost model disabled. */
3427 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3429 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3430 *ret_min_profitable_niters
= 0;
3431 *ret_min_profitable_estimate
= 0;
3435 /* Requires loop versioning tests to handle misalignment. */
3436 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3438 /* FIXME: Make cost depend on complexity of individual check. */
3439 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3440 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3442 dump_printf (MSG_NOTE
,
3443 "cost model: Adding cost of checks for loop "
3444 "versioning to treat misalignment.\n");
3447 /* Requires loop versioning with alias checks. */
3448 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3450 /* FIXME: Make cost depend on complexity of individual check. */
3451 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3452 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3454 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3456 /* Count LEN - 1 ANDs and LEN comparisons. */
3457 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1, scalar_stmt
,
3458 NULL
, 0, vect_prologue
);
3459 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3462 /* Count LEN - 1 ANDs and LEN comparisons. */
3463 unsigned int nstmts
= len
* 2 - 1;
3464 /* +1 for each bias that needs adding. */
3465 for (unsigned int i
= 0; i
< len
; ++i
)
3466 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3468 (void) add_stmt_cost (target_cost_data
, nstmts
, scalar_stmt
,
3469 NULL
, 0, vect_prologue
);
3471 dump_printf (MSG_NOTE
,
3472 "cost model: Adding cost of checks for loop "
3473 "versioning aliasing.\n");
3476 /* Requires loop versioning with niter checks. */
3477 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3479 /* FIXME: Make cost depend on complexity of individual check. */
3480 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
, NULL
, 0,
3482 dump_printf (MSG_NOTE
,
3483 "cost model: Adding cost of checks for loop "
3484 "versioning niters.\n");
3487 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3488 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
3491 /* Count statements in scalar loop. Using this as scalar cost for a single
3494 TODO: Add outer loop support.
3496 TODO: Consider assigning different costs to different scalar
3499 scalar_single_iter_cost
3500 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3502 /* Add additional cost for the peeled instructions in prologue and epilogue
3503 loop. (For fully-masked loops there will be no peeling.)
3505 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3506 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3508 TODO: Build an expression that represents peel_iters for prologue and
3509 epilogue to be used in a run-time test. */
3511 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3513 peel_iters_prologue
= 0;
3514 peel_iters_epilogue
= 0;
3516 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3518 /* We need to peel exactly one iteration. */
3519 peel_iters_epilogue
+= 1;
3520 stmt_info_for_cost
*si
;
3522 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
3525 struct _stmt_vec_info
*stmt_info
3526 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3527 (void) add_stmt_cost (target_cost_data
, si
->count
,
3528 si
->kind
, stmt_info
, si
->misalign
,
3535 peel_iters_prologue
= assumed_vf
/ 2;
3536 dump_printf (MSG_NOTE
, "cost model: "
3537 "prologue peel iters set to vf/2.\n");
3539 /* If peeling for alignment is unknown, loop bound of main loop becomes
3541 peel_iters_epilogue
= assumed_vf
/ 2;
3542 dump_printf (MSG_NOTE
, "cost model: "
3543 "epilogue peel iters set to vf/2 because "
3544 "peeling for alignment is unknown.\n");
3546 /* If peeled iterations are unknown, count a taken branch and a not taken
3547 branch per peeled loop. Even if scalar loop iterations are known,
3548 vector iterations are not known since peeled prologue iterations are
3549 not known. Hence guards remain the same. */
3550 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3551 NULL
, 0, vect_prologue
);
3552 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3553 NULL
, 0, vect_prologue
);
3554 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3555 NULL
, 0, vect_epilogue
);
3556 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3557 NULL
, 0, vect_epilogue
);
3558 stmt_info_for_cost
*si
;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3562 struct _stmt_vec_info
*stmt_info
3563 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3564 (void) add_stmt_cost (target_cost_data
,
3565 si
->count
* peel_iters_prologue
,
3566 si
->kind
, stmt_info
, si
->misalign
,
3568 (void) add_stmt_cost (target_cost_data
,
3569 si
->count
* peel_iters_epilogue
,
3570 si
->kind
, stmt_info
, si
->misalign
,
3576 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3577 stmt_info_for_cost
*si
;
3579 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3581 prologue_cost_vec
.create (2);
3582 epilogue_cost_vec
.create (2);
3583 peel_iters_prologue
= npeel
;
3585 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3586 &peel_iters_epilogue
,
3587 &LOOP_VINFO_SCALAR_ITERATION_COST
3590 &epilogue_cost_vec
);
3592 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3594 struct _stmt_vec_info
*stmt_info
3595 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3596 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3597 si
->misalign
, vect_prologue
);
3600 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3602 struct _stmt_vec_info
*stmt_info
3603 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3604 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3605 si
->misalign
, vect_epilogue
);
3608 prologue_cost_vec
.release ();
3609 epilogue_cost_vec
.release ();
3612 /* FORNOW: The scalar outside cost is incremented in one of the
3615 1. The vectorizer checks for alignment and aliasing and generates
3616 a condition that allows dynamic vectorization. A cost model
3617 check is ANDED with the versioning condition. Hence scalar code
3618 path now has the added cost of the versioning check.
3620 if (cost > th & versioning_check)
3623 Hence run-time scalar is incremented by not-taken branch cost.
3625 2. The vectorizer then checks if a prologue is required. If the
3626 cost model check was not done before during versioning, it has to
3627 be done before the prologue check.
3630 prologue = scalar_iters
3635 if (prologue == num_iters)
3638 Hence the run-time scalar cost is incremented by a taken branch,
3639 plus a not-taken branch, plus a taken branch cost.
3641 3. The vectorizer then checks if an epilogue is required. If the
3642 cost model check was not done before during prologue check, it
3643 has to be done with the epilogue check.
3649 if (prologue == num_iters)
3652 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3655 Hence the run-time scalar cost should be incremented by 2 taken
3658 TODO: The back end may reorder the BBS's differently and reverse
3659 conditions/branch directions. Change the estimates below to
3660 something more reasonable. */
3662 /* If the number of iterations is known and we do not do versioning, we can
3663 decide whether to vectorize at compile time. Hence the scalar version
3664 do not carry cost model guard costs. */
3665 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3666 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3668 /* Cost model check occurs at versioning. */
3669 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3670 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3673 /* Cost model check occurs at prologue generation. */
3674 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3675 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3676 + vect_get_stmt_cost (cond_branch_not_taken
);
3677 /* Cost model check occurs at epilogue generation. */
3679 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3683 /* Complete the target-specific cost calculations. */
3684 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3685 &vec_inside_cost
, &vec_epilogue_cost
);
3687 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3692 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3694 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3696 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3698 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3699 scalar_single_iter_cost
);
3700 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3701 scalar_outside_cost
);
3702 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3704 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3705 peel_iters_prologue
);
3706 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3707 peel_iters_epilogue
);
3710 /* Calculate number of iterations required to make the vector version
3711 profitable, relative to the loop bodies only. The following condition
3713 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3715 SIC = scalar iteration cost, VIC = vector iteration cost,
3716 VOC = vector outside cost, VF = vectorization factor,
3717 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3718 SOC = scalar outside cost for run time cost model check. */
3720 if ((scalar_single_iter_cost
* assumed_vf
) > (int) vec_inside_cost
)
3722 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
3724 - vec_inside_cost
* peel_iters_prologue
3725 - vec_inside_cost
* peel_iters_epilogue
);
3726 if (min_profitable_iters
<= 0)
3727 min_profitable_iters
= 0;
3730 min_profitable_iters
/= ((scalar_single_iter_cost
* assumed_vf
)
3733 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
3734 <= (((int) vec_inside_cost
* min_profitable_iters
)
3735 + (((int) vec_outside_cost
- scalar_outside_cost
)
3737 min_profitable_iters
++;
3740 /* vector version will never be profitable. */
3743 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3744 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
3745 "did not happen for a simd loop");
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3749 "cost model: the vector iteration cost = %d "
3750 "divided by the scalar iteration cost = %d "
3751 "is greater or equal to the vectorization factor = %d"
3753 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
3754 *ret_min_profitable_niters
= -1;
3755 *ret_min_profitable_estimate
= -1;
3759 dump_printf (MSG_NOTE
,
3760 " Calculated minimum iters for profitability: %d\n",
3761 min_profitable_iters
);
3763 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
3764 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
3765 /* We want the vectorized loop to execute at least once. */
3766 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
3768 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_NOTE
, vect_location
,
3770 " Runtime profitability threshold = %d\n",
3771 min_profitable_iters
);
3773 *ret_min_profitable_niters
= min_profitable_iters
;
3775 /* Calculate number of iterations required to make the vector version
3776 profitable, relative to the loop bodies only.
3778 Non-vectorized variant is SIC * niters and it must win over vector
3779 variant on the expected loop trip count. The following condition must hold true:
3780 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3782 if (vec_outside_cost
<= 0)
3783 min_profitable_estimate
= 0;
3786 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
3788 - vec_inside_cost
* peel_iters_prologue
3789 - vec_inside_cost
* peel_iters_epilogue
)
3790 / ((scalar_single_iter_cost
* assumed_vf
)
3793 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE
, vect_location
,
3796 " Static estimate profitability threshold = %d\n",
3797 min_profitable_estimate
);
3799 *ret_min_profitable_estimate
= min_profitable_estimate
;
3802 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3803 vector elements (not bits) for a vector with NELT elements. */
3805 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
3806 vec_perm_builder
*sel
)
3808 /* The encoding is a single stepped pattern. Any wrap-around is handled
3809 by vec_perm_indices. */
3810 sel
->new_vector (nelt
, 1, 3);
3811 for (unsigned int i
= 0; i
< 3; i
++)
3812 sel
->quick_push (i
+ offset
);
3815 /* Checks whether the target supports whole-vector shifts for vectors of mode
3816 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3817 it supports vec_perm_const with masks for all necessary shift amounts. */
3819 have_whole_vector_shift (machine_mode mode
)
3821 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3824 /* Variable-length vectors should be handled via the optab. */
3826 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
3829 vec_perm_builder sel
;
3830 vec_perm_indices indices
;
3831 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
3833 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
3834 indices
.new_vector (sel
, 2, nelt
);
3835 if (!can_vec_perm_const_p (mode
, indices
, false))
3841 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3842 functions. Design better to avoid maintenance issues. */
3844 /* Function vect_model_reduction_cost.
3846 Models cost for a reduction operation, including the vector ops
3847 generated within the strip-mine loop, the initial definition before
3848 the loop, and the epilogue code that must be generated. */
3851 vect_model_reduction_cost (stmt_vec_info stmt_info
, internal_fn reduc_fn
,
3852 int ncopies
, stmt_vector_for_cost
*cost_vec
)
3854 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
3855 enum tree_code code
;
3860 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3861 struct loop
*loop
= NULL
;
3864 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3866 /* Condition reductions generate two reductions in the loop. */
3867 vect_reduction_type reduction_type
3868 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
3869 if (reduction_type
== COND_REDUCTION
)
3872 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3873 mode
= TYPE_MODE (vectype
);
3874 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3877 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3879 code
= gimple_assign_rhs_code (orig_stmt
);
3881 if (reduction_type
== EXTRACT_LAST_REDUCTION
3882 || reduction_type
== FOLD_LEFT_REDUCTION
)
3884 /* No extra instructions needed in the prologue. */
3887 if (reduction_type
== EXTRACT_LAST_REDUCTION
|| reduc_fn
!= IFN_LAST
)
3888 /* Count one reduction-like operation per vector. */
3889 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
3890 stmt_info
, 0, vect_body
);
3893 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3894 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
3895 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
3896 vec_to_scalar
, stmt_info
, 0,
3898 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
3899 scalar_stmt
, stmt_info
, 0,
3905 /* Add in cost for initial definition.
3906 For cond reduction we have four vectors: initial index, step,
3907 initial result of the data reduction, initial value of the index
3909 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
3910 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
3911 scalar_to_vec
, stmt_info
, 0,
3914 /* Cost of reduction op inside loop. */
3915 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
3916 stmt_info
, 0, vect_body
);
3919 /* Determine cost of epilogue code.
3921 We have a reduction operator that will reduce the vector in one statement.
3922 Also requires scalar extract. */
3924 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt
))
3926 if (reduc_fn
!= IFN_LAST
)
3928 if (reduction_type
== COND_REDUCTION
)
3930 /* An EQ stmt and an COND_EXPR stmt. */
3931 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3932 vector_stmt
, stmt_info
, 0,
3934 /* Reduction of the max index and a reduction of the found
3936 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3937 vec_to_scalar
, stmt_info
, 0,
3939 /* A broadcast of the max value. */
3940 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3941 scalar_to_vec
, stmt_info
, 0,
3946 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
3947 stmt_info
, 0, vect_epilogue
);
3948 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3949 vec_to_scalar
, stmt_info
, 0,
3953 else if (reduction_type
== COND_REDUCTION
)
3955 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
3956 /* Extraction of scalar elements. */
3957 epilogue_cost
+= record_stmt_cost (cost_vec
,
3958 2 * estimated_nunits
,
3959 vec_to_scalar
, stmt_info
, 0,
3961 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3962 epilogue_cost
+= record_stmt_cost (cost_vec
,
3963 2 * estimated_nunits
- 3,
3964 scalar_stmt
, stmt_info
, 0,
3967 else if (reduction_type
== EXTRACT_LAST_REDUCTION
3968 || reduction_type
== FOLD_LEFT_REDUCTION
)
3969 /* No extra instructions need in the epilogue. */
3973 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3975 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3976 int element_bitsize
= tree_to_uhwi (bitsize
);
3977 int nelements
= vec_size_in_bits
/ element_bitsize
;
3979 if (code
== COND_EXPR
)
3982 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3984 /* We have a whole vector shift available. */
3985 if (optab
!= unknown_optab
3986 && VECTOR_MODE_P (mode
)
3987 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3988 && have_whole_vector_shift (mode
))
3990 /* Final reduction via vector shifts and the reduction operator.
3991 Also requires scalar extract. */
3992 epilogue_cost
+= record_stmt_cost (cost_vec
,
3993 exact_log2 (nelements
) * 2,
3994 vector_stmt
, stmt_info
, 0,
3996 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3997 vec_to_scalar
, stmt_info
, 0,
4001 /* Use extracts and reduction op for final reduction. For N
4002 elements, we have N extracts and N-1 reduction ops. */
4003 epilogue_cost
+= record_stmt_cost (cost_vec
,
4004 nelements
+ nelements
- 1,
4005 vector_stmt
, stmt_info
, 0,
4010 if (dump_enabled_p ())
4011 dump_printf (MSG_NOTE
,
4012 "vect_model_reduction_cost: inside_cost = %d, "
4013 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4014 prologue_cost
, epilogue_cost
);
4018 /* Function vect_model_induction_cost.
4020 Models cost for induction operations. */
4023 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
,
4024 stmt_vector_for_cost
*cost_vec
)
4026 unsigned inside_cost
, prologue_cost
;
4028 if (PURE_SLP_STMT (stmt_info
))
4031 /* loop cost for vec_loop. */
4032 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
4033 stmt_info
, 0, vect_body
);
4035 /* prologue cost for vec_init and vec_step. */
4036 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
4037 stmt_info
, 0, vect_prologue
);
4039 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_NOTE
, vect_location
,
4041 "vect_model_induction_cost: inside_cost = %d, "
4042 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
4047 /* Function get_initial_def_for_reduction
4050 STMT - a stmt that performs a reduction operation in the loop.
4051 INIT_VAL - the initial value of the reduction variable
4054 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4055 of the reduction (used for adjusting the epilog - see below).
4056 Return a vector variable, initialized according to the operation that STMT
4057 performs. This vector will be used as the initial value of the
4058 vector of partial results.
4060 Option1 (adjust in epilog): Initialize the vector as follows:
4061 add/bit or/xor: [0,0,...,0,0]
4062 mult/bit and: [1,1,...,1,1]
4063 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4064 and when necessary (e.g. add/mult case) let the caller know
4065 that it needs to adjust the result by init_val.
4067 Option2: Initialize the vector as follows:
4068 add/bit or/xor: [init_val,0,0,...,0]
4069 mult/bit and: [init_val,1,1,...,1]
4070 min/max/cond_expr: [init_val,init_val,...,init_val]
4071 and no adjustments are needed.
4073 For example, for the following code:
4079 STMT is 's = s + a[i]', and the reduction variable is 's'.
4080 For a vector of 4 units, we want to return either [0,0,0,init_val],
4081 or [0,0,0,0] and let the caller know that it needs to adjust
4082 the result at the end by 'init_val'.
4084 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4085 initialization vector is simpler (same element in all entries), if
4086 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4088 A cost model should help decide between these two schemes. */
4091 get_initial_def_for_reduction (gimple
*stmt
, tree init_val
,
4092 tree
*adjustment_def
)
4094 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
4095 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
4096 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4097 tree scalar_type
= TREE_TYPE (init_val
);
4098 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
4099 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4102 bool nested_in_vect_loop
= false;
4103 REAL_VALUE_TYPE real_init_val
= dconst0
;
4104 int int_init_val
= 0;
4105 gimple
*def_stmt
= NULL
;
4106 gimple_seq stmts
= NULL
;
4108 gcc_assert (vectype
);
4110 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4111 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4113 if (nested_in_vect_loop_p (loop
, stmt
))
4114 nested_in_vect_loop
= true;
4116 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
4118 /* In case of double reduction we only create a vector variable to be put
4119 in the reduction phi node. The actual statement creation is done in
4120 vect_create_epilog_for_reduction. */
4121 if (adjustment_def
&& nested_in_vect_loop
4122 && TREE_CODE (init_val
) == SSA_NAME
4123 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
4124 && gimple_code (def_stmt
) == GIMPLE_PHI
4125 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
4126 && vinfo_for_stmt (def_stmt
)
4127 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
4128 == vect_double_reduction_def
)
4130 *adjustment_def
= NULL
;
4131 return vect_create_destination_var (init_val
, vectype
);
4134 vect_reduction_type reduction_type
4135 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo
);
4137 /* In case of a nested reduction do not use an adjustment def as
4138 that case is not supported by the epilogue generation correctly
4139 if ncopies is not one. */
4140 if (adjustment_def
&& nested_in_vect_loop
)
4142 *adjustment_def
= NULL
;
4143 return vect_get_vec_def_for_operand (init_val
, stmt
);
4148 case WIDEN_SUM_EXPR
:
4158 /* ADJUSTMENT_DEF is NULL when called from
4159 vect_create_epilog_for_reduction to vectorize double reduction. */
4161 *adjustment_def
= init_val
;
4163 if (code
== MULT_EXPR
)
4165 real_init_val
= dconst1
;
4169 if (code
== BIT_AND_EXPR
)
4172 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4173 def_for_init
= build_real (scalar_type
, real_init_val
);
4175 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4178 /* Option1: the first element is '0' or '1' as well. */
4179 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4181 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4183 /* Option2 (variable length): the first element is INIT_VAL. */
4184 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4186 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4187 vectype
, init_def
, init_val
);
4191 /* Option2: the first element is INIT_VAL. */
4192 tree_vector_builder
elts (vectype
, 1, 2);
4193 elts
.quick_push (init_val
);
4194 elts
.quick_push (def_for_init
);
4195 init_def
= gimple_build_vector (&stmts
, &elts
);
4206 *adjustment_def
= NULL_TREE
;
4207 if (reduction_type
!= COND_REDUCTION
4208 && reduction_type
!= EXTRACT_LAST_REDUCTION
)
4210 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
);
4214 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4215 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4224 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4228 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4229 NUMBER_OF_VECTORS is the number of vector defs to create.
4230 If NEUTRAL_OP is nonnull, introducing extra elements of that
4231 value will not change the result. */
4234 get_initial_defs_for_reduction (slp_tree slp_node
,
4235 vec
<tree
> *vec_oprnds
,
4236 unsigned int number_of_vectors
,
4237 bool reduc_chain
, tree neutral_op
)
4239 vec
<gimple
*> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4240 gimple
*stmt
= stmts
[0];
4241 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
4242 unsigned HOST_WIDE_INT nunits
;
4243 unsigned j
, number_of_places_left_in_vector
;
4246 int group_size
= stmts
.length ();
4247 unsigned int vec_num
, i
;
4248 unsigned number_of_copies
= 1;
4250 voprnds
.create (number_of_vectors
);
4252 auto_vec
<tree
, 16> permute_results
;
4254 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4256 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4258 loop
= (gimple_bb (stmt
))->loop_father
;
4260 edge pe
= loop_preheader_edge (loop
);
4262 gcc_assert (!reduc_chain
|| neutral_op
);
4264 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4265 created vectors. It is greater than 1 if unrolling is performed.
4267 For example, we have two scalar operands, s1 and s2 (e.g., group of
4268 strided accesses of size two), while NUNITS is four (i.e., four scalars
4269 of this type can be packed in a vector). The output vector will contain
4270 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4273 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4274 containing the operands.
4276 For example, NUNITS is four as before, and the group size is 8
4277 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4278 {s5, s6, s7, s8}. */
4280 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4281 nunits
= group_size
;
4283 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4285 number_of_places_left_in_vector
= nunits
;
4286 bool constant_p
= true;
4287 tree_vector_builder
elts (vector_type
, nunits
, 1);
4288 elts
.quick_grow (nunits
);
4289 for (j
= 0; j
< number_of_copies
; j
++)
4291 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
4294 /* Get the def before the loop. In reduction chain we have only
4295 one initial value. */
4296 if ((j
!= (number_of_copies
- 1)
4297 || (reduc_chain
&& i
!= 0))
4301 op
= PHI_ARG_DEF_FROM_EDGE (stmt
, pe
);
4303 /* Create 'vect_ = {op0,op1,...,opn}'. */
4304 number_of_places_left_in_vector
--;
4305 elts
[number_of_places_left_in_vector
] = op
;
4306 if (!CONSTANT_CLASS_P (op
))
4309 if (number_of_places_left_in_vector
== 0)
4311 gimple_seq ctor_seq
= NULL
;
4313 if (constant_p
&& !neutral_op
4314 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4315 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4316 /* Build the vector directly from ELTS. */
4317 init
= gimple_build_vector (&ctor_seq
, &elts
);
4318 else if (neutral_op
)
4320 /* Build a vector of the neutral value and shift the
4321 other elements into place. */
4322 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4325 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4330 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4331 vector_type
, init
, elts
[k
]);
4336 /* First time round, duplicate ELTS to fill the
4337 required number of vectors, then cherry pick the
4338 appropriate result for each iteration. */
4339 if (vec_oprnds
->is_empty ())
4340 duplicate_and_interleave (&ctor_seq
, vector_type
, elts
,
4343 init
= permute_results
[number_of_vectors
- j
- 1];
4345 if (ctor_seq
!= NULL
)
4346 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4347 voprnds
.quick_push (init
);
4349 number_of_places_left_in_vector
= nunits
;
4350 elts
.new_vector (vector_type
, nunits
, 1);
4351 elts
.quick_grow (nunits
);
4357 /* Since the vectors are created in the reverse order, we should invert
4359 vec_num
= voprnds
.length ();
4360 for (j
= vec_num
; j
!= 0; j
--)
4362 vop
= voprnds
[j
- 1];
4363 vec_oprnds
->quick_push (vop
);
4368 /* In case that VF is greater than the unrolling factor needed for the SLP
4369 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4370 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4371 to replicate the vectors. */
4372 tree neutral_vec
= NULL
;
4373 while (number_of_vectors
> vec_oprnds
->length ())
4379 gimple_seq ctor_seq
= NULL
;
4380 neutral_vec
= gimple_build_vector_from_val
4381 (&ctor_seq
, vector_type
, neutral_op
);
4382 if (ctor_seq
!= NULL
)
4383 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4385 vec_oprnds
->quick_push (neutral_vec
);
4389 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
4390 vec_oprnds
->quick_push (vop
);
4396 /* Function vect_create_epilog_for_reduction
4398 Create code at the loop-epilog to finalize the result of a reduction
4401 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4402 reduction statements.
4403 STMT is the scalar reduction stmt that is being vectorized.
4404 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4405 number of elements that we can fit in a vectype (nunits). In this case
4406 we have to generate more than one vector stmt - i.e - we need to "unroll"
4407 the vector stmt by a factor VF/nunits. For more details see documentation
4408 in vectorizable_operation.
4409 REDUC_FN is the internal function for the epilog reduction.
4410 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4412 REDUC_INDEX is the index of the operand in the right hand side of the
4413 statement that is defined by REDUCTION_PHI.
4414 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4415 SLP_NODE is an SLP node containing a group of reduction statements. The
4416 first one in this group is STMT.
4417 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4418 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4419 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4420 any value of the IV in the loop.
4421 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4422 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4423 null if this is not an SLP reduction
4426 1. Creates the reduction def-use cycles: sets the arguments for
4428 The loop-entry argument is the vectorized initial-value of the reduction.
4429 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4431 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4432 by calling the function specified by REDUC_FN if available, or by
4433 other means (whole-vector shifts or a scalar loop).
4434 The function also creates a new phi node at the loop exit to preserve
4435 loop-closed form, as illustrated below.
4437 The flow at the entry to this function:
4440 vec_def = phi <null, null> # REDUCTION_PHI
4441 VECT_DEF = vector_stmt # vectorized form of STMT
4442 s_loop = scalar_stmt # (scalar) STMT
4444 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4448 The above is transformed by this function into:
4451 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4452 VECT_DEF = vector_stmt # vectorized form of STMT
4453 s_loop = scalar_stmt # (scalar) STMT
4455 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4456 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4457 v_out2 = reduce <v_out1>
4458 s_out3 = extract_field <v_out2, 0>
4459 s_out4 = adjust_result <s_out3>
4465 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple
*stmt
,
4466 gimple
*reduc_def_stmt
,
4467 int ncopies
, internal_fn reduc_fn
,
4468 vec
<gimple
*> reduction_phis
,
4471 slp_instance slp_node_instance
,
4472 tree induc_val
, enum tree_code induc_code
,
4475 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4476 stmt_vec_info prev_phi_info
;
4479 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4480 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4481 basic_block exit_bb
;
4484 gimple
*new_phi
= NULL
, *phi
;
4485 gimple_stmt_iterator exit_gsi
;
4487 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4488 gimple
*epilog_stmt
= NULL
;
4489 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4492 tree adjustment_def
= NULL
;
4493 tree vec_initial_def
= NULL
;
4494 tree expr
, def
, initial_def
= NULL
;
4495 tree orig_name
, scalar_result
;
4496 imm_use_iterator imm_iter
, phi_imm_iter
;
4497 use_operand_p use_p
, phi_use_p
;
4498 gimple
*use_stmt
, *orig_stmt
, *reduction_phi
= NULL
;
4499 bool nested_in_vect_loop
= false;
4500 auto_vec
<gimple
*> new_phis
;
4501 auto_vec
<gimple
*> inner_phis
;
4502 enum vect_def_type dt
= vect_unknown_def_type
;
4504 auto_vec
<tree
> scalar_results
;
4505 unsigned int group_size
= 1, k
, ratio
;
4506 auto_vec
<tree
> vec_initial_defs
;
4507 auto_vec
<gimple
*> phis
;
4508 bool slp_reduc
= false;
4509 bool direct_slp_reduc
;
4510 tree new_phi_result
;
4511 gimple
*inner_phi
= NULL
;
4512 tree induction_index
= NULL_TREE
;
4515 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4517 if (nested_in_vect_loop_p (loop
, stmt
))
4521 nested_in_vect_loop
= true;
4522 gcc_assert (!slp_node
);
4525 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4526 gcc_assert (vectype
);
4527 mode
= TYPE_MODE (vectype
);
4529 /* 1. Create the reduction def-use cycle:
4530 Set the arguments of REDUCTION_PHIS, i.e., transform
4533 vec_def = phi <null, null> # REDUCTION_PHI
4534 VECT_DEF = vector_stmt # vectorized form of STMT
4540 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4541 VECT_DEF = vector_stmt # vectorized form of STMT
4544 (in case of SLP, do it for all the phis). */
4546 /* Get the loop-entry arguments. */
4547 enum vect_def_type initial_def_dt
= vect_unknown_def_type
;
4550 unsigned vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
4551 vec_initial_defs
.reserve (vec_num
);
4552 get_initial_defs_for_reduction (slp_node_instance
->reduc_phis
,
4553 &vec_initial_defs
, vec_num
,
4554 GROUP_FIRST_ELEMENT (stmt_info
),
4559 /* Get at the scalar def before the loop, that defines the initial value
4560 of the reduction variable. */
4562 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4563 loop_preheader_edge (loop
));
4564 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4565 and we can't use zero for induc_val, use initial_def. Similarly
4566 for REDUC_MIN and initial_def larger than the base. */
4567 if (TREE_CODE (initial_def
) == INTEGER_CST
4568 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4569 == INTEGER_INDUC_COND_REDUCTION
)
4570 && !integer_zerop (induc_val
)
4571 && ((induc_code
== MAX_EXPR
4572 && tree_int_cst_lt (initial_def
, induc_val
))
4573 || (induc_code
== MIN_EXPR
4574 && tree_int_cst_lt (induc_val
, initial_def
))))
4575 induc_val
= initial_def
;
4576 vect_is_simple_use (initial_def
, loop_vinfo
, &def_stmt
, &initial_def_dt
);
4577 vec_initial_def
= get_initial_def_for_reduction (stmt
, initial_def
,
4579 vec_initial_defs
.create (1);
4580 vec_initial_defs
.quick_push (vec_initial_def
);
4583 /* Set phi nodes arguments. */
4584 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4586 tree vec_init_def
= vec_initial_defs
[i
];
4587 tree def
= vect_defs
[i
];
4588 for (j
= 0; j
< ncopies
; j
++)
4592 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4593 if (nested_in_vect_loop
)
4595 = vect_get_vec_def_for_stmt_copy (initial_def_dt
,
4599 /* Set the loop-entry arg of the reduction-phi. */
4601 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4602 == INTEGER_INDUC_COND_REDUCTION
)
4604 /* Initialise the reduction phi to zero. This prevents initial
4605 values of non-zero interferring with the reduction op. */
4606 gcc_assert (ncopies
== 1);
4607 gcc_assert (i
== 0);
4609 tree vec_init_def_type
= TREE_TYPE (vec_init_def
);
4611 = build_vector_from_val (vec_init_def_type
, induc_val
);
4613 add_phi_arg (as_a
<gphi
*> (phi
), induc_val_vec
,
4614 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4617 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4618 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4620 /* Set the loop-latch arg for the reduction-phi. */
4622 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4624 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4627 if (dump_enabled_p ())
4629 dump_printf_loc (MSG_NOTE
, vect_location
,
4630 "transform reduction: created def-use cycle: ");
4631 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4632 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4637 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4638 which is updated with the current index of the loop for every match of
4639 the original loop's cond_expr (VEC_STMT). This results in a vector
4640 containing the last time the condition passed for that vector lane.
4641 The first match will be a 1 to allow 0 to be used for non-matching
4642 indexes. If there are no matches at all then the vector will be all
4644 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
4646 tree indx_before_incr
, indx_after_incr
;
4647 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
4649 gimple
*vec_stmt
= STMT_VINFO_VEC_STMT (stmt_info
);
4650 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
4652 int scalar_precision
4653 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
4654 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
4655 tree cr_index_vector_type
= build_vector_type
4656 (cr_index_scalar_type
, TYPE_VECTOR_SUBPARTS (vectype
));
4658 /* First we create a simple vector induction variable which starts
4659 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4660 vector size (STEP). */
4662 /* Create a {1,2,3,...} vector. */
4663 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
4665 /* Create a vector of the step value. */
4666 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
4667 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
4669 /* Create an induction variable. */
4670 gimple_stmt_iterator incr_gsi
;
4672 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4673 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
4674 insert_after
, &indx_before_incr
, &indx_after_incr
);
4676 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4677 filled with zeros (VEC_ZERO). */
4679 /* Create a vector of 0s. */
4680 tree zero
= build_zero_cst (cr_index_scalar_type
);
4681 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
4683 /* Create a vector phi node. */
4684 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
4685 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
4686 set_vinfo_for_stmt (new_phi
,
4687 new_stmt_vec_info (new_phi
, loop_vinfo
));
4688 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
4689 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4691 /* Now take the condition from the loops original cond_expr
4692 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4693 every match uses values from the induction variable
4694 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4696 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4697 the new cond_expr (INDEX_COND_EXPR). */
4699 /* Duplicate the condition from vec_stmt. */
4700 tree ccompare
= unshare_expr (gimple_assign_rhs1 (vec_stmt
));
4702 /* Create a conditional, where the condition is taken from vec_stmt
4703 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4704 else is the phi (NEW_PHI_TREE). */
4705 tree index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
4706 ccompare
, indx_before_incr
,
4708 induction_index
= make_ssa_name (cr_index_vector_type
);
4709 gimple
*index_condition
= gimple_build_assign (induction_index
,
4711 gsi_insert_before (&incr_gsi
, index_condition
, GSI_SAME_STMT
);
4712 stmt_vec_info index_vec_info
= new_stmt_vec_info (index_condition
,
4714 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
4715 set_vinfo_for_stmt (index_condition
, index_vec_info
);
4717 /* Update the phi with the vec cond. */
4718 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
4719 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4722 /* 2. Create epilog code.
4723 The reduction epilog code operates across the elements of the vector
4724 of partial results computed by the vectorized loop.
4725 The reduction epilog code consists of:
4727 step 1: compute the scalar result in a vector (v_out2)
4728 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4729 step 3: adjust the scalar result (s_out3) if needed.
4731 Step 1 can be accomplished using one the following three schemes:
4732 (scheme 1) using reduc_fn, if available.
4733 (scheme 2) using whole-vector shifts, if available.
4734 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4737 The overall epilog code looks like this:
4739 s_out0 = phi <s_loop> # original EXIT_PHI
4740 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4741 v_out2 = reduce <v_out1> # step 1
4742 s_out3 = extract_field <v_out2, 0> # step 2
4743 s_out4 = adjust_result <s_out3> # step 3
4745 (step 3 is optional, and steps 1 and 2 may be combined).
4746 Lastly, the uses of s_out0 are replaced by s_out4. */
4749 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4750 v_out1 = phi <VECT_DEF>
4751 Store them in NEW_PHIS. */
4753 exit_bb
= single_exit (loop
)->dest
;
4754 prev_phi_info
= NULL
;
4755 new_phis
.create (vect_defs
.length ());
4756 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4758 for (j
= 0; j
< ncopies
; j
++)
4760 tree new_def
= copy_ssa_name (def
);
4761 phi
= create_phi_node (new_def
, exit_bb
);
4762 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
));
4764 new_phis
.quick_push (phi
);
4767 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
4768 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
4771 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4772 prev_phi_info
= vinfo_for_stmt (phi
);
4776 /* The epilogue is created for the outer-loop, i.e., for the loop being
4777 vectorized. Create exit phis for the outer loop. */
4781 exit_bb
= single_exit (loop
)->dest
;
4782 inner_phis
.create (vect_defs
.length ());
4783 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4785 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4786 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4787 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4789 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4791 inner_phis
.quick_push (phi
);
4792 new_phis
[i
] = outer_phi
;
4793 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4794 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
4796 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4797 new_result
= copy_ssa_name (PHI_RESULT (phi
));
4798 outer_phi
= create_phi_node (new_result
, exit_bb
);
4799 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4801 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4803 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
4804 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4809 exit_gsi
= gsi_after_labels (exit_bb
);
4811 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4812 (i.e. when reduc_fn is not available) and in the final adjustment
4813 code (if needed). Also get the original scalar reduction variable as
4814 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4815 represents a reduction pattern), the tree-code and scalar-def are
4816 taken from the original stmt that the pattern-stmt (STMT) replaces.
4817 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4818 are taken from STMT. */
4820 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4823 /* Regular reduction */
4828 /* Reduction pattern */
4829 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
4830 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
4831 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
4834 code
= gimple_assign_rhs_code (orig_stmt
);
4835 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4836 partial results are added and not subtracted. */
4837 if (code
== MINUS_EXPR
)
4840 scalar_dest
= gimple_assign_lhs (orig_stmt
);
4841 scalar_type
= TREE_TYPE (scalar_dest
);
4842 scalar_results
.create (group_size
);
4843 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4844 bitsize
= TYPE_SIZE (scalar_type
);
4846 /* In case this is a reduction in an inner-loop while vectorizing an outer
4847 loop - we don't need to extract a single scalar result at the end of the
4848 inner-loop (unless it is double reduction, i.e., the use of reduction is
4849 outside the outer-loop). The final vector of partial results will be used
4850 in the vectorized outer-loop, or reduced to a scalar result at the end of
4852 if (nested_in_vect_loop
&& !double_reduc
)
4853 goto vect_finalize_reduction
;
4855 /* SLP reduction without reduction chain, e.g.,
4859 b2 = operation (b1) */
4860 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
4862 /* True if we should implement SLP_REDUC using native reduction operations
4863 instead of scalar operations. */
4864 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
4866 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
4868 /* In case of reduction chain, e.g.,
4871 a3 = operation (a2),
4873 we may end up with more than one vector result. Here we reduce them to
4875 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) || direct_slp_reduc
)
4877 tree first_vect
= PHI_RESULT (new_phis
[0]);
4878 gassign
*new_vec_stmt
= NULL
;
4879 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4880 for (k
= 1; k
< new_phis
.length (); k
++)
4882 gimple
*next_phi
= new_phis
[k
];
4883 tree second_vect
= PHI_RESULT (next_phi
);
4884 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4885 new_vec_stmt
= gimple_build_assign (tem
, code
,
4886 first_vect
, second_vect
);
4887 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4891 new_phi_result
= first_vect
;
4894 new_phis
.truncate (0);
4895 new_phis
.safe_push (new_vec_stmt
);
4898 /* Likewise if we couldn't use a single defuse cycle. */
4899 else if (ncopies
> 1)
4901 gcc_assert (new_phis
.length () == 1);
4902 tree first_vect
= PHI_RESULT (new_phis
[0]);
4903 gassign
*new_vec_stmt
= NULL
;
4904 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4905 gimple
*next_phi
= new_phis
[0];
4906 for (int k
= 1; k
< ncopies
; ++k
)
4908 next_phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi
));
4909 tree second_vect
= PHI_RESULT (next_phi
);
4910 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4911 new_vec_stmt
= gimple_build_assign (tem
, code
,
4912 first_vect
, second_vect
);
4913 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4916 new_phi_result
= first_vect
;
4917 new_phis
.truncate (0);
4918 new_phis
.safe_push (new_vec_stmt
);
4921 new_phi_result
= PHI_RESULT (new_phis
[0]);
4923 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
4924 && reduc_fn
!= IFN_LAST
)
4926 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4927 various data values where the condition matched and another vector
4928 (INDUCTION_INDEX) containing all the indexes of those matches. We
4929 need to extract the last matching index (which will be the index with
4930 highest value) and use this to index into the data vector.
4931 For the case where there were no matches, the data vector will contain
4932 all default values and the index vector will be all zeros. */
4934 /* Get various versions of the type of the vector of indexes. */
4935 tree index_vec_type
= TREE_TYPE (induction_index
);
4936 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
4937 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
4938 tree index_vec_cmp_type
= build_same_sized_truth_vector_type
4941 /* Get an unsigned integer version of the type of the data vector. */
4942 int scalar_precision
4943 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
4944 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
4945 tree vectype_unsigned
= build_vector_type
4946 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
4948 /* First we need to create a vector (ZERO_VEC) of zeros and another
4949 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4950 can create using a MAX reduction and then expanding.
4951 In the case where the loop never made any matches, the max index will
4954 /* Vector of {0, 0, 0,...}. */
4955 tree zero_vec
= make_ssa_name (vectype
);
4956 tree zero_vec_rhs
= build_zero_cst (vectype
);
4957 gimple
*zero_vec_stmt
= gimple_build_assign (zero_vec
, zero_vec_rhs
);
4958 gsi_insert_before (&exit_gsi
, zero_vec_stmt
, GSI_SAME_STMT
);
4960 /* Find maximum value from the vector of found indexes. */
4961 tree max_index
= make_ssa_name (index_scalar_type
);
4962 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4963 1, induction_index
);
4964 gimple_call_set_lhs (max_index_stmt
, max_index
);
4965 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
4967 /* Vector of {max_index, max_index, max_index,...}. */
4968 tree max_index_vec
= make_ssa_name (index_vec_type
);
4969 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
4971 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
4973 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
4975 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4976 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4977 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4978 otherwise. Only one value should match, resulting in a vector
4979 (VEC_COND) with one data value and the rest zeros.
4980 In the case where the loop never made any matches, every index will
4981 match, resulting in a vector with all data values (which will all be
4982 the default value). */
4984 /* Compare the max index vector to the vector of found indexes to find
4985 the position of the max value. */
4986 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
4987 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
4990 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
4992 /* Use the compare to choose either values from the data vector or
4994 tree vec_cond
= make_ssa_name (vectype
);
4995 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
4996 vec_compare
, new_phi_result
,
4998 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5000 /* Finally we need to extract the data value from the vector (VEC_COND)
5001 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5002 reduction, but because this doesn't exist, we can use a MAX reduction
5003 instead. The data value might be signed or a float so we need to cast
5005 In the case where the loop never made any matches, the data values are
5006 all identical, and so will reduce down correctly. */
5008 /* Make the matched data values unsigned. */
5009 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5010 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5012 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5015 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5017 /* Reduce down to a scalar value. */
5018 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5019 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5021 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5022 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5024 /* Convert the reduced value back to the result type and set as the
5026 gimple_seq stmts
= NULL
;
5027 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5029 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5030 scalar_results
.safe_push (new_temp
);
5032 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
5033 && reduc_fn
== IFN_LAST
)
5035 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5037 idx_val = induction_index[0];
5038 val = data_reduc[0];
5039 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5040 if (induction_index[i] > idx_val)
5041 val = data_reduc[i], idx_val = induction_index[i];
5044 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5045 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5046 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5047 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5048 /* Enforced by vectorizable_reduction, which ensures we have target
5049 support before allowing a conditional reduction on variable-length
5051 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5052 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5053 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5055 tree old_idx_val
= idx_val
;
5057 idx_val
= make_ssa_name (idx_eltype
);
5058 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5059 build3 (BIT_FIELD_REF
, idx_eltype
,
5061 bitsize_int (el_size
),
5062 bitsize_int (off
)));
5063 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5064 val
= make_ssa_name (data_eltype
);
5065 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5066 build3 (BIT_FIELD_REF
,
5069 bitsize_int (el_size
),
5070 bitsize_int (off
)));
5071 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5074 tree new_idx_val
= idx_val
;
5076 if (off
!= v_size
- el_size
)
5078 new_idx_val
= make_ssa_name (idx_eltype
);
5079 epilog_stmt
= gimple_build_assign (new_idx_val
,
5082 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5084 new_val
= make_ssa_name (data_eltype
);
5085 epilog_stmt
= gimple_build_assign (new_val
,
5092 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5093 idx_val
= new_idx_val
;
5097 /* Convert the reduced value back to the result type and set as the
5099 gimple_seq stmts
= NULL
;
5100 val
= gimple_convert (&stmts
, scalar_type
, val
);
5101 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5102 scalar_results
.safe_push (val
);
5105 /* 2.3 Create the reduction code, using one of the three schemes described
5106 above. In SLP we simply need to extract all the elements from the
5107 vector (without reducing them), so we use scalar shifts. */
5108 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5114 v_out2 = reduc_expr <v_out1> */
5116 if (dump_enabled_p ())
5117 dump_printf_loc (MSG_NOTE
, vect_location
,
5118 "Reduce using direct vector reduction.\n");
5120 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5121 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
5124 = vect_create_destination_var (scalar_dest
, vec_elem_type
);
5125 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5127 gimple_set_lhs (epilog_stmt
, tmp_dest
);
5128 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
5129 gimple_set_lhs (epilog_stmt
, new_temp
);
5130 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5132 epilog_stmt
= gimple_build_assign (new_scalar_dest
, NOP_EXPR
,
5137 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5139 gimple_set_lhs (epilog_stmt
, new_scalar_dest
);
5142 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5143 gimple_set_lhs (epilog_stmt
, new_temp
);
5144 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5146 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5147 == INTEGER_INDUC_COND_REDUCTION
)
5148 && !operand_equal_p (initial_def
, induc_val
, 0))
5150 /* Earlier we set the initial value to be a vector if induc_val
5151 values. Check the result and if it is induc_val then replace
5152 with the original initial value, unless induc_val is
5153 the same as initial_def already. */
5154 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5157 tmp
= make_ssa_name (new_scalar_dest
);
5158 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5159 initial_def
, new_temp
);
5160 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5164 scalar_results
.safe_push (new_temp
);
5166 else if (direct_slp_reduc
)
5168 /* Here we create one vector for each of the GROUP_SIZE results,
5169 with the elements for other SLP statements replaced with the
5170 neutral value. We can then do a normal reduction on each vector. */
5172 /* Enforced by vectorizable_reduction. */
5173 gcc_assert (new_phis
.length () == 1);
5174 gcc_assert (pow2p_hwi (group_size
));
5176 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5177 vec
<gimple
*> orig_phis
= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5178 gimple_seq seq
= NULL
;
5180 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5181 and the same element size as VECTYPE. */
5182 tree index
= build_index_vector (vectype
, 0, 1);
5183 tree index_type
= TREE_TYPE (index
);
5184 tree index_elt_type
= TREE_TYPE (index_type
);
5185 tree mask_type
= build_same_sized_truth_vector_type (index_type
);
5187 /* Create a vector that, for each element, identifies which of
5188 the GROUP_SIZE results should use it. */
5189 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5190 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5191 build_vector_from_val (index_type
, index_mask
));
5193 /* Get a neutral vector value. This is simply a splat of the neutral
5194 scalar value if we have one, otherwise the initial scalar value
5195 is itself a neutral value. */
5196 tree vector_identity
= NULL_TREE
;
5198 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5200 for (unsigned int i
= 0; i
< group_size
; ++i
)
5202 /* If there's no univeral neutral value, we can use the
5203 initial scalar value from the original PHI. This is used
5204 for MIN and MAX reduction, for example. */
5208 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
],
5209 loop_preheader_edge (loop
));
5210 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5214 /* Calculate the equivalent of:
5216 sel[j] = (index[j] == i);
5218 which selects the elements of NEW_PHI_RESULT that should
5219 be included in the result. */
5220 tree compare_val
= build_int_cst (index_elt_type
, i
);
5221 compare_val
= build_vector_from_val (index_type
, compare_val
);
5222 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5223 index
, compare_val
);
5225 /* Calculate the equivalent of:
5227 vec = seq ? new_phi_result : vector_identity;
5229 VEC is now suitable for a full vector reduction. */
5230 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5231 sel
, new_phi_result
, vector_identity
);
5233 /* Do the reduction and convert it to the appropriate type. */
5234 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5235 TREE_TYPE (vectype
), vec
);
5236 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5237 scalar_results
.safe_push (scalar
);
5239 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5243 bool reduce_with_shift
;
5246 /* COND reductions all do the final reduction with MAX_EXPR
5248 if (code
== COND_EXPR
)
5250 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5251 == INTEGER_INDUC_COND_REDUCTION
)
5257 /* See if the target wants to do the final (shift) reduction
5258 in a vector mode of smaller size and first reduce upper/lower
5259 halves against each other. */
5260 enum machine_mode mode1
= mode
;
5261 tree vectype1
= vectype
;
5262 unsigned sz
= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
));
5265 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5266 sz1
= GET_MODE_SIZE (mode1
).to_constant ();
5268 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz1
);
5269 reduce_with_shift
= have_whole_vector_shift (mode1
);
5270 if (!VECTOR_MODE_P (mode1
))
5271 reduce_with_shift
= false;
5274 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5275 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5276 reduce_with_shift
= false;
5279 /* First reduce the vector to the desired vector size we should
5280 do shift reduction on by combining upper and lower halves. */
5281 new_temp
= new_phi_result
;
5284 gcc_assert (!slp_reduc
);
5286 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz
);
5288 /* The target has to make sure we support lowpart/highpart
5289 extraction, either via direct vector extract or through
5290 an integer mode punning. */
5292 if (convert_optab_handler (vec_extract_optab
,
5293 TYPE_MODE (TREE_TYPE (new_temp
)),
5294 TYPE_MODE (vectype1
))
5295 != CODE_FOR_nothing
)
5297 /* Extract sub-vectors directly once vec_extract becomes
5298 a conversion optab. */
5299 dst1
= make_ssa_name (vectype1
);
5301 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5302 build3 (BIT_FIELD_REF
, vectype1
,
5303 new_temp
, TYPE_SIZE (vectype1
),
5305 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5306 dst2
= make_ssa_name (vectype1
);
5308 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5309 build3 (BIT_FIELD_REF
, vectype1
,
5310 new_temp
, TYPE_SIZE (vectype1
),
5311 bitsize_int (sz
* BITS_PER_UNIT
)));
5312 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5316 /* Extract via punning to appropriately sized integer mode
5318 tree eltype
= build_nonstandard_integer_type (sz
* BITS_PER_UNIT
,
5320 tree etype
= build_vector_type (eltype
, 2);
5321 gcc_assert (convert_optab_handler (vec_extract_optab
,
5324 != CODE_FOR_nothing
);
5325 tree tem
= make_ssa_name (etype
);
5326 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5327 build1 (VIEW_CONVERT_EXPR
,
5329 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5331 tem
= make_ssa_name (eltype
);
5333 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5334 build3 (BIT_FIELD_REF
, eltype
,
5335 new_temp
, TYPE_SIZE (eltype
),
5337 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5338 dst1
= make_ssa_name (vectype1
);
5339 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5340 build1 (VIEW_CONVERT_EXPR
,
5342 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5343 tem
= make_ssa_name (eltype
);
5345 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5346 build3 (BIT_FIELD_REF
, eltype
,
5347 new_temp
, TYPE_SIZE (eltype
),
5348 bitsize_int (sz
* BITS_PER_UNIT
)));
5349 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5350 dst2
= make_ssa_name (vectype1
);
5351 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5352 build1 (VIEW_CONVERT_EXPR
,
5354 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5357 new_temp
= make_ssa_name (vectype1
);
5358 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5359 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5362 if (reduce_with_shift
&& !slp_reduc
)
5364 int element_bitsize
= tree_to_uhwi (bitsize
);
5365 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5366 for variable-length vectors and also requires direct target support
5367 for loop reductions. */
5368 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5369 int nelements
= vec_size_in_bits
/ element_bitsize
;
5370 vec_perm_builder sel
;
5371 vec_perm_indices indices
;
5375 tree zero_vec
= build_zero_cst (vectype1
);
5377 for (offset = nelements/2; offset >= 1; offset/=2)
5379 Create: va' = vec_shift <va, offset>
5380 Create: va = vop <va, va'>
5385 if (dump_enabled_p ())
5386 dump_printf_loc (MSG_NOTE
, vect_location
,
5387 "Reduce using vector shifts\n");
5389 mode1
= TYPE_MODE (vectype1
);
5390 vec_dest
= vect_create_destination_var (scalar_dest
, vectype1
);
5391 for (elt_offset
= nelements
/ 2;
5395 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5396 indices
.new_vector (sel
, 2, nelements
);
5397 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5398 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
5399 new_temp
, zero_vec
, mask
);
5400 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
5401 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5402 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5404 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
5406 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
5407 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5408 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5411 /* 2.4 Extract the final scalar result. Create:
5412 s_out3 = extract_field <v_out2, bitpos> */
5414 if (dump_enabled_p ())
5415 dump_printf_loc (MSG_NOTE
, vect_location
,
5416 "extract scalar result\n");
5418 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5419 bitsize
, bitsize_zero_node
);
5420 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5421 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5422 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5423 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5424 scalar_results
.safe_push (new_temp
);
5429 s = extract_field <v_out2, 0>
5430 for (offset = element_size;
5431 offset < vector_size;
5432 offset += element_size;)
5434 Create: s' = extract_field <v_out2, offset>
5435 Create: s = op <s, s'> // For non SLP cases
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_NOTE
, vect_location
,
5440 "Reduce using scalar code.\n");
5442 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5443 int element_bitsize
= tree_to_uhwi (bitsize
);
5444 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5447 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5448 vec_temp
= PHI_RESULT (new_phi
);
5450 vec_temp
= gimple_assign_lhs (new_phi
);
5451 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
5453 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5454 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5455 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5456 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5458 /* In SLP we don't need to apply reduction operation, so we just
5459 collect s' values in SCALAR_RESULTS. */
5461 scalar_results
.safe_push (new_temp
);
5463 for (bit_offset
= element_bitsize
;
5464 bit_offset
< vec_size_in_bits
;
5465 bit_offset
+= element_bitsize
)
5467 tree bitpos
= bitsize_int (bit_offset
);
5468 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
5471 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5472 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5473 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5474 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5478 /* In SLP we don't need to apply reduction operation, so
5479 we just collect s' values in SCALAR_RESULTS. */
5480 new_temp
= new_name
;
5481 scalar_results
.safe_push (new_name
);
5485 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5486 new_name
, new_temp
);
5487 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5488 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5489 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5494 /* The only case where we need to reduce scalar results in SLP, is
5495 unrolling. If the size of SCALAR_RESULTS is greater than
5496 GROUP_SIZE, we reduce them combining elements modulo
5500 tree res
, first_res
, new_res
;
5503 /* Reduce multiple scalar results in case of SLP unrolling. */
5504 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5507 first_res
= scalar_results
[j
% group_size
];
5508 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5510 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
5511 gimple_assign_set_lhs (new_stmt
, new_res
);
5512 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
5513 scalar_results
[j
% group_size
] = new_res
;
5517 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5518 scalar_results
.safe_push (new_temp
);
5521 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5522 == INTEGER_INDUC_COND_REDUCTION
)
5523 && !operand_equal_p (initial_def
, induc_val
, 0))
5525 /* Earlier we set the initial value to be a vector if induc_val
5526 values. Check the result and if it is induc_val then replace
5527 with the original initial value, unless induc_val is
5528 the same as initial_def already. */
5529 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5532 tree tmp
= make_ssa_name (new_scalar_dest
);
5533 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5534 initial_def
, new_temp
);
5535 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5536 scalar_results
[0] = tmp
;
5540 vect_finalize_reduction
:
5545 /* 2.5 Adjust the final result by the initial value of the reduction
5546 variable. (When such adjustment is not needed, then
5547 'adjustment_def' is zero). For example, if code is PLUS we create:
5548 new_temp = loop_exit_def + adjustment_def */
5552 gcc_assert (!slp_reduc
);
5553 if (nested_in_vect_loop
)
5555 new_phi
= new_phis
[0];
5556 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
5557 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
5558 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5562 new_temp
= scalar_results
[0];
5563 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5564 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
5565 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
5568 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
5569 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
5570 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5571 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5572 if (nested_in_vect_loop
)
5574 set_vinfo_for_stmt (epilog_stmt
,
5575 new_stmt_vec_info (epilog_stmt
, loop_vinfo
));
5576 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
5577 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
5580 scalar_results
.quick_push (new_temp
);
5582 scalar_results
[0] = new_temp
;
5585 scalar_results
[0] = new_temp
;
5587 new_phis
[0] = epilog_stmt
;
5590 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5591 phis with new adjusted scalar results, i.e., replace use <s_out0>
5596 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5597 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5598 v_out2 = reduce <v_out1>
5599 s_out3 = extract_field <v_out2, 0>
5600 s_out4 = adjust_result <s_out3>
5607 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5608 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5609 v_out2 = reduce <v_out1>
5610 s_out3 = extract_field <v_out2, 0>
5611 s_out4 = adjust_result <s_out3>
5616 /* In SLP reduction chain we reduce vector results into one vector if
5617 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5618 the last stmt in the reduction chain, since we are looking for the loop
5620 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
5622 gimple
*dest_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5623 /* Handle reduction patterns. */
5624 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
)))
5625 dest_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
));
5627 scalar_dest
= gimple_assign_lhs (dest_stmt
);
5631 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5632 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5633 need to match SCALAR_RESULTS with corresponding statements. The first
5634 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5635 the first vector stmt, etc.
5636 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5637 if (group_size
> new_phis
.length ())
5639 ratio
= group_size
/ new_phis
.length ();
5640 gcc_assert (!(group_size
% new_phis
.length ()));
5645 for (k
= 0; k
< group_size
; k
++)
5649 epilog_stmt
= new_phis
[k
/ ratio
];
5650 reduction_phi
= reduction_phis
[k
/ ratio
];
5652 inner_phi
= inner_phis
[k
/ ratio
];
5657 gimple
*current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5659 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
5660 /* SLP statements can't participate in patterns. */
5661 gcc_assert (!orig_stmt
);
5662 scalar_dest
= gimple_assign_lhs (current_stmt
);
5666 /* Find the loop-closed-use at the loop exit of the original scalar
5667 result. (The reduction result is expected to have two immediate uses -
5668 one at the latch block, and one at the loop exit). */
5669 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5670 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
5671 && !is_gimple_debug (USE_STMT (use_p
)))
5672 phis
.safe_push (USE_STMT (use_p
));
5674 /* While we expect to have found an exit_phi because of loop-closed-ssa
5675 form we can end up without one if the scalar cycle is dead. */
5677 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5681 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5684 /* FORNOW. Currently not supporting the case that an inner-loop
5685 reduction is not used in the outer-loop (but only outside the
5686 outer-loop), unless it is double reduction. */
5687 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5688 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
5692 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
5694 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
5696 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
5697 != vect_double_reduction_def
)
5700 /* Handle double reduction:
5702 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5703 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5704 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5705 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5707 At that point the regular reduction (stmt2 and stmt3) is
5708 already vectorized, as well as the exit phi node, stmt4.
5709 Here we vectorize the phi node of double reduction, stmt1, and
5710 update all relevant statements. */
5712 /* Go through all the uses of s2 to find double reduction phi
5713 node, i.e., stmt1 above. */
5714 orig_name
= PHI_RESULT (exit_phi
);
5715 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5717 stmt_vec_info use_stmt_vinfo
;
5718 stmt_vec_info new_phi_vinfo
;
5719 tree vect_phi_init
, preheader_arg
, vect_phi_res
;
5720 basic_block bb
= gimple_bb (use_stmt
);
5723 /* Check that USE_STMT is really double reduction phi
5725 if (gimple_code (use_stmt
) != GIMPLE_PHI
5726 || gimple_phi_num_args (use_stmt
) != 2
5727 || bb
->loop_father
!= outer_loop
)
5729 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
5731 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
5732 != vect_double_reduction_def
)
5735 /* Create vector phi node for double reduction:
5736 vs1 = phi <vs0, vs2>
5737 vs1 was created previously in this function by a call to
5738 vect_get_vec_def_for_operand and is stored in
5740 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5741 vs0 is created here. */
5743 /* Create vector phi node. */
5744 vect_phi
= create_phi_node (vec_initial_def
, bb
);
5745 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
5746 loop_vec_info_for_loop (outer_loop
));
5747 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
5749 /* Create vs0 - initial def of the double reduction phi. */
5750 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
5751 loop_preheader_edge (outer_loop
));
5752 vect_phi_init
= get_initial_def_for_reduction
5753 (stmt
, preheader_arg
, NULL
);
5755 /* Update phi node arguments with vs0 and vs2. */
5756 add_phi_arg (vect_phi
, vect_phi_init
,
5757 loop_preheader_edge (outer_loop
),
5759 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
5760 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
5761 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_NOTE
, vect_location
,
5764 "created double reduction phi node: ");
5765 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
5768 vect_phi_res
= PHI_RESULT (vect_phi
);
5770 /* Replace the use, i.e., set the correct vs1 in the regular
5771 reduction phi node. FORNOW, NCOPIES is always 1, so the
5772 loop is redundant. */
5773 use
= reduction_phi
;
5774 for (j
= 0; j
< ncopies
; j
++)
5776 edge pr_edge
= loop_preheader_edge (loop
);
5777 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
5778 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
5785 if (nested_in_vect_loop
)
5794 /* Find the loop-closed-use at the loop exit of the original scalar
5795 result. (The reduction result is expected to have two immediate uses,
5796 one at the latch block, and one at the loop exit). For double
5797 reductions we are looking for exit phis of the outer loop. */
5798 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5800 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
5802 if (!is_gimple_debug (USE_STMT (use_p
)))
5803 phis
.safe_push (USE_STMT (use_p
));
5807 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
5809 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
5811 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
5813 if (!flow_bb_inside_loop_p (loop
,
5814 gimple_bb (USE_STMT (phi_use_p
)))
5815 && !is_gimple_debug (USE_STMT (phi_use_p
)))
5816 phis
.safe_push (USE_STMT (phi_use_p
));
5822 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5824 /* Replace the uses: */
5825 orig_name
= PHI_RESULT (exit_phi
);
5826 scalar_result
= scalar_results
[k
];
5827 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5828 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
5829 SET_USE (use_p
, scalar_result
);
5836 /* Return a vector of type VECTYPE that is equal to the vector select
5837 operation "MASK ? VEC : IDENTITY". Insert the select statements
5841 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
5842 tree vec
, tree identity
)
5844 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
5845 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
5846 mask
, vec
, identity
);
5847 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5851 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5852 order, starting with LHS. Insert the extraction statements before GSI and
5853 associate the new scalar SSA names with variable SCALAR_DEST.
5854 Return the SSA name for the result. */
5857 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
5858 tree_code code
, tree lhs
, tree vector_rhs
)
5860 tree vectype
= TREE_TYPE (vector_rhs
);
5861 tree scalar_type
= TREE_TYPE (vectype
);
5862 tree bitsize
= TYPE_SIZE (scalar_type
);
5863 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5864 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
5866 for (unsigned HOST_WIDE_INT bit_offset
= 0;
5867 bit_offset
< vec_size_in_bits
;
5868 bit_offset
+= element_bitsize
)
5870 tree bitpos
= bitsize_int (bit_offset
);
5871 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
5874 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
5875 rhs
= make_ssa_name (scalar_dest
, stmt
);
5876 gimple_assign_set_lhs (stmt
, rhs
);
5877 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5879 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
5880 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
5881 gimple_assign_set_lhs (stmt
, new_name
);
5882 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5888 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5889 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5890 statement. CODE is the operation performed by STMT and OPS are
5891 its scalar operands. REDUC_INDEX is the index of the operand in
5892 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5893 implements in-order reduction, or IFN_LAST if we should open-code it.
5894 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5895 that should be used to control the operation in a fully-masked loop. */
5898 vectorize_fold_left_reduction (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5899 gimple
**vec_stmt
, slp_tree slp_node
,
5900 gimple
*reduc_def_stmt
,
5901 tree_code code
, internal_fn reduc_fn
,
5902 tree ops
[3], tree vectype_in
,
5903 int reduc_index
, vec_loop_masks
*masks
)
5905 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5906 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5907 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5908 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5909 gimple
*new_stmt
= NULL
;
5915 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
5917 gcc_assert (!nested_in_vect_loop_p (loop
, stmt
));
5918 gcc_assert (ncopies
== 1);
5919 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
5920 gcc_assert (reduc_index
== (code
== MINUS_EXPR
? 0 : 1));
5921 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5922 == FOLD_LEFT_REDUCTION
);
5925 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
5926 TYPE_VECTOR_SUBPARTS (vectype_in
)));
5928 tree op0
= ops
[1 - reduc_index
];
5931 gimple
*scalar_dest_def
;
5932 auto_vec
<tree
> vec_oprnds0
;
5935 vect_get_vec_defs (op0
, NULL_TREE
, stmt
, &vec_oprnds0
, NULL
, slp_node
);
5936 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
5937 scalar_dest_def
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5941 tree loop_vec_def0
= vect_get_vec_def_for_operand (op0
, stmt
);
5942 vec_oprnds0
.create (1);
5943 vec_oprnds0
.quick_push (loop_vec_def0
);
5944 scalar_dest_def
= stmt
;
5947 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def
);
5948 tree scalar_type
= TREE_TYPE (scalar_dest
);
5949 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
5951 int vec_num
= vec_oprnds0
.length ();
5952 gcc_assert (vec_num
== 1 || slp_node
);
5953 tree vec_elem_type
= TREE_TYPE (vectype_out
);
5954 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
5956 tree vector_identity
= NULL_TREE
;
5957 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5958 vector_identity
= build_zero_cst (vectype_out
);
5960 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
5963 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5965 tree mask
= NULL_TREE
;
5966 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5967 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
5969 /* Handle MINUS by adding the negative. */
5970 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
5972 tree negated
= make_ssa_name (vectype_out
);
5973 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
5974 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5979 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
5982 /* On the first iteration the input is simply the scalar phi
5983 result, and for subsequent iterations it is the output of
5984 the preceding operation. */
5985 if (reduc_fn
!= IFN_LAST
)
5987 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
, def0
);
5988 /* For chained SLP reductions the output of the previous reduction
5989 operation serves as the input of the next. For the final statement
5990 the output cannot be a temporary - we reuse the original
5991 scalar destination of the last statement. */
5992 if (i
!= vec_num
- 1)
5994 gimple_set_lhs (new_stmt
, scalar_dest_var
);
5995 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
5996 gimple_set_lhs (new_stmt
, reduc_var
);
6001 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6003 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6004 /* Remove the statement, so that we can use the same code paths
6005 as for statements that we've just created. */
6006 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6007 gsi_remove (&tmp_gsi
, false);
6010 if (i
== vec_num
- 1)
6012 gimple_set_lhs (new_stmt
, scalar_dest
);
6013 vect_finish_replace_stmt (scalar_dest_def
, new_stmt
);
6016 vect_finish_stmt_generation (scalar_dest_def
, new_stmt
, gsi
);
6019 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6023 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
6028 /* Function is_nonwrapping_integer_induction.
6030 Check if STMT (which is part of loop LOOP) both increments and
6031 does not cause overflow. */
6034 is_nonwrapping_integer_induction (gimple
*stmt
, struct loop
*loop
)
6036 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
6037 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6038 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6039 tree lhs_type
= TREE_TYPE (gimple_phi_result (stmt
));
6040 widest_int ni
, max_loop_value
, lhs_max
;
6041 bool overflow
= false;
6043 /* Make sure the loop is integer based. */
6044 if (TREE_CODE (base
) != INTEGER_CST
6045 || TREE_CODE (step
) != INTEGER_CST
)
6048 /* Check that the max size of the loop will not wrap. */
6050 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6053 if (! max_stmt_executions (loop
, &ni
))
6056 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6061 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6062 TYPE_SIGN (lhs_type
), &overflow
);
6066 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6067 <= TYPE_PRECISION (lhs_type
));
6070 /* Function vectorizable_reduction.
6072 Check if STMT performs a reduction operation that can be vectorized.
6073 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6074 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6075 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6077 This function also handles reduction idioms (patterns) that have been
6078 recognized in advance during vect_pattern_recog. In this case, STMT may be
6080 X = pattern_expr (arg0, arg1, ..., X)
6081 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6082 sequence that had been detected and replaced by the pattern-stmt (STMT).
6084 This function also handles reduction of condition expressions, for example:
6085 for (int i = 0; i < N; i++)
6088 This is handled by vectorising the loop and creating an additional vector
6089 containing the loop indexes for which "a[i] < value" was true. In the
6090 function epilogue this is reduced to a single max value and then used to
6091 index into the vector of results.
6093 In some cases of reduction patterns, the type of the reduction variable X is
6094 different than the type of the other arguments of STMT.
6095 In such cases, the vectype that is used when transforming STMT into a vector
6096 stmt is different than the vectype that is used to determine the
6097 vectorization factor, because it consists of a different number of elements
6098 than the actual number of elements that are being operated upon in parallel.
6100 For example, consider an accumulation of shorts into an int accumulator.
6101 On some targets it's possible to vectorize this pattern operating on 8
6102 shorts at a time (hence, the vectype for purposes of determining the
6103 vectorization factor should be V8HI); on the other hand, the vectype that
6104 is used to create the vector form is actually V4SI (the type of the result).
6106 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6107 indicates what is the actual level of parallelism (V8HI in the example), so
6108 that the right vectorization factor would be derived. This vectype
6109 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6110 be used to create the vectorized stmt. The right vectype for the vectorized
6111 stmt is obtained from the type of the result X:
6112 get_vectype_for_scalar_type (TREE_TYPE (X))
6114 This means that, contrary to "regular" reductions (or "regular" stmts in
6115 general), the following equation:
6116 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6117 does *NOT* necessarily hold for reduction patterns. */
6120 vectorizable_reduction (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
6121 gimple
**vec_stmt
, slp_tree slp_node
,
6122 slp_instance slp_node_instance
,
6123 stmt_vector_for_cost
*cost_vec
)
6127 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6128 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6129 tree vectype_in
= NULL_TREE
;
6130 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6131 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6132 enum tree_code code
, orig_code
;
6133 internal_fn reduc_fn
;
6134 machine_mode vec_mode
;
6137 tree new_temp
= NULL_TREE
;
6139 enum vect_def_type dt
, cond_reduc_dt
= vect_unknown_def_type
;
6140 gimple
*cond_reduc_def_stmt
= NULL
;
6141 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6145 stmt_vec_info orig_stmt_info
= NULL
;
6149 stmt_vec_info prev_stmt_info
, prev_phi_info
;
6150 bool single_defuse_cycle
= false;
6151 gimple
*new_stmt
= NULL
;
6154 enum vect_def_type dts
[3];
6155 bool nested_cycle
= false, found_nested_cycle_def
= false;
6156 bool double_reduc
= false;
6158 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
6160 gimple
*def_arg_stmt
;
6161 auto_vec
<tree
> vec_oprnds0
;
6162 auto_vec
<tree
> vec_oprnds1
;
6163 auto_vec
<tree
> vec_oprnds2
;
6164 auto_vec
<tree
> vect_defs
;
6165 auto_vec
<gimple
*> phis
;
6168 bool first_p
= true;
6169 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6170 tree cond_reduc_val
= NULL_TREE
;
6172 /* Make sure it was already recognized as a reduction computation. */
6173 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_reduction_def
6174 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_nested_cycle
)
6177 if (nested_in_vect_loop_p (loop
, stmt
))
6181 nested_cycle
= true;
6184 /* In case of reduction chain we switch to the first stmt in the chain, but
6185 we don't update STMT_INFO, since only the last stmt is marked as reduction
6186 and has reduction properties. */
6187 if (GROUP_FIRST_ELEMENT (stmt_info
)
6188 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
6190 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
6194 if (gimple_code (stmt
) == GIMPLE_PHI
)
6196 /* Analysis is fully done on the reduction stmt invocation. */
6200 slp_node_instance
->reduc_phis
= slp_node
;
6202 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6206 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6207 /* Leave the scalar phi in place. Note that checking
6208 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6209 for reductions involving a single statement. */
6212 gimple
*reduc_stmt
= STMT_VINFO_REDUC_DEF (stmt_info
);
6213 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt
)))
6214 reduc_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt
));
6216 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt
))
6217 == EXTRACT_LAST_REDUCTION
)
6218 /* Leave the scalar phi in place. */
6221 gcc_assert (is_gimple_assign (reduc_stmt
));
6222 for (unsigned k
= 1; k
< gimple_num_ops (reduc_stmt
); ++k
)
6224 tree op
= gimple_op (reduc_stmt
, k
);
6225 if (op
== gimple_phi_result (stmt
))
6228 && gimple_assign_rhs_code (reduc_stmt
) == COND_EXPR
)
6231 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6232 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op
)))))
6233 vectype_in
= get_vectype_for_scalar_type (TREE_TYPE (op
));
6236 gcc_assert (vectype_in
);
6241 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6243 use_operand_p use_p
;
6246 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt
))
6247 <= vect_used_only_live
)
6248 && single_imm_use (gimple_phi_result (stmt
), &use_p
, &use_stmt
)
6249 && (use_stmt
== reduc_stmt
6250 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt
))
6252 single_defuse_cycle
= true;
6254 /* Create the destination vector */
6255 scalar_dest
= gimple_assign_lhs (reduc_stmt
);
6256 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
6259 /* The size vect_schedule_slp_instance computes is off for us. */
6260 vec_num
= vect_get_num_vectors
6261 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6262 * SLP_TREE_SCALAR_STMTS (slp_node
).length (),
6267 /* Generate the reduction PHIs upfront. */
6268 prev_phi_info
= NULL
;
6269 for (j
= 0; j
< ncopies
; j
++)
6271 if (j
== 0 || !single_defuse_cycle
)
6273 for (i
= 0; i
< vec_num
; i
++)
6275 /* Create the reduction-phi that defines the reduction
6277 gimple
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
6278 set_vinfo_for_stmt (new_phi
,
6279 new_stmt_vec_info (new_phi
, loop_vinfo
));
6282 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
6286 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi
;
6288 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
6289 prev_phi_info
= vinfo_for_stmt (new_phi
);
6298 /* 1. Is vectorizable reduction? */
6299 /* Not supportable if the reduction variable is used in the loop, unless
6300 it's a reduction chain. */
6301 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6302 && !GROUP_FIRST_ELEMENT (stmt_info
))
6305 /* Reductions that are not used even in an enclosing outer-loop,
6306 are expected to be "live" (used out of the loop). */
6307 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6308 && !STMT_VINFO_LIVE_P (stmt_info
))
6311 /* 2. Has this been recognized as a reduction pattern?
6313 Check if STMT represents a pattern that has been recognized
6314 in earlier analysis stages. For stmts that represent a pattern,
6315 the STMT_VINFO_RELATED_STMT field records the last stmt in
6316 the original sequence that constitutes the pattern. */
6318 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
6321 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
6322 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6323 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6326 /* 3. Check the operands of the operation. The first operands are defined
6327 inside the loop body. The last operand is the reduction variable,
6328 which is defined by the loop-header-phi. */
6330 gcc_assert (is_gimple_assign (stmt
));
6333 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
6335 case GIMPLE_BINARY_RHS
:
6336 code
= gimple_assign_rhs_code (stmt
);
6337 op_type
= TREE_CODE_LENGTH (code
);
6338 gcc_assert (op_type
== binary_op
);
6339 ops
[0] = gimple_assign_rhs1 (stmt
);
6340 ops
[1] = gimple_assign_rhs2 (stmt
);
6343 case GIMPLE_TERNARY_RHS
:
6344 code
= gimple_assign_rhs_code (stmt
);
6345 op_type
= TREE_CODE_LENGTH (code
);
6346 gcc_assert (op_type
== ternary_op
);
6347 ops
[0] = gimple_assign_rhs1 (stmt
);
6348 ops
[1] = gimple_assign_rhs2 (stmt
);
6349 ops
[2] = gimple_assign_rhs3 (stmt
);
6352 case GIMPLE_UNARY_RHS
:
6359 if (code
== COND_EXPR
&& slp_node
)
6362 scalar_dest
= gimple_assign_lhs (stmt
);
6363 scalar_type
= TREE_TYPE (scalar_dest
);
6364 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6365 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6368 /* Do not try to vectorize bit-precision reductions. */
6369 if (!type_has_mode_precision_p (scalar_type
))
6372 /* All uses but the last are expected to be defined in the loop.
6373 The last use is the reduction variable. In case of nested cycle this
6374 assumption is not true: we use reduc_index to record the index of the
6375 reduction variable. */
6376 gimple
*reduc_def_stmt
= NULL
;
6377 int reduc_index
= -1;
6378 for (i
= 0; i
< op_type
; i
++)
6380 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6381 if (i
== 0 && code
== COND_EXPR
)
6384 is_simple_use
= vect_is_simple_use (ops
[i
], loop_vinfo
,
6385 &def_stmt
, &dts
[i
], &tem
);
6387 gcc_assert (is_simple_use
);
6388 if (dt
== vect_reduction_def
)
6390 reduc_def_stmt
= def_stmt
;
6396 /* To properly compute ncopies we are interested in the widest
6397 input type in case we're looking at a widening accumulation. */
6399 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6400 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
)))))
6404 if (dt
!= vect_internal_def
6405 && dt
!= vect_external_def
6406 && dt
!= vect_constant_def
6407 && dt
!= vect_induction_def
6408 && !(dt
== vect_nested_cycle
&& nested_cycle
))
6411 if (dt
== vect_nested_cycle
)
6413 found_nested_cycle_def
= true;
6414 reduc_def_stmt
= def_stmt
;
6418 if (i
== 1 && code
== COND_EXPR
)
6420 /* Record how value of COND_EXPR is defined. */
6421 if (dt
== vect_constant_def
)
6424 cond_reduc_val
= ops
[i
];
6426 if (dt
== vect_induction_def
6428 && is_nonwrapping_integer_induction (def_stmt
, loop
))
6431 cond_reduc_def_stmt
= def_stmt
;
6437 vectype_in
= vectype_out
;
6439 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6440 directy used in stmt. */
6441 if (reduc_index
== -1)
6443 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6445 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6447 "in-order reduction chain without SLP.\n");
6452 reduc_def_stmt
= STMT_VINFO_REDUC_DEF (orig_stmt_info
);
6454 reduc_def_stmt
= STMT_VINFO_REDUC_DEF (stmt_info
);
6457 if (! reduc_def_stmt
|| gimple_code (reduc_def_stmt
) != GIMPLE_PHI
)
6460 if (!(reduc_index
== -1
6461 || dts
[reduc_index
] == vect_reduction_def
6462 || dts
[reduc_index
] == vect_nested_cycle
6463 || ((dts
[reduc_index
] == vect_internal_def
6464 || dts
[reduc_index
] == vect_external_def
6465 || dts
[reduc_index
] == vect_constant_def
6466 || dts
[reduc_index
] == vect_induction_def
)
6467 && nested_cycle
&& found_nested_cycle_def
)))
6469 /* For pattern recognized stmts, orig_stmt might be a reduction,
6470 but some helper statements for the pattern might not, or
6471 might be COND_EXPRs with reduction uses in the condition. */
6472 gcc_assert (orig_stmt
);
6476 stmt_vec_info reduc_def_info
= vinfo_for_stmt (reduc_def_stmt
);
6477 enum vect_reduction_type v_reduc_type
6478 = STMT_VINFO_REDUC_TYPE (reduc_def_info
);
6479 gimple
*tmp
= STMT_VINFO_REDUC_DEF (reduc_def_info
);
6481 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = v_reduc_type
;
6482 /* If we have a condition reduction, see if we can simplify it further. */
6483 if (v_reduc_type
== COND_REDUCTION
)
6485 /* TODO: We can't yet handle reduction chains, since we need to treat
6486 each COND_EXPR in the chain specially, not just the last one.
6489 x_1 = PHI <x_3, ...>
6490 x_2 = a_2 ? ... : x_1;
6491 x_3 = a_3 ? ... : x_2;
6493 we're interested in the last element in x_3 for which a_2 || a_3
6494 is true, whereas the current reduction chain handling would
6495 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6496 as a reduction operation. */
6497 if (reduc_index
== -1)
6499 if (dump_enabled_p ())
6500 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6501 "conditional reduction chains not supported\n");
6505 /* vect_is_simple_reduction ensured that operand 2 is the
6506 loop-carried operand. */
6507 gcc_assert (reduc_index
== 2);
6509 /* Loop peeling modifies initial value of reduction PHI, which
6510 makes the reduction stmt to be transformed different to the
6511 original stmt analyzed. We need to record reduction code for
6512 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6513 it can be used directly at transform stage. */
6514 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MAX_EXPR
6515 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MIN_EXPR
)
6517 /* Also set the reduction type to CONST_COND_REDUCTION. */
6518 gcc_assert (cond_reduc_dt
== vect_constant_def
);
6519 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = CONST_COND_REDUCTION
;
6521 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6522 vectype_in
, OPTIMIZE_FOR_SPEED
))
6524 if (dump_enabled_p ())
6525 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6526 "optimizing condition reduction with"
6527 " FOLD_EXTRACT_LAST.\n");
6528 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = EXTRACT_LAST_REDUCTION
;
6530 else if (cond_reduc_dt
== vect_induction_def
)
6532 stmt_vec_info cond_stmt_vinfo
= vinfo_for_stmt (cond_reduc_def_stmt
);
6534 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6535 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6537 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6538 && TREE_CODE (step
) == INTEGER_CST
);
6539 cond_reduc_val
= NULL_TREE
;
6540 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6541 above base; punt if base is the minimum value of the type for
6542 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6543 if (tree_int_cst_sgn (step
) == -1)
6545 cond_reduc_op_code
= MIN_EXPR
;
6546 if (tree_int_cst_sgn (base
) == -1)
6547 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6548 else if (tree_int_cst_lt (base
,
6549 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6551 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6555 cond_reduc_op_code
= MAX_EXPR
;
6556 if (tree_int_cst_sgn (base
) == 1)
6557 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6558 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6561 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6565 if (dump_enabled_p ())
6566 dump_printf_loc (MSG_NOTE
, vect_location
,
6567 "condition expression based on "
6568 "integer induction.\n");
6569 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6570 = INTEGER_INDUC_COND_REDUCTION
;
6573 else if (cond_reduc_dt
== vect_constant_def
)
6575 enum vect_def_type cond_initial_dt
;
6576 gimple
*def_stmt
= SSA_NAME_DEF_STMT (ops
[reduc_index
]);
6577 tree cond_initial_val
6578 = PHI_ARG_DEF_FROM_EDGE (def_stmt
, loop_preheader_edge (loop
));
6580 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6581 vect_is_simple_use (cond_initial_val
, loop_vinfo
,
6582 &def_stmt
, &cond_initial_dt
);
6583 if (cond_initial_dt
== vect_constant_def
6584 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6585 TREE_TYPE (cond_reduc_val
)))
6587 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6588 cond_initial_val
, cond_reduc_val
);
6589 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6591 if (dump_enabled_p ())
6592 dump_printf_loc (MSG_NOTE
, vect_location
,
6593 "condition expression based on "
6594 "compile time constant.\n");
6595 /* Record reduction code at analysis stage. */
6596 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
)
6597 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6598 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6599 = CONST_COND_REDUCTION
;
6606 gcc_assert (tmp
== orig_stmt
6607 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == orig_stmt
);
6609 /* We changed STMT to be the first stmt in reduction chain, hence we
6610 check that in this case the first element in the chain is STMT. */
6611 gcc_assert (stmt
== tmp
6612 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
6614 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
6620 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6622 gcc_assert (ncopies
>= 1);
6624 vec_mode
= TYPE_MODE (vectype_in
);
6625 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6627 if (code
== COND_EXPR
)
6629 /* Only call during the analysis stage, otherwise we'll lose
6631 if (!vec_stmt
&& !vectorizable_condition (stmt
, gsi
, NULL
,
6632 ops
[reduc_index
], 0, NULL
,
6635 if (dump_enabled_p ())
6636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6637 "unsupported condition in reduction\n");
6643 /* 4. Supportable by target? */
6645 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
6646 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
6648 /* Shifts and rotates are only supported by vectorizable_shifts,
6649 not vectorizable_reduction. */
6650 if (dump_enabled_p ())
6651 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6652 "unsupported shift or rotation.\n");
6656 /* 4.1. check support for the operation in the loop */
6657 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
6660 if (dump_enabled_p ())
6661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6667 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
6669 if (dump_enabled_p ())
6670 dump_printf (MSG_NOTE
, "op not supported by target.\n");
6672 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
6673 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6676 if (dump_enabled_p ())
6677 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
6680 /* Worthwhile without SIMD support? */
6681 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
6682 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6686 "not worthwhile without SIMD support.\n");
6692 /* 4.2. Check support for the epilog operation.
6694 If STMT represents a reduction pattern, then the type of the
6695 reduction variable may be different than the type of the rest
6696 of the arguments. For example, consider the case of accumulation
6697 of shorts into an int accumulator; The original code:
6698 S1: int_a = (int) short_a;
6699 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6702 STMT: int_acc = widen_sum <short_a, int_acc>
6705 1. The tree-code that is used to create the vector operation in the
6706 epilog code (that reduces the partial results) is not the
6707 tree-code of STMT, but is rather the tree-code of the original
6708 stmt from the pattern that STMT is replacing. I.e, in the example
6709 above we want to use 'widen_sum' in the loop, but 'plus' in the
6711 2. The type (mode) we use to check available target support
6712 for the vector operation to be created in the *epilog*, is
6713 determined by the type of the reduction variable (in the example
6714 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6715 However the type (mode) we use to check available target support
6716 for the vector operation to be created *inside the loop*, is
6717 determined by the type of the other arguments to STMT (in the
6718 example we'd check this: optab_handler (widen_sum_optab,
6721 This is contrary to "regular" reductions, in which the types of all
6722 the arguments are the same as the type of the reduction variable.
6723 For "regular" reductions we can therefore use the same vector type
6724 (and also the same tree-code) when generating the epilog code and
6725 when generating the code inside the loop. */
6727 vect_reduction_type reduction_type
6728 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
6730 && (reduction_type
== TREE_CODE_REDUCTION
6731 || reduction_type
== FOLD_LEFT_REDUCTION
))
6733 /* This is a reduction pattern: get the vectype from the type of the
6734 reduction variable, and get the tree-code from orig_stmt. */
6735 orig_code
= gimple_assign_rhs_code (orig_stmt
);
6736 gcc_assert (vectype_out
);
6737 vec_mode
= TYPE_MODE (vectype_out
);
6741 /* Regular reduction: use the same vectype and tree-code as used for
6742 the vector code inside the loop can be used for the epilog code. */
6745 if (code
== MINUS_EXPR
)
6746 orig_code
= PLUS_EXPR
;
6748 /* For simple condition reductions, replace with the actual expression
6749 we want to base our reduction around. */
6750 if (reduction_type
== CONST_COND_REDUCTION
)
6752 orig_code
= STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
);
6753 gcc_assert (orig_code
== MAX_EXPR
|| orig_code
== MIN_EXPR
);
6755 else if (reduction_type
== INTEGER_INDUC_COND_REDUCTION
)
6756 orig_code
= cond_reduc_op_code
;
6761 def_bb
= gimple_bb (reduc_def_stmt
);
6762 def_stmt_loop
= def_bb
->loop_father
;
6763 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
6764 loop_preheader_edge (def_stmt_loop
));
6765 if (TREE_CODE (def_arg
) == SSA_NAME
6766 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
6767 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
6768 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
6769 && vinfo_for_stmt (def_arg_stmt
)
6770 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
6771 == vect_double_reduction_def
)
6772 double_reduc
= true;
6775 reduc_fn
= IFN_LAST
;
6777 if (reduction_type
== TREE_CODE_REDUCTION
6778 || reduction_type
== FOLD_LEFT_REDUCTION
6779 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6780 || reduction_type
== CONST_COND_REDUCTION
)
6782 if (reduction_type
== FOLD_LEFT_REDUCTION
6783 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6784 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6786 if (reduc_fn
!= IFN_LAST
6787 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6788 OPTIMIZE_FOR_SPEED
))
6790 if (dump_enabled_p ())
6791 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6792 "reduc op not supported by target.\n");
6794 reduc_fn
= IFN_LAST
;
6799 if (!nested_cycle
|| double_reduc
)
6801 if (dump_enabled_p ())
6802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6803 "no reduc code for scalar code.\n");
6809 else if (reduction_type
== COND_REDUCTION
)
6811 int scalar_precision
6812 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6813 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6814 cr_index_vector_type
= build_vector_type (cr_index_scalar_type
,
6817 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
6818 OPTIMIZE_FOR_SPEED
))
6819 reduc_fn
= IFN_REDUC_MAX
;
6822 if (reduction_type
!= EXTRACT_LAST_REDUCTION
6823 && reduc_fn
== IFN_LAST
6824 && !nunits_out
.is_constant ())
6826 if (dump_enabled_p ())
6827 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6828 "missing target support for reduction on"
6829 " variable-length vectors.\n");
6833 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6836 if (dump_enabled_p ())
6837 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6838 "multiple types in double reduction or condition "
6843 /* For SLP reductions, see if there is a neutral value we can use. */
6844 tree neutral_op
= NULL_TREE
;
6847 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
, code
,
6848 GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
6850 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
6852 /* We can't support in-order reductions of code such as this:
6854 for (int i = 0; i < n1; ++i)
6855 for (int j = 0; j < n2; ++j)
6858 since GCC effectively transforms the loop when vectorizing:
6860 for (int i = 0; i < n1 / VF; ++i)
6861 for (int j = 0; j < n2; ++j)
6862 for (int k = 0; k < VF; ++k)
6865 which is a reassociation of the original operation. */
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6868 "in-order double reduction not supported.\n");
6873 if (reduction_type
== FOLD_LEFT_REDUCTION
6875 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
6877 /* We cannot use in-order reductions in this case because there is
6878 an implicit reassociation of the operations involved. */
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6881 "in-order unchained SLP reductions not supported.\n");
6885 /* For double reductions, and for SLP reductions with a neutral value,
6886 we construct a variable-length initial vector by loading a vector
6887 full of the neutral value and then shift-and-inserting the start
6888 values into the low-numbered elements. */
6889 if ((double_reduc
|| neutral_op
)
6890 && !nunits_out
.is_constant ()
6891 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
6892 vectype_out
, OPTIMIZE_FOR_SPEED
))
6894 if (dump_enabled_p ())
6895 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6896 "reduction on variable-length vectors requires"
6897 " target support for a vector-shift-and-insert"
6902 /* Check extra constraints for variable-length unchained SLP reductions. */
6903 if (STMT_SLP_TYPE (stmt_info
)
6904 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
6905 && !nunits_out
.is_constant ())
6907 /* We checked above that we could build the initial vector when
6908 there's a neutral element value. Check here for the case in
6909 which each SLP statement has its own initial value and in which
6910 that value needs to be repeated for every instance of the
6911 statement within the initial vector. */
6912 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6913 scalar_mode elt_mode
= SCALAR_TYPE_MODE (TREE_TYPE (vectype_out
));
6915 && !can_duplicate_and_interleave_p (group_size
, elt_mode
))
6917 if (dump_enabled_p ())
6918 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6919 "unsupported form of SLP reduction for"
6920 " variable-length vectors: cannot build"
6921 " initial vector.\n");
6924 /* The epilogue code relies on the number of elements being a multiple
6925 of the group size. The duplicate-and-interleave approach to setting
6926 up the the initial vector does too. */
6927 if (!multiple_p (nunits_out
, group_size
))
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6931 "unsupported form of SLP reduction for"
6932 " variable-length vectors: the vector size"
6933 " is not a multiple of the number of results.\n");
6938 /* In case of widenning multiplication by a constant, we update the type
6939 of the constant to be the type of the other operand. We check that the
6940 constant fits the type in the pattern recognition pass. */
6941 if (code
== DOT_PROD_EXPR
6942 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
6944 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
6945 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
6946 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
6947 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
6950 if (dump_enabled_p ())
6951 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6952 "invalid types in dot-prod\n");
6958 if (reduction_type
== COND_REDUCTION
)
6962 if (! max_loop_iterations (loop
, &ni
))
6964 if (dump_enabled_p ())
6965 dump_printf_loc (MSG_NOTE
, vect_location
,
6966 "loop count not known, cannot create cond "
6970 /* Convert backedges to iterations. */
6973 /* The additional index will be the same type as the condition. Check
6974 that the loop can fit into this less one (because we'll use up the
6975 zero slot for when there are no matches). */
6976 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
6977 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
6979 if (dump_enabled_p ())
6980 dump_printf_loc (MSG_NOTE
, vect_location
,
6981 "loop size is greater than data size.\n");
6986 /* In case the vectorization factor (VF) is bigger than the number
6987 of elements that we can fit in a vectype (nunits), we have to generate
6988 more than one vector stmt - i.e - we need to "unroll" the
6989 vector stmt by a factor VF/nunits. For more details see documentation
6990 in vectorizable_operation. */
6992 /* If the reduction is used in an outer loop we need to generate
6993 VF intermediate results, like so (e.g. for ncopies=2):
6998 (i.e. we generate VF results in 2 registers).
6999 In this case we have a separate def-use cycle for each copy, and therefore
7000 for each copy we get the vector def for the reduction variable from the
7001 respective phi node created for this copy.
7003 Otherwise (the reduction is unused in the loop nest), we can combine
7004 together intermediate results, like so (e.g. for ncopies=2):
7008 (i.e. we generate VF/2 results in a single register).
7009 In this case for each copy we get the vector def for the reduction variable
7010 from the vectorized reduction operation generated in the previous iteration.
7012 This only works when we see both the reduction PHI and its only consumer
7013 in vectorizable_reduction and there are no intermediate stmts
7015 use_operand_p use_p
;
7018 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7019 && single_imm_use (gimple_phi_result (reduc_def_stmt
), &use_p
, &use_stmt
)
7020 && (use_stmt
== stmt
7021 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt
)) == stmt
))
7023 single_defuse_cycle
= true;
7027 epilog_copies
= ncopies
;
7029 /* If the reduction stmt is one of the patterns that have lane
7030 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7032 && ! single_defuse_cycle
)
7033 && (code
== DOT_PROD_EXPR
7034 || code
== WIDEN_SUM_EXPR
7035 || code
== SAD_EXPR
))
7037 if (dump_enabled_p ())
7038 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7039 "multi def-use cycle not possible for lane-reducing "
7040 "reduction operation\n");
7045 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7049 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7050 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7052 if (!vec_stmt
) /* transformation not required. */
7055 vect_model_reduction_cost (stmt_info
, reduc_fn
, ncopies
, cost_vec
);
7056 if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7058 if (reduction_type
!= FOLD_LEFT_REDUCTION
7059 && (cond_fn
== IFN_LAST
7060 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7061 OPTIMIZE_FOR_SPEED
)))
7063 if (dump_enabled_p ())
7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7065 "can't use a fully-masked loop because no"
7066 " conditional operation is available.\n");
7067 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7069 else if (reduc_index
== -1)
7071 if (dump_enabled_p ())
7072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7073 "can't use a fully-masked loop for chained"
7075 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7078 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7081 if (dump_enabled_p ()
7082 && reduction_type
== FOLD_LEFT_REDUCTION
)
7083 dump_printf_loc (MSG_NOTE
, vect_location
,
7084 "using an in-order (fold-left) reduction.\n");
7085 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7091 if (dump_enabled_p ())
7092 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7094 /* FORNOW: Multiple types are not supported for condition. */
7095 if (code
== COND_EXPR
)
7096 gcc_assert (ncopies
== 1);
7098 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7100 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7101 return vectorize_fold_left_reduction
7102 (stmt
, gsi
, vec_stmt
, slp_node
, reduc_def_stmt
, code
,
7103 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
7105 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
7107 gcc_assert (!slp_node
);
7108 return vectorizable_condition (stmt
, gsi
, vec_stmt
,
7109 NULL
, reduc_index
, NULL
, NULL
);
7112 /* Create the destination vector */
7113 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7115 prev_stmt_info
= NULL
;
7116 prev_phi_info
= NULL
;
7119 vec_oprnds0
.create (1);
7120 vec_oprnds1
.create (1);
7121 if (op_type
== ternary_op
)
7122 vec_oprnds2
.create (1);
7125 phis
.create (vec_num
);
7126 vect_defs
.create (vec_num
);
7128 vect_defs
.quick_push (NULL_TREE
);
7131 phis
.splice (SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
));
7133 phis
.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt
)));
7135 for (j
= 0; j
< ncopies
; j
++)
7137 if (code
== COND_EXPR
)
7139 gcc_assert (!slp_node
);
7140 vectorizable_condition (stmt
, gsi
, vec_stmt
,
7141 PHI_RESULT (phis
[0]),
7142 reduc_index
, NULL
, NULL
);
7143 /* Multiple types are not supported for condition. */
7152 /* Get vec defs for all the operands except the reduction index,
7153 ensuring the ordering of the ops in the vector is kept. */
7154 auto_vec
<tree
, 3> slp_ops
;
7155 auto_vec
<vec
<tree
>, 3> vec_defs
;
7157 slp_ops
.quick_push (ops
[0]);
7158 slp_ops
.quick_push (ops
[1]);
7159 if (op_type
== ternary_op
)
7160 slp_ops
.quick_push (ops
[2]);
7162 vect_get_slp_defs (slp_ops
, slp_node
, &vec_defs
);
7164 vec_oprnds0
.safe_splice (vec_defs
[0]);
7165 vec_defs
[0].release ();
7166 vec_oprnds1
.safe_splice (vec_defs
[1]);
7167 vec_defs
[1].release ();
7168 if (op_type
== ternary_op
)
7170 vec_oprnds2
.safe_splice (vec_defs
[2]);
7171 vec_defs
[2].release ();
7176 vec_oprnds0
.quick_push
7177 (vect_get_vec_def_for_operand (ops
[0], stmt
));
7178 vec_oprnds1
.quick_push
7179 (vect_get_vec_def_for_operand (ops
[1], stmt
));
7180 if (op_type
== ternary_op
)
7181 vec_oprnds2
.quick_push
7182 (vect_get_vec_def_for_operand (ops
[2], stmt
));
7189 gcc_assert (reduc_index
!= -1 || ! single_defuse_cycle
);
7191 if (single_defuse_cycle
&& reduc_index
== 0)
7192 vec_oprnds0
[0] = gimple_get_lhs (new_stmt
);
7195 = vect_get_vec_def_for_stmt_copy (dts
[0], vec_oprnds0
[0]);
7196 if (single_defuse_cycle
&& reduc_index
== 1)
7197 vec_oprnds1
[0] = gimple_get_lhs (new_stmt
);
7200 = vect_get_vec_def_for_stmt_copy (dts
[1], vec_oprnds1
[0]);
7201 if (op_type
== ternary_op
)
7203 if (single_defuse_cycle
&& reduc_index
== 2)
7204 vec_oprnds2
[0] = gimple_get_lhs (new_stmt
);
7207 = vect_get_vec_def_for_stmt_copy (dts
[2], vec_oprnds2
[0]);
7212 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7214 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7217 /* Make sure that the reduction accumulator is vop[0]. */
7218 if (reduc_index
== 1)
7220 gcc_assert (commutative_tree_code (code
));
7221 std::swap (vop
[0], vop
[1]);
7223 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7224 vectype_in
, i
* ncopies
+ j
);
7225 gcall
*call
= gimple_build_call_internal (cond_fn
, 3, mask
,
7227 new_temp
= make_ssa_name (vec_dest
, call
);
7228 gimple_call_set_lhs (call
, new_temp
);
7229 gimple_call_set_nothrow (call
, true);
7234 if (op_type
== ternary_op
)
7235 vop
[2] = vec_oprnds2
[i
];
7237 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7238 new_stmt
= gimple_build_assign (new_temp
, code
,
7239 vop
[0], vop
[1], vop
[2]);
7241 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
7245 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7246 vect_defs
.quick_push (new_temp
);
7249 vect_defs
[0] = new_temp
;
7256 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
7258 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
7260 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
7263 /* Finalize the reduction-phi (set its arguments) and create the
7264 epilog reduction code. */
7265 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
7266 vect_defs
[0] = gimple_get_lhs (*vec_stmt
);
7268 vect_create_epilog_for_reduction (vect_defs
, stmt
, reduc_def_stmt
,
7269 epilog_copies
, reduc_fn
, phis
,
7270 double_reduc
, slp_node
, slp_node_instance
,
7271 cond_reduc_val
, cond_reduc_op_code
,
7277 /* Function vect_min_worthwhile_factor.
7279 For a loop where we could vectorize the operation indicated by CODE,
7280 return the minimum vectorization factor that makes it worthwhile
7281 to use generic vectors. */
7283 vect_min_worthwhile_factor (enum tree_code code
)
7303 /* Return true if VINFO indicates we are doing loop vectorization and if
7304 it is worth decomposing CODE operations into scalar operations for
7305 that loop's vectorization factor. */
7308 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7310 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7311 unsigned HOST_WIDE_INT value
;
7313 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7314 && value
>= vect_min_worthwhile_factor (code
));
7317 /* Function vectorizable_induction
7319 Check if PHI performs an induction computation that can be vectorized.
7320 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7321 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7322 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7325 vectorizable_induction (gimple
*phi
,
7326 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7327 gimple
**vec_stmt
, slp_tree slp_node
,
7328 stmt_vector_for_cost
*cost_vec
)
7330 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
7331 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7332 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7334 bool nested_in_vect_loop
= false;
7335 struct loop
*iv_loop
;
7337 edge pe
= loop_preheader_edge (loop
);
7339 tree new_vec
, vec_init
, vec_step
, t
;
7342 gphi
*induction_phi
;
7343 tree induc_def
, vec_dest
;
7344 tree init_expr
, step_expr
;
7345 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7349 imm_use_iterator imm_iter
;
7350 use_operand_p use_p
;
7354 gimple_stmt_iterator si
;
7355 basic_block bb
= gimple_bb (phi
);
7357 if (gimple_code (phi
) != GIMPLE_PHI
)
7360 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7363 /* Make sure it was recognized as induction computation. */
7364 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7367 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7368 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7373 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7374 gcc_assert (ncopies
>= 1);
7376 /* FORNOW. These restrictions should be relaxed. */
7377 if (nested_in_vect_loop_p (loop
, phi
))
7379 imm_use_iterator imm_iter
;
7380 use_operand_p use_p
;
7387 if (dump_enabled_p ())
7388 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7389 "multiple types in nested loop.\n");
7393 /* FORNOW: outer loop induction with SLP not supported. */
7394 if (STMT_SLP_TYPE (stmt_info
))
7398 latch_e
= loop_latch_edge (loop
->inner
);
7399 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7400 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7402 gimple
*use_stmt
= USE_STMT (use_p
);
7403 if (is_gimple_debug (use_stmt
))
7406 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7408 exit_phi
= use_stmt
;
7414 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
7415 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7416 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7418 if (dump_enabled_p ())
7419 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7420 "inner-loop induction only used outside "
7421 "of the outer vectorized loop.\n");
7426 nested_in_vect_loop
= true;
7427 iv_loop
= loop
->inner
;
7431 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7433 if (slp_node
&& !nunits
.is_constant ())
7435 /* The current SLP code creates the initial value element-by-element. */
7436 if (dump_enabled_p ())
7437 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7438 "SLP induction not supported for variable-length"
7443 if (!vec_stmt
) /* transformation not required. */
7445 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
7446 if (dump_enabled_p ())
7447 dump_printf_loc (MSG_NOTE
, vect_location
,
7448 "=== vectorizable_induction ===\n");
7449 vect_model_induction_cost (stmt_info
, ncopies
, cost_vec
);
7455 /* Compute a vector variable, initialized with the first VF values of
7456 the induction variable. E.g., for an iv with IV_PHI='X' and
7457 evolution S, for a vector of 4 units, we want to compute:
7458 [X, X + S, X + 2*S, X + 3*S]. */
7460 if (dump_enabled_p ())
7461 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
7463 latch_e
= loop_latch_edge (iv_loop
);
7464 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7466 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
7467 gcc_assert (step_expr
!= NULL_TREE
);
7469 pe
= loop_preheader_edge (iv_loop
);
7470 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
7471 loop_preheader_edge (iv_loop
));
7474 if (!nested_in_vect_loop
)
7476 /* Convert the initial value to the desired type. */
7477 tree new_type
= TREE_TYPE (vectype
);
7478 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
7480 /* If we are using the loop mask to "peel" for alignment then we need
7481 to adjust the start value here. */
7482 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
7483 if (skip_niters
!= NULL_TREE
)
7485 if (FLOAT_TYPE_P (vectype
))
7486 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
7489 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
7490 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
7491 skip_niters
, step_expr
);
7492 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
7493 init_expr
, skip_step
);
7497 /* Convert the step to the desired type. */
7498 step_expr
= gimple_convert (&stmts
, TREE_TYPE (vectype
), step_expr
);
7502 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7503 gcc_assert (!new_bb
);
7506 /* Find the first insertion point in the BB. */
7507 si
= gsi_after_labels (bb
);
7509 /* For SLP induction we have to generate several IVs as for example
7510 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7511 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7512 [VF*S, VF*S, VF*S, VF*S] for all. */
7515 /* Enforced above. */
7516 unsigned int const_nunits
= nunits
.to_constant ();
7518 /* Generate [VF*S, VF*S, ... ]. */
7519 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7521 expr
= build_int_cst (integer_type_node
, vf
);
7522 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7525 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7526 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7528 if (! CONSTANT_CLASS_P (new_name
))
7529 new_name
= vect_init_vector (phi
, new_name
,
7530 TREE_TYPE (step_expr
), NULL
);
7531 new_vec
= build_vector_from_val (vectype
, new_name
);
7532 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7534 /* Now generate the IVs. */
7535 unsigned group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7536 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7537 unsigned elts
= const_nunits
* nvects
;
7538 unsigned nivs
= least_common_multiple (group_size
,
7539 const_nunits
) / const_nunits
;
7540 gcc_assert (elts
% group_size
== 0);
7541 tree elt
= init_expr
;
7543 for (ivn
= 0; ivn
< nivs
; ++ivn
)
7545 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7547 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
7549 if (ivn
*const_nunits
+ eltn
>= group_size
7550 && (ivn
* const_nunits
+ eltn
) % group_size
== 0)
7551 elt
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (elt
),
7553 elts
.quick_push (elt
);
7555 vec_init
= gimple_build_vector (&stmts
, &elts
);
7558 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7559 gcc_assert (!new_bb
);
7562 /* Create the induction-phi that defines the induction-operand. */
7563 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7564 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7565 set_vinfo_for_stmt (induction_phi
,
7566 new_stmt_vec_info (induction_phi
, loop_vinfo
));
7567 induc_def
= PHI_RESULT (induction_phi
);
7569 /* Create the iv update inside the loop */
7570 vec_def
= make_ssa_name (vec_dest
);
7571 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
7572 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7573 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
));
7575 /* Set the arguments of the phi node: */
7576 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7577 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7580 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
7583 /* Re-use IVs when we can. */
7587 = least_common_multiple (group_size
, const_nunits
) / group_size
;
7588 /* Generate [VF'*S, VF'*S, ... ]. */
7589 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7591 expr
= build_int_cst (integer_type_node
, vfp
);
7592 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7595 expr
= build_int_cst (TREE_TYPE (step_expr
), vfp
);
7596 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7598 if (! CONSTANT_CLASS_P (new_name
))
7599 new_name
= vect_init_vector (phi
, new_name
,
7600 TREE_TYPE (step_expr
), NULL
);
7601 new_vec
= build_vector_from_val (vectype
, new_name
);
7602 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7603 for (; ivn
< nvects
; ++ivn
)
7605 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
7607 if (gimple_code (iv
) == GIMPLE_PHI
)
7608 def
= gimple_phi_result (iv
);
7610 def
= gimple_assign_lhs (iv
);
7611 new_stmt
= gimple_build_assign (make_ssa_name (vectype
),
7614 if (gimple_code (iv
) == GIMPLE_PHI
)
7615 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7618 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
7619 gsi_insert_after (&tgsi
, new_stmt
, GSI_CONTINUE_LINKING
);
7621 set_vinfo_for_stmt (new_stmt
,
7622 new_stmt_vec_info (new_stmt
, loop_vinfo
));
7623 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7630 /* Create the vector that holds the initial_value of the induction. */
7631 if (nested_in_vect_loop
)
7633 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7634 been created during vectorization of previous stmts. We obtain it
7635 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7636 vec_init
= vect_get_vec_def_for_operand (init_expr
, phi
);
7637 /* If the initial value is not of proper type, convert it. */
7638 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
7641 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
7645 build1 (VIEW_CONVERT_EXPR
, vectype
,
7647 vec_init
= gimple_assign_lhs (new_stmt
);
7648 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
7650 gcc_assert (!new_bb
);
7651 set_vinfo_for_stmt (new_stmt
,
7652 new_stmt_vec_info (new_stmt
, loop_vinfo
));
7657 /* iv_loop is the loop to be vectorized. Create:
7658 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7660 new_name
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_expr
);
7662 unsigned HOST_WIDE_INT const_nunits
;
7663 if (nunits
.is_constant (&const_nunits
))
7665 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7666 elts
.quick_push (new_name
);
7667 for (i
= 1; i
< const_nunits
; i
++)
7669 /* Create: new_name_i = new_name + step_expr */
7670 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
7671 new_name
, step_expr
);
7672 elts
.quick_push (new_name
);
7674 /* Create a vector from [new_name_0, new_name_1, ...,
7675 new_name_nunits-1] */
7676 vec_init
= gimple_build_vector (&stmts
, &elts
);
7678 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
7679 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7680 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, vectype
,
7681 new_name
, step_expr
);
7685 [base, base, base, ...]
7686 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7687 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
7688 gcc_assert (flag_associative_math
);
7689 tree index
= build_index_vector (vectype
, 0, 1);
7690 tree base_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7692 tree step_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7694 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, vectype
, index
);
7695 vec_init
= gimple_build (&stmts
, MULT_EXPR
, vectype
,
7696 vec_init
, step_vec
);
7697 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, vectype
,
7698 vec_init
, base_vec
);
7703 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7704 gcc_assert (!new_bb
);
7709 /* Create the vector that holds the step of the induction. */
7710 if (nested_in_vect_loop
)
7711 /* iv_loop is nested in the loop to be vectorized. Generate:
7712 vec_step = [S, S, S, S] */
7713 new_name
= step_expr
;
7716 /* iv_loop is the loop to be vectorized. Generate:
7717 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7718 gimple_seq seq
= NULL
;
7719 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7721 expr
= build_int_cst (integer_type_node
, vf
);
7722 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7725 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7726 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7730 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7731 gcc_assert (!new_bb
);
7735 t
= unshare_expr (new_name
);
7736 gcc_assert (CONSTANT_CLASS_P (new_name
)
7737 || TREE_CODE (new_name
) == SSA_NAME
);
7738 new_vec
= build_vector_from_val (vectype
, t
);
7739 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7742 /* Create the following def-use cycle:
7747 vec_iv = PHI <vec_init, vec_loop>
7751 vec_loop = vec_iv + vec_step; */
7753 /* Create the induction-phi that defines the induction-operand. */
7754 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7755 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7756 set_vinfo_for_stmt (induction_phi
,
7757 new_stmt_vec_info (induction_phi
, loop_vinfo
));
7758 induc_def
= PHI_RESULT (induction_phi
);
7760 /* Create the iv update inside the loop */
7761 vec_def
= make_ssa_name (vec_dest
);
7762 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
7763 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7764 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
));
7766 /* Set the arguments of the phi node: */
7767 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7768 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7771 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= induction_phi
;
7773 /* In case that vectorization factor (VF) is bigger than the number
7774 of elements that we can fit in a vectype (nunits), we have to generate
7775 more than one vector stmt - i.e - we need to "unroll" the
7776 vector stmt by a factor VF/nunits. For more details see documentation
7777 in vectorizable_operation. */
7781 gimple_seq seq
= NULL
;
7782 stmt_vec_info prev_stmt_vinfo
;
7783 /* FORNOW. This restriction should be relaxed. */
7784 gcc_assert (!nested_in_vect_loop
);
7786 /* Create the vector that holds the step of the induction. */
7787 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7789 expr
= build_int_cst (integer_type_node
, nunits
);
7790 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7793 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
7794 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7798 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7799 gcc_assert (!new_bb
);
7802 t
= unshare_expr (new_name
);
7803 gcc_assert (CONSTANT_CLASS_P (new_name
)
7804 || TREE_CODE (new_name
) == SSA_NAME
);
7805 new_vec
= build_vector_from_val (vectype
, t
);
7806 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7808 vec_def
= induc_def
;
7809 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
7810 for (i
= 1; i
< ncopies
; i
++)
7812 /* vec_i = vec_prev + vec_step */
7813 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
7815 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
7816 gimple_assign_set_lhs (new_stmt
, vec_def
);
7818 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7819 set_vinfo_for_stmt (new_stmt
,
7820 new_stmt_vec_info (new_stmt
, loop_vinfo
));
7821 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
7822 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
7826 if (nested_in_vect_loop
)
7828 /* Find the loop-closed exit-phi of the induction, and record
7829 the final vector of induction results: */
7831 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7833 gimple
*use_stmt
= USE_STMT (use_p
);
7834 if (is_gimple_debug (use_stmt
))
7837 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
7839 exit_phi
= use_stmt
;
7845 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
7846 /* FORNOW. Currently not supporting the case that an inner-loop induction
7847 is not used in the outer-loop (i.e. only outside the outer-loop). */
7848 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
7849 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
7851 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
7852 if (dump_enabled_p ())
7854 dump_printf_loc (MSG_NOTE
, vect_location
,
7855 "vector of inductions after inner-loop:");
7856 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
7862 if (dump_enabled_p ())
7864 dump_printf_loc (MSG_NOTE
, vect_location
,
7865 "transform induction: created def-use cycle: ");
7866 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
7867 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
7868 SSA_NAME_DEF_STMT (vec_def
), 0);
7874 /* Function vectorizable_live_operation.
7876 STMT computes a value that is used outside the loop. Check if
7877 it can be supported. */
7880 vectorizable_live_operation (gimple
*stmt
,
7881 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7882 slp_tree slp_node
, int slp_index
,
7884 stmt_vector_for_cost
*)
7886 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
7887 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7888 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7889 imm_use_iterator imm_iter
;
7890 tree lhs
, lhs_type
, bitsize
, vec_bitsize
;
7891 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7892 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7895 auto_vec
<tree
> vec_oprnds
;
7897 poly_uint64 vec_index
= 0;
7899 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
7901 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
7904 /* FORNOW. CHECKME. */
7905 if (nested_in_vect_loop_p (loop
, stmt
))
7908 /* If STMT is not relevant and it is a simple assignment and its inputs are
7909 invariant then it can remain in place, unvectorized. The original last
7910 scalar value that it computes will be used. */
7911 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7913 gcc_assert (is_simple_and_all_uses_invariant (stmt
, loop_vinfo
));
7914 if (dump_enabled_p ())
7915 dump_printf_loc (MSG_NOTE
, vect_location
,
7916 "statement is simple and uses invariant. Leaving in "
7924 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7928 gcc_assert (slp_index
>= 0);
7930 int num_scalar
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7931 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7933 /* Get the last occurrence of the scalar index from the concatenation of
7934 all the slp vectors. Calculate which slp vector it is and the index
7936 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
7938 /* Calculate which vector contains the result, and which lane of
7939 that vector we need. */
7940 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
7942 if (dump_enabled_p ())
7943 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7944 "Cannot determine which vector holds the"
7945 " final result.\n");
7952 /* No transformation required. */
7953 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7955 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
7956 OPTIMIZE_FOR_SPEED
))
7958 if (dump_enabled_p ())
7959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7960 "can't use a fully-masked loop because "
7961 "the target doesn't support extract last "
7963 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7967 if (dump_enabled_p ())
7968 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7969 "can't use a fully-masked loop because an "
7970 "SLP statement is live after the loop.\n");
7971 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7973 else if (ncopies
> 1)
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7977 "can't use a fully-masked loop because"
7978 " ncopies is greater than 1.\n");
7979 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7983 gcc_assert (ncopies
== 1 && !slp_node
);
7984 vect_record_loop_mask (loop_vinfo
,
7985 &LOOP_VINFO_MASKS (loop_vinfo
),
7992 /* If stmt has a related stmt, then use that for getting the lhs. */
7993 if (is_pattern_stmt_p (stmt_info
))
7994 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
7996 lhs
= (is_a
<gphi
*> (stmt
)) ? gimple_phi_result (stmt
)
7997 : gimple_get_lhs (stmt
);
7998 lhs_type
= TREE_TYPE (lhs
);
8000 bitsize
= (VECTOR_BOOLEAN_TYPE_P (vectype
)
8001 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype
)))
8002 : TYPE_SIZE (TREE_TYPE (vectype
)));
8003 vec_bitsize
= TYPE_SIZE (vectype
);
8005 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8006 tree vec_lhs
, bitstart
;
8009 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8011 /* Get the correct slp vectorized stmt. */
8012 gimple
*vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
8013 if (gphi
*phi
= dyn_cast
<gphi
*> (vec_stmt
))
8014 vec_lhs
= gimple_phi_result (phi
);
8016 vec_lhs
= gimple_get_lhs (vec_stmt
);
8018 /* Get entry to use. */
8019 bitstart
= bitsize_int (vec_index
);
8020 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8024 enum vect_def_type dt
= STMT_VINFO_DEF_TYPE (stmt_info
);
8025 vec_lhs
= vect_get_vec_def_for_operand_1 (stmt
, dt
);
8026 gcc_checking_assert (ncopies
== 1
8027 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8029 /* For multiple copies, get the last copy. */
8030 for (int i
= 1; i
< ncopies
; ++i
)
8031 vec_lhs
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
,
8034 /* Get the last lane in the vector. */
8035 bitstart
= int_const_binop (MINUS_EXPR
, vec_bitsize
, bitsize
);
8038 gimple_seq stmts
= NULL
;
8040 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8044 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8046 where VEC_LHS is the vectorized live-out result and MASK is
8047 the loop mask for the final iteration. */
8048 gcc_assert (ncopies
== 1 && !slp_node
);
8049 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8050 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8052 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
,
8053 scalar_type
, mask
, vec_lhs
);
8055 /* Convert the extracted vector element to the required scalar type. */
8056 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8060 tree bftype
= TREE_TYPE (vectype
);
8061 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8062 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8063 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs
, bitsize
, bitstart
);
8064 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8065 &stmts
, true, NULL_TREE
);
8069 gsi_insert_seq_on_edge_immediate (single_exit (loop
), stmts
);
8071 /* Replace use of lhs with newly computed result. If the use stmt is a
8072 single arg PHI, just replace all uses of PHI result. It's necessary
8073 because lcssa PHI defining lhs may be before newly inserted stmt. */
8074 use_operand_p use_p
;
8075 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8076 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8077 && !is_gimple_debug (use_stmt
))
8079 if (gimple_code (use_stmt
) == GIMPLE_PHI
8080 && gimple_phi_num_args (use_stmt
) == 1)
8082 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8086 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8087 SET_USE (use_p
, new_tree
);
8089 update_stmt (use_stmt
);
8095 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8098 vect_loop_kill_debug_uses (struct loop
*loop
, gimple
*stmt
)
8100 ssa_op_iter op_iter
;
8101 imm_use_iterator imm_iter
;
8102 def_operand_p def_p
;
8105 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
8107 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8111 if (!is_gimple_debug (ustmt
))
8114 bb
= gimple_bb (ustmt
);
8116 if (!flow_bb_inside_loop_p (loop
, bb
))
8118 if (gimple_debug_bind_p (ustmt
))
8120 if (dump_enabled_p ())
8121 dump_printf_loc (MSG_NOTE
, vect_location
,
8122 "killing debug use\n");
8124 gimple_debug_bind_reset_value (ustmt
);
8125 update_stmt (ustmt
);
8134 /* Given loop represented by LOOP_VINFO, return true if computation of
8135 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8139 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8141 /* Constant case. */
8142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8144 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8145 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8147 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8148 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8149 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8154 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8155 /* Check the upper bound of loop niters. */
8156 if (get_max_loop_iterations (loop
, &max
))
8158 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8159 signop sgn
= TYPE_SIGN (type
);
8160 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8167 /* Return a mask type with half the number of elements as TYPE. */
8170 vect_halve_mask_nunits (tree type
)
8172 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (type
), 2);
8173 return build_truth_vector_type (nunits
, current_vector_size
);
8176 /* Return a mask type with twice as many elements as TYPE. */
8179 vect_double_mask_nunits (tree type
)
8181 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (type
) * 2;
8182 return build_truth_vector_type (nunits
, current_vector_size
);
8185 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8186 contain a sequence of NVECTORS masks that each control a vector of type
8190 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8191 unsigned int nvectors
, tree vectype
)
8193 gcc_assert (nvectors
!= 0);
8194 if (masks
->length () < nvectors
)
8195 masks
->safe_grow_cleared (nvectors
);
8196 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8197 /* The number of scalars per iteration and the number of vectors are
8198 both compile-time constants. */
8199 unsigned int nscalars_per_iter
8200 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8201 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8202 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8204 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8205 rgm
->mask_type
= build_same_sized_truth_vector_type (vectype
);
8209 /* Given a complete set of masks MASKS, extract mask number INDEX
8210 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8211 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8213 See the comment above vec_loop_masks for more details about the mask
8217 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8218 unsigned int nvectors
, tree vectype
, unsigned int index
)
8220 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8221 tree mask_type
= rgm
->mask_type
;
8223 /* Populate the rgroup's mask array, if this is the first time we've
8225 if (rgm
->masks
.is_empty ())
8227 rgm
->masks
.safe_grow_cleared (nvectors
);
8228 for (unsigned int i
= 0; i
< nvectors
; ++i
)
8230 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
8231 /* Provide a dummy definition until the real one is available. */
8232 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
8233 rgm
->masks
[i
] = mask
;
8237 tree mask
= rgm
->masks
[index
];
8238 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
8239 TYPE_VECTOR_SUBPARTS (vectype
)))
8241 /* A loop mask for data type X can be reused for data type Y
8242 if X has N times more elements than Y and if Y's elements
8243 are N times bigger than X's. In this case each sequence
8244 of N elements in the loop mask will be all-zero or all-one.
8245 We can then view-convert the mask so that each sequence of
8246 N elements is replaced by a single element. */
8247 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
8248 TYPE_VECTOR_SUBPARTS (vectype
)));
8249 gimple_seq seq
= NULL
;
8250 mask_type
= build_same_sized_truth_vector_type (vectype
);
8251 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
8253 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
8258 /* Scale profiling counters by estimation for LOOP which is vectorized
8262 scale_profile_for_vect_loop (struct loop
*loop
, unsigned vf
)
8264 edge preheader
= loop_preheader_edge (loop
);
8265 /* Reduce loop iterations by the vectorization factor. */
8266 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
8267 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
8269 if (freq_h
.nonzero_p ())
8271 profile_probability p
;
8273 /* Avoid dropping loop body profile counter to 0 because of zero count
8274 in loop's preheader. */
8275 if (!(freq_e
== profile_count::zero ()))
8276 freq_e
= freq_e
.force_nonzero ();
8277 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
8278 scale_loop_frequencies (loop
, p
);
8281 edge exit_e
= single_exit (loop
);
8282 exit_e
->probability
= profile_probability::always ()
8283 .apply_scale (1, new_est_niter
+ 1);
8285 edge exit_l
= single_pred_edge (loop
->latch
);
8286 profile_probability prob
= exit_l
->probability
;
8287 exit_l
->probability
= exit_e
->probability
.invert ();
8288 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
8289 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
8292 /* Function vect_transform_loop.
8294 The analysis phase has determined that the loop is vectorizable.
8295 Vectorize the loop - created vectorized stmts to replace the scalar
8296 stmts in the loop, and update the loop exit condition.
8297 Returns scalar epilogue loop if any. */
8300 vect_transform_loop (loop_vec_info loop_vinfo
)
8302 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8303 struct loop
*epilogue
= NULL
;
8304 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
8305 int nbbs
= loop
->num_nodes
;
8307 tree niters_vector
= NULL_TREE
;
8308 tree step_vector
= NULL_TREE
;
8309 tree niters_vector_mult_vf
= NULL_TREE
;
8310 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8311 unsigned int lowest_vf
= constant_lower_bound (vf
);
8313 bool slp_scheduled
= false;
8314 gimple
*stmt
, *pattern_stmt
;
8315 gimple_seq pattern_def_seq
= NULL
;
8316 gimple_stmt_iterator pattern_def_si
= gsi_none ();
8317 bool transform_pattern_stmt
= false;
8318 bool check_profitability
= false;
8321 if (dump_enabled_p ())
8322 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
8324 /* Use the more conservative vectorization threshold. If the number
8325 of iterations is constant assume the cost check has been performed
8326 by our caller. If the threshold makes all loops profitable that
8327 run at least the (estimated) vectorization factor number of times
8328 checking is pointless, too. */
8329 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
8330 if (th
>= vect_vf_for_cost (loop_vinfo
)
8331 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8333 if (dump_enabled_p ())
8334 dump_printf_loc (MSG_NOTE
, vect_location
,
8335 "Profitability threshold is %d loop iterations.\n",
8337 check_profitability
= true;
8340 /* Make sure there exists a single-predecessor exit bb. Do this before
8342 edge e
= single_exit (loop
);
8343 if (! single_pred_p (e
->dest
))
8345 split_loop_exit_edge (e
);
8346 if (dump_enabled_p ())
8347 dump_printf (MSG_NOTE
, "split exit edge\n");
8350 /* Version the loop first, if required, so the profitability check
8353 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
8355 poly_uint64 versioning_threshold
8356 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
8357 if (check_profitability
8358 && ordered_p (poly_uint64 (th
), versioning_threshold
))
8360 versioning_threshold
= ordered_max (poly_uint64 (th
),
8361 versioning_threshold
);
8362 check_profitability
= false;
8364 vect_loop_versioning (loop_vinfo
, th
, check_profitability
,
8365 versioning_threshold
);
8366 check_profitability
= false;
8369 /* Make sure there exists a single-predecessor exit bb also on the
8370 scalar loop copy. Do this after versioning but before peeling
8371 so CFG structure is fine for both scalar and if-converted loop
8372 to make slpeel_duplicate_current_defs_from_edges face matched
8373 loop closed PHI nodes on the exit. */
8374 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8376 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
8377 if (! single_pred_p (e
->dest
))
8379 split_loop_exit_edge (e
);
8380 if (dump_enabled_p ())
8381 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
8385 tree niters
= vect_build_loop_niters (loop_vinfo
);
8386 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
8387 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
8388 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
8389 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
8390 &step_vector
, &niters_vector_mult_vf
, th
,
8391 check_profitability
, niters_no_overflow
);
8393 if (niters_vector
== NULL_TREE
)
8395 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8396 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8397 && known_eq (lowest_vf
, vf
))
8400 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
8401 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
8402 step_vector
= build_one_cst (TREE_TYPE (niters
));
8405 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
8406 &step_vector
, niters_no_overflow
);
8409 /* 1) Make sure the loop header has exactly two entries
8410 2) Make sure we have a preheader basic block. */
8412 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
8414 split_edge (loop_preheader_edge (loop
));
8416 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8417 && vect_use_loop_mask_for_alignment_p (loop_vinfo
))
8418 /* This will deal with any possible peeling. */
8419 vect_prepare_for_masked_peels (loop_vinfo
);
8421 /* FORNOW: the vectorizer supports only loops which body consist
8422 of one basic block (header + empty latch). When the vectorizer will
8423 support more involved loop forms, the order by which the BBs are
8424 traversed need to be reconsidered. */
8426 for (i
= 0; i
< nbbs
; i
++)
8428 basic_block bb
= bbs
[i
];
8429 stmt_vec_info stmt_info
;
8431 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
8434 gphi
*phi
= si
.phi ();
8435 if (dump_enabled_p ())
8437 dump_printf_loc (MSG_NOTE
, vect_location
,
8438 "------>vectorizing phi: ");
8439 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
8441 stmt_info
= vinfo_for_stmt (phi
);
8445 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8446 vect_loop_kill_debug_uses (loop
, phi
);
8448 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8449 && !STMT_VINFO_LIVE_P (stmt_info
))
8452 if (STMT_VINFO_VECTYPE (stmt_info
)
8454 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
8455 && dump_enabled_p ())
8456 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8458 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
8459 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8460 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
8461 && ! PURE_SLP_STMT (stmt_info
))
8463 if (dump_enabled_p ())
8464 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
8465 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
8469 pattern_stmt
= NULL
;
8470 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
8471 !gsi_end_p (si
) || transform_pattern_stmt
;)
8475 if (transform_pattern_stmt
)
8476 stmt
= pattern_stmt
;
8479 stmt
= gsi_stmt (si
);
8480 /* During vectorization remove existing clobber stmts. */
8481 if (gimple_clobber_p (stmt
))
8483 unlink_stmt_vdef (stmt
);
8484 gsi_remove (&si
, true);
8485 release_defs (stmt
);
8490 if (dump_enabled_p ())
8492 dump_printf_loc (MSG_NOTE
, vect_location
,
8493 "------>vectorizing statement: ");
8494 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
8497 stmt_info
= vinfo_for_stmt (stmt
);
8499 /* vector stmts created in the outer-loop during vectorization of
8500 stmts in an inner-loop may not have a stmt_info, and do not
8501 need to be vectorized. */
8508 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8509 vect_loop_kill_debug_uses (loop
, stmt
);
8511 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8512 && !STMT_VINFO_LIVE_P (stmt_info
))
8514 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
8515 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
8516 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
8517 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
8519 stmt
= pattern_stmt
;
8520 stmt_info
= vinfo_for_stmt (stmt
);
8528 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
8529 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
8530 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
8531 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
8532 transform_pattern_stmt
= true;
8534 /* If pattern statement has def stmts, vectorize them too. */
8535 if (is_pattern_stmt_p (stmt_info
))
8537 if (pattern_def_seq
== NULL
)
8539 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
8540 pattern_def_si
= gsi_start (pattern_def_seq
);
8542 else if (!gsi_end_p (pattern_def_si
))
8543 gsi_next (&pattern_def_si
);
8544 if (pattern_def_seq
!= NULL
)
8546 gimple
*pattern_def_stmt
= NULL
;
8547 stmt_vec_info pattern_def_stmt_info
= NULL
;
8549 while (!gsi_end_p (pattern_def_si
))
8551 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
8552 pattern_def_stmt_info
8553 = vinfo_for_stmt (pattern_def_stmt
);
8554 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
8555 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
8557 gsi_next (&pattern_def_si
);
8560 if (!gsi_end_p (pattern_def_si
))
8562 if (dump_enabled_p ())
8564 dump_printf_loc (MSG_NOTE
, vect_location
,
8565 "==> vectorizing pattern def "
8567 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
8568 pattern_def_stmt
, 0);
8571 stmt
= pattern_def_stmt
;
8572 stmt_info
= pattern_def_stmt_info
;
8576 pattern_def_si
= gsi_none ();
8577 transform_pattern_stmt
= false;
8581 transform_pattern_stmt
= false;
8584 if (STMT_VINFO_VECTYPE (stmt_info
))
8587 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
8588 if (!STMT_SLP_TYPE (stmt_info
)
8589 && maybe_ne (nunits
, vf
)
8590 && dump_enabled_p ())
8591 /* For SLP VF is set according to unrolling factor, and not
8592 to vector size, hence for SLP this print is not valid. */
8593 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8596 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8598 if (STMT_SLP_TYPE (stmt_info
))
8602 slp_scheduled
= true;
8604 if (dump_enabled_p ())
8605 dump_printf_loc (MSG_NOTE
, vect_location
,
8606 "=== scheduling SLP instances ===\n");
8608 vect_schedule_slp (loop_vinfo
);
8611 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8612 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
8614 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
8616 pattern_def_seq
= NULL
;
8623 /* -------- vectorize statement ------------ */
8624 if (dump_enabled_p ())
8625 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
8627 grouped_store
= false;
8628 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
8631 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
8633 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8634 interleaving chain was completed - free all the stores in
8637 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
8641 /* Free the attached stmt_vec_info and remove the stmt. */
8642 gimple
*store
= gsi_stmt (si
);
8643 free_stmt_vec_info (store
);
8644 unlink_stmt_vdef (store
);
8645 gsi_remove (&si
, true);
8646 release_defs (store
);
8649 /* Stores can only appear at the end of pattern statements. */
8650 gcc_assert (!transform_pattern_stmt
);
8651 pattern_def_seq
= NULL
;
8653 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
8655 pattern_def_seq
= NULL
;
8660 /* Stub out scalar statements that must not survive vectorization.
8661 Doing this here helps with grouped statements, or statements that
8662 are involved in patterns. */
8663 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
8664 !gsi_end_p (gsi
); gsi_next (&gsi
))
8666 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
8667 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
8669 tree lhs
= gimple_get_lhs (call
);
8670 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8672 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
8673 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
8674 gsi_replace (&gsi
, new_stmt
, true);
8680 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8681 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8682 if (integer_onep (step_vector
))
8683 niters_no_overflow
= true;
8684 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
8685 niters_vector_mult_vf
, !niters_no_overflow
);
8687 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
8688 scale_profile_for_vect_loop (loop
, assumed_vf
);
8690 /* True if the final iteration might not handle a full vector's
8691 worth of scalar iterations. */
8692 bool final_iter_may_be_partial
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8693 /* The minimum number of iterations performed by the epilogue. This
8694 is 1 when peeling for gaps because we always need a final scalar
8696 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
8697 /* +1 to convert latch counts to loop iteration counts,
8698 -min_epilogue_iters to remove iterations that cannot be performed
8699 by the vector code. */
8700 int bias_for_lowest
= 1 - min_epilogue_iters
;
8701 int bias_for_assumed
= bias_for_lowest
;
8702 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
8703 if (alignment_npeels
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8705 /* When the amount of peeling is known at compile time, the first
8706 iteration will have exactly alignment_npeels active elements.
8707 In the worst case it will have at least one. */
8708 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
8709 bias_for_lowest
+= lowest_vf
- min_first_active
;
8710 bias_for_assumed
+= assumed_vf
- min_first_active
;
8712 /* In these calculations the "- 1" converts loop iteration counts
8713 back to latch counts. */
8714 if (loop
->any_upper_bound
)
8715 loop
->nb_iterations_upper_bound
8716 = (final_iter_may_be_partial
8717 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8719 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8721 if (loop
->any_likely_upper_bound
)
8722 loop
->nb_iterations_likely_upper_bound
8723 = (final_iter_may_be_partial
8724 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
8725 + bias_for_lowest
, lowest_vf
) - 1
8726 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
8727 + bias_for_lowest
, lowest_vf
) - 1);
8728 if (loop
->any_estimate
)
8729 loop
->nb_iterations_estimate
8730 = (final_iter_may_be_partial
8731 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8733 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8736 if (dump_enabled_p ())
8738 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8740 dump_printf_loc (MSG_NOTE
, vect_location
,
8741 "LOOP VECTORIZED\n");
8743 dump_printf_loc (MSG_NOTE
, vect_location
,
8744 "OUTER LOOP VECTORIZED\n");
8745 dump_printf (MSG_NOTE
, "\n");
8749 dump_printf_loc (MSG_NOTE
, vect_location
,
8750 "LOOP EPILOGUE VECTORIZED (VS=");
8751 dump_dec (MSG_NOTE
, current_vector_size
);
8752 dump_printf (MSG_NOTE
, ")\n");
8756 /* Free SLP instances here because otherwise stmt reference counting
8758 slp_instance instance
;
8759 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
8760 vect_free_slp_instance (instance
);
8761 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
8762 /* Clear-up safelen field since its value is invalid after vectorization
8763 since vectorized loop can have loop-carried dependencies. */
8766 /* Don't vectorize epilogue for epilogue. */
8767 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8770 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK
))
8775 auto_vector_sizes vector_sizes
;
8776 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
8777 unsigned int next_size
= 0;
8779 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8780 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0
8781 && known_eq (vf
, lowest_vf
))
8784 = (LOOP_VINFO_INT_NITERS (loop_vinfo
)
8785 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
));
8786 eiters
= eiters
% lowest_vf
;
8787 epilogue
->nb_iterations_upper_bound
= eiters
- 1;
8790 while (next_size
< vector_sizes
.length ()
8791 && !(constant_multiple_p (current_vector_size
,
8792 vector_sizes
[next_size
], &ratio
)
8793 && eiters
>= lowest_vf
/ ratio
))
8797 while (next_size
< vector_sizes
.length ()
8798 && maybe_lt (current_vector_size
, vector_sizes
[next_size
]))
8801 if (next_size
== vector_sizes
.length ())
8807 epilogue
->force_vectorize
= loop
->force_vectorize
;
8808 epilogue
->safelen
= loop
->safelen
;
8809 epilogue
->dont_vectorize
= false;
8811 /* We may need to if-convert epilogue to vectorize it. */
8812 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8813 tree_if_conversion (epilogue
);
8819 /* The code below is trying to perform simple optimization - revert
8820 if-conversion for masked stores, i.e. if the mask of a store is zero
8821 do not perform it and all stored value producers also if possible.
8829 this transformation will produce the following semi-hammock:
8831 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8833 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8834 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8835 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8836 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8837 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8838 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8843 optimize_mask_stores (struct loop
*loop
)
8845 basic_block
*bbs
= get_loop_body (loop
);
8846 unsigned nbbs
= loop
->num_nodes
;
8849 struct loop
*bb_loop
;
8850 gimple_stmt_iterator gsi
;
8852 auto_vec
<gimple
*> worklist
;
8854 vect_location
= find_loop_location (loop
);
8855 /* Pick up all masked stores in loop if any. */
8856 for (i
= 0; i
< nbbs
; i
++)
8859 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
8862 stmt
= gsi_stmt (gsi
);
8863 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
8864 worklist
.safe_push (stmt
);
8869 if (worklist
.is_empty ())
8872 /* Loop has masked stores. */
8873 while (!worklist
.is_empty ())
8875 gimple
*last
, *last_store
;
8878 basic_block store_bb
, join_bb
;
8879 gimple_stmt_iterator gsi_to
;
8880 tree vdef
, new_vdef
;
8885 last
= worklist
.pop ();
8886 mask
= gimple_call_arg (last
, 2);
8887 bb
= gimple_bb (last
);
8888 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8889 the same loop as if_bb. It could be different to LOOP when two
8890 level loop-nest is vectorized and mask_store belongs to the inner
8892 e
= split_block (bb
, last
);
8893 bb_loop
= bb
->loop_father
;
8894 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
8896 store_bb
= create_empty_bb (bb
);
8897 add_bb_to_loop (store_bb
, bb_loop
);
8898 e
->flags
= EDGE_TRUE_VALUE
;
8899 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
8900 /* Put STORE_BB to likely part. */
8901 efalse
->probability
= profile_probability::unlikely ();
8902 store_bb
->count
= efalse
->count ();
8903 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
8904 if (dom_info_available_p (CDI_DOMINATORS
))
8905 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
8906 if (dump_enabled_p ())
8907 dump_printf_loc (MSG_NOTE
, vect_location
,
8908 "Create new block %d to sink mask stores.",
8910 /* Create vector comparison with boolean result. */
8911 vectype
= TREE_TYPE (mask
);
8912 zero
= build_zero_cst (vectype
);
8913 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
8914 gsi
= gsi_last_bb (bb
);
8915 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
8916 /* Create new PHI node for vdef of the last masked store:
8917 .MEM_2 = VDEF <.MEM_1>
8918 will be converted to
8919 .MEM.3 = VDEF <.MEM_1>
8920 and new PHI node will be created in join bb
8921 .MEM_2 = PHI <.MEM_1, .MEM_3>
8923 vdef
= gimple_vdef (last
);
8924 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
8925 gimple_set_vdef (last
, new_vdef
);
8926 phi
= create_phi_node (vdef
, join_bb
);
8927 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
8929 /* Put all masked stores with the same mask to STORE_BB if possible. */
8932 gimple_stmt_iterator gsi_from
;
8933 gimple
*stmt1
= NULL
;
8935 /* Move masked store to STORE_BB. */
8937 gsi
= gsi_for_stmt (last
);
8939 /* Shift GSI to the previous stmt for further traversal. */
8941 gsi_to
= gsi_start_bb (store_bb
);
8942 gsi_move_before (&gsi_from
, &gsi_to
);
8943 /* Setup GSI_TO to the non-empty block start. */
8944 gsi_to
= gsi_start_bb (store_bb
);
8945 if (dump_enabled_p ())
8947 dump_printf_loc (MSG_NOTE
, vect_location
,
8948 "Move stmt to created bb\n");
8949 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, last
, 0);
8951 /* Move all stored value producers if possible. */
8952 while (!gsi_end_p (gsi
))
8955 imm_use_iterator imm_iter
;
8956 use_operand_p use_p
;
8959 /* Skip debug statements. */
8960 if (is_gimple_debug (gsi_stmt (gsi
)))
8965 stmt1
= gsi_stmt (gsi
);
8966 /* Do not consider statements writing to memory or having
8967 volatile operand. */
8968 if (gimple_vdef (stmt1
)
8969 || gimple_has_volatile_ops (stmt1
))
8973 lhs
= gimple_get_lhs (stmt1
);
8977 /* LHS of vectorized stmt must be SSA_NAME. */
8978 if (TREE_CODE (lhs
) != SSA_NAME
)
8981 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8983 /* Remove dead scalar statement. */
8984 if (has_zero_uses (lhs
))
8986 gsi_remove (&gsi_from
, true);
8991 /* Check that LHS does not have uses outside of STORE_BB. */
8993 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
8996 use_stmt
= USE_STMT (use_p
);
8997 if (is_gimple_debug (use_stmt
))
8999 if (gimple_bb (use_stmt
) != store_bb
)
9008 if (gimple_vuse (stmt1
)
9009 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
9012 /* Can move STMT1 to STORE_BB. */
9013 if (dump_enabled_p ())
9015 dump_printf_loc (MSG_NOTE
, vect_location
,
9016 "Move stmt to created bb\n");
9017 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt1
, 0);
9019 gsi_move_before (&gsi_from
, &gsi_to
);
9020 /* Shift GSI_TO for further insertion. */
9023 /* Put other masked stores with the same mask to STORE_BB. */
9024 if (worklist
.is_empty ()
9025 || gimple_call_arg (worklist
.last (), 2) != mask
9026 || worklist
.last () != stmt1
)
9028 last
= worklist
.pop ();
9030 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);