2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 for (i=0; i<N/8; i++){
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
186 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
187 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
188 unsigned nbbs
= loop
->num_nodes
;
189 poly_uint64 vectorization_factor
= 1;
190 tree scalar_type
= NULL_TREE
;
193 stmt_vec_info stmt_info
;
196 gimple
*stmt
, *pattern_stmt
= NULL
;
197 gimple_seq pattern_def_seq
= NULL
;
198 gimple_stmt_iterator pattern_def_si
= gsi_none ();
199 bool analyze_pattern_stmt
= false;
201 auto_vec
<stmt_vec_info
> mask_producers
;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE
, vect_location
,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i
= 0; i
< nbbs
; i
++)
209 basic_block bb
= bbs
[i
];
211 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
215 stmt_info
= vinfo_for_stmt (phi
);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
222 gcc_assert (stmt_info
);
224 if (STMT_VINFO_RELEVANT_P (stmt_info
)
225 || STMT_VINFO_LIVE_P (stmt_info
))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
228 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE
, vect_location
,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
235 dump_printf (MSG_NOTE
, "\n");
238 vectype
= get_vectype_for_scalar_type (scalar_type
);
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
244 "not vectorized: unsupported "
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
248 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
252 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
257 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
258 dump_printf (MSG_NOTE
, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
264 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
265 dump_printf (MSG_NOTE
, "\n");
268 vect_update_max_nunits (&vectorization_factor
, vectype
);
272 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
273 !gsi_end_p (si
) || analyze_pattern_stmt
;)
277 if (analyze_pattern_stmt
)
280 stmt
= gsi_stmt (si
);
282 stmt_info
= vinfo_for_stmt (stmt
);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE
, vect_location
,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
291 gcc_assert (stmt_info
);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
295 && !STMT_VINFO_LIVE_P (stmt_info
))
296 || gimple_clobber_p (stmt
))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
299 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
304 stmt_info
= vinfo_for_stmt (pattern_stmt
);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE
, vect_location
,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
321 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
324 analyze_pattern_stmt
= true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info
))
329 if (pattern_def_seq
== NULL
)
331 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
332 pattern_def_si
= gsi_start (pattern_def_seq
);
334 else if (!gsi_end_p (pattern_def_si
))
335 gsi_next (&pattern_def_si
);
336 if (pattern_def_seq
!= NULL
)
338 gimple
*pattern_def_stmt
= NULL
;
339 stmt_vec_info pattern_def_stmt_info
= NULL
;
341 while (!gsi_end_p (pattern_def_si
))
343 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt
);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
349 gsi_next (&pattern_def_si
);
352 if (!gsi_end_p (pattern_def_si
))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE
, vect_location
,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
359 pattern_def_stmt
, 0);
362 stmt
= pattern_def_stmt
;
363 stmt_info
= pattern_def_stmt_info
;
367 pattern_def_si
= gsi_none ();
368 analyze_pattern_stmt
= false;
372 analyze_pattern_stmt
= false;
375 if (gimple_get_lhs (stmt
) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt
)
378 || !gimple_call_internal_p (stmt
)
379 || gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
))
381 if (is_gimple_call (stmt
))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
389 pattern_def_seq
= NULL
;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
417 if (STMT_VINFO_VECTYPE (stmt_info
))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
424 || is_pattern_stmt_p (stmt_info
)
425 || !gsi_end_p (pattern_def_si
));
426 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
431 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
432 scalar_type
= TREE_TYPE (gimple_call_arg (stmt
, 3));
434 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type
)
440 && is_gimple_assign (stmt
)
441 && gimple_assign_rhs_code (stmt
) != COND_EXPR
)
443 if (STMT_VINFO_RELEVANT_P (stmt_info
)
444 || STMT_VINFO_LIVE_P (stmt_info
))
445 mask_producers
.safe_push (stmt_info
);
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
))
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt
))))
452 scalar_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
455 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
457 pattern_def_seq
= NULL
;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE
, vect_location
,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
469 dump_printf (MSG_NOTE
, "\n");
471 vectype
= get_vectype_for_scalar_type (scalar_type
);
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
477 "not vectorized: unsupported "
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
481 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
487 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
492 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
493 dump_printf (MSG_NOTE
, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
500 vf_vectype
= vectype
;
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
507 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE
, vect_location
,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
514 dump_printf (MSG_NOTE
, "\n");
516 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
526 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype
)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
541 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
544 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
552 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
553 dump_printf (MSG_NOTE
, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
559 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vf_vectype
));
560 dump_printf (MSG_NOTE
, "\n");
563 vect_update_max_nunits (&vectorization_factor
, vf_vectype
);
565 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
567 pattern_def_seq
= NULL
;
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
577 dump_dec (MSG_NOTE
, vectorization_factor
);
578 dump_printf (MSG_NOTE
, "\n");
581 if (known_le (vectorization_factor
, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
585 "not vectorized: unsupported data-type\n");
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
590 for (i
= 0; i
< mask_producers
.length (); i
++)
592 tree mask_type
= NULL
;
594 stmt
= STMT_VINFO_STMT (mask_producers
[i
]);
596 if (is_gimple_assign (stmt
)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt
))))
601 scalar_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
602 mask_type
= get_mask_type_for_scalar_type (scalar_type
);
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
608 "not vectorized: unsupported mask\n");
617 enum vect_def_type dt
;
619 FOR_EACH_SSA_TREE_OPERAND (rhs
, stmt
, iter
, SSA_OP_USE
)
621 if (!vect_is_simple_use (rhs
, mask_producers
[i
]->vinfo
,
622 &def_stmt
, &dt
, &vectype
))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
627 "not vectorized: can't compute mask type "
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
644 TYPE_VECTOR_SUBPARTS (vectype
)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
653 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
656 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type
)
661 != VECTOR_BOOLEAN_TYPE_P (vectype
))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
670 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
673 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type
)
683 && gimple_code (stmt
) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
)) == tcc_comparison
)
685 mask_type
= build_same_sized_truth_vector_type (mask_type
);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
696 "not vectorized: can't compute mask type "
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
704 STMT_VINFO_VECTYPE (mask_producers
[i
]) = mask_type
;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
717 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
722 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
725 /* When there is no evolution in this loop, the evolution function
727 if (evolution_part
== NULL_TREE
)
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part
))
735 step_expr
= evolution_part
;
736 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
741 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
742 dump_printf (MSG_NOTE
, ", init: ");
743 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
744 dump_printf (MSG_NOTE
, "\n");
750 if (TREE_CODE (step_expr
) != INTEGER_CST
751 && (TREE_CODE (step_expr
) != SSA_NAME
752 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
753 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
756 || !flag_associative_math
)))
757 && (TREE_CODE (step_expr
) != REAL_CST
758 || !flag_associative_math
))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
779 basic_block bb
= loop
->header
;
781 auto_vec
<gimple
*, 64> worklist
;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE
, vect_location
,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
792 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
794 gphi
*phi
= gsi
.phi ();
795 tree access_fn
= NULL
;
796 tree def
= PHI_RESULT (phi
);
797 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def
))
810 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
812 /* Analyze the evolution function. */
813 access_fn
= analyze_scalar_evolution (loop
, def
);
816 STRIP_NOPS (access_fn
);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE
, vect_location
,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
822 dump_printf (MSG_NOTE
, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
825 = initial_condition_in_loop_num (access_fn
, loop
->num
);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
827 = evolution_part_in_loop_num (access_fn
, loop
->num
);
831 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
832 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
833 && TREE_CODE (step
) != INTEGER_CST
))
835 worklist
.safe_push (phi
);
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist
.length () > 0)
852 gimple
*phi
= worklist
.pop ();
853 tree def
= PHI_RESULT (phi
);
854 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
863 gcc_assert (!virtual_operand_p (def
)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
866 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
,
867 &double_reduc
, false);
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE
, vect_location
,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
878 vect_double_reduction_def
;
882 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE
, vect_location
,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE
, vect_location
,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt
)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
941 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
943 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
955 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
961 vect_fixup_reduc_chain (gimple
*stmt
)
963 gimple
*firstp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp
))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
967 GROUP_SIZE (vinfo_for_stmt (firstp
)) = GROUP_SIZE (vinfo_for_stmt (stmt
));
970 stmtp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp
)) = firstp
;
972 stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp
))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp
)) = vect_reduction_def
;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first
)))
992 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
));
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
)))
997 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1003 vect_fixup_reduc_chain (first
);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first
));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1021 vect_get_loop_niters (struct loop
*loop
, tree
*assumptions
,
1022 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
1024 edge exit
= single_exit (loop
);
1025 struct tree_niter_desc niter_desc
;
1026 tree niter_assumptions
, niter
, may_be_zero
;
1027 gcond
*cond
= get_loop_exit_condition (loop
);
1029 *assumptions
= boolean_true_node
;
1030 *number_of_iterationsm1
= chrec_dont_know
;
1031 *number_of_iterations
= chrec_dont_know
;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE
, vect_location
,
1034 "=== get_loop_niters ===\n");
1039 niter
= chrec_dont_know
;
1040 may_be_zero
= NULL_TREE
;
1041 niter_assumptions
= boolean_true_node
;
1042 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
1043 || chrec_contains_undetermined (niter_desc
.niter
))
1046 niter_assumptions
= niter_desc
.assumptions
;
1047 may_be_zero
= niter_desc
.may_be_zero
;
1048 niter
= niter_desc
.niter
;
1050 if (may_be_zero
&& integer_zerop (may_be_zero
))
1051 may_be_zero
= NULL_TREE
;
1055 if (COMPARISON_CLASS_P (may_be_zero
))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
1060 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
1062 fold_build1 (TRUTH_NOT_EXPR
,
1066 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
1067 build_int_cst (TREE_TYPE (niter
), 0),
1068 rewrite_to_non_trapping_overflow (niter
));
1070 may_be_zero
= NULL_TREE
;
1072 else if (integer_nonzerop (may_be_zero
))
1074 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
1075 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
1082 *assumptions
= niter_assumptions
;
1083 *number_of_iterationsm1
= niter
;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter
&& !chrec_contains_undetermined (niter
))
1090 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
1091 build_int_cst (TREE_TYPE (niter
), 1));
1092 *number_of_iterations
= niter
;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1102 bb_in_loop_p (const_basic_block bb
, const void *data
)
1104 const struct loop
*const loop
= (const struct loop
*)data
;
1105 if (flow_bb_inside_loop_p (loop
, bb
))
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop
*loop_in
)
1115 : vec_info (vec_info::loop
, init_cost (loop_in
)),
1117 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
1118 num_itersm1 (NULL_TREE
),
1119 num_iters (NULL_TREE
),
1120 num_iters_unchanged (NULL_TREE
),
1121 num_iters_assumptions (NULL_TREE
),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE
),
1127 mask_compare_type (NULL_TREE
),
1128 unaligned_dr (NULL
),
1129 peeling_for_alignment (0),
1131 slp_unrolling_factor (1),
1132 single_scalar_iteration_cost (0),
1133 vectorizable (false),
1134 can_fully_mask_p (true),
1135 fully_masked_p (false),
1136 peeling_for_gaps (false),
1137 peeling_for_niter (false),
1138 operands_swapped (false),
1139 no_data_dependencies (false),
1140 has_mask_store (false),
1142 orig_loop_info (NULL
)
1144 /* Create/Update stmt_info for all stmts in the loop. */
1145 basic_block
*body
= get_loop_body (loop
);
1146 for (unsigned int i
= 0; i
< loop
->num_nodes
; i
++)
1148 basic_block bb
= body
[i
];
1149 gimple_stmt_iterator si
;
1151 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1153 gimple
*phi
= gsi_stmt (si
);
1154 gimple_set_uid (phi
, 0);
1155 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, this));
1158 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1160 gimple
*stmt
= gsi_stmt (si
);
1161 gimple_set_uid (stmt
, 0);
1162 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, this));
1167 /* CHECKME: We want to visit all BBs before their successors (except for
1168 latch blocks, for which this assertion wouldn't hold). In the simple
1169 case of the loop forms we allow, a dfs order of the BBs would the same
1170 as reversed postorder traversal, so we are safe. */
1172 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
1173 bbs
, loop
->num_nodes
, loop
);
1174 gcc_assert (nbbs
== loop
->num_nodes
);
1177 /* Free all levels of MASKS. */
1180 release_vec_loop_masks (vec_loop_masks
*masks
)
1184 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
1185 rgm
->masks
.release ();
1189 /* Free all memory used by the _loop_vec_info, as well as all the
1190 stmt_vec_info structs of all the stmts in the loop. */
1192 _loop_vec_info::~_loop_vec_info ()
1195 gimple_stmt_iterator si
;
1198 nbbs
= loop
->num_nodes
;
1199 for (j
= 0; j
< nbbs
; j
++)
1201 basic_block bb
= bbs
[j
];
1202 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1203 free_stmt_vec_info (gsi_stmt (si
));
1205 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
1207 gimple
*stmt
= gsi_stmt (si
);
1209 /* We may have broken canonical form by moving a constant
1210 into RHS1 of a commutative op. Fix such occurrences. */
1211 if (operands_swapped
&& is_gimple_assign (stmt
))
1213 enum tree_code code
= gimple_assign_rhs_code (stmt
);
1215 if ((code
== PLUS_EXPR
1216 || code
== POINTER_PLUS_EXPR
1217 || code
== MULT_EXPR
)
1218 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
1219 swap_ssa_operands (stmt
,
1220 gimple_assign_rhs1_ptr (stmt
),
1221 gimple_assign_rhs2_ptr (stmt
));
1222 else if (code
== COND_EXPR
1223 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt
)))
1225 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1226 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1228 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1230 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
,
1232 cond_code
= invert_tree_comparison (cond_code
,
1234 if (cond_code
!= ERROR_MARK
)
1236 TREE_SET_CODE (cond_expr
, cond_code
);
1237 swap_ssa_operands (stmt
,
1238 gimple_assign_rhs2_ptr (stmt
),
1239 gimple_assign_rhs3_ptr (stmt
));
1245 /* Free stmt_vec_info. */
1246 free_stmt_vec_info (stmt
);
1253 release_vec_loop_masks (&masks
);
1258 /* Return true if we can use CMP_TYPE as the comparison type to produce
1259 all masks required to mask LOOP_VINFO. */
1262 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1266 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1267 if (rgm
->mask_type
!= NULL_TREE
1268 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1269 cmp_type
, rgm
->mask_type
,
1270 OPTIMIZE_FOR_SPEED
))
1275 /* Calculate the maximum number of scalars per iteration for every
1276 rgroup in LOOP_VINFO. */
1279 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1281 unsigned int res
= 1;
1284 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1285 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1289 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1290 whether we can actually generate the masks required. Return true if so,
1291 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1294 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1296 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1297 unsigned int min_ni_width
;
1299 /* Use a normal loop if there are no statements that need masking.
1300 This only happens in rare degenerate cases: it means that the loop
1301 has no loads, no stores, and no live-out values. */
1302 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1305 /* Get the maximum number of iterations that is representable
1306 in the counter type. */
1307 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1308 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1310 /* Get a more refined estimate for the number of iterations. */
1311 widest_int max_back_edges
;
1312 if (max_loop_iterations (loop
, &max_back_edges
))
1313 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1315 /* Account for rgroup masks, in which each bit is replicated N times. */
1316 max_ni
*= vect_get_max_nscalars_per_iter (loop_vinfo
);
1318 /* Work out how many bits we need to represent the limit. */
1319 min_ni_width
= wi::min_precision (max_ni
, UNSIGNED
);
1321 /* Find a scalar mode for which WHILE_ULT is supported. */
1322 opt_scalar_int_mode cmp_mode_iter
;
1323 tree cmp_type
= NULL_TREE
;
1324 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1326 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1327 if (cmp_bits
>= min_ni_width
1328 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1330 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1332 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1334 /* Although we could stop as soon as we find a valid mode,
1335 it's often better to continue until we hit Pmode, since the
1336 operands to the WHILE are more likely to be reusable in
1337 address calculations. */
1338 cmp_type
= this_type
;
1339 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1348 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1352 /* Calculate the cost of one scalar iteration of the loop. */
1354 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1356 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1357 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1358 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
1359 int innerloop_iters
, i
;
1361 /* Count statements in scalar loop. Using this as scalar cost for a single
1364 TODO: Add outer loop support.
1366 TODO: Consider assigning different costs to different scalar
1370 innerloop_iters
= 1;
1372 innerloop_iters
= 50; /* FIXME */
1374 for (i
= 0; i
< nbbs
; i
++)
1376 gimple_stmt_iterator si
;
1377 basic_block bb
= bbs
[i
];
1379 if (bb
->loop_father
== loop
->inner
)
1380 factor
= innerloop_iters
;
1384 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1386 gimple
*stmt
= gsi_stmt (si
);
1387 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1389 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1392 /* Skip stmts that are not vectorized inside the loop. */
1394 && !STMT_VINFO_RELEVANT_P (stmt_info
)
1395 && (!STMT_VINFO_LIVE_P (stmt_info
)
1396 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1397 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
1400 vect_cost_for_stmt kind
;
1401 if (STMT_VINFO_DATA_REF (stmt_info
))
1403 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1406 kind
= scalar_store
;
1411 scalar_single_iter_cost
1412 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1413 factor
, kind
, stmt_info
, 0, vect_prologue
);
1416 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
)
1417 = scalar_single_iter_cost
;
1421 /* Function vect_analyze_loop_form_1.
1423 Verify that certain CFG restrictions hold, including:
1424 - the loop has a pre-header
1425 - the loop has a single entry and exit
1426 - the loop exit condition is simple enough
1427 - the number of iterations can be analyzed, i.e, a countable loop. The
1428 niter could be analyzed under some assumptions. */
1431 vect_analyze_loop_form_1 (struct loop
*loop
, gcond
**loop_cond
,
1432 tree
*assumptions
, tree
*number_of_iterationsm1
,
1433 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_NOTE
, vect_location
,
1437 "=== vect_analyze_loop_form ===\n");
1439 /* Different restrictions apply when we are considering an inner-most loop,
1440 vs. an outer (nested) loop.
1441 (FORNOW. May want to relax some of these restrictions in the future). */
1445 /* Inner-most loop. We currently require that the number of BBs is
1446 exactly 2 (the header and latch). Vectorizable inner-most loops
1457 if (loop
->num_nodes
!= 2)
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1461 "not vectorized: control flow in loop.\n");
1465 if (empty_block_p (loop
->header
))
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1469 "not vectorized: empty loop.\n");
1475 struct loop
*innerloop
= loop
->inner
;
1478 /* Nested loop. We currently require that the loop is doubly-nested,
1479 contains a single inner loop, and the number of BBs is exactly 5.
1480 Vectorizable outer-loops look like this:
1492 The inner-loop has the properties expected of inner-most loops
1493 as described above. */
1495 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1499 "not vectorized: multiple nested loops.\n");
1503 if (loop
->num_nodes
!= 5)
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1507 "not vectorized: control flow in loop.\n");
1511 entryedge
= loop_preheader_edge (innerloop
);
1512 if (entryedge
->src
!= loop
->header
1513 || !single_exit (innerloop
)
1514 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1518 "not vectorized: unsupported outerloop form.\n");
1522 /* Analyze the inner-loop. */
1523 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1524 if (! vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1525 &inner_assumptions
, &inner_niterm1
,
1527 /* Don't support analyzing niter under assumptions for inner
1529 || !integer_onep (inner_assumptions
))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1533 "not vectorized: Bad inner loop.\n");
1537 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1539 if (dump_enabled_p ())
1540 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1541 "not vectorized: inner-loop count not"
1546 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE
, vect_location
,
1548 "Considering outer-loop vectorization.\n");
1551 if (!single_exit (loop
)
1552 || EDGE_COUNT (loop
->header
->preds
) != 2)
1554 if (dump_enabled_p ())
1556 if (!single_exit (loop
))
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1558 "not vectorized: multiple exits.\n");
1559 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1561 "not vectorized: too many incoming edges.\n");
1566 /* We assume that the loop exit condition is at the end of the loop. i.e,
1567 that the loop is represented as a do-while (with a proper if-guard
1568 before the loop if needed), where the loop header contains all the
1569 executable statements, and the latch is empty. */
1570 if (!empty_block_p (loop
->latch
)
1571 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1573 if (dump_enabled_p ())
1574 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1575 "not vectorized: latch block not empty.\n");
1579 /* Make sure the exit is not abnormal. */
1580 edge e
= single_exit (loop
);
1581 if (e
->flags
& EDGE_ABNORMAL
)
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1585 "not vectorized: abnormal loop exit edge.\n");
1589 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1590 number_of_iterationsm1
);
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1595 "not vectorized: complicated exit condition.\n");
1599 if (integer_zerop (*assumptions
)
1600 || !*number_of_iterations
1601 || chrec_contains_undetermined (*number_of_iterations
))
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1605 "not vectorized: number of iterations cannot be "
1610 if (integer_zerop (*number_of_iterations
))
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1614 "not vectorized: number of iterations = 0.\n");
1621 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1624 vect_analyze_loop_form (struct loop
*loop
)
1626 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1627 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1629 if (! vect_analyze_loop_form_1 (loop
, &loop_cond
,
1630 &assumptions
, &number_of_iterationsm1
,
1631 &number_of_iterations
, &inner_loop_cond
))
1634 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
);
1635 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1636 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1637 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1638 if (!integer_onep (assumptions
))
1640 /* We consider to vectorize this loop by versioning it under
1641 some assumptions. In order to do this, we need to clear
1642 existing information computed by scev and niter analyzer. */
1644 free_numbers_of_iterations_estimates (loop
);
1645 /* Also set flag for this loop so that following scev and niter
1646 analysis are done under the assumptions. */
1647 loop_constraint_set (loop
, LOOP_C_FINITE
);
1648 /* Also record the assumptions for versioning. */
1649 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1652 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1654 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_NOTE
, vect_location
,
1657 "Symbolic number of iterations is ");
1658 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1659 dump_printf (MSG_NOTE
, "\n");
1663 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1664 if (inner_loop_cond
)
1665 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond
))
1666 = loop_exit_ctrl_vec_info_type
;
1668 gcc_assert (!loop
->aux
);
1669 loop
->aux
= loop_vinfo
;
1675 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1676 statements update the vectorization factor. */
1679 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1681 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1682 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1683 int nbbs
= loop
->num_nodes
;
1684 poly_uint64 vectorization_factor
;
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE
, vect_location
,
1689 "=== vect_update_vf_for_slp ===\n");
1691 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1692 gcc_assert (known_ne (vectorization_factor
, 0U));
1694 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1695 vectorization factor of the loop is the unrolling factor required by
1696 the SLP instances. If that unrolling factor is 1, we say, that we
1697 perform pure SLP on loop - cross iteration parallelism is not
1699 bool only_slp_in_loop
= true;
1700 for (i
= 0; i
< nbbs
; i
++)
1702 basic_block bb
= bbs
[i
];
1703 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1706 gimple
*stmt
= gsi_stmt (si
);
1707 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1708 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
1709 && STMT_VINFO_RELATED_STMT (stmt_info
))
1711 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
1712 stmt_info
= vinfo_for_stmt (stmt
);
1714 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1715 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1716 && !PURE_SLP_STMT (stmt_info
))
1717 /* STMT needs both SLP and loop-based vectorization. */
1718 only_slp_in_loop
= false;
1722 if (only_slp_in_loop
)
1724 dump_printf_loc (MSG_NOTE
, vect_location
,
1725 "Loop contains only SLP stmts\n");
1726 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1730 dump_printf_loc (MSG_NOTE
, vect_location
,
1731 "Loop contains SLP and non-SLP stmts\n");
1732 /* Both the vectorization factor and unroll factor have the form
1733 current_vector_size * X for some rational X, so they must have
1734 a common multiple. */
1735 vectorization_factor
1736 = force_common_multiple (vectorization_factor
,
1737 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1740 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1741 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_NOTE
, vect_location
,
1744 "Updating vectorization factor to ");
1745 dump_dec (MSG_NOTE
, vectorization_factor
);
1746 dump_printf (MSG_NOTE
, ".\n");
1750 /* Return true if STMT_INFO describes a double reduction phi and if
1751 the other phi in the reduction is also relevant for vectorization.
1752 This rejects cases such as:
1755 x_1 = PHI <x_3(outer2), ...>;
1763 x_3 = PHI <x_2(inner)>;
1765 if nothing in x_2 or elsewhere makes x_1 relevant. */
1768 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1770 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1773 gimple
*other_phi
= STMT_VINFO_REDUC_DEF (stmt_info
);
1774 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi
));
1777 /* Function vect_analyze_loop_operations.
1779 Scan the loop stmts and make sure they are all vectorizable. */
1782 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1784 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1785 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1786 int nbbs
= loop
->num_nodes
;
1788 stmt_vec_info stmt_info
;
1789 bool need_to_vectorize
= false;
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_NOTE
, vect_location
,
1794 "=== vect_analyze_loop_operations ===\n");
1796 for (i
= 0; i
< nbbs
; i
++)
1798 basic_block bb
= bbs
[i
];
1800 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1803 gphi
*phi
= si
.phi ();
1806 stmt_info
= vinfo_for_stmt (phi
);
1807 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1810 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1812 if (virtual_operand_p (gimple_phi_result (phi
)))
1815 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1816 (i.e., a phi in the tail of the outer-loop). */
1817 if (! is_loop_header_bb_p (bb
))
1819 /* FORNOW: we currently don't support the case that these phis
1820 are not used in the outerloop (unless it is double reduction,
1821 i.e., this phi is vect_reduction_def), cause this case
1822 requires to actually do something here. */
1823 if (STMT_VINFO_LIVE_P (stmt_info
)
1824 && !vect_active_double_reduction_p (stmt_info
))
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1828 "Unsupported loop-closed phi in "
1833 /* If PHI is used in the outer loop, we check that its operand
1834 is defined in the inner loop. */
1835 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1838 gimple
*op_def_stmt
;
1840 if (gimple_phi_num_args (phi
) != 1)
1843 phi_op
= PHI_ARG_DEF (phi
, 0);
1844 if (TREE_CODE (phi_op
) != SSA_NAME
)
1847 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1848 if (gimple_nop_p (op_def_stmt
)
1849 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1850 || !vinfo_for_stmt (op_def_stmt
))
1853 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1854 != vect_used_in_outer
1855 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1856 != vect_used_in_outer_by_reduction
)
1863 gcc_assert (stmt_info
);
1865 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1866 || STMT_VINFO_LIVE_P (stmt_info
))
1867 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1869 /* A scalar-dependence cycle that we don't support. */
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1872 "not vectorized: scalar dependence cycle.\n");
1876 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1878 need_to_vectorize
= true;
1879 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1880 && ! PURE_SLP_STMT (stmt_info
))
1881 ok
= vectorizable_induction (phi
, NULL
, NULL
, NULL
);
1882 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1883 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1884 && ! PURE_SLP_STMT (stmt_info
))
1885 ok
= vectorizable_reduction (phi
, NULL
, NULL
, NULL
, NULL
);
1888 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1890 && STMT_VINFO_LIVE_P (stmt_info
)
1891 && !PURE_SLP_STMT (stmt_info
))
1892 ok
= vectorizable_live_operation (phi
, NULL
, NULL
, -1, NULL
);
1896 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1899 "not vectorized: relevant phi not "
1901 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1907 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1910 gimple
*stmt
= gsi_stmt (si
);
1911 if (!gimple_clobber_p (stmt
)
1912 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
, NULL
))
1917 /* All operations in the loop are either irrelevant (deal with loop
1918 control, or dead), or only used outside the loop and can be moved
1919 out of the loop (e.g. invariants, inductions). The loop can be
1920 optimized away by scalar optimizations. We're better off not
1921 touching this loop. */
1922 if (!need_to_vectorize
)
1924 if (dump_enabled_p ())
1925 dump_printf_loc (MSG_NOTE
, vect_location
,
1926 "All the computation can be taken out of the loop.\n");
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1929 "not vectorized: redundant loop. no profit to "
1937 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1938 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1939 definitely no, or -1 if it's worth retrying. */
1942 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1944 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1945 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1947 /* Only fully-masked loops can have iteration counts less than the
1948 vectorization factor. */
1949 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1951 HOST_WIDE_INT max_niter
;
1953 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1954 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1956 max_niter
= max_stmt_executions_int (loop
);
1959 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1963 "not vectorized: iteration count smaller than "
1964 "vectorization factor.\n");
1969 int min_profitable_iters
, min_profitable_estimate
;
1970 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1971 &min_profitable_estimate
);
1973 if (min_profitable_iters
< 0)
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1977 "not vectorized: vectorization not profitable.\n");
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1980 "not vectorized: vector version will never be "
1985 int min_scalar_loop_bound
= (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1988 /* Use the cost model only if it is more conservative than user specified
1990 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1991 min_profitable_iters
);
1993 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1995 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1996 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2000 "not vectorized: vectorization not profitable.\n");
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE
, vect_location
,
2003 "not vectorized: iteration count smaller than user "
2004 "specified loop bound parameter or minimum profitable "
2005 "iterations (whichever is more conservative).\n");
2009 HOST_WIDE_INT estimated_niter
= estimated_stmt_executions_int (loop
);
2010 if (estimated_niter
== -1)
2011 estimated_niter
= likely_max_stmt_executions_int (loop
);
2012 if (estimated_niter
!= -1
2013 && ((unsigned HOST_WIDE_INT
) estimated_niter
2014 < MAX (th
, (unsigned) min_profitable_estimate
)))
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2018 "not vectorized: estimated iteration count too "
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE
, vect_location
,
2022 "not vectorized: estimated iteration count smaller "
2023 "than specified loop bound parameter or minimum "
2024 "profitable iterations (whichever is more "
2025 "conservative).\n");
2033 /* Function vect_analyze_loop_2.
2035 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2036 for it. The different analyses will record information in the
2037 loop_vec_info struct. */
2039 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
)
2043 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2044 poly_uint64 min_vf
= 2;
2045 unsigned int n_stmts
= 0;
2047 /* The first group of checks is independent of the vector size. */
2050 /* Find all data references in the loop (which correspond to vdefs/vuses)
2051 and analyze their evolution in the loop. */
2053 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2055 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2056 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2060 "not vectorized: loop nest containing two "
2061 "or more consecutive inner loops cannot be "
2066 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
2067 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
2068 !gsi_end_p (gsi
); gsi_next (&gsi
))
2070 gimple
*stmt
= gsi_stmt (gsi
);
2071 if (is_gimple_debug (stmt
))
2074 if (!find_data_references_in_stmt (loop
, stmt
,
2075 &LOOP_VINFO_DATAREFS (loop_vinfo
)))
2077 if (is_gimple_call (stmt
) && loop
->safelen
)
2079 tree fndecl
= gimple_call_fndecl (stmt
), op
;
2080 if (fndecl
!= NULL_TREE
)
2082 cgraph_node
*node
= cgraph_node::get (fndecl
);
2083 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2085 unsigned int j
, n
= gimple_call_num_args (stmt
);
2086 for (j
= 0; j
< n
; j
++)
2088 op
= gimple_call_arg (stmt
, j
);
2090 || (REFERENCE_CLASS_P (op
)
2091 && get_base_address (op
)))
2094 op
= gimple_call_lhs (stmt
);
2095 /* Ignore #pragma omp declare simd functions
2096 if they don't have data references in the
2097 call stmt itself. */
2101 || (REFERENCE_CLASS_P (op
)
2102 && get_base_address (op
)))))
2107 if (dump_enabled_p ())
2108 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2109 "not vectorized: loop contains function "
2110 "calls or data references that cannot "
2116 /* Analyze the data references and also adjust the minimal
2117 vectorization factor according to the loads and stores. */
2119 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
);
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2124 "bad data references.\n");
2128 /* Classify all cross-iteration scalar data-flow cycles.
2129 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2130 vect_analyze_scalar_cycles (loop_vinfo
);
2132 vect_pattern_recog (loop_vinfo
);
2134 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2136 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2137 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2139 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2144 "bad data access.\n");
2148 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2150 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
2153 if (dump_enabled_p ())
2154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2155 "unexpected pattern.\n");
2159 /* While the rest of the analysis below depends on it in some way. */
2162 /* Analyze data dependences between the data-refs in the loop
2163 and adjust the maximum vectorization factor according to
2165 FORNOW: fail at the first data dependence that we encounter. */
2167 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2169 || (max_vf
!= MAX_VECTORIZATION_FACTOR
2170 && maybe_lt (max_vf
, min_vf
)))
2172 if (dump_enabled_p ())
2173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2174 "bad data dependence.\n");
2177 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2179 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "can't determine vectorization factor.\n");
2187 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2188 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2192 "bad data dependence.\n");
2196 /* Compute the scalar iteration cost. */
2197 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2199 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2202 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2203 ok
= vect_analyze_slp (loop_vinfo
, n_stmts
);
2207 /* If there are any SLP instances mark them as pure_slp. */
2208 bool slp
= vect_make_slp_decision (loop_vinfo
);
2211 /* Find stmts that need to be both vectorized and SLPed. */
2212 vect_detect_hybrid_slp (loop_vinfo
);
2214 /* Update the vectorization factor based on the SLP decision. */
2215 vect_update_vf_for_slp (loop_vinfo
);
2218 bool saved_can_fully_mask_p
= LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
);
2220 /* We don't expect to have to roll back to anything other than an empty
2222 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2224 /* This is the point where we can re-start analysis with SLP forced off. */
2227 /* Now the vectorization factor is final. */
2228 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2229 gcc_assert (known_ne (vectorization_factor
, 0U));
2231 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE
, vect_location
,
2234 "vectorization_factor = ");
2235 dump_dec (MSG_NOTE
, vectorization_factor
);
2236 dump_printf (MSG_NOTE
, ", niters = " HOST_WIDE_INT_PRINT_DEC
"\n",
2237 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2240 HOST_WIDE_INT max_niter
2241 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
2243 /* Analyze the alignment of the data-refs in the loop.
2244 Fail if a data reference is found that cannot be vectorized. */
2246 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2251 "bad data alignment.\n");
2255 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2256 It is important to call pruning after vect_analyze_data_ref_accesses,
2257 since we use grouping information gathered by interleaving analysis. */
2258 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2262 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2264 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2266 /* This pass will decide on using loop versioning and/or loop peeling in
2267 order to enhance the alignment of data references in the loop. */
2268 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2273 "bad data alignment.\n");
2280 /* Analyze operations in the SLP instances. Note this may
2281 remove unsupported SLP instances which makes the above
2282 SLP kind detection invalid. */
2283 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2284 vect_slp_analyze_operations (loop_vinfo
);
2285 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2289 /* Scan all the remaining operations in the loop that are not subject
2290 to SLP and make sure they are vectorizable. */
2291 ok
= vect_analyze_loop_operations (loop_vinfo
);
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2296 "bad operation or unsupported loop bound.\n");
2300 /* Decide whether to use a fully-masked loop for this vectorization
2302 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2303 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
)
2304 && vect_verify_full_masking (loop_vinfo
));
2305 if (dump_enabled_p ())
2307 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2308 dump_printf_loc (MSG_NOTE
, vect_location
,
2309 "using a fully-masked loop.\n");
2311 dump_printf_loc (MSG_NOTE
, vect_location
,
2312 "not using a fully-masked loop.\n");
2315 /* If epilog loop is required because of data accesses with gaps,
2316 one additional iteration needs to be peeled. Check if there is
2317 enough iterations for vectorization. */
2318 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2319 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2320 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2322 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2323 tree scalar_niters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2325 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_NOTE
, vect_location
,
2329 "loop has no enough iterations to support"
2330 " peeling for gaps.\n");
2335 /* Check the costings of the loop make vectorizing worthwhile. */
2336 res
= vect_analyze_loop_costing (loop_vinfo
);
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2343 "Loop costings not worthwhile.\n");
2347 /* Decide whether we need to create an epilogue loop to handle
2348 remaining scalar iterations. */
2349 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2351 unsigned HOST_WIDE_INT const_vf
;
2352 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2353 /* The main loop handles all iterations. */
2354 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2355 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2356 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
2358 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
)
2359 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
),
2360 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2361 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2363 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2364 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
2365 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
2366 < (unsigned) exact_log2 (const_vf
))
2367 /* In case of versioning, check if the maximum number of
2368 iterations is greater than th. If they are identical,
2369 the epilogue is unnecessary. */
2370 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2371 || ((unsigned HOST_WIDE_INT
) max_niter
2372 > (th
/ const_vf
) * const_vf
))))
2373 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2375 /* If an epilogue loop is required make sure we can create one. */
2376 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2377 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2381 if (!vect_can_advance_ivs_p (loop_vinfo
)
2382 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2383 single_exit (LOOP_VINFO_LOOP
2386 if (dump_enabled_p ())
2387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2388 "not vectorized: can't create required "
2394 /* During peeling, we need to check if number of loop iterations is
2395 enough for both peeled prolog loop and vector loop. This check
2396 can be merged along with threshold check of loop versioning, so
2397 increase threshold for this case if necessary. */
2398 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
2400 poly_uint64 niters_th
= 0;
2402 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2404 /* Niters for peeled prolog loop. */
2405 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2407 struct data_reference
*dr
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2409 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
)));
2410 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2413 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2416 /* Niters for at least one iteration of vectorized loop. */
2417 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2418 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2419 /* One additional iteration because of peeling for gap. */
2420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2422 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2425 gcc_assert (known_eq (vectorization_factor
,
2426 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2428 /* Ok to vectorize! */
2432 /* Try again with SLP forced off but if we didn't do any SLP there is
2433 no point in re-trying. */
2437 /* If there are reduction chains re-trying will fail anyway. */
2438 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2441 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2442 via interleaving or lane instructions. */
2443 slp_instance instance
;
2446 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2448 stmt_vec_info vinfo
;
2449 vinfo
= vinfo_for_stmt
2450 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0]);
2451 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2453 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2454 unsigned int size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2455 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2456 if (! vect_store_lanes_supported (vectype
, size
, false)
2457 && ! vect_grouped_store_supported (vectype
, size
))
2459 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2461 vinfo
= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]);
2462 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2463 bool single_element_p
= !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo
);
2464 size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2465 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2466 if (! vect_load_lanes_supported (vectype
, size
, false)
2467 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2473 if (dump_enabled_p ())
2474 dump_printf_loc (MSG_NOTE
, vect_location
,
2475 "re-trying with SLP disabled\n");
2477 /* Roll back state appropriately. No SLP this time. */
2479 /* Restore vectorization factor as it were without SLP. */
2480 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2481 /* Free the SLP instances. */
2482 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2483 vect_free_slp_instance (instance
);
2484 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2485 /* Reset SLP type to loop_vect on all stmts. */
2486 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2488 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2489 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2490 !gsi_end_p (si
); gsi_next (&si
))
2492 stmt_vec_info stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
2493 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2495 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2496 !gsi_end_p (si
); gsi_next (&si
))
2498 stmt_vec_info stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
2499 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2500 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2502 stmt_info
= vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
2503 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2504 for (gimple_stmt_iterator pi
2505 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2506 !gsi_end_p (pi
); gsi_next (&pi
))
2508 gimple
*pstmt
= gsi_stmt (pi
);
2509 STMT_SLP_TYPE (vinfo_for_stmt (pstmt
)) = loop_vect
;
2514 /* Free optimized alias test DDRS. */
2515 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2516 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2517 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2518 /* Reset target cost data. */
2519 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2520 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2521 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2522 /* Reset accumulated rgroup information. */
2523 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo
));
2524 /* Reset assorted flags. */
2525 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2526 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2527 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2528 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2529 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = saved_can_fully_mask_p
;
2534 /* Function vect_analyze_loop.
2536 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2537 for it. The different analyses will record information in the
2538 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2541 vect_analyze_loop (struct loop
*loop
, loop_vec_info orig_loop_vinfo
)
2543 loop_vec_info loop_vinfo
;
2544 auto_vector_sizes vector_sizes
;
2546 /* Autodetect first vector size we try. */
2547 current_vector_size
= 0;
2548 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
2549 unsigned int next_size
= 0;
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE
, vect_location
,
2553 "===== analyze_loop_nest =====\n");
2555 if (loop_outer (loop
)
2556 && loop_vec_info_for_loop (loop_outer (loop
))
2557 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE
, vect_location
,
2561 "outer-loop already vectorized.\n");
2565 poly_uint64 autodetected_vector_size
= 0;
2568 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2569 loop_vinfo
= vect_analyze_loop_form (loop
);
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2574 "bad loop form.\n");
2580 if (orig_loop_vinfo
)
2581 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = orig_loop_vinfo
;
2583 if (vect_analyze_loop_2 (loop_vinfo
, fatal
))
2585 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2593 autodetected_vector_size
= current_vector_size
;
2595 if (next_size
< vector_sizes
.length ()
2596 && known_eq (vector_sizes
[next_size
], autodetected_vector_size
))
2600 || next_size
== vector_sizes
.length ()
2601 || known_eq (current_vector_size
, 0U))
2604 /* Try the next biggest vector size. */
2605 current_vector_size
= vector_sizes
[next_size
++];
2606 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_NOTE
, vect_location
,
2609 "***** Re-trying analysis with "
2611 dump_dec (MSG_NOTE
, current_vector_size
);
2612 dump_printf (MSG_NOTE
, "\n");
2617 /* Return true if there is an in-order reduction function for CODE, storing
2618 it in *REDUC_FN if so. */
2621 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
2626 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
2634 /* Function reduction_fn_for_scalar_code
2637 CODE - tree_code of a reduction operations.
2640 REDUC_FN - the corresponding internal function to be used to reduce the
2641 vector of partial results into a single scalar result, or IFN_LAST
2642 if the operation is a supported reduction operation, but does not have
2643 such an internal function.
2645 Return FALSE if CODE currently cannot be vectorized as reduction. */
2648 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
2653 *reduc_fn
= IFN_REDUC_MAX
;
2657 *reduc_fn
= IFN_REDUC_MIN
;
2661 *reduc_fn
= IFN_REDUC_PLUS
;
2665 *reduc_fn
= IFN_REDUC_AND
;
2669 *reduc_fn
= IFN_REDUC_IOR
;
2673 *reduc_fn
= IFN_REDUC_XOR
;
2678 *reduc_fn
= IFN_LAST
;
2686 /* If there is a neutral value X such that SLP reduction NODE would not
2687 be affected by the introduction of additional X elements, return that X,
2688 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2689 is true if the SLP statements perform a single reduction, false if each
2690 statement performs an independent reduction. */
2693 neutral_op_for_slp_reduction (slp_tree slp_node
, tree_code code
,
2696 vec
<gimple
*> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2697 gimple
*stmt
= stmts
[0];
2698 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2699 tree vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
2700 tree scalar_type
= TREE_TYPE (vector_type
);
2701 struct loop
*loop
= gimple_bb (stmt
)->loop_father
;
2706 case WIDEN_SUM_EXPR
:
2713 return build_zero_cst (scalar_type
);
2716 return build_one_cst (scalar_type
);
2719 return build_all_ones_cst (scalar_type
);
2723 /* For MIN/MAX the initial values are neutral. A reduction chain
2724 has only a single initial value, so that value is neutral for
2727 return PHI_ARG_DEF_FROM_EDGE (stmt
, loop_preheader_edge (loop
));
2735 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2736 STMT is printed with a message MSG. */
2739 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
2741 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
2742 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
2746 /* Detect SLP reduction of the form:
2756 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2757 FIRST_STMT is the first reduction stmt in the chain
2758 (a2 = operation (a1)).
2760 Return TRUE if a reduction chain was detected. */
2763 vect_is_slp_reduction (loop_vec_info loop_info
, gimple
*phi
,
2766 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2767 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2768 enum tree_code code
;
2769 gimple
*current_stmt
= NULL
, *loop_use_stmt
= NULL
, *first
, *next_stmt
;
2770 stmt_vec_info use_stmt_info
, current_stmt_info
;
2772 imm_use_iterator imm_iter
;
2773 use_operand_p use_p
;
2774 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2777 if (loop
!= vect_loop
)
2780 lhs
= PHI_RESULT (phi
);
2781 code
= gimple_assign_rhs_code (first_stmt
);
2785 n_out_of_loop_uses
= 0;
2786 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2788 gimple
*use_stmt
= USE_STMT (use_p
);
2789 if (is_gimple_debug (use_stmt
))
2792 /* Check if we got back to the reduction phi. */
2793 if (use_stmt
== phi
)
2795 loop_use_stmt
= use_stmt
;
2800 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2802 loop_use_stmt
= use_stmt
;
2806 n_out_of_loop_uses
++;
2808 /* There are can be either a single use in the loop or two uses in
2810 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2817 /* We reached a statement with no loop uses. */
2818 if (nloop_uses
== 0)
2821 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2822 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2825 if (!is_gimple_assign (loop_use_stmt
)
2826 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2827 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2830 /* Insert USE_STMT into reduction chain. */
2831 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2834 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2835 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2836 GROUP_FIRST_ELEMENT (use_stmt_info
)
2837 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2840 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2842 lhs
= gimple_assign_lhs (loop_use_stmt
);
2843 current_stmt
= loop_use_stmt
;
2847 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2850 /* Swap the operands, if needed, to make the reduction operand be the second
2852 lhs
= PHI_RESULT (phi
);
2853 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2856 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2858 tree op
= gimple_assign_rhs1 (next_stmt
);
2859 gimple
*def_stmt
= NULL
;
2861 if (TREE_CODE (op
) == SSA_NAME
)
2862 def_stmt
= SSA_NAME_DEF_STMT (op
);
2864 /* Check that the other def is either defined in the loop
2865 ("vect_internal_def"), or it's an induction (defined by a
2866 loop-header phi-node). */
2868 && gimple_bb (def_stmt
)
2869 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2870 && (is_gimple_assign (def_stmt
)
2871 || is_gimple_call (def_stmt
)
2872 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2873 == vect_induction_def
2874 || (gimple_code (def_stmt
) == GIMPLE_PHI
2875 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2876 == vect_internal_def
2877 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2879 lhs
= gimple_assign_lhs (next_stmt
);
2880 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2888 tree op
= gimple_assign_rhs2 (next_stmt
);
2889 gimple
*def_stmt
= NULL
;
2891 if (TREE_CODE (op
) == SSA_NAME
)
2892 def_stmt
= SSA_NAME_DEF_STMT (op
);
2894 /* Check that the other def is either defined in the loop
2895 ("vect_internal_def"), or it's an induction (defined by a
2896 loop-header phi-node). */
2898 && gimple_bb (def_stmt
)
2899 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2900 && (is_gimple_assign (def_stmt
)
2901 || is_gimple_call (def_stmt
)
2902 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2903 == vect_induction_def
2904 || (gimple_code (def_stmt
) == GIMPLE_PHI
2905 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2906 == vect_internal_def
2907 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2909 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2912 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2915 swap_ssa_operands (next_stmt
,
2916 gimple_assign_rhs1_ptr (next_stmt
),
2917 gimple_assign_rhs2_ptr (next_stmt
));
2918 update_stmt (next_stmt
);
2920 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2921 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2927 lhs
= gimple_assign_lhs (next_stmt
);
2928 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2931 /* Save the chain for further analysis in SLP detection. */
2932 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2933 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2934 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2939 /* Return true if we need an in-order reduction for operation CODE
2940 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2941 overflow must wrap. */
2944 needs_fold_left_reduction_p (tree type
, tree_code code
,
2945 bool need_wrapping_integral_overflow
)
2947 /* CHECKME: check for !flag_finite_math_only too? */
2948 if (SCALAR_FLOAT_TYPE_P (type
))
2956 return !flag_associative_math
;
2959 if (INTEGRAL_TYPE_P (type
))
2961 if (!operation_no_trapping_overflow (type
, code
))
2963 if (need_wrapping_integral_overflow
2964 && !TYPE_OVERFLOW_WRAPS (type
)
2965 && operation_can_overflow (code
))
2970 if (SAT_FIXED_POINT_TYPE_P (type
))
2976 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2977 reduction operation CODE has a handled computation expression. */
2980 check_reduction_path (location_t loc
, loop_p loop
, gphi
*phi
, tree loop_arg
,
2981 enum tree_code code
)
2983 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
2984 auto_bitmap visited
;
2985 tree lookfor
= PHI_RESULT (phi
);
2987 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
2988 while (USE_FROM_PTR (curr
) != loop_arg
)
2989 curr
= op_iter_next_use (&curri
);
2990 curri
.i
= curri
.numops
;
2993 path
.safe_push (std::make_pair (curri
, curr
));
2994 tree use
= USE_FROM_PTR (curr
);
2997 gimple
*def
= SSA_NAME_DEF_STMT (use
);
2998 if (gimple_nop_p (def
)
2999 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3004 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3008 curr
= op_iter_next_use (&curri
);
3009 /* Skip already visited or non-SSA operands (from iterating
3011 while (curr
!= NULL_USE_OPERAND_P
3012 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3013 || ! bitmap_set_bit (visited
,
3015 (USE_FROM_PTR (curr
)))));
3017 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3018 if (curr
== NULL_USE_OPERAND_P
)
3023 if (gimple_code (def
) == GIMPLE_PHI
)
3024 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3026 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3027 while (curr
!= NULL_USE_OPERAND_P
3028 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3029 || ! bitmap_set_bit (visited
,
3031 (USE_FROM_PTR (curr
)))))
3032 curr
= op_iter_next_use (&curri
);
3033 if (curr
== NULL_USE_OPERAND_P
)
3038 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3040 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3042 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3043 FOR_EACH_VEC_ELT (path
, i
, x
)
3045 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, USE_FROM_PTR (x
->second
));
3046 dump_printf (MSG_NOTE
, " ");
3048 dump_printf (MSG_NOTE
, "\n");
3051 /* Check whether the reduction path detected is valid. */
3052 bool fail
= path
.length () == 0;
3054 for (unsigned i
= 1; i
< path
.length (); ++i
)
3056 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3057 tree op
= USE_FROM_PTR (path
[i
].second
);
3058 if (! has_single_use (op
)
3059 || ! is_gimple_assign (use_stmt
))
3064 if (gimple_assign_rhs_code (use_stmt
) != code
)
3066 if (code
== PLUS_EXPR
3067 && gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
3069 /* Track whether we negate the reduction value each iteration. */
3070 if (gimple_assign_rhs2 (use_stmt
) == op
)
3080 return ! fail
&& ! neg
;
3084 /* Function vect_is_simple_reduction
3086 (1) Detect a cross-iteration def-use cycle that represents a simple
3087 reduction computation. We look for the following pattern:
3092 a2 = operation (a3, a1)
3099 a2 = operation (a3, a1)
3102 1. operation is commutative and associative and it is safe to
3103 change the order of the computation
3104 2. no uses for a2 in the loop (a2 is used out of the loop)
3105 3. no uses of a1 in the loop besides the reduction operation
3106 4. no uses of a1 outside the loop.
3108 Conditions 1,4 are tested here.
3109 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3111 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3114 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3118 inner loop (def of a3)
3121 (4) Detect condition expressions, ie:
3122 for (int i = 0; i < N; i++)
3129 vect_is_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
3131 bool need_wrapping_integral_overflow
,
3132 enum vect_reduction_type
*v_reduc_type
)
3134 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
3135 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
3136 gimple
*def_stmt
, *def1
= NULL
, *def2
= NULL
, *phi_use_stmt
= NULL
;
3137 enum tree_code orig_code
, code
;
3138 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
3142 imm_use_iterator imm_iter
;
3143 use_operand_p use_p
;
3146 *double_reduc
= false;
3147 *v_reduc_type
= TREE_CODE_REDUCTION
;
3149 tree phi_name
= PHI_RESULT (phi
);
3150 /* ??? If there are no uses of the PHI result the inner loop reduction
3151 won't be detected as possibly double-reduction by vectorizable_reduction
3152 because that tries to walk the PHI arg from the preheader edge which
3153 can be constant. See PR60382. */
3154 if (has_zero_uses (phi_name
))
3157 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3159 gimple
*use_stmt
= USE_STMT (use_p
);
3160 if (is_gimple_debug (use_stmt
))
3163 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3167 "intermediate value used outside loop.\n");
3175 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3177 "reduction value used in loop.\n");
3181 phi_use_stmt
= use_stmt
;
3184 edge latch_e
= loop_latch_edge (loop
);
3185 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
3186 if (TREE_CODE (loop_arg
) != SSA_NAME
)
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3191 "reduction: not ssa_name: ");
3192 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
3193 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3198 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
3199 if (is_gimple_assign (def_stmt
))
3201 name
= gimple_assign_lhs (def_stmt
);
3204 else if (gimple_code (def_stmt
) == GIMPLE_PHI
)
3206 name
= PHI_RESULT (def_stmt
);
3211 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3214 "reduction: unhandled reduction operation: ");
3215 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def_stmt
, 0);
3220 if (! flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
3224 auto_vec
<gphi
*, 3> lcphis
;
3225 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
3227 gimple
*use_stmt
= USE_STMT (use_p
);
3228 if (is_gimple_debug (use_stmt
))
3230 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3233 /* We can have more than one loop-closed PHI. */
3234 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3239 "reduction used in loop.\n");
3244 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3245 defined in the inner loop. */
3248 op1
= PHI_ARG_DEF (def_stmt
, 0);
3250 if (gimple_phi_num_args (def_stmt
) != 1
3251 || TREE_CODE (op1
) != SSA_NAME
)
3253 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3255 "unsupported phi node definition.\n");
3260 def1
= SSA_NAME_DEF_STMT (op1
);
3261 if (gimple_bb (def1
)
3262 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3264 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3265 && is_gimple_assign (def1
)
3266 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3268 if (dump_enabled_p ())
3269 report_vect_op (MSG_NOTE
, def_stmt
,
3270 "detected double reduction: ");
3272 *double_reduc
= true;
3279 /* If we are vectorizing an inner reduction we are executing that
3280 in the original order only in case we are not dealing with a
3281 double reduction. */
3282 bool check_reduction
= true;
3283 if (flow_loop_nested_p (vect_loop
, loop
))
3287 check_reduction
= false;
3288 FOR_EACH_VEC_ELT (lcphis
, i
, lcphi
)
3289 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, gimple_phi_result (lcphi
))
3291 gimple
*use_stmt
= USE_STMT (use_p
);
3292 if (is_gimple_debug (use_stmt
))
3294 if (! flow_bb_inside_loop_p (vect_loop
, gimple_bb (use_stmt
)))
3295 check_reduction
= true;
3299 bool nested_in_vect_loop
= flow_loop_nested_p (vect_loop
, loop
);
3300 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
3302 /* We can handle "res -= x[i]", which is non-associative by
3303 simply rewriting this into "res += -x[i]". Avoid changing
3304 gimple instruction for the first simple tests and only do this
3305 if we're allowed to change code at all. */
3306 if (code
== MINUS_EXPR
&& gimple_assign_rhs2 (def_stmt
) != phi_name
)
3309 if (code
== COND_EXPR
)
3311 if (! nested_in_vect_loop
)
3312 *v_reduc_type
= COND_REDUCTION
;
3314 op3
= gimple_assign_rhs1 (def_stmt
);
3315 if (COMPARISON_CLASS_P (op3
))
3317 op4
= TREE_OPERAND (op3
, 1);
3318 op3
= TREE_OPERAND (op3
, 0);
3320 if (op3
== phi_name
|| op4
== phi_name
)
3322 if (dump_enabled_p ())
3323 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3324 "reduction: condition depends on previous"
3329 op1
= gimple_assign_rhs2 (def_stmt
);
3330 op2
= gimple_assign_rhs3 (def_stmt
);
3332 else if (!commutative_tree_code (code
) || !associative_tree_code (code
))
3334 if (dump_enabled_p ())
3335 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3336 "reduction: not commutative/associative: ");
3339 else if (get_gimple_rhs_class (code
) == GIMPLE_BINARY_RHS
)
3341 op1
= gimple_assign_rhs1 (def_stmt
);
3342 op2
= gimple_assign_rhs2 (def_stmt
);
3346 if (dump_enabled_p ())
3347 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3348 "reduction: not handled operation: ");
3352 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
3354 if (dump_enabled_p ())
3355 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3356 "reduction: both uses not ssa_names: ");
3361 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
3362 if ((TREE_CODE (op1
) == SSA_NAME
3363 && !types_compatible_p (type
,TREE_TYPE (op1
)))
3364 || (TREE_CODE (op2
) == SSA_NAME
3365 && !types_compatible_p (type
, TREE_TYPE (op2
)))
3366 || (op3
&& TREE_CODE (op3
) == SSA_NAME
3367 && !types_compatible_p (type
, TREE_TYPE (op3
)))
3368 || (op4
&& TREE_CODE (op4
) == SSA_NAME
3369 && !types_compatible_p (type
, TREE_TYPE (op4
))))
3371 if (dump_enabled_p ())
3373 dump_printf_loc (MSG_NOTE
, vect_location
,
3374 "reduction: multiple types: operation type: ");
3375 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
3376 dump_printf (MSG_NOTE
, ", operands types: ");
3377 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3379 dump_printf (MSG_NOTE
, ",");
3380 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3384 dump_printf (MSG_NOTE
, ",");
3385 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3391 dump_printf (MSG_NOTE
, ",");
3392 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3395 dump_printf (MSG_NOTE
, "\n");
3401 /* Check whether it's ok to change the order of the computation.
3402 Generally, when vectorizing a reduction we change the order of the
3403 computation. This may change the behavior of the program in some
3404 cases, so we need to check that this is ok. One exception is when
3405 vectorizing an outer-loop: the inner-loop is executed sequentially,
3406 and therefore vectorizing reductions in the inner-loop during
3407 outer-loop vectorization is safe. */
3409 && *v_reduc_type
== TREE_CODE_REDUCTION
3410 && needs_fold_left_reduction_p (type
, code
,
3411 need_wrapping_integral_overflow
))
3412 *v_reduc_type
= FOLD_LEFT_REDUCTION
;
3414 /* Reduction is safe. We're dealing with one of the following:
3415 1) integer arithmetic and no trapv
3416 2) floating point arithmetic, and special flags permit this optimization
3417 3) nested cycle (i.e., outer loop vectorization). */
3418 if (TREE_CODE (op1
) == SSA_NAME
)
3419 def1
= SSA_NAME_DEF_STMT (op1
);
3421 if (TREE_CODE (op2
) == SSA_NAME
)
3422 def2
= SSA_NAME_DEF_STMT (op2
);
3424 if (code
!= COND_EXPR
3425 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
3427 if (dump_enabled_p ())
3428 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
3432 /* Check that one def is the reduction def, defined by PHI,
3433 the other def is either defined in the loop ("vect_internal_def"),
3434 or it's an induction (defined by a loop-header phi-node). */
3436 if (def2
&& def2
== phi
3437 && (code
== COND_EXPR
3438 || !def1
|| gimple_nop_p (def1
)
3439 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
3440 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
3441 && (is_gimple_assign (def1
)
3442 || is_gimple_call (def1
)
3443 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
3444 == vect_induction_def
3445 || (gimple_code (def1
) == GIMPLE_PHI
3446 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
3447 == vect_internal_def
3448 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
3450 if (dump_enabled_p ())
3451 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3455 if (def1
&& def1
== phi
3456 && (code
== COND_EXPR
3457 || !def2
|| gimple_nop_p (def2
)
3458 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
3459 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
3460 && (is_gimple_assign (def2
)
3461 || is_gimple_call (def2
)
3462 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
3463 == vect_induction_def
3464 || (gimple_code (def2
) == GIMPLE_PHI
3465 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
3466 == vect_internal_def
3467 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
3469 if (! nested_in_vect_loop
&& orig_code
!= MINUS_EXPR
)
3471 /* Check if we can swap operands (just for simplicity - so that
3472 the rest of the code can assume that the reduction variable
3473 is always the last (second) argument). */
3474 if (code
== COND_EXPR
)
3476 /* Swap cond_expr by inverting the condition. */
3477 tree cond_expr
= gimple_assign_rhs1 (def_stmt
);
3478 enum tree_code invert_code
= ERROR_MARK
;
3479 enum tree_code cond_code
= TREE_CODE (cond_expr
);
3481 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
3483 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
3484 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
3486 if (invert_code
!= ERROR_MARK
)
3488 TREE_SET_CODE (cond_expr
, invert_code
);
3489 swap_ssa_operands (def_stmt
,
3490 gimple_assign_rhs2_ptr (def_stmt
),
3491 gimple_assign_rhs3_ptr (def_stmt
));
3495 if (dump_enabled_p ())
3496 report_vect_op (MSG_NOTE
, def_stmt
,
3497 "detected reduction: cannot swap operands "
3503 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
3504 gimple_assign_rhs2_ptr (def_stmt
));
3506 if (dump_enabled_p ())
3507 report_vect_op (MSG_NOTE
, def_stmt
,
3508 "detected reduction: need to swap operands: ");
3510 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
3511 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
3515 if (dump_enabled_p ())
3516 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3522 /* Try to find SLP reduction chain. */
3523 if (! nested_in_vect_loop
3524 && code
!= COND_EXPR
3525 && orig_code
!= MINUS_EXPR
3526 && vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
3528 if (dump_enabled_p ())
3529 report_vect_op (MSG_NOTE
, def_stmt
,
3530 "reduction: detected reduction chain: ");
3535 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3536 gimple
*first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt
));
3539 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
));
3540 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first
)) = NULL
;
3541 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
)) = NULL
;
3545 /* Look for the expression computing loop_arg from loop PHI result. */
3546 if (check_reduction_path (vect_location
, loop
, as_a
<gphi
*> (phi
), loop_arg
,
3550 if (dump_enabled_p ())
3552 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3553 "reduction: unknown pattern: ");
3559 /* Wrapper around vect_is_simple_reduction, which will modify code
3560 in-place if it enables detection of more reductions. Arguments
3564 vect_force_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
3566 bool need_wrapping_integral_overflow
)
3568 enum vect_reduction_type v_reduc_type
;
3569 gimple
*def
= vect_is_simple_reduction (loop_info
, phi
, double_reduc
,
3570 need_wrapping_integral_overflow
,
3574 stmt_vec_info reduc_def_info
= vinfo_for_stmt (phi
);
3575 STMT_VINFO_REDUC_TYPE (reduc_def_info
) = v_reduc_type
;
3576 STMT_VINFO_REDUC_DEF (reduc_def_info
) = def
;
3577 reduc_def_info
= vinfo_for_stmt (def
);
3578 STMT_VINFO_REDUC_TYPE (reduc_def_info
) = v_reduc_type
;
3579 STMT_VINFO_REDUC_DEF (reduc_def_info
) = phi
;
3584 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3586 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3587 int *peel_iters_epilogue
,
3588 stmt_vector_for_cost
*scalar_cost_vec
,
3589 stmt_vector_for_cost
*prologue_cost_vec
,
3590 stmt_vector_for_cost
*epilogue_cost_vec
)
3593 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3595 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3597 *peel_iters_epilogue
= assumed_vf
/ 2;
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_NOTE
, vect_location
,
3600 "cost model: epilogue peel iters set to vf/2 "
3601 "because loop iterations are unknown .\n");
3603 /* If peeled iterations are known but number of scalar loop
3604 iterations are unknown, count a taken branch per peeled loop. */
3605 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3606 NULL
, 0, vect_prologue
);
3607 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3608 NULL
, 0, vect_epilogue
);
3612 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3613 peel_iters_prologue
= niters
< peel_iters_prologue
?
3614 niters
: peel_iters_prologue
;
3615 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3616 /* If we need to peel for gaps, but no peeling is required, we have to
3617 peel VF iterations. */
3618 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3619 *peel_iters_epilogue
= assumed_vf
;
3622 stmt_info_for_cost
*si
;
3624 if (peel_iters_prologue
)
3625 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3627 stmt_vec_info stmt_info
3628 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3629 retval
+= record_stmt_cost (prologue_cost_vec
,
3630 si
->count
* peel_iters_prologue
,
3631 si
->kind
, stmt_info
, si
->misalign
,
3634 if (*peel_iters_epilogue
)
3635 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3637 stmt_vec_info stmt_info
3638 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3639 retval
+= record_stmt_cost (epilogue_cost_vec
,
3640 si
->count
* *peel_iters_epilogue
,
3641 si
->kind
, stmt_info
, si
->misalign
,
3648 /* Function vect_estimate_min_profitable_iters
3650 Return the number of iterations required for the vector version of the
3651 loop to be profitable relative to the cost of the scalar version of the
3654 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3655 of iterations for vectorization. -1 value means loop vectorization
3656 is not profitable. This returned value may be used for dynamic
3657 profitability check.
3659 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3660 for static check against estimated number of iterations. */
3663 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3664 int *ret_min_profitable_niters
,
3665 int *ret_min_profitable_estimate
)
3667 int min_profitable_iters
;
3668 int min_profitable_estimate
;
3669 int peel_iters_prologue
;
3670 int peel_iters_epilogue
;
3671 unsigned vec_inside_cost
= 0;
3672 int vec_outside_cost
= 0;
3673 unsigned vec_prologue_cost
= 0;
3674 unsigned vec_epilogue_cost
= 0;
3675 int scalar_single_iter_cost
= 0;
3676 int scalar_outside_cost
= 0;
3677 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3678 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3679 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3681 /* Cost model disabled. */
3682 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3684 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3685 *ret_min_profitable_niters
= 0;
3686 *ret_min_profitable_estimate
= 0;
3690 /* Requires loop versioning tests to handle misalignment. */
3691 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3693 /* FIXME: Make cost depend on complexity of individual check. */
3694 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3695 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3697 dump_printf (MSG_NOTE
,
3698 "cost model: Adding cost of checks for loop "
3699 "versioning to treat misalignment.\n");
3702 /* Requires loop versioning with alias checks. */
3703 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3705 /* FIXME: Make cost depend on complexity of individual check. */
3706 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3707 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3709 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3711 /* Count LEN - 1 ANDs and LEN comparisons. */
3712 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1, scalar_stmt
,
3713 NULL
, 0, vect_prologue
);
3714 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3717 /* Count LEN - 1 ANDs and LEN comparisons. */
3718 unsigned int nstmts
= len
* 2 - 1;
3719 /* +1 for each bias that needs adding. */
3720 for (unsigned int i
= 0; i
< len
; ++i
)
3721 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3723 (void) add_stmt_cost (target_cost_data
, nstmts
, scalar_stmt
,
3724 NULL
, 0, vect_prologue
);
3726 dump_printf (MSG_NOTE
,
3727 "cost model: Adding cost of checks for loop "
3728 "versioning aliasing.\n");
3731 /* Requires loop versioning with niter checks. */
3732 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3734 /* FIXME: Make cost depend on complexity of individual check. */
3735 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
, NULL
, 0,
3737 dump_printf (MSG_NOTE
,
3738 "cost model: Adding cost of checks for loop "
3739 "versioning niters.\n");
3742 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3743 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
3746 /* Count statements in scalar loop. Using this as scalar cost for a single
3749 TODO: Add outer loop support.
3751 TODO: Consider assigning different costs to different scalar
3754 scalar_single_iter_cost
3755 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3757 /* Add additional cost for the peeled instructions in prologue and epilogue
3758 loop. (For fully-masked loops there will be no peeling.)
3760 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3761 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3763 TODO: Build an expression that represents peel_iters for prologue and
3764 epilogue to be used in a run-time test. */
3766 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3768 peel_iters_prologue
= 0;
3769 peel_iters_epilogue
= 0;
3771 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3773 /* We need to peel exactly one iteration. */
3774 peel_iters_epilogue
+= 1;
3775 stmt_info_for_cost
*si
;
3777 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
3780 struct _stmt_vec_info
*stmt_info
3781 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3782 (void) add_stmt_cost (target_cost_data
, si
->count
,
3783 si
->kind
, stmt_info
, si
->misalign
,
3790 peel_iters_prologue
= assumed_vf
/ 2;
3791 dump_printf (MSG_NOTE
, "cost model: "
3792 "prologue peel iters set to vf/2.\n");
3794 /* If peeling for alignment is unknown, loop bound of main loop becomes
3796 peel_iters_epilogue
= assumed_vf
/ 2;
3797 dump_printf (MSG_NOTE
, "cost model: "
3798 "epilogue peel iters set to vf/2 because "
3799 "peeling for alignment is unknown.\n");
3801 /* If peeled iterations are unknown, count a taken branch and a not taken
3802 branch per peeled loop. Even if scalar loop iterations are known,
3803 vector iterations are not known since peeled prologue iterations are
3804 not known. Hence guards remain the same. */
3805 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3806 NULL
, 0, vect_prologue
);
3807 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3808 NULL
, 0, vect_prologue
);
3809 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3810 NULL
, 0, vect_epilogue
);
3811 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3812 NULL
, 0, vect_epilogue
);
3813 stmt_info_for_cost
*si
;
3815 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3817 struct _stmt_vec_info
*stmt_info
3818 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3819 (void) add_stmt_cost (target_cost_data
,
3820 si
->count
* peel_iters_prologue
,
3821 si
->kind
, stmt_info
, si
->misalign
,
3823 (void) add_stmt_cost (target_cost_data
,
3824 si
->count
* peel_iters_epilogue
,
3825 si
->kind
, stmt_info
, si
->misalign
,
3831 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3832 stmt_info_for_cost
*si
;
3834 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3836 prologue_cost_vec
.create (2);
3837 epilogue_cost_vec
.create (2);
3838 peel_iters_prologue
= npeel
;
3840 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3841 &peel_iters_epilogue
,
3842 &LOOP_VINFO_SCALAR_ITERATION_COST
3845 &epilogue_cost_vec
);
3847 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3849 struct _stmt_vec_info
*stmt_info
3850 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3851 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3852 si
->misalign
, vect_prologue
);
3855 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3857 struct _stmt_vec_info
*stmt_info
3858 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3859 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3860 si
->misalign
, vect_epilogue
);
3863 prologue_cost_vec
.release ();
3864 epilogue_cost_vec
.release ();
3867 /* FORNOW: The scalar outside cost is incremented in one of the
3870 1. The vectorizer checks for alignment and aliasing and generates
3871 a condition that allows dynamic vectorization. A cost model
3872 check is ANDED with the versioning condition. Hence scalar code
3873 path now has the added cost of the versioning check.
3875 if (cost > th & versioning_check)
3878 Hence run-time scalar is incremented by not-taken branch cost.
3880 2. The vectorizer then checks if a prologue is required. If the
3881 cost model check was not done before during versioning, it has to
3882 be done before the prologue check.
3885 prologue = scalar_iters
3890 if (prologue == num_iters)
3893 Hence the run-time scalar cost is incremented by a taken branch,
3894 plus a not-taken branch, plus a taken branch cost.
3896 3. The vectorizer then checks if an epilogue is required. If the
3897 cost model check was not done before during prologue check, it
3898 has to be done with the epilogue check.
3904 if (prologue == num_iters)
3907 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3910 Hence the run-time scalar cost should be incremented by 2 taken
3913 TODO: The back end may reorder the BBS's differently and reverse
3914 conditions/branch directions. Change the estimates below to
3915 something more reasonable. */
3917 /* If the number of iterations is known and we do not do versioning, we can
3918 decide whether to vectorize at compile time. Hence the scalar version
3919 do not carry cost model guard costs. */
3920 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3921 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3923 /* Cost model check occurs at versioning. */
3924 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3925 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3928 /* Cost model check occurs at prologue generation. */
3929 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3930 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3931 + vect_get_stmt_cost (cond_branch_not_taken
);
3932 /* Cost model check occurs at epilogue generation. */
3934 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3938 /* Complete the target-specific cost calculations. */
3939 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3940 &vec_inside_cost
, &vec_epilogue_cost
);
3942 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3944 if (dump_enabled_p ())
3946 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3947 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3949 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3951 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3953 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3954 scalar_single_iter_cost
);
3955 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3956 scalar_outside_cost
);
3957 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3959 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3960 peel_iters_prologue
);
3961 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3962 peel_iters_epilogue
);
3965 /* Calculate number of iterations required to make the vector version
3966 profitable, relative to the loop bodies only. The following condition
3968 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3970 SIC = scalar iteration cost, VIC = vector iteration cost,
3971 VOC = vector outside cost, VF = vectorization factor,
3972 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3973 SOC = scalar outside cost for run time cost model check. */
3975 if ((scalar_single_iter_cost
* assumed_vf
) > (int) vec_inside_cost
)
3977 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
3979 - vec_inside_cost
* peel_iters_prologue
3980 - vec_inside_cost
* peel_iters_epilogue
);
3981 if (min_profitable_iters
<= 0)
3982 min_profitable_iters
= 0;
3985 min_profitable_iters
/= ((scalar_single_iter_cost
* assumed_vf
)
3988 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
3989 <= (((int) vec_inside_cost
* min_profitable_iters
)
3990 + (((int) vec_outside_cost
- scalar_outside_cost
)
3992 min_profitable_iters
++;
3995 /* vector version will never be profitable. */
3998 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3999 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
4000 "did not happen for a simd loop");
4002 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4004 "cost model: the vector iteration cost = %d "
4005 "divided by the scalar iteration cost = %d "
4006 "is greater or equal to the vectorization factor = %d"
4008 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4009 *ret_min_profitable_niters
= -1;
4010 *ret_min_profitable_estimate
= -1;
4014 dump_printf (MSG_NOTE
,
4015 " Calculated minimum iters for profitability: %d\n",
4016 min_profitable_iters
);
4018 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
4019 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4020 /* We want the vectorized loop to execute at least once. */
4021 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4023 if (dump_enabled_p ())
4024 dump_printf_loc (MSG_NOTE
, vect_location
,
4025 " Runtime profitability threshold = %d\n",
4026 min_profitable_iters
);
4028 *ret_min_profitable_niters
= min_profitable_iters
;
4030 /* Calculate number of iterations required to make the vector version
4031 profitable, relative to the loop bodies only.
4033 Non-vectorized variant is SIC * niters and it must win over vector
4034 variant on the expected loop trip count. The following condition must hold true:
4035 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4037 if (vec_outside_cost
<= 0)
4038 min_profitable_estimate
= 0;
4041 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4043 - vec_inside_cost
* peel_iters_prologue
4044 - vec_inside_cost
* peel_iters_epilogue
)
4045 / ((scalar_single_iter_cost
* assumed_vf
)
4048 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4049 if (dump_enabled_p ())
4050 dump_printf_loc (MSG_NOTE
, vect_location
,
4051 " Static estimate profitability threshold = %d\n",
4052 min_profitable_estimate
);
4054 *ret_min_profitable_estimate
= min_profitable_estimate
;
4057 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4058 vector elements (not bits) for a vector with NELT elements. */
4060 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4061 vec_perm_builder
*sel
)
4063 /* The encoding is a single stepped pattern. Any wrap-around is handled
4064 by vec_perm_indices. */
4065 sel
->new_vector (nelt
, 1, 3);
4066 for (unsigned int i
= 0; i
< 3; i
++)
4067 sel
->quick_push (i
+ offset
);
4070 /* Checks whether the target supports whole-vector shifts for vectors of mode
4071 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4072 it supports vec_perm_const with masks for all necessary shift amounts. */
4074 have_whole_vector_shift (machine_mode mode
)
4076 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4079 /* Variable-length vectors should be handled via the optab. */
4081 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4084 vec_perm_builder sel
;
4085 vec_perm_indices indices
;
4086 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4088 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4089 indices
.new_vector (sel
, 2, nelt
);
4090 if (!can_vec_perm_const_p (mode
, indices
, false))
4096 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4097 functions. Design better to avoid maintenance issues. */
4099 /* Function vect_model_reduction_cost.
4101 Models cost for a reduction operation, including the vector ops
4102 generated within the strip-mine loop, the initial definition before
4103 the loop, and the epilogue code that must be generated. */
4106 vect_model_reduction_cost (stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4109 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
4110 enum tree_code code
;
4115 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4116 struct loop
*loop
= NULL
;
4117 void *target_cost_data
;
4121 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4122 target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
4125 target_cost_data
= BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info
));
4127 /* Condition reductions generate two reductions in the loop. */
4128 vect_reduction_type reduction_type
4129 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
4130 if (reduction_type
== COND_REDUCTION
)
4133 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4134 mode
= TYPE_MODE (vectype
);
4135 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4138 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
4140 code
= gimple_assign_rhs_code (orig_stmt
);
4142 if (reduction_type
== EXTRACT_LAST_REDUCTION
4143 || reduction_type
== FOLD_LEFT_REDUCTION
)
4145 /* No extra instructions needed in the prologue. */
4148 if (reduction_type
== EXTRACT_LAST_REDUCTION
|| reduc_fn
!= IFN_LAST
)
4149 /* Count one reduction-like operation per vector. */
4150 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vec_to_scalar
,
4151 stmt_info
, 0, vect_body
);
4154 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4155 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4156 inside_cost
= add_stmt_cost (target_cost_data
, nelements
,
4157 vec_to_scalar
, stmt_info
, 0,
4159 inside_cost
+= add_stmt_cost (target_cost_data
, nelements
,
4160 scalar_stmt
, stmt_info
, 0,
4166 /* Add in cost for initial definition.
4167 For cond reduction we have four vectors: initial index, step,
4168 initial result of the data reduction, initial value of the index
4170 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
4171 prologue_cost
+= add_stmt_cost (target_cost_data
, prologue_stmts
,
4172 scalar_to_vec
, stmt_info
, 0,
4175 /* Cost of reduction op inside loop. */
4176 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
4177 stmt_info
, 0, vect_body
);
4180 /* Determine cost of epilogue code.
4182 We have a reduction operator that will reduce the vector in one statement.
4183 Also requires scalar extract. */
4185 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt
))
4187 if (reduc_fn
!= IFN_LAST
)
4189 if (reduction_type
== COND_REDUCTION
)
4191 /* An EQ stmt and an COND_EXPR stmt. */
4192 epilogue_cost
+= add_stmt_cost (target_cost_data
, 2,
4193 vector_stmt
, stmt_info
, 0,
4195 /* Reduction of the max index and a reduction of the found
4197 epilogue_cost
+= add_stmt_cost (target_cost_data
, 2,
4198 vec_to_scalar
, stmt_info
, 0,
4200 /* A broadcast of the max value. */
4201 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
4202 scalar_to_vec
, stmt_info
, 0,
4207 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
4208 stmt_info
, 0, vect_epilogue
);
4209 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
4210 vec_to_scalar
, stmt_info
, 0,
4214 else if (reduction_type
== COND_REDUCTION
)
4216 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4217 /* Extraction of scalar elements. */
4218 epilogue_cost
+= add_stmt_cost (target_cost_data
,
4219 2 * estimated_nunits
,
4220 vec_to_scalar
, stmt_info
, 0,
4222 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4223 epilogue_cost
+= add_stmt_cost (target_cost_data
,
4224 2 * estimated_nunits
- 3,
4225 scalar_stmt
, stmt_info
, 0,
4228 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4229 || reduction_type
== FOLD_LEFT_REDUCTION
)
4230 /* No extra instructions need in the epilogue. */
4234 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4236 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
4237 int element_bitsize
= tree_to_uhwi (bitsize
);
4238 int nelements
= vec_size_in_bits
/ element_bitsize
;
4240 if (code
== COND_EXPR
)
4243 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4245 /* We have a whole vector shift available. */
4246 if (optab
!= unknown_optab
4247 && VECTOR_MODE_P (mode
)
4248 && optab_handler (optab
, mode
) != CODE_FOR_nothing
4249 && have_whole_vector_shift (mode
))
4251 /* Final reduction via vector shifts and the reduction operator.
4252 Also requires scalar extract. */
4253 epilogue_cost
+= add_stmt_cost (target_cost_data
,
4254 exact_log2 (nelements
) * 2,
4255 vector_stmt
, stmt_info
, 0,
4257 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
4258 vec_to_scalar
, stmt_info
, 0,
4262 /* Use extracts and reduction op for final reduction. For N
4263 elements, we have N extracts and N-1 reduction ops. */
4264 epilogue_cost
+= add_stmt_cost (target_cost_data
,
4265 nelements
+ nelements
- 1,
4266 vector_stmt
, stmt_info
, 0,
4271 if (dump_enabled_p ())
4272 dump_printf (MSG_NOTE
,
4273 "vect_model_reduction_cost: inside_cost = %d, "
4274 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4275 prologue_cost
, epilogue_cost
);
4279 /* Function vect_model_induction_cost.
4281 Models cost for induction operations. */
4284 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
4286 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4287 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
4288 unsigned inside_cost
, prologue_cost
;
4290 if (PURE_SLP_STMT (stmt_info
))
4293 /* loop cost for vec_loop. */
4294 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
4295 stmt_info
, 0, vect_body
);
4297 /* prologue cost for vec_init and vec_step. */
4298 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
4299 stmt_info
, 0, vect_prologue
);
4301 if (dump_enabled_p ())
4302 dump_printf_loc (MSG_NOTE
, vect_location
,
4303 "vect_model_induction_cost: inside_cost = %d, "
4304 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
4309 /* Function get_initial_def_for_reduction
4312 STMT - a stmt that performs a reduction operation in the loop.
4313 INIT_VAL - the initial value of the reduction variable
4316 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4317 of the reduction (used for adjusting the epilog - see below).
4318 Return a vector variable, initialized according to the operation that STMT
4319 performs. This vector will be used as the initial value of the
4320 vector of partial results.
4322 Option1 (adjust in epilog): Initialize the vector as follows:
4323 add/bit or/xor: [0,0,...,0,0]
4324 mult/bit and: [1,1,...,1,1]
4325 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4326 and when necessary (e.g. add/mult case) let the caller know
4327 that it needs to adjust the result by init_val.
4329 Option2: Initialize the vector as follows:
4330 add/bit or/xor: [init_val,0,0,...,0]
4331 mult/bit and: [init_val,1,1,...,1]
4332 min/max/cond_expr: [init_val,init_val,...,init_val]
4333 and no adjustments are needed.
4335 For example, for the following code:
4341 STMT is 's = s + a[i]', and the reduction variable is 's'.
4342 For a vector of 4 units, we want to return either [0,0,0,init_val],
4343 or [0,0,0,0] and let the caller know that it needs to adjust
4344 the result at the end by 'init_val'.
4346 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4347 initialization vector is simpler (same element in all entries), if
4348 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4350 A cost model should help decide between these two schemes. */
4353 get_initial_def_for_reduction (gimple
*stmt
, tree init_val
,
4354 tree
*adjustment_def
)
4356 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
4357 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
4358 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4359 tree scalar_type
= TREE_TYPE (init_val
);
4360 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
4361 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4364 bool nested_in_vect_loop
= false;
4365 REAL_VALUE_TYPE real_init_val
= dconst0
;
4366 int int_init_val
= 0;
4367 gimple
*def_stmt
= NULL
;
4368 gimple_seq stmts
= NULL
;
4370 gcc_assert (vectype
);
4372 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4373 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4375 if (nested_in_vect_loop_p (loop
, stmt
))
4376 nested_in_vect_loop
= true;
4378 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
4380 /* In case of double reduction we only create a vector variable to be put
4381 in the reduction phi node. The actual statement creation is done in
4382 vect_create_epilog_for_reduction. */
4383 if (adjustment_def
&& nested_in_vect_loop
4384 && TREE_CODE (init_val
) == SSA_NAME
4385 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
4386 && gimple_code (def_stmt
) == GIMPLE_PHI
4387 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
4388 && vinfo_for_stmt (def_stmt
)
4389 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
4390 == vect_double_reduction_def
)
4392 *adjustment_def
= NULL
;
4393 return vect_create_destination_var (init_val
, vectype
);
4396 vect_reduction_type reduction_type
4397 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo
);
4399 /* In case of a nested reduction do not use an adjustment def as
4400 that case is not supported by the epilogue generation correctly
4401 if ncopies is not one. */
4402 if (adjustment_def
&& nested_in_vect_loop
)
4404 *adjustment_def
= NULL
;
4405 return vect_get_vec_def_for_operand (init_val
, stmt
);
4410 case WIDEN_SUM_EXPR
:
4420 /* ADJUSTMENT_DEF is NULL when called from
4421 vect_create_epilog_for_reduction to vectorize double reduction. */
4423 *adjustment_def
= init_val
;
4425 if (code
== MULT_EXPR
)
4427 real_init_val
= dconst1
;
4431 if (code
== BIT_AND_EXPR
)
4434 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4435 def_for_init
= build_real (scalar_type
, real_init_val
);
4437 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4440 /* Option1: the first element is '0' or '1' as well. */
4441 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4443 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4445 /* Option2 (variable length): the first element is INIT_VAL. */
4446 init_def
= build_vector_from_val (vectype
, def_for_init
);
4447 gcall
*call
= gimple_build_call_internal (IFN_VEC_SHL_INSERT
,
4448 2, init_def
, init_val
);
4449 init_def
= make_ssa_name (vectype
);
4450 gimple_call_set_lhs (call
, init_def
);
4451 gimple_seq_add_stmt (&stmts
, call
);
4455 /* Option2: the first element is INIT_VAL. */
4456 tree_vector_builder
elts (vectype
, 1, 2);
4457 elts
.quick_push (init_val
);
4458 elts
.quick_push (def_for_init
);
4459 init_def
= gimple_build_vector (&stmts
, &elts
);
4470 *adjustment_def
= NULL_TREE
;
4471 if (reduction_type
!= COND_REDUCTION
4472 && reduction_type
!= EXTRACT_LAST_REDUCTION
)
4474 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
);
4478 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4479 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4488 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4492 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4493 NUMBER_OF_VECTORS is the number of vector defs to create.
4494 If NEUTRAL_OP is nonnull, introducing extra elements of that
4495 value will not change the result. */
4498 get_initial_defs_for_reduction (slp_tree slp_node
,
4499 vec
<tree
> *vec_oprnds
,
4500 unsigned int number_of_vectors
,
4501 bool reduc_chain
, tree neutral_op
)
4503 vec
<gimple
*> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4504 gimple
*stmt
= stmts
[0];
4505 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
4506 unsigned HOST_WIDE_INT nunits
;
4507 unsigned j
, number_of_places_left_in_vector
;
4510 int group_size
= stmts
.length ();
4511 unsigned int vec_num
, i
;
4512 unsigned number_of_copies
= 1;
4514 voprnds
.create (number_of_vectors
);
4516 auto_vec
<tree
, 16> permute_results
;
4518 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4520 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4522 loop
= (gimple_bb (stmt
))->loop_father
;
4524 edge pe
= loop_preheader_edge (loop
);
4526 gcc_assert (!reduc_chain
|| neutral_op
);
4528 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4529 created vectors. It is greater than 1 if unrolling is performed.
4531 For example, we have two scalar operands, s1 and s2 (e.g., group of
4532 strided accesses of size two), while NUNITS is four (i.e., four scalars
4533 of this type can be packed in a vector). The output vector will contain
4534 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4537 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4538 containing the operands.
4540 For example, NUNITS is four as before, and the group size is 8
4541 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4542 {s5, s6, s7, s8}. */
4544 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4545 nunits
= group_size
;
4547 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4549 number_of_places_left_in_vector
= nunits
;
4550 bool constant_p
= true;
4551 tree_vector_builder
elts (vector_type
, nunits
, 1);
4552 elts
.quick_grow (nunits
);
4553 for (j
= 0; j
< number_of_copies
; j
++)
4555 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
4558 /* Get the def before the loop. In reduction chain we have only
4559 one initial value. */
4560 if ((j
!= (number_of_copies
- 1)
4561 || (reduc_chain
&& i
!= 0))
4565 op
= PHI_ARG_DEF_FROM_EDGE (stmt
, pe
);
4567 /* Create 'vect_ = {op0,op1,...,opn}'. */
4568 number_of_places_left_in_vector
--;
4569 elts
[number_of_places_left_in_vector
] = op
;
4570 if (!CONSTANT_CLASS_P (op
))
4573 if (number_of_places_left_in_vector
== 0)
4575 gimple_seq ctor_seq
= NULL
;
4577 if (constant_p
&& !neutral_op
4578 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4579 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4580 /* Build the vector directly from ELTS. */
4581 init
= gimple_build_vector (&ctor_seq
, &elts
);
4582 else if (neutral_op
)
4584 /* Build a vector of the neutral value and shift the
4585 other elements into place. */
4586 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4589 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4594 gcall
*call
= gimple_build_call_internal
4595 (IFN_VEC_SHL_INSERT
, 2, init
, elts
[k
]);
4596 init
= make_ssa_name (vector_type
);
4597 gimple_call_set_lhs (call
, init
);
4598 gimple_seq_add_stmt (&ctor_seq
, call
);
4603 /* First time round, duplicate ELTS to fill the
4604 required number of vectors, then cherry pick the
4605 appropriate result for each iteration. */
4606 if (vec_oprnds
->is_empty ())
4607 duplicate_and_interleave (&ctor_seq
, vector_type
, elts
,
4610 init
= permute_results
[number_of_vectors
- j
- 1];
4612 if (ctor_seq
!= NULL
)
4613 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4614 voprnds
.quick_push (init
);
4616 number_of_places_left_in_vector
= nunits
;
4617 elts
.new_vector (vector_type
, nunits
, 1);
4618 elts
.quick_grow (nunits
);
4624 /* Since the vectors are created in the reverse order, we should invert
4626 vec_num
= voprnds
.length ();
4627 for (j
= vec_num
; j
!= 0; j
--)
4629 vop
= voprnds
[j
- 1];
4630 vec_oprnds
->quick_push (vop
);
4635 /* In case that VF is greater than the unrolling factor needed for the SLP
4636 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4637 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4638 to replicate the vectors. */
4639 tree neutral_vec
= NULL
;
4640 while (number_of_vectors
> vec_oprnds
->length ())
4646 gimple_seq ctor_seq
= NULL
;
4647 neutral_vec
= gimple_build_vector_from_val
4648 (&ctor_seq
, vector_type
, neutral_op
);
4649 if (ctor_seq
!= NULL
)
4650 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4652 vec_oprnds
->quick_push (neutral_vec
);
4656 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
4657 vec_oprnds
->quick_push (vop
);
4663 /* Function vect_create_epilog_for_reduction
4665 Create code at the loop-epilog to finalize the result of a reduction
4668 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4669 reduction statements.
4670 STMT is the scalar reduction stmt that is being vectorized.
4671 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4672 number of elements that we can fit in a vectype (nunits). In this case
4673 we have to generate more than one vector stmt - i.e - we need to "unroll"
4674 the vector stmt by a factor VF/nunits. For more details see documentation
4675 in vectorizable_operation.
4676 REDUC_FN is the internal function for the epilog reduction.
4677 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4679 REDUC_INDEX is the index of the operand in the right hand side of the
4680 statement that is defined by REDUCTION_PHI.
4681 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4682 SLP_NODE is an SLP node containing a group of reduction statements. The
4683 first one in this group is STMT.
4684 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4685 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4686 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4687 any value of the IV in the loop.
4688 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4689 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4690 null if this is not an SLP reduction
4693 1. Creates the reduction def-use cycles: sets the arguments for
4695 The loop-entry argument is the vectorized initial-value of the reduction.
4696 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4698 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4699 by calling the function specified by REDUC_FN if available, or by
4700 other means (whole-vector shifts or a scalar loop).
4701 The function also creates a new phi node at the loop exit to preserve
4702 loop-closed form, as illustrated below.
4704 The flow at the entry to this function:
4707 vec_def = phi <null, null> # REDUCTION_PHI
4708 VECT_DEF = vector_stmt # vectorized form of STMT
4709 s_loop = scalar_stmt # (scalar) STMT
4711 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4715 The above is transformed by this function into:
4718 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4719 VECT_DEF = vector_stmt # vectorized form of STMT
4720 s_loop = scalar_stmt # (scalar) STMT
4722 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4723 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4724 v_out2 = reduce <v_out1>
4725 s_out3 = extract_field <v_out2, 0>
4726 s_out4 = adjust_result <s_out3>
4732 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple
*stmt
,
4733 gimple
*reduc_def_stmt
,
4734 int ncopies
, internal_fn reduc_fn
,
4735 vec
<gimple
*> reduction_phis
,
4738 slp_instance slp_node_instance
,
4739 tree induc_val
, enum tree_code induc_code
,
4742 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4743 stmt_vec_info prev_phi_info
;
4746 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4747 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4748 basic_block exit_bb
;
4751 gimple
*new_phi
= NULL
, *phi
;
4752 gimple_stmt_iterator exit_gsi
;
4754 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4755 gimple
*epilog_stmt
= NULL
;
4756 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4759 tree adjustment_def
= NULL
;
4760 tree vec_initial_def
= NULL
;
4761 tree expr
, def
, initial_def
= NULL
;
4762 tree orig_name
, scalar_result
;
4763 imm_use_iterator imm_iter
, phi_imm_iter
;
4764 use_operand_p use_p
, phi_use_p
;
4765 gimple
*use_stmt
, *orig_stmt
, *reduction_phi
= NULL
;
4766 bool nested_in_vect_loop
= false;
4767 auto_vec
<gimple
*> new_phis
;
4768 auto_vec
<gimple
*> inner_phis
;
4769 enum vect_def_type dt
= vect_unknown_def_type
;
4771 auto_vec
<tree
> scalar_results
;
4772 unsigned int group_size
= 1, k
, ratio
;
4773 auto_vec
<tree
> vec_initial_defs
;
4774 auto_vec
<gimple
*> phis
;
4775 bool slp_reduc
= false;
4776 bool direct_slp_reduc
;
4777 tree new_phi_result
;
4778 gimple
*inner_phi
= NULL
;
4779 tree induction_index
= NULL_TREE
;
4782 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4784 if (nested_in_vect_loop_p (loop
, stmt
))
4788 nested_in_vect_loop
= true;
4789 gcc_assert (!slp_node
);
4792 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4793 gcc_assert (vectype
);
4794 mode
= TYPE_MODE (vectype
);
4796 /* 1. Create the reduction def-use cycle:
4797 Set the arguments of REDUCTION_PHIS, i.e., transform
4800 vec_def = phi <null, null> # REDUCTION_PHI
4801 VECT_DEF = vector_stmt # vectorized form of STMT
4807 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4808 VECT_DEF = vector_stmt # vectorized form of STMT
4811 (in case of SLP, do it for all the phis). */
4813 /* Get the loop-entry arguments. */
4814 enum vect_def_type initial_def_dt
= vect_unknown_def_type
;
4817 unsigned vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
4818 vec_initial_defs
.reserve (vec_num
);
4819 get_initial_defs_for_reduction (slp_node_instance
->reduc_phis
,
4820 &vec_initial_defs
, vec_num
,
4821 GROUP_FIRST_ELEMENT (stmt_info
),
4826 /* Get at the scalar def before the loop, that defines the initial value
4827 of the reduction variable. */
4829 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4830 loop_preheader_edge (loop
));
4831 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4832 and we can't use zero for induc_val, use initial_def. Similarly
4833 for REDUC_MIN and initial_def larger than the base. */
4834 if (TREE_CODE (initial_def
) == INTEGER_CST
4835 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4836 == INTEGER_INDUC_COND_REDUCTION
)
4837 && !integer_zerop (induc_val
)
4838 && ((induc_code
== MAX_EXPR
4839 && tree_int_cst_lt (initial_def
, induc_val
))
4840 || (induc_code
== MIN_EXPR
4841 && tree_int_cst_lt (induc_val
, initial_def
))))
4842 induc_val
= initial_def
;
4843 vect_is_simple_use (initial_def
, loop_vinfo
, &def_stmt
, &initial_def_dt
);
4844 vec_initial_def
= get_initial_def_for_reduction (stmt
, initial_def
,
4846 vec_initial_defs
.create (1);
4847 vec_initial_defs
.quick_push (vec_initial_def
);
4850 /* Set phi nodes arguments. */
4851 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4853 tree vec_init_def
= vec_initial_defs
[i
];
4854 tree def
= vect_defs
[i
];
4855 for (j
= 0; j
< ncopies
; j
++)
4859 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4860 if (nested_in_vect_loop
)
4862 = vect_get_vec_def_for_stmt_copy (initial_def_dt
,
4866 /* Set the loop-entry arg of the reduction-phi. */
4868 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4869 == INTEGER_INDUC_COND_REDUCTION
)
4871 /* Initialise the reduction phi to zero. This prevents initial
4872 values of non-zero interferring with the reduction op. */
4873 gcc_assert (ncopies
== 1);
4874 gcc_assert (i
== 0);
4876 tree vec_init_def_type
= TREE_TYPE (vec_init_def
);
4878 = build_vector_from_val (vec_init_def_type
, induc_val
);
4880 add_phi_arg (as_a
<gphi
*> (phi
), induc_val_vec
,
4881 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4884 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4885 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4887 /* Set the loop-latch arg for the reduction-phi. */
4889 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4891 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4894 if (dump_enabled_p ())
4896 dump_printf_loc (MSG_NOTE
, vect_location
,
4897 "transform reduction: created def-use cycle: ");
4898 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4899 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4904 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4905 which is updated with the current index of the loop for every match of
4906 the original loop's cond_expr (VEC_STMT). This results in a vector
4907 containing the last time the condition passed for that vector lane.
4908 The first match will be a 1 to allow 0 to be used for non-matching
4909 indexes. If there are no matches at all then the vector will be all
4911 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
4913 tree indx_before_incr
, indx_after_incr
;
4914 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
4916 gimple
*vec_stmt
= STMT_VINFO_VEC_STMT (stmt_info
);
4917 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
4919 int scalar_precision
4920 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
4921 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
4922 tree cr_index_vector_type
= build_vector_type
4923 (cr_index_scalar_type
, TYPE_VECTOR_SUBPARTS (vectype
));
4925 /* First we create a simple vector induction variable which starts
4926 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4927 vector size (STEP). */
4929 /* Create a {1,2,3,...} vector. */
4930 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
4932 /* Create a vector of the step value. */
4933 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
4934 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
4936 /* Create an induction variable. */
4937 gimple_stmt_iterator incr_gsi
;
4939 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4940 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
4941 insert_after
, &indx_before_incr
, &indx_after_incr
);
4943 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4944 filled with zeros (VEC_ZERO). */
4946 /* Create a vector of 0s. */
4947 tree zero
= build_zero_cst (cr_index_scalar_type
);
4948 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
4950 /* Create a vector phi node. */
4951 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
4952 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
4953 set_vinfo_for_stmt (new_phi
,
4954 new_stmt_vec_info (new_phi
, loop_vinfo
));
4955 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
4956 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4958 /* Now take the condition from the loops original cond_expr
4959 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4960 every match uses values from the induction variable
4961 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4963 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4964 the new cond_expr (INDEX_COND_EXPR). */
4966 /* Duplicate the condition from vec_stmt. */
4967 tree ccompare
= unshare_expr (gimple_assign_rhs1 (vec_stmt
));
4969 /* Create a conditional, where the condition is taken from vec_stmt
4970 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4971 else is the phi (NEW_PHI_TREE). */
4972 tree index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
4973 ccompare
, indx_before_incr
,
4975 induction_index
= make_ssa_name (cr_index_vector_type
);
4976 gimple
*index_condition
= gimple_build_assign (induction_index
,
4978 gsi_insert_before (&incr_gsi
, index_condition
, GSI_SAME_STMT
);
4979 stmt_vec_info index_vec_info
= new_stmt_vec_info (index_condition
,
4981 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
4982 set_vinfo_for_stmt (index_condition
, index_vec_info
);
4984 /* Update the phi with the vec cond. */
4985 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
4986 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4989 /* 2. Create epilog code.
4990 The reduction epilog code operates across the elements of the vector
4991 of partial results computed by the vectorized loop.
4992 The reduction epilog code consists of:
4994 step 1: compute the scalar result in a vector (v_out2)
4995 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4996 step 3: adjust the scalar result (s_out3) if needed.
4998 Step 1 can be accomplished using one the following three schemes:
4999 (scheme 1) using reduc_fn, if available.
5000 (scheme 2) using whole-vector shifts, if available.
5001 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5004 The overall epilog code looks like this:
5006 s_out0 = phi <s_loop> # original EXIT_PHI
5007 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5008 v_out2 = reduce <v_out1> # step 1
5009 s_out3 = extract_field <v_out2, 0> # step 2
5010 s_out4 = adjust_result <s_out3> # step 3
5012 (step 3 is optional, and steps 1 and 2 may be combined).
5013 Lastly, the uses of s_out0 are replaced by s_out4. */
5016 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5017 v_out1 = phi <VECT_DEF>
5018 Store them in NEW_PHIS. */
5020 exit_bb
= single_exit (loop
)->dest
;
5021 prev_phi_info
= NULL
;
5022 new_phis
.create (vect_defs
.length ());
5023 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
5025 for (j
= 0; j
< ncopies
; j
++)
5027 tree new_def
= copy_ssa_name (def
);
5028 phi
= create_phi_node (new_def
, exit_bb
);
5029 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
));
5031 new_phis
.quick_push (phi
);
5034 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
5035 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
5038 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5039 prev_phi_info
= vinfo_for_stmt (phi
);
5043 /* The epilogue is created for the outer-loop, i.e., for the loop being
5044 vectorized. Create exit phis for the outer loop. */
5048 exit_bb
= single_exit (loop
)->dest
;
5049 inner_phis
.create (vect_defs
.length ());
5050 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
5052 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
5053 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
5054 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
5056 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
5058 inner_phis
.quick_push (phi
);
5059 new_phis
[i
] = outer_phi
;
5060 prev_phi_info
= vinfo_for_stmt (outer_phi
);
5061 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
5063 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
5064 new_result
= copy_ssa_name (PHI_RESULT (phi
));
5065 outer_phi
= create_phi_node (new_result
, exit_bb
);
5066 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
5068 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
5070 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
5071 prev_phi_info
= vinfo_for_stmt (outer_phi
);
5076 exit_gsi
= gsi_after_labels (exit_bb
);
5078 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5079 (i.e. when reduc_fn is not available) and in the final adjustment
5080 code (if needed). Also get the original scalar reduction variable as
5081 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5082 represents a reduction pattern), the tree-code and scalar-def are
5083 taken from the original stmt that the pattern-stmt (STMT) replaces.
5084 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5085 are taken from STMT. */
5087 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
5090 /* Regular reduction */
5095 /* Reduction pattern */
5096 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
5097 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
5098 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
5101 code
= gimple_assign_rhs_code (orig_stmt
);
5102 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5103 partial results are added and not subtracted. */
5104 if (code
== MINUS_EXPR
)
5107 scalar_dest
= gimple_assign_lhs (orig_stmt
);
5108 scalar_type
= TREE_TYPE (scalar_dest
);
5109 scalar_results
.create (group_size
);
5110 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5111 bitsize
= TYPE_SIZE (scalar_type
);
5113 /* In case this is a reduction in an inner-loop while vectorizing an outer
5114 loop - we don't need to extract a single scalar result at the end of the
5115 inner-loop (unless it is double reduction, i.e., the use of reduction is
5116 outside the outer-loop). The final vector of partial results will be used
5117 in the vectorized outer-loop, or reduced to a scalar result at the end of
5119 if (nested_in_vect_loop
&& !double_reduc
)
5120 goto vect_finalize_reduction
;
5122 /* SLP reduction without reduction chain, e.g.,
5126 b2 = operation (b1) */
5127 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
5129 /* True if we should implement SLP_REDUC using native reduction operations
5130 instead of scalar operations. */
5131 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5133 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5135 /* In case of reduction chain, e.g.,
5138 a3 = operation (a2),
5140 we may end up with more than one vector result. Here we reduce them to
5142 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) || direct_slp_reduc
)
5144 tree first_vect
= PHI_RESULT (new_phis
[0]);
5145 gassign
*new_vec_stmt
= NULL
;
5146 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5147 for (k
= 1; k
< new_phis
.length (); k
++)
5149 gimple
*next_phi
= new_phis
[k
];
5150 tree second_vect
= PHI_RESULT (next_phi
);
5151 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
5152 new_vec_stmt
= gimple_build_assign (tem
, code
,
5153 first_vect
, second_vect
);
5154 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
5158 new_phi_result
= first_vect
;
5161 new_phis
.truncate (0);
5162 new_phis
.safe_push (new_vec_stmt
);
5165 /* Likewise if we couldn't use a single defuse cycle. */
5166 else if (ncopies
> 1)
5168 gcc_assert (new_phis
.length () == 1);
5169 tree first_vect
= PHI_RESULT (new_phis
[0]);
5170 gassign
*new_vec_stmt
= NULL
;
5171 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5172 gimple
*next_phi
= new_phis
[0];
5173 for (int k
= 1; k
< ncopies
; ++k
)
5175 next_phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi
));
5176 tree second_vect
= PHI_RESULT (next_phi
);
5177 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
5178 new_vec_stmt
= gimple_build_assign (tem
, code
,
5179 first_vect
, second_vect
);
5180 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
5183 new_phi_result
= first_vect
;
5184 new_phis
.truncate (0);
5185 new_phis
.safe_push (new_vec_stmt
);
5188 new_phi_result
= PHI_RESULT (new_phis
[0]);
5190 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
5191 && reduc_fn
!= IFN_LAST
)
5193 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5194 various data values where the condition matched and another vector
5195 (INDUCTION_INDEX) containing all the indexes of those matches. We
5196 need to extract the last matching index (which will be the index with
5197 highest value) and use this to index into the data vector.
5198 For the case where there were no matches, the data vector will contain
5199 all default values and the index vector will be all zeros. */
5201 /* Get various versions of the type of the vector of indexes. */
5202 tree index_vec_type
= TREE_TYPE (induction_index
);
5203 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5204 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5205 tree index_vec_cmp_type
= build_same_sized_truth_vector_type
5208 /* Get an unsigned integer version of the type of the data vector. */
5209 int scalar_precision
5210 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5211 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5212 tree vectype_unsigned
= build_vector_type
5213 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
5215 /* First we need to create a vector (ZERO_VEC) of zeros and another
5216 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5217 can create using a MAX reduction and then expanding.
5218 In the case where the loop never made any matches, the max index will
5221 /* Vector of {0, 0, 0,...}. */
5222 tree zero_vec
= make_ssa_name (vectype
);
5223 tree zero_vec_rhs
= build_zero_cst (vectype
);
5224 gimple
*zero_vec_stmt
= gimple_build_assign (zero_vec
, zero_vec_rhs
);
5225 gsi_insert_before (&exit_gsi
, zero_vec_stmt
, GSI_SAME_STMT
);
5227 /* Find maximum value from the vector of found indexes. */
5228 tree max_index
= make_ssa_name (index_scalar_type
);
5229 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5230 1, induction_index
);
5231 gimple_call_set_lhs (max_index_stmt
, max_index
);
5232 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5234 /* Vector of {max_index, max_index, max_index,...}. */
5235 tree max_index_vec
= make_ssa_name (index_vec_type
);
5236 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5238 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5240 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5242 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5243 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5244 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5245 otherwise. Only one value should match, resulting in a vector
5246 (VEC_COND) with one data value and the rest zeros.
5247 In the case where the loop never made any matches, every index will
5248 match, resulting in a vector with all data values (which will all be
5249 the default value). */
5251 /* Compare the max index vector to the vector of found indexes to find
5252 the position of the max value. */
5253 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5254 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5257 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5259 /* Use the compare to choose either values from the data vector or
5261 tree vec_cond
= make_ssa_name (vectype
);
5262 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5263 vec_compare
, new_phi_result
,
5265 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5267 /* Finally we need to extract the data value from the vector (VEC_COND)
5268 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5269 reduction, but because this doesn't exist, we can use a MAX reduction
5270 instead. The data value might be signed or a float so we need to cast
5272 In the case where the loop never made any matches, the data values are
5273 all identical, and so will reduce down correctly. */
5275 /* Make the matched data values unsigned. */
5276 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5277 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5279 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5282 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5284 /* Reduce down to a scalar value. */
5285 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5286 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5288 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5289 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5291 /* Convert the reduced value back to the result type and set as the
5293 gimple_seq stmts
= NULL
;
5294 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5296 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5297 scalar_results
.safe_push (new_temp
);
5299 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
5300 && reduc_fn
== IFN_LAST
)
5302 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5304 idx_val = induction_index[0];
5305 val = data_reduc[0];
5306 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5307 if (induction_index[i] > idx_val)
5308 val = data_reduc[i], idx_val = induction_index[i];
5311 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5312 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5313 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5314 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5315 /* Enforced by vectorizable_reduction, which ensures we have target
5316 support before allowing a conditional reduction on variable-length
5318 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5319 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5320 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5322 tree old_idx_val
= idx_val
;
5324 idx_val
= make_ssa_name (idx_eltype
);
5325 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5326 build3 (BIT_FIELD_REF
, idx_eltype
,
5328 bitsize_int (el_size
),
5329 bitsize_int (off
)));
5330 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5331 val
= make_ssa_name (data_eltype
);
5332 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5333 build3 (BIT_FIELD_REF
,
5336 bitsize_int (el_size
),
5337 bitsize_int (off
)));
5338 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5341 tree new_idx_val
= idx_val
;
5343 if (off
!= v_size
- el_size
)
5345 new_idx_val
= make_ssa_name (idx_eltype
);
5346 epilog_stmt
= gimple_build_assign (new_idx_val
,
5349 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5351 new_val
= make_ssa_name (data_eltype
);
5352 epilog_stmt
= gimple_build_assign (new_val
,
5359 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5360 idx_val
= new_idx_val
;
5364 /* Convert the reduced value back to the result type and set as the
5366 gimple_seq stmts
= NULL
;
5367 val
= gimple_convert (&stmts
, scalar_type
, val
);
5368 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5369 scalar_results
.safe_push (val
);
5372 /* 2.3 Create the reduction code, using one of the three schemes described
5373 above. In SLP we simply need to extract all the elements from the
5374 vector (without reducing them), so we use scalar shifts. */
5375 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5381 v_out2 = reduc_expr <v_out1> */
5383 if (dump_enabled_p ())
5384 dump_printf_loc (MSG_NOTE
, vect_location
,
5385 "Reduce using direct vector reduction.\n");
5387 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5388 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
5391 = vect_create_destination_var (scalar_dest
, vec_elem_type
);
5392 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5394 gimple_set_lhs (epilog_stmt
, tmp_dest
);
5395 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
5396 gimple_set_lhs (epilog_stmt
, new_temp
);
5397 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5399 epilog_stmt
= gimple_build_assign (new_scalar_dest
, NOP_EXPR
,
5404 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5406 gimple_set_lhs (epilog_stmt
, new_scalar_dest
);
5409 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5410 gimple_set_lhs (epilog_stmt
, new_temp
);
5411 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5413 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5414 == INTEGER_INDUC_COND_REDUCTION
)
5415 && !operand_equal_p (initial_def
, induc_val
, 0))
5417 /* Earlier we set the initial value to be a vector if induc_val
5418 values. Check the result and if it is induc_val then replace
5419 with the original initial value, unless induc_val is
5420 the same as initial_def already. */
5421 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5424 tmp
= make_ssa_name (new_scalar_dest
);
5425 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5426 initial_def
, new_temp
);
5427 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5431 scalar_results
.safe_push (new_temp
);
5433 else if (direct_slp_reduc
)
5435 /* Here we create one vector for each of the GROUP_SIZE results,
5436 with the elements for other SLP statements replaced with the
5437 neutral value. We can then do a normal reduction on each vector. */
5439 /* Enforced by vectorizable_reduction. */
5440 gcc_assert (new_phis
.length () == 1);
5441 gcc_assert (pow2p_hwi (group_size
));
5443 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5444 vec
<gimple
*> orig_phis
= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5445 gimple_seq seq
= NULL
;
5447 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5448 and the same element size as VECTYPE. */
5449 tree index
= build_index_vector (vectype
, 0, 1);
5450 tree index_type
= TREE_TYPE (index
);
5451 tree index_elt_type
= TREE_TYPE (index_type
);
5452 tree mask_type
= build_same_sized_truth_vector_type (index_type
);
5454 /* Create a vector that, for each element, identifies which of
5455 the GROUP_SIZE results should use it. */
5456 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5457 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5458 build_vector_from_val (index_type
, index_mask
));
5460 /* Get a neutral vector value. This is simply a splat of the neutral
5461 scalar value if we have one, otherwise the initial scalar value
5462 is itself a neutral value. */
5463 tree vector_identity
= NULL_TREE
;
5465 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5467 for (unsigned int i
= 0; i
< group_size
; ++i
)
5469 /* If there's no univeral neutral value, we can use the
5470 initial scalar value from the original PHI. This is used
5471 for MIN and MAX reduction, for example. */
5475 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
],
5476 loop_preheader_edge (loop
));
5477 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5481 /* Calculate the equivalent of:
5483 sel[j] = (index[j] == i);
5485 which selects the elements of NEW_PHI_RESULT that should
5486 be included in the result. */
5487 tree compare_val
= build_int_cst (index_elt_type
, i
);
5488 compare_val
= build_vector_from_val (index_type
, compare_val
);
5489 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5490 index
, compare_val
);
5492 /* Calculate the equivalent of:
5494 vec = seq ? new_phi_result : vector_identity;
5496 VEC is now suitable for a full vector reduction. */
5497 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5498 sel
, new_phi_result
, vector_identity
);
5500 /* Do the reduction and convert it to the appropriate type. */
5501 gcall
*call
= gimple_build_call_internal (reduc_fn
, 1, vec
);
5502 tree scalar
= make_ssa_name (TREE_TYPE (vectype
));
5503 gimple_call_set_lhs (call
, scalar
);
5504 gimple_seq_add_stmt (&seq
, call
);
5505 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5506 scalar_results
.safe_push (scalar
);
5508 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5512 bool reduce_with_shift
;
5515 /* COND reductions all do the final reduction with MAX_EXPR
5517 if (code
== COND_EXPR
)
5519 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5520 == INTEGER_INDUC_COND_REDUCTION
)
5526 /* See if the target wants to do the final (shift) reduction
5527 in a vector mode of smaller size and first reduce upper/lower
5528 halves against each other. */
5529 enum machine_mode mode1
= mode
;
5530 tree vectype1
= vectype
;
5531 unsigned sz
= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
));
5534 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5535 sz1
= GET_MODE_SIZE (mode1
).to_constant ();
5537 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz1
);
5538 reduce_with_shift
= have_whole_vector_shift (mode1
);
5539 if (!VECTOR_MODE_P (mode1
))
5540 reduce_with_shift
= false;
5543 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5544 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5545 reduce_with_shift
= false;
5548 /* First reduce the vector to the desired vector size we should
5549 do shift reduction on by combining upper and lower halves. */
5550 new_temp
= new_phi_result
;
5553 gcc_assert (!slp_reduc
);
5555 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz
);
5557 /* The target has to make sure we support lowpart/highpart
5558 extraction, either via direct vector extract or through
5559 an integer mode punning. */
5561 if (convert_optab_handler (vec_extract_optab
,
5562 TYPE_MODE (TREE_TYPE (new_temp
)),
5563 TYPE_MODE (vectype1
))
5564 != CODE_FOR_nothing
)
5566 /* Extract sub-vectors directly once vec_extract becomes
5567 a conversion optab. */
5568 dst1
= make_ssa_name (vectype1
);
5570 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5571 build3 (BIT_FIELD_REF
, vectype1
,
5572 new_temp
, TYPE_SIZE (vectype1
),
5574 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5575 dst2
= make_ssa_name (vectype1
);
5577 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5578 build3 (BIT_FIELD_REF
, vectype1
,
5579 new_temp
, TYPE_SIZE (vectype1
),
5580 bitsize_int (sz
* BITS_PER_UNIT
)));
5581 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5585 /* Extract via punning to appropriately sized integer mode
5587 tree eltype
= build_nonstandard_integer_type (sz
* BITS_PER_UNIT
,
5589 tree etype
= build_vector_type (eltype
, 2);
5590 gcc_assert (convert_optab_handler (vec_extract_optab
,
5593 != CODE_FOR_nothing
);
5594 tree tem
= make_ssa_name (etype
);
5595 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5596 build1 (VIEW_CONVERT_EXPR
,
5598 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5600 tem
= make_ssa_name (eltype
);
5602 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5603 build3 (BIT_FIELD_REF
, eltype
,
5604 new_temp
, TYPE_SIZE (eltype
),
5606 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5607 dst1
= make_ssa_name (vectype1
);
5608 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5609 build1 (VIEW_CONVERT_EXPR
,
5611 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5612 tem
= make_ssa_name (eltype
);
5614 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5615 build3 (BIT_FIELD_REF
, eltype
,
5616 new_temp
, TYPE_SIZE (eltype
),
5617 bitsize_int (sz
* BITS_PER_UNIT
)));
5618 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5619 dst2
= make_ssa_name (vectype1
);
5620 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5621 build1 (VIEW_CONVERT_EXPR
,
5623 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5626 new_temp
= make_ssa_name (vectype1
);
5627 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5628 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5631 if (reduce_with_shift
&& !slp_reduc
)
5633 int element_bitsize
= tree_to_uhwi (bitsize
);
5634 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5635 for variable-length vectors and also requires direct target support
5636 for loop reductions. */
5637 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5638 int nelements
= vec_size_in_bits
/ element_bitsize
;
5639 vec_perm_builder sel
;
5640 vec_perm_indices indices
;
5644 tree zero_vec
= build_zero_cst (vectype1
);
5646 for (offset = nelements/2; offset >= 1; offset/=2)
5648 Create: va' = vec_shift <va, offset>
5649 Create: va = vop <va, va'>
5654 if (dump_enabled_p ())
5655 dump_printf_loc (MSG_NOTE
, vect_location
,
5656 "Reduce using vector shifts\n");
5658 mode1
= TYPE_MODE (vectype1
);
5659 vec_dest
= vect_create_destination_var (scalar_dest
, vectype1
);
5660 for (elt_offset
= nelements
/ 2;
5664 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5665 indices
.new_vector (sel
, 2, nelements
);
5666 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5667 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
5668 new_temp
, zero_vec
, mask
);
5669 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
5670 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5671 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5673 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
5675 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
5676 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5677 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5680 /* 2.4 Extract the final scalar result. Create:
5681 s_out3 = extract_field <v_out2, bitpos> */
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_NOTE
, vect_location
,
5685 "extract scalar result\n");
5687 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5688 bitsize
, bitsize_zero_node
);
5689 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5690 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5691 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5692 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5693 scalar_results
.safe_push (new_temp
);
5698 s = extract_field <v_out2, 0>
5699 for (offset = element_size;
5700 offset < vector_size;
5701 offset += element_size;)
5703 Create: s' = extract_field <v_out2, offset>
5704 Create: s = op <s, s'> // For non SLP cases
5707 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_NOTE
, vect_location
,
5709 "Reduce using scalar code.\n");
5711 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5712 int element_bitsize
= tree_to_uhwi (bitsize
);
5713 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5716 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5717 vec_temp
= PHI_RESULT (new_phi
);
5719 vec_temp
= gimple_assign_lhs (new_phi
);
5720 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
5722 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5723 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5724 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5725 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5727 /* In SLP we don't need to apply reduction operation, so we just
5728 collect s' values in SCALAR_RESULTS. */
5730 scalar_results
.safe_push (new_temp
);
5732 for (bit_offset
= element_bitsize
;
5733 bit_offset
< vec_size_in_bits
;
5734 bit_offset
+= element_bitsize
)
5736 tree bitpos
= bitsize_int (bit_offset
);
5737 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
5740 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5741 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5742 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5743 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5747 /* In SLP we don't need to apply reduction operation, so
5748 we just collect s' values in SCALAR_RESULTS. */
5749 new_temp
= new_name
;
5750 scalar_results
.safe_push (new_name
);
5754 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5755 new_name
, new_temp
);
5756 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5757 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5758 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5763 /* The only case where we need to reduce scalar results in SLP, is
5764 unrolling. If the size of SCALAR_RESULTS is greater than
5765 GROUP_SIZE, we reduce them combining elements modulo
5769 tree res
, first_res
, new_res
;
5772 /* Reduce multiple scalar results in case of SLP unrolling. */
5773 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5776 first_res
= scalar_results
[j
% group_size
];
5777 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5779 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
5780 gimple_assign_set_lhs (new_stmt
, new_res
);
5781 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
5782 scalar_results
[j
% group_size
] = new_res
;
5786 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5787 scalar_results
.safe_push (new_temp
);
5790 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5791 == INTEGER_INDUC_COND_REDUCTION
)
5792 && !operand_equal_p (initial_def
, induc_val
, 0))
5794 /* Earlier we set the initial value to be a vector if induc_val
5795 values. Check the result and if it is induc_val then replace
5796 with the original initial value, unless induc_val is
5797 the same as initial_def already. */
5798 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5801 tree tmp
= make_ssa_name (new_scalar_dest
);
5802 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5803 initial_def
, new_temp
);
5804 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5805 scalar_results
[0] = tmp
;
5809 vect_finalize_reduction
:
5814 /* 2.5 Adjust the final result by the initial value of the reduction
5815 variable. (When such adjustment is not needed, then
5816 'adjustment_def' is zero). For example, if code is PLUS we create:
5817 new_temp = loop_exit_def + adjustment_def */
5821 gcc_assert (!slp_reduc
);
5822 if (nested_in_vect_loop
)
5824 new_phi
= new_phis
[0];
5825 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
5826 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
5827 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5831 new_temp
= scalar_results
[0];
5832 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5833 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
5834 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
5837 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
5838 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
5839 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5840 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5841 if (nested_in_vect_loop
)
5843 set_vinfo_for_stmt (epilog_stmt
,
5844 new_stmt_vec_info (epilog_stmt
, loop_vinfo
));
5845 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
5846 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
5849 scalar_results
.quick_push (new_temp
);
5851 scalar_results
[0] = new_temp
;
5854 scalar_results
[0] = new_temp
;
5856 new_phis
[0] = epilog_stmt
;
5859 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5860 phis with new adjusted scalar results, i.e., replace use <s_out0>
5865 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5866 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5867 v_out2 = reduce <v_out1>
5868 s_out3 = extract_field <v_out2, 0>
5869 s_out4 = adjust_result <s_out3>
5876 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5877 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5878 v_out2 = reduce <v_out1>
5879 s_out3 = extract_field <v_out2, 0>
5880 s_out4 = adjust_result <s_out3>
5885 /* In SLP reduction chain we reduce vector results into one vector if
5886 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5887 the last stmt in the reduction chain, since we are looking for the loop
5889 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
5891 gimple
*dest_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5892 /* Handle reduction patterns. */
5893 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
)))
5894 dest_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
));
5896 scalar_dest
= gimple_assign_lhs (dest_stmt
);
5900 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5901 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5902 need to match SCALAR_RESULTS with corresponding statements. The first
5903 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5904 the first vector stmt, etc.
5905 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5906 if (group_size
> new_phis
.length ())
5908 ratio
= group_size
/ new_phis
.length ();
5909 gcc_assert (!(group_size
% new_phis
.length ()));
5914 for (k
= 0; k
< group_size
; k
++)
5918 epilog_stmt
= new_phis
[k
/ ratio
];
5919 reduction_phi
= reduction_phis
[k
/ ratio
];
5921 inner_phi
= inner_phis
[k
/ ratio
];
5926 gimple
*current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5928 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
5929 /* SLP statements can't participate in patterns. */
5930 gcc_assert (!orig_stmt
);
5931 scalar_dest
= gimple_assign_lhs (current_stmt
);
5935 /* Find the loop-closed-use at the loop exit of the original scalar
5936 result. (The reduction result is expected to have two immediate uses -
5937 one at the latch block, and one at the loop exit). */
5938 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5939 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
5940 && !is_gimple_debug (USE_STMT (use_p
)))
5941 phis
.safe_push (USE_STMT (use_p
));
5943 /* While we expect to have found an exit_phi because of loop-closed-ssa
5944 form we can end up without one if the scalar cycle is dead. */
5946 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5950 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5953 /* FORNOW. Currently not supporting the case that an inner-loop
5954 reduction is not used in the outer-loop (but only outside the
5955 outer-loop), unless it is double reduction. */
5956 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5957 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
5961 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
5963 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
5965 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
5966 != vect_double_reduction_def
)
5969 /* Handle double reduction:
5971 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5972 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5973 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5974 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5976 At that point the regular reduction (stmt2 and stmt3) is
5977 already vectorized, as well as the exit phi node, stmt4.
5978 Here we vectorize the phi node of double reduction, stmt1, and
5979 update all relevant statements. */
5981 /* Go through all the uses of s2 to find double reduction phi
5982 node, i.e., stmt1 above. */
5983 orig_name
= PHI_RESULT (exit_phi
);
5984 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5986 stmt_vec_info use_stmt_vinfo
;
5987 stmt_vec_info new_phi_vinfo
;
5988 tree vect_phi_init
, preheader_arg
, vect_phi_res
;
5989 basic_block bb
= gimple_bb (use_stmt
);
5992 /* Check that USE_STMT is really double reduction phi
5994 if (gimple_code (use_stmt
) != GIMPLE_PHI
5995 || gimple_phi_num_args (use_stmt
) != 2
5996 || bb
->loop_father
!= outer_loop
)
5998 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
6000 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
6001 != vect_double_reduction_def
)
6004 /* Create vector phi node for double reduction:
6005 vs1 = phi <vs0, vs2>
6006 vs1 was created previously in this function by a call to
6007 vect_get_vec_def_for_operand and is stored in
6009 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6010 vs0 is created here. */
6012 /* Create vector phi node. */
6013 vect_phi
= create_phi_node (vec_initial_def
, bb
);
6014 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
6015 loop_vec_info_for_loop (outer_loop
));
6016 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
6018 /* Create vs0 - initial def of the double reduction phi. */
6019 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
6020 loop_preheader_edge (outer_loop
));
6021 vect_phi_init
= get_initial_def_for_reduction
6022 (stmt
, preheader_arg
, NULL
);
6024 /* Update phi node arguments with vs0 and vs2. */
6025 add_phi_arg (vect_phi
, vect_phi_init
,
6026 loop_preheader_edge (outer_loop
),
6028 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
6029 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
6030 if (dump_enabled_p ())
6032 dump_printf_loc (MSG_NOTE
, vect_location
,
6033 "created double reduction phi node: ");
6034 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
6037 vect_phi_res
= PHI_RESULT (vect_phi
);
6039 /* Replace the use, i.e., set the correct vs1 in the regular
6040 reduction phi node. FORNOW, NCOPIES is always 1, so the
6041 loop is redundant. */
6042 use
= reduction_phi
;
6043 for (j
= 0; j
< ncopies
; j
++)
6045 edge pr_edge
= loop_preheader_edge (loop
);
6046 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
6047 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
6054 if (nested_in_vect_loop
)
6063 /* Find the loop-closed-use at the loop exit of the original scalar
6064 result. (The reduction result is expected to have two immediate uses,
6065 one at the latch block, and one at the loop exit). For double
6066 reductions we are looking for exit phis of the outer loop. */
6067 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6069 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6071 if (!is_gimple_debug (USE_STMT (use_p
)))
6072 phis
.safe_push (USE_STMT (use_p
));
6076 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6078 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6080 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6082 if (!flow_bb_inside_loop_p (loop
,
6083 gimple_bb (USE_STMT (phi_use_p
)))
6084 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6085 phis
.safe_push (USE_STMT (phi_use_p
));
6091 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6093 /* Replace the uses: */
6094 orig_name
= PHI_RESULT (exit_phi
);
6095 scalar_result
= scalar_results
[k
];
6096 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6097 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6098 SET_USE (use_p
, scalar_result
);
6105 /* Return a vector of type VECTYPE that is equal to the vector select
6106 operation "MASK ? VEC : IDENTITY". Insert the select statements
6110 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6111 tree vec
, tree identity
)
6113 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6114 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6115 mask
, vec
, identity
);
6116 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6120 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6121 order, starting with LHS. Insert the extraction statements before GSI and
6122 associate the new scalar SSA names with variable SCALAR_DEST.
6123 Return the SSA name for the result. */
6126 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6127 tree_code code
, tree lhs
, tree vector_rhs
)
6129 tree vectype
= TREE_TYPE (vector_rhs
);
6130 tree scalar_type
= TREE_TYPE (vectype
);
6131 tree bitsize
= TYPE_SIZE (scalar_type
);
6132 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6133 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6135 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6136 bit_offset
< vec_size_in_bits
;
6137 bit_offset
+= element_bitsize
)
6139 tree bitpos
= bitsize_int (bit_offset
);
6140 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6143 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6144 rhs
= make_ssa_name (scalar_dest
, stmt
);
6145 gimple_assign_set_lhs (stmt
, rhs
);
6146 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6148 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6149 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6150 gimple_assign_set_lhs (stmt
, new_name
);
6151 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6157 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6158 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6159 statement. CODE is the operation performed by STMT and OPS are
6160 its scalar operands. REDUC_INDEX is the index of the operand in
6161 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6162 implements in-order reduction, or IFN_LAST if we should open-code it.
6163 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6164 that should be used to control the operation in a fully-masked loop. */
6167 vectorize_fold_left_reduction (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
6168 gimple
**vec_stmt
, slp_tree slp_node
,
6169 gimple
*reduc_def_stmt
,
6170 tree_code code
, internal_fn reduc_fn
,
6171 tree ops
[3], tree vectype_in
,
6172 int reduc_index
, vec_loop_masks
*masks
)
6174 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6175 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6176 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6177 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6178 gimple
*new_stmt
= NULL
;
6184 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6186 gcc_assert (!nested_in_vect_loop_p (loop
, stmt
));
6187 gcc_assert (ncopies
== 1);
6188 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6189 gcc_assert (reduc_index
== (code
== MINUS_EXPR
? 0 : 1));
6190 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6191 == FOLD_LEFT_REDUCTION
);
6194 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6195 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6197 tree op0
= ops
[1 - reduc_index
];
6200 gimple
*scalar_dest_def
;
6201 auto_vec
<tree
> vec_oprnds0
;
6204 vect_get_vec_defs (op0
, NULL_TREE
, stmt
, &vec_oprnds0
, NULL
, slp_node
);
6205 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6206 scalar_dest_def
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6210 tree loop_vec_def0
= vect_get_vec_def_for_operand (op0
, stmt
);
6211 vec_oprnds0
.create (1);
6212 vec_oprnds0
.quick_push (loop_vec_def0
);
6213 scalar_dest_def
= stmt
;
6216 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def
);
6217 tree scalar_type
= TREE_TYPE (scalar_dest
);
6218 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6220 int vec_num
= vec_oprnds0
.length ();
6221 gcc_assert (vec_num
== 1 || slp_node
);
6222 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6223 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6225 tree vector_identity
= NULL_TREE
;
6226 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6227 vector_identity
= build_zero_cst (vectype_out
);
6229 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6232 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6234 tree mask
= NULL_TREE
;
6235 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6236 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6238 /* Handle MINUS by adding the negative. */
6239 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6241 tree negated
= make_ssa_name (vectype_out
);
6242 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6243 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6248 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6251 /* On the first iteration the input is simply the scalar phi
6252 result, and for subsequent iterations it is the output of
6253 the preceding operation. */
6254 if (reduc_fn
!= IFN_LAST
)
6256 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
, def0
);
6257 /* For chained SLP reductions the output of the previous reduction
6258 operation serves as the input of the next. For the final statement
6259 the output cannot be a temporary - we reuse the original
6260 scalar destination of the last statement. */
6261 if (i
!= vec_num
- 1)
6263 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6264 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6265 gimple_set_lhs (new_stmt
, reduc_var
);
6270 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6272 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6273 /* Remove the statement, so that we can use the same code paths
6274 as for statements that we've just created. */
6275 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6276 gsi_remove (&tmp_gsi
, false);
6279 if (i
== vec_num
- 1)
6281 gimple_set_lhs (new_stmt
, scalar_dest
);
6282 vect_finish_replace_stmt (scalar_dest_def
, new_stmt
);
6285 vect_finish_stmt_generation (scalar_dest_def
, new_stmt
, gsi
);
6288 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6292 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
6297 /* Function is_nonwrapping_integer_induction.
6299 Check if STMT (which is part of loop LOOP) both increments and
6300 does not cause overflow. */
6303 is_nonwrapping_integer_induction (gimple
*stmt
, struct loop
*loop
)
6305 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
6306 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6307 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6308 tree lhs_type
= TREE_TYPE (gimple_phi_result (stmt
));
6309 widest_int ni
, max_loop_value
, lhs_max
;
6310 bool overflow
= false;
6312 /* Make sure the loop is integer based. */
6313 if (TREE_CODE (base
) != INTEGER_CST
6314 || TREE_CODE (step
) != INTEGER_CST
)
6317 /* Check that the max size of the loop will not wrap. */
6319 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6322 if (! max_stmt_executions (loop
, &ni
))
6325 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6330 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6331 TYPE_SIGN (lhs_type
), &overflow
);
6335 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6336 <= TYPE_PRECISION (lhs_type
));
6339 /* Function vectorizable_reduction.
6341 Check if STMT performs a reduction operation that can be vectorized.
6342 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6343 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6344 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6346 This function also handles reduction idioms (patterns) that have been
6347 recognized in advance during vect_pattern_recog. In this case, STMT may be
6349 X = pattern_expr (arg0, arg1, ..., X)
6350 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6351 sequence that had been detected and replaced by the pattern-stmt (STMT).
6353 This function also handles reduction of condition expressions, for example:
6354 for (int i = 0; i < N; i++)
6357 This is handled by vectorising the loop and creating an additional vector
6358 containing the loop indexes for which "a[i] < value" was true. In the
6359 function epilogue this is reduced to a single max value and then used to
6360 index into the vector of results.
6362 In some cases of reduction patterns, the type of the reduction variable X is
6363 different than the type of the other arguments of STMT.
6364 In such cases, the vectype that is used when transforming STMT into a vector
6365 stmt is different than the vectype that is used to determine the
6366 vectorization factor, because it consists of a different number of elements
6367 than the actual number of elements that are being operated upon in parallel.
6369 For example, consider an accumulation of shorts into an int accumulator.
6370 On some targets it's possible to vectorize this pattern operating on 8
6371 shorts at a time (hence, the vectype for purposes of determining the
6372 vectorization factor should be V8HI); on the other hand, the vectype that
6373 is used to create the vector form is actually V4SI (the type of the result).
6375 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6376 indicates what is the actual level of parallelism (V8HI in the example), so
6377 that the right vectorization factor would be derived. This vectype
6378 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6379 be used to create the vectorized stmt. The right vectype for the vectorized
6380 stmt is obtained from the type of the result X:
6381 get_vectype_for_scalar_type (TREE_TYPE (X))
6383 This means that, contrary to "regular" reductions (or "regular" stmts in
6384 general), the following equation:
6385 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6386 does *NOT* necessarily hold for reduction patterns. */
6389 vectorizable_reduction (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
6390 gimple
**vec_stmt
, slp_tree slp_node
,
6391 slp_instance slp_node_instance
)
6395 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6396 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6397 tree vectype_in
= NULL_TREE
;
6398 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6399 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6400 enum tree_code code
, orig_code
;
6401 internal_fn reduc_fn
;
6402 machine_mode vec_mode
;
6405 tree new_temp
= NULL_TREE
;
6407 enum vect_def_type dt
, cond_reduc_dt
= vect_unknown_def_type
;
6408 gimple
*cond_reduc_def_stmt
= NULL
;
6409 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6413 stmt_vec_info orig_stmt_info
= NULL
;
6417 stmt_vec_info prev_stmt_info
, prev_phi_info
;
6418 bool single_defuse_cycle
= false;
6419 gimple
*new_stmt
= NULL
;
6422 enum vect_def_type dts
[3];
6423 bool nested_cycle
= false, found_nested_cycle_def
= false;
6424 bool double_reduc
= false;
6426 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
6428 gimple
*def_arg_stmt
;
6429 auto_vec
<tree
> vec_oprnds0
;
6430 auto_vec
<tree
> vec_oprnds1
;
6431 auto_vec
<tree
> vec_oprnds2
;
6432 auto_vec
<tree
> vect_defs
;
6433 auto_vec
<gimple
*> phis
;
6436 bool first_p
= true;
6437 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6438 tree cond_reduc_val
= NULL_TREE
;
6440 /* Make sure it was already recognized as a reduction computation. */
6441 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_reduction_def
6442 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_nested_cycle
)
6445 if (nested_in_vect_loop_p (loop
, stmt
))
6449 nested_cycle
= true;
6452 /* In case of reduction chain we switch to the first stmt in the chain, but
6453 we don't update STMT_INFO, since only the last stmt is marked as reduction
6454 and has reduction properties. */
6455 if (GROUP_FIRST_ELEMENT (stmt_info
)
6456 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
6458 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
6462 if (gimple_code (stmt
) == GIMPLE_PHI
)
6464 /* Analysis is fully done on the reduction stmt invocation. */
6468 slp_node_instance
->reduc_phis
= slp_node
;
6470 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6474 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6475 /* Leave the scalar phi in place. Note that checking
6476 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6477 for reductions involving a single statement. */
6480 gimple
*reduc_stmt
= STMT_VINFO_REDUC_DEF (stmt_info
);
6481 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt
)))
6482 reduc_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt
));
6484 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt
))
6485 == EXTRACT_LAST_REDUCTION
)
6486 /* Leave the scalar phi in place. */
6489 gcc_assert (is_gimple_assign (reduc_stmt
));
6490 for (unsigned k
= 1; k
< gimple_num_ops (reduc_stmt
); ++k
)
6492 tree op
= gimple_op (reduc_stmt
, k
);
6493 if (op
== gimple_phi_result (stmt
))
6496 && gimple_assign_rhs_code (reduc_stmt
) == COND_EXPR
)
6499 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6500 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op
)))))
6501 vectype_in
= get_vectype_for_scalar_type (TREE_TYPE (op
));
6504 gcc_assert (vectype_in
);
6509 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6511 use_operand_p use_p
;
6514 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt
))
6515 <= vect_used_only_live
)
6516 && single_imm_use (gimple_phi_result (stmt
), &use_p
, &use_stmt
)
6517 && (use_stmt
== reduc_stmt
6518 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt
))
6520 single_defuse_cycle
= true;
6522 /* Create the destination vector */
6523 scalar_dest
= gimple_assign_lhs (reduc_stmt
);
6524 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
6527 /* The size vect_schedule_slp_instance computes is off for us. */
6528 vec_num
= vect_get_num_vectors
6529 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6530 * SLP_TREE_SCALAR_STMTS (slp_node
).length (),
6535 /* Generate the reduction PHIs upfront. */
6536 prev_phi_info
= NULL
;
6537 for (j
= 0; j
< ncopies
; j
++)
6539 if (j
== 0 || !single_defuse_cycle
)
6541 for (i
= 0; i
< vec_num
; i
++)
6543 /* Create the reduction-phi that defines the reduction
6545 gimple
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
6546 set_vinfo_for_stmt (new_phi
,
6547 new_stmt_vec_info (new_phi
, loop_vinfo
));
6550 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
6554 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi
;
6556 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
6557 prev_phi_info
= vinfo_for_stmt (new_phi
);
6566 /* 1. Is vectorizable reduction? */
6567 /* Not supportable if the reduction variable is used in the loop, unless
6568 it's a reduction chain. */
6569 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6570 && !GROUP_FIRST_ELEMENT (stmt_info
))
6573 /* Reductions that are not used even in an enclosing outer-loop,
6574 are expected to be "live" (used out of the loop). */
6575 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6576 && !STMT_VINFO_LIVE_P (stmt_info
))
6579 /* 2. Has this been recognized as a reduction pattern?
6581 Check if STMT represents a pattern that has been recognized
6582 in earlier analysis stages. For stmts that represent a pattern,
6583 the STMT_VINFO_RELATED_STMT field records the last stmt in
6584 the original sequence that constitutes the pattern. */
6586 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
6589 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
6590 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6591 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6594 /* 3. Check the operands of the operation. The first operands are defined
6595 inside the loop body. The last operand is the reduction variable,
6596 which is defined by the loop-header-phi. */
6598 gcc_assert (is_gimple_assign (stmt
));
6601 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
6603 case GIMPLE_BINARY_RHS
:
6604 code
= gimple_assign_rhs_code (stmt
);
6605 op_type
= TREE_CODE_LENGTH (code
);
6606 gcc_assert (op_type
== binary_op
);
6607 ops
[0] = gimple_assign_rhs1 (stmt
);
6608 ops
[1] = gimple_assign_rhs2 (stmt
);
6611 case GIMPLE_TERNARY_RHS
:
6612 code
= gimple_assign_rhs_code (stmt
);
6613 op_type
= TREE_CODE_LENGTH (code
);
6614 gcc_assert (op_type
== ternary_op
);
6615 ops
[0] = gimple_assign_rhs1 (stmt
);
6616 ops
[1] = gimple_assign_rhs2 (stmt
);
6617 ops
[2] = gimple_assign_rhs3 (stmt
);
6620 case GIMPLE_UNARY_RHS
:
6627 if (code
== COND_EXPR
&& slp_node
)
6630 scalar_dest
= gimple_assign_lhs (stmt
);
6631 scalar_type
= TREE_TYPE (scalar_dest
);
6632 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6633 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6636 /* Do not try to vectorize bit-precision reductions. */
6637 if (!type_has_mode_precision_p (scalar_type
))
6640 /* All uses but the last are expected to be defined in the loop.
6641 The last use is the reduction variable. In case of nested cycle this
6642 assumption is not true: we use reduc_index to record the index of the
6643 reduction variable. */
6644 gimple
*reduc_def_stmt
= NULL
;
6645 int reduc_index
= -1;
6646 for (i
= 0; i
< op_type
; i
++)
6648 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6649 if (i
== 0 && code
== COND_EXPR
)
6652 is_simple_use
= vect_is_simple_use (ops
[i
], loop_vinfo
,
6653 &def_stmt
, &dts
[i
], &tem
);
6655 gcc_assert (is_simple_use
);
6656 if (dt
== vect_reduction_def
)
6658 reduc_def_stmt
= def_stmt
;
6664 /* To properly compute ncopies we are interested in the widest
6665 input type in case we're looking at a widening accumulation. */
6667 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6668 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
)))))
6672 if (dt
!= vect_internal_def
6673 && dt
!= vect_external_def
6674 && dt
!= vect_constant_def
6675 && dt
!= vect_induction_def
6676 && !(dt
== vect_nested_cycle
&& nested_cycle
))
6679 if (dt
== vect_nested_cycle
)
6681 found_nested_cycle_def
= true;
6682 reduc_def_stmt
= def_stmt
;
6686 if (i
== 1 && code
== COND_EXPR
)
6688 /* Record how value of COND_EXPR is defined. */
6689 if (dt
== vect_constant_def
)
6692 cond_reduc_val
= ops
[i
];
6694 if (dt
== vect_induction_def
6696 && is_nonwrapping_integer_induction (def_stmt
, loop
))
6699 cond_reduc_def_stmt
= def_stmt
;
6705 vectype_in
= vectype_out
;
6707 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6708 directy used in stmt. */
6709 if (reduc_index
== -1)
6711 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6713 if (dump_enabled_p ())
6714 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6715 "in-order reduction chain without SLP.\n");
6720 reduc_def_stmt
= STMT_VINFO_REDUC_DEF (orig_stmt_info
);
6722 reduc_def_stmt
= STMT_VINFO_REDUC_DEF (stmt_info
);
6725 if (! reduc_def_stmt
|| gimple_code (reduc_def_stmt
) != GIMPLE_PHI
)
6728 if (!(reduc_index
== -1
6729 || dts
[reduc_index
] == vect_reduction_def
6730 || dts
[reduc_index
] == vect_nested_cycle
6731 || ((dts
[reduc_index
] == vect_internal_def
6732 || dts
[reduc_index
] == vect_external_def
6733 || dts
[reduc_index
] == vect_constant_def
6734 || dts
[reduc_index
] == vect_induction_def
)
6735 && nested_cycle
&& found_nested_cycle_def
)))
6737 /* For pattern recognized stmts, orig_stmt might be a reduction,
6738 but some helper statements for the pattern might not, or
6739 might be COND_EXPRs with reduction uses in the condition. */
6740 gcc_assert (orig_stmt
);
6744 stmt_vec_info reduc_def_info
= vinfo_for_stmt (reduc_def_stmt
);
6745 enum vect_reduction_type v_reduc_type
6746 = STMT_VINFO_REDUC_TYPE (reduc_def_info
);
6747 gimple
*tmp
= STMT_VINFO_REDUC_DEF (reduc_def_info
);
6749 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = v_reduc_type
;
6750 /* If we have a condition reduction, see if we can simplify it further. */
6751 if (v_reduc_type
== COND_REDUCTION
)
6753 /* Loop peeling modifies initial value of reduction PHI, which
6754 makes the reduction stmt to be transformed different to the
6755 original stmt analyzed. We need to record reduction code for
6756 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6757 it can be used directly at transform stage. */
6758 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MAX_EXPR
6759 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MIN_EXPR
)
6761 /* Also set the reduction type to CONST_COND_REDUCTION. */
6762 gcc_assert (cond_reduc_dt
== vect_constant_def
);
6763 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = CONST_COND_REDUCTION
;
6765 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6766 vectype_in
, OPTIMIZE_FOR_SPEED
))
6768 if (dump_enabled_p ())
6769 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6770 "optimizing condition reduction with"
6771 " FOLD_EXTRACT_LAST.\n");
6772 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = EXTRACT_LAST_REDUCTION
;
6774 else if (cond_reduc_dt
== vect_induction_def
)
6776 stmt_vec_info cond_stmt_vinfo
= vinfo_for_stmt (cond_reduc_def_stmt
);
6778 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6779 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6781 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6782 && TREE_CODE (step
) == INTEGER_CST
);
6783 cond_reduc_val
= NULL_TREE
;
6784 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6785 above base; punt if base is the minimum value of the type for
6786 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6787 if (tree_int_cst_sgn (step
) == -1)
6789 cond_reduc_op_code
= MIN_EXPR
;
6790 if (tree_int_cst_sgn (base
) == -1)
6791 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6792 else if (tree_int_cst_lt (base
,
6793 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6795 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6799 cond_reduc_op_code
= MAX_EXPR
;
6800 if (tree_int_cst_sgn (base
) == 1)
6801 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6802 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6805 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_NOTE
, vect_location
,
6811 "condition expression based on "
6812 "integer induction.\n");
6813 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6814 = INTEGER_INDUC_COND_REDUCTION
;
6817 else if (cond_reduc_dt
== vect_constant_def
)
6819 enum vect_def_type cond_initial_dt
;
6820 gimple
*def_stmt
= SSA_NAME_DEF_STMT (ops
[reduc_index
]);
6821 tree cond_initial_val
6822 = PHI_ARG_DEF_FROM_EDGE (def_stmt
, loop_preheader_edge (loop
));
6824 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6825 vect_is_simple_use (cond_initial_val
, loop_vinfo
,
6826 &def_stmt
, &cond_initial_dt
);
6827 if (cond_initial_dt
== vect_constant_def
6828 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6829 TREE_TYPE (cond_reduc_val
)))
6831 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6832 cond_initial_val
, cond_reduc_val
);
6833 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_NOTE
, vect_location
,
6837 "condition expression based on "
6838 "compile time constant.\n");
6839 /* Record reduction code at analysis stage. */
6840 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
)
6841 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6842 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6843 = CONST_COND_REDUCTION
;
6850 gcc_assert (tmp
== orig_stmt
6851 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == orig_stmt
);
6853 /* We changed STMT to be the first stmt in reduction chain, hence we
6854 check that in this case the first element in the chain is STMT. */
6855 gcc_assert (stmt
== tmp
6856 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
6858 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
6864 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6866 gcc_assert (ncopies
>= 1);
6868 vec_mode
= TYPE_MODE (vectype_in
);
6869 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6871 if (code
== COND_EXPR
)
6873 /* Only call during the analysis stage, otherwise we'll lose
6875 if (!vec_stmt
&& !vectorizable_condition (stmt
, gsi
, NULL
,
6876 ops
[reduc_index
], 0, NULL
))
6878 if (dump_enabled_p ())
6879 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6880 "unsupported condition in reduction\n");
6886 /* 4. Supportable by target? */
6888 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
6889 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
6891 /* Shifts and rotates are only supported by vectorizable_shifts,
6892 not vectorizable_reduction. */
6893 if (dump_enabled_p ())
6894 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6895 "unsupported shift or rotation.\n");
6899 /* 4.1. check support for the operation in the loop */
6900 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
6903 if (dump_enabled_p ())
6904 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6910 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
6912 if (dump_enabled_p ())
6913 dump_printf (MSG_NOTE
, "op not supported by target.\n");
6915 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
6916 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6919 if (dump_enabled_p ())
6920 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
6923 /* Worthwhile without SIMD support? */
6924 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
6925 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6927 if (dump_enabled_p ())
6928 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6929 "not worthwhile without SIMD support.\n");
6935 /* 4.2. Check support for the epilog operation.
6937 If STMT represents a reduction pattern, then the type of the
6938 reduction variable may be different than the type of the rest
6939 of the arguments. For example, consider the case of accumulation
6940 of shorts into an int accumulator; The original code:
6941 S1: int_a = (int) short_a;
6942 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6945 STMT: int_acc = widen_sum <short_a, int_acc>
6948 1. The tree-code that is used to create the vector operation in the
6949 epilog code (that reduces the partial results) is not the
6950 tree-code of STMT, but is rather the tree-code of the original
6951 stmt from the pattern that STMT is replacing. I.e, in the example
6952 above we want to use 'widen_sum' in the loop, but 'plus' in the
6954 2. The type (mode) we use to check available target support
6955 for the vector operation to be created in the *epilog*, is
6956 determined by the type of the reduction variable (in the example
6957 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6958 However the type (mode) we use to check available target support
6959 for the vector operation to be created *inside the loop*, is
6960 determined by the type of the other arguments to STMT (in the
6961 example we'd check this: optab_handler (widen_sum_optab,
6964 This is contrary to "regular" reductions, in which the types of all
6965 the arguments are the same as the type of the reduction variable.
6966 For "regular" reductions we can therefore use the same vector type
6967 (and also the same tree-code) when generating the epilog code and
6968 when generating the code inside the loop. */
6970 vect_reduction_type reduction_type
6971 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
6973 && (reduction_type
== TREE_CODE_REDUCTION
6974 || reduction_type
== FOLD_LEFT_REDUCTION
))
6976 /* This is a reduction pattern: get the vectype from the type of the
6977 reduction variable, and get the tree-code from orig_stmt. */
6978 orig_code
= gimple_assign_rhs_code (orig_stmt
);
6979 gcc_assert (vectype_out
);
6980 vec_mode
= TYPE_MODE (vectype_out
);
6984 /* Regular reduction: use the same vectype and tree-code as used for
6985 the vector code inside the loop can be used for the epilog code. */
6988 if (code
== MINUS_EXPR
)
6989 orig_code
= PLUS_EXPR
;
6991 /* For simple condition reductions, replace with the actual expression
6992 we want to base our reduction around. */
6993 if (reduction_type
== CONST_COND_REDUCTION
)
6995 orig_code
= STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
);
6996 gcc_assert (orig_code
== MAX_EXPR
|| orig_code
== MIN_EXPR
);
6998 else if (reduction_type
== INTEGER_INDUC_COND_REDUCTION
)
6999 orig_code
= cond_reduc_op_code
;
7004 def_bb
= gimple_bb (reduc_def_stmt
);
7005 def_stmt_loop
= def_bb
->loop_father
;
7006 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
7007 loop_preheader_edge (def_stmt_loop
));
7008 if (TREE_CODE (def_arg
) == SSA_NAME
7009 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
7010 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
7011 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
7012 && vinfo_for_stmt (def_arg_stmt
)
7013 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
7014 == vect_double_reduction_def
)
7015 double_reduc
= true;
7018 reduc_fn
= IFN_LAST
;
7020 if (reduction_type
== TREE_CODE_REDUCTION
7021 || reduction_type
== FOLD_LEFT_REDUCTION
7022 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
7023 || reduction_type
== CONST_COND_REDUCTION
)
7025 if (reduction_type
== FOLD_LEFT_REDUCTION
7026 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
7027 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
7029 if (reduc_fn
!= IFN_LAST
7030 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
7031 OPTIMIZE_FOR_SPEED
))
7033 if (dump_enabled_p ())
7034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7035 "reduc op not supported by target.\n");
7037 reduc_fn
= IFN_LAST
;
7042 if (!nested_cycle
|| double_reduc
)
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7046 "no reduc code for scalar code.\n");
7052 else if (reduction_type
== COND_REDUCTION
)
7054 int scalar_precision
7055 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
7056 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7057 cr_index_vector_type
= build_vector_type (cr_index_scalar_type
,
7060 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7061 OPTIMIZE_FOR_SPEED
))
7062 reduc_fn
= IFN_REDUC_MAX
;
7065 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7066 && reduc_fn
== IFN_LAST
7067 && !nunits_out
.is_constant ())
7069 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7071 "missing target support for reduction on"
7072 " variable-length vectors.\n");
7076 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
7079 if (dump_enabled_p ())
7080 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7081 "multiple types in double reduction or condition "
7086 /* For SLP reductions, see if there is a neutral value we can use. */
7087 tree neutral_op
= NULL_TREE
;
7090 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
, code
,
7091 GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
7093 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7095 /* We can't support in-order reductions of code such as this:
7097 for (int i = 0; i < n1; ++i)
7098 for (int j = 0; j < n2; ++j)
7101 since GCC effectively transforms the loop when vectorizing:
7103 for (int i = 0; i < n1 / VF; ++i)
7104 for (int j = 0; j < n2; ++j)
7105 for (int k = 0; k < VF; ++k)
7108 which is a reassociation of the original operation. */
7109 if (dump_enabled_p ())
7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7111 "in-order double reduction not supported.\n");
7116 if (reduction_type
== FOLD_LEFT_REDUCTION
7118 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
7120 /* We cannot use in-order reductions in this case because there is
7121 an implicit reassociation of the operations involved. */
7122 if (dump_enabled_p ())
7123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7124 "in-order unchained SLP reductions not supported.\n");
7128 /* For double reductions, and for SLP reductions with a neutral value,
7129 we construct a variable-length initial vector by loading a vector
7130 full of the neutral value and then shift-and-inserting the start
7131 values into the low-numbered elements. */
7132 if ((double_reduc
|| neutral_op
)
7133 && !nunits_out
.is_constant ()
7134 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7135 vectype_out
, OPTIMIZE_FOR_SPEED
))
7137 if (dump_enabled_p ())
7138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7139 "reduction on variable-length vectors requires"
7140 " target support for a vector-shift-and-insert"
7145 /* Check extra constraints for variable-length unchained SLP reductions. */
7146 if (STMT_SLP_TYPE (stmt_info
)
7147 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
7148 && !nunits_out
.is_constant ())
7150 /* We checked above that we could build the initial vector when
7151 there's a neutral element value. Check here for the case in
7152 which each SLP statement has its own initial value and in which
7153 that value needs to be repeated for every instance of the
7154 statement within the initial vector. */
7155 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7156 scalar_mode elt_mode
= SCALAR_TYPE_MODE (TREE_TYPE (vectype_out
));
7158 && !can_duplicate_and_interleave_p (group_size
, elt_mode
))
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7162 "unsupported form of SLP reduction for"
7163 " variable-length vectors: cannot build"
7164 " initial vector.\n");
7167 /* The epilogue code relies on the number of elements being a multiple
7168 of the group size. The duplicate-and-interleave approach to setting
7169 up the the initial vector does too. */
7170 if (!multiple_p (nunits_out
, group_size
))
7172 if (dump_enabled_p ())
7173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7174 "unsupported form of SLP reduction for"
7175 " variable-length vectors: the vector size"
7176 " is not a multiple of the number of results.\n");
7181 /* In case of widenning multiplication by a constant, we update the type
7182 of the constant to be the type of the other operand. We check that the
7183 constant fits the type in the pattern recognition pass. */
7184 if (code
== DOT_PROD_EXPR
7185 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
7187 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
7188 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
7189 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
7190 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
7193 if (dump_enabled_p ())
7194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7195 "invalid types in dot-prod\n");
7201 if (reduction_type
== COND_REDUCTION
)
7205 if (! max_loop_iterations (loop
, &ni
))
7207 if (dump_enabled_p ())
7208 dump_printf_loc (MSG_NOTE
, vect_location
,
7209 "loop count not known, cannot create cond "
7213 /* Convert backedges to iterations. */
7216 /* The additional index will be the same type as the condition. Check
7217 that the loop can fit into this less one (because we'll use up the
7218 zero slot for when there are no matches). */
7219 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7220 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7222 if (dump_enabled_p ())
7223 dump_printf_loc (MSG_NOTE
, vect_location
,
7224 "loop size is greater than data size.\n");
7229 /* In case the vectorization factor (VF) is bigger than the number
7230 of elements that we can fit in a vectype (nunits), we have to generate
7231 more than one vector stmt - i.e - we need to "unroll" the
7232 vector stmt by a factor VF/nunits. For more details see documentation
7233 in vectorizable_operation. */
7235 /* If the reduction is used in an outer loop we need to generate
7236 VF intermediate results, like so (e.g. for ncopies=2):
7241 (i.e. we generate VF results in 2 registers).
7242 In this case we have a separate def-use cycle for each copy, and therefore
7243 for each copy we get the vector def for the reduction variable from the
7244 respective phi node created for this copy.
7246 Otherwise (the reduction is unused in the loop nest), we can combine
7247 together intermediate results, like so (e.g. for ncopies=2):
7251 (i.e. we generate VF/2 results in a single register).
7252 In this case for each copy we get the vector def for the reduction variable
7253 from the vectorized reduction operation generated in the previous iteration.
7255 This only works when we see both the reduction PHI and its only consumer
7256 in vectorizable_reduction and there are no intermediate stmts
7258 use_operand_p use_p
;
7261 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7262 && single_imm_use (gimple_phi_result (reduc_def_stmt
), &use_p
, &use_stmt
)
7263 && (use_stmt
== stmt
7264 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt
)) == stmt
))
7266 single_defuse_cycle
= true;
7270 epilog_copies
= ncopies
;
7272 /* If the reduction stmt is one of the patterns that have lane
7273 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7275 && ! single_defuse_cycle
)
7276 && (code
== DOT_PROD_EXPR
7277 || code
== WIDEN_SUM_EXPR
7278 || code
== SAD_EXPR
))
7280 if (dump_enabled_p ())
7281 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7282 "multi def-use cycle not possible for lane-reducing "
7283 "reduction operation\n");
7288 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7292 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7293 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7295 if (!vec_stmt
) /* transformation not required. */
7298 vect_model_reduction_cost (stmt_info
, reduc_fn
, ncopies
);
7299 if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7301 if (reduction_type
!= FOLD_LEFT_REDUCTION
7302 && (cond_fn
== IFN_LAST
7303 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7304 OPTIMIZE_FOR_SPEED
)))
7306 if (dump_enabled_p ())
7307 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7308 "can't use a fully-masked loop because no"
7309 " conditional operation is available.\n");
7310 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7312 else if (reduc_index
== -1)
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7316 "can't use a fully-masked loop for chained"
7318 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7321 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7324 if (dump_enabled_p ()
7325 && reduction_type
== FOLD_LEFT_REDUCTION
)
7326 dump_printf_loc (MSG_NOTE
, vect_location
,
7327 "using an in-order (fold-left) reduction.\n");
7328 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7334 if (dump_enabled_p ())
7335 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7337 /* FORNOW: Multiple types are not supported for condition. */
7338 if (code
== COND_EXPR
)
7339 gcc_assert (ncopies
== 1);
7341 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7343 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7344 return vectorize_fold_left_reduction
7345 (stmt
, gsi
, vec_stmt
, slp_node
, reduc_def_stmt
, code
,
7346 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
7348 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
7350 gcc_assert (!slp_node
);
7351 return vectorizable_condition (stmt
, gsi
, vec_stmt
,
7352 NULL
, reduc_index
, NULL
);
7355 /* Create the destination vector */
7356 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7358 prev_stmt_info
= NULL
;
7359 prev_phi_info
= NULL
;
7362 vec_oprnds0
.create (1);
7363 vec_oprnds1
.create (1);
7364 if (op_type
== ternary_op
)
7365 vec_oprnds2
.create (1);
7368 phis
.create (vec_num
);
7369 vect_defs
.create (vec_num
);
7371 vect_defs
.quick_push (NULL_TREE
);
7374 phis
.splice (SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
));
7376 phis
.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt
)));
7378 for (j
= 0; j
< ncopies
; j
++)
7380 if (code
== COND_EXPR
)
7382 gcc_assert (!slp_node
);
7383 vectorizable_condition (stmt
, gsi
, vec_stmt
,
7384 PHI_RESULT (phis
[0]),
7386 /* Multiple types are not supported for condition. */
7395 /* Get vec defs for all the operands except the reduction index,
7396 ensuring the ordering of the ops in the vector is kept. */
7397 auto_vec
<tree
, 3> slp_ops
;
7398 auto_vec
<vec
<tree
>, 3> vec_defs
;
7400 slp_ops
.quick_push (ops
[0]);
7401 slp_ops
.quick_push (ops
[1]);
7402 if (op_type
== ternary_op
)
7403 slp_ops
.quick_push (ops
[2]);
7405 vect_get_slp_defs (slp_ops
, slp_node
, &vec_defs
);
7407 vec_oprnds0
.safe_splice (vec_defs
[0]);
7408 vec_defs
[0].release ();
7409 vec_oprnds1
.safe_splice (vec_defs
[1]);
7410 vec_defs
[1].release ();
7411 if (op_type
== ternary_op
)
7413 vec_oprnds2
.safe_splice (vec_defs
[2]);
7414 vec_defs
[2].release ();
7419 vec_oprnds0
.quick_push
7420 (vect_get_vec_def_for_operand (ops
[0], stmt
));
7421 vec_oprnds1
.quick_push
7422 (vect_get_vec_def_for_operand (ops
[1], stmt
));
7423 if (op_type
== ternary_op
)
7424 vec_oprnds2
.quick_push
7425 (vect_get_vec_def_for_operand (ops
[2], stmt
));
7432 gcc_assert (reduc_index
!= -1 || ! single_defuse_cycle
);
7434 if (single_defuse_cycle
&& reduc_index
== 0)
7435 vec_oprnds0
[0] = gimple_get_lhs (new_stmt
);
7438 = vect_get_vec_def_for_stmt_copy (dts
[0], vec_oprnds0
[0]);
7439 if (single_defuse_cycle
&& reduc_index
== 1)
7440 vec_oprnds1
[0] = gimple_get_lhs (new_stmt
);
7443 = vect_get_vec_def_for_stmt_copy (dts
[1], vec_oprnds1
[0]);
7444 if (op_type
== ternary_op
)
7446 if (single_defuse_cycle
&& reduc_index
== 2)
7447 vec_oprnds2
[0] = gimple_get_lhs (new_stmt
);
7450 = vect_get_vec_def_for_stmt_copy (dts
[2], vec_oprnds2
[0]);
7455 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7457 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7460 /* Make sure that the reduction accumulator is vop[0]. */
7461 if (reduc_index
== 1)
7463 gcc_assert (commutative_tree_code (code
));
7464 std::swap (vop
[0], vop
[1]);
7466 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7467 vectype_in
, i
* ncopies
+ j
);
7468 gcall
*call
= gimple_build_call_internal (cond_fn
, 3, mask
,
7470 new_temp
= make_ssa_name (vec_dest
, call
);
7471 gimple_call_set_lhs (call
, new_temp
);
7472 gimple_call_set_nothrow (call
, true);
7477 if (op_type
== ternary_op
)
7478 vop
[2] = vec_oprnds2
[i
];
7480 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7481 new_stmt
= gimple_build_assign (new_temp
, code
,
7482 vop
[0], vop
[1], vop
[2]);
7484 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
7488 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7489 vect_defs
.quick_push (new_temp
);
7492 vect_defs
[0] = new_temp
;
7499 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
7501 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
7503 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
7506 /* Finalize the reduction-phi (set its arguments) and create the
7507 epilog reduction code. */
7508 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
7509 vect_defs
[0] = gimple_get_lhs (*vec_stmt
);
7511 vect_create_epilog_for_reduction (vect_defs
, stmt
, reduc_def_stmt
,
7512 epilog_copies
, reduc_fn
, phis
,
7513 double_reduc
, slp_node
, slp_node_instance
,
7514 cond_reduc_val
, cond_reduc_op_code
,
7520 /* Function vect_min_worthwhile_factor.
7522 For a loop where we could vectorize the operation indicated by CODE,
7523 return the minimum vectorization factor that makes it worthwhile
7524 to use generic vectors. */
7526 vect_min_worthwhile_factor (enum tree_code code
)
7546 /* Return true if VINFO indicates we are doing loop vectorization and if
7547 it is worth decomposing CODE operations into scalar operations for
7548 that loop's vectorization factor. */
7551 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7553 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7554 unsigned HOST_WIDE_INT value
;
7556 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7557 && value
>= vect_min_worthwhile_factor (code
));
7560 /* Function vectorizable_induction
7562 Check if PHI performs an induction computation that can be vectorized.
7563 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7564 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7565 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7568 vectorizable_induction (gimple
*phi
,
7569 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7570 gimple
**vec_stmt
, slp_tree slp_node
)
7572 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
7573 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7574 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7576 bool nested_in_vect_loop
= false;
7577 struct loop
*iv_loop
;
7579 edge pe
= loop_preheader_edge (loop
);
7581 tree new_vec
, vec_init
, vec_step
, t
;
7584 gphi
*induction_phi
;
7585 tree induc_def
, vec_dest
;
7586 tree init_expr
, step_expr
;
7587 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7591 imm_use_iterator imm_iter
;
7592 use_operand_p use_p
;
7596 gimple_stmt_iterator si
;
7597 basic_block bb
= gimple_bb (phi
);
7599 if (gimple_code (phi
) != GIMPLE_PHI
)
7602 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7605 /* Make sure it was recognized as induction computation. */
7606 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7609 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7610 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7615 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7616 gcc_assert (ncopies
>= 1);
7618 /* FORNOW. These restrictions should be relaxed. */
7619 if (nested_in_vect_loop_p (loop
, phi
))
7621 imm_use_iterator imm_iter
;
7622 use_operand_p use_p
;
7629 if (dump_enabled_p ())
7630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7631 "multiple types in nested loop.\n");
7635 /* FORNOW: outer loop induction with SLP not supported. */
7636 if (STMT_SLP_TYPE (stmt_info
))
7640 latch_e
= loop_latch_edge (loop
->inner
);
7641 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7642 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7644 gimple
*use_stmt
= USE_STMT (use_p
);
7645 if (is_gimple_debug (use_stmt
))
7648 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7650 exit_phi
= use_stmt
;
7656 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
7657 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7658 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7660 if (dump_enabled_p ())
7661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7662 "inner-loop induction only used outside "
7663 "of the outer vectorized loop.\n");
7668 nested_in_vect_loop
= true;
7669 iv_loop
= loop
->inner
;
7673 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7675 if (slp_node
&& !nunits
.is_constant ())
7677 /* The current SLP code creates the initial value element-by-element. */
7678 if (dump_enabled_p ())
7679 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7680 "SLP induction not supported for variable-length"
7685 if (!vec_stmt
) /* transformation not required. */
7687 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
7688 if (dump_enabled_p ())
7689 dump_printf_loc (MSG_NOTE
, vect_location
,
7690 "=== vectorizable_induction ===\n");
7691 vect_model_induction_cost (stmt_info
, ncopies
);
7697 /* Compute a vector variable, initialized with the first VF values of
7698 the induction variable. E.g., for an iv with IV_PHI='X' and
7699 evolution S, for a vector of 4 units, we want to compute:
7700 [X, X + S, X + 2*S, X + 3*S]. */
7702 if (dump_enabled_p ())
7703 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
7705 latch_e
= loop_latch_edge (iv_loop
);
7706 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7708 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
7709 gcc_assert (step_expr
!= NULL_TREE
);
7711 pe
= loop_preheader_edge (iv_loop
);
7712 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
7713 loop_preheader_edge (iv_loop
));
7716 if (!nested_in_vect_loop
)
7718 /* Convert the initial value to the desired type. */
7719 tree new_type
= TREE_TYPE (vectype
);
7720 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
7722 /* If we are using the loop mask to "peel" for alignment then we need
7723 to adjust the start value here. */
7724 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
7725 if (skip_niters
!= NULL_TREE
)
7727 if (FLOAT_TYPE_P (vectype
))
7728 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
7731 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
7732 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
7733 skip_niters
, step_expr
);
7734 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
7735 init_expr
, skip_step
);
7739 /* Convert the step to the desired type. */
7740 step_expr
= gimple_convert (&stmts
, TREE_TYPE (vectype
), step_expr
);
7744 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7745 gcc_assert (!new_bb
);
7748 /* Find the first insertion point in the BB. */
7749 si
= gsi_after_labels (bb
);
7751 /* For SLP induction we have to generate several IVs as for example
7752 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7753 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7754 [VF*S, VF*S, VF*S, VF*S] for all. */
7757 /* Enforced above. */
7758 unsigned int const_nunits
= nunits
.to_constant ();
7760 /* Generate [VF*S, VF*S, ... ]. */
7761 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7763 expr
= build_int_cst (integer_type_node
, vf
);
7764 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7767 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7768 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7770 if (! CONSTANT_CLASS_P (new_name
))
7771 new_name
= vect_init_vector (phi
, new_name
,
7772 TREE_TYPE (step_expr
), NULL
);
7773 new_vec
= build_vector_from_val (vectype
, new_name
);
7774 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7776 /* Now generate the IVs. */
7777 unsigned group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7778 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7779 unsigned elts
= const_nunits
* nvects
;
7780 unsigned nivs
= least_common_multiple (group_size
,
7781 const_nunits
) / const_nunits
;
7782 gcc_assert (elts
% group_size
== 0);
7783 tree elt
= init_expr
;
7785 for (ivn
= 0; ivn
< nivs
; ++ivn
)
7787 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7789 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
7791 if (ivn
*const_nunits
+ eltn
>= group_size
7792 && (ivn
* const_nunits
+ eltn
) % group_size
== 0)
7793 elt
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (elt
),
7795 elts
.quick_push (elt
);
7797 vec_init
= gimple_build_vector (&stmts
, &elts
);
7800 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7801 gcc_assert (!new_bb
);
7804 /* Create the induction-phi that defines the induction-operand. */
7805 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7806 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7807 set_vinfo_for_stmt (induction_phi
,
7808 new_stmt_vec_info (induction_phi
, loop_vinfo
));
7809 induc_def
= PHI_RESULT (induction_phi
);
7811 /* Create the iv update inside the loop */
7812 vec_def
= make_ssa_name (vec_dest
);
7813 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
7814 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7815 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
));
7817 /* Set the arguments of the phi node: */
7818 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7819 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7822 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
7825 /* Re-use IVs when we can. */
7829 = least_common_multiple (group_size
, const_nunits
) / group_size
;
7830 /* Generate [VF'*S, VF'*S, ... ]. */
7831 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7833 expr
= build_int_cst (integer_type_node
, vfp
);
7834 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7837 expr
= build_int_cst (TREE_TYPE (step_expr
), vfp
);
7838 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7840 if (! CONSTANT_CLASS_P (new_name
))
7841 new_name
= vect_init_vector (phi
, new_name
,
7842 TREE_TYPE (step_expr
), NULL
);
7843 new_vec
= build_vector_from_val (vectype
, new_name
);
7844 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7845 for (; ivn
< nvects
; ++ivn
)
7847 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
7849 if (gimple_code (iv
) == GIMPLE_PHI
)
7850 def
= gimple_phi_result (iv
);
7852 def
= gimple_assign_lhs (iv
);
7853 new_stmt
= gimple_build_assign (make_ssa_name (vectype
),
7856 if (gimple_code (iv
) == GIMPLE_PHI
)
7857 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7860 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
7861 gsi_insert_after (&tgsi
, new_stmt
, GSI_CONTINUE_LINKING
);
7863 set_vinfo_for_stmt (new_stmt
,
7864 new_stmt_vec_info (new_stmt
, loop_vinfo
));
7865 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7872 /* Create the vector that holds the initial_value of the induction. */
7873 if (nested_in_vect_loop
)
7875 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7876 been created during vectorization of previous stmts. We obtain it
7877 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7878 vec_init
= vect_get_vec_def_for_operand (init_expr
, phi
);
7879 /* If the initial value is not of proper type, convert it. */
7880 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
7883 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
7887 build1 (VIEW_CONVERT_EXPR
, vectype
,
7889 vec_init
= gimple_assign_lhs (new_stmt
);
7890 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
7892 gcc_assert (!new_bb
);
7893 set_vinfo_for_stmt (new_stmt
,
7894 new_stmt_vec_info (new_stmt
, loop_vinfo
));
7899 /* iv_loop is the loop to be vectorized. Create:
7900 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7902 new_name
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_expr
);
7904 unsigned HOST_WIDE_INT const_nunits
;
7905 if (nunits
.is_constant (&const_nunits
))
7907 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7908 elts
.quick_push (new_name
);
7909 for (i
= 1; i
< const_nunits
; i
++)
7911 /* Create: new_name_i = new_name + step_expr */
7912 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
7913 new_name
, step_expr
);
7914 elts
.quick_push (new_name
);
7916 /* Create a vector from [new_name_0, new_name_1, ...,
7917 new_name_nunits-1] */
7918 vec_init
= gimple_build_vector (&stmts
, &elts
);
7920 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
7921 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7922 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, vectype
,
7923 new_name
, step_expr
);
7927 [base, base, base, ...]
7928 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7929 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
7930 gcc_assert (flag_associative_math
);
7931 tree index
= build_index_vector (vectype
, 0, 1);
7932 tree base_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7934 tree step_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7936 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, vectype
, index
);
7937 vec_init
= gimple_build (&stmts
, MULT_EXPR
, vectype
,
7938 vec_init
, step_vec
);
7939 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, vectype
,
7940 vec_init
, base_vec
);
7945 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7946 gcc_assert (!new_bb
);
7951 /* Create the vector that holds the step of the induction. */
7952 if (nested_in_vect_loop
)
7953 /* iv_loop is nested in the loop to be vectorized. Generate:
7954 vec_step = [S, S, S, S] */
7955 new_name
= step_expr
;
7958 /* iv_loop is the loop to be vectorized. Generate:
7959 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7960 gimple_seq seq
= NULL
;
7961 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7963 expr
= build_int_cst (integer_type_node
, vf
);
7964 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7967 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7968 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7972 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7973 gcc_assert (!new_bb
);
7977 t
= unshare_expr (new_name
);
7978 gcc_assert (CONSTANT_CLASS_P (new_name
)
7979 || TREE_CODE (new_name
) == SSA_NAME
);
7980 new_vec
= build_vector_from_val (vectype
, t
);
7981 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
7984 /* Create the following def-use cycle:
7989 vec_iv = PHI <vec_init, vec_loop>
7993 vec_loop = vec_iv + vec_step; */
7995 /* Create the induction-phi that defines the induction-operand. */
7996 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7997 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7998 set_vinfo_for_stmt (induction_phi
,
7999 new_stmt_vec_info (induction_phi
, loop_vinfo
));
8000 induc_def
= PHI_RESULT (induction_phi
);
8002 /* Create the iv update inside the loop */
8003 vec_def
= make_ssa_name (vec_dest
);
8004 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
8005 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
8006 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
));
8008 /* Set the arguments of the phi node: */
8009 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8010 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8013 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= induction_phi
;
8015 /* In case that vectorization factor (VF) is bigger than the number
8016 of elements that we can fit in a vectype (nunits), we have to generate
8017 more than one vector stmt - i.e - we need to "unroll" the
8018 vector stmt by a factor VF/nunits. For more details see documentation
8019 in vectorizable_operation. */
8023 gimple_seq seq
= NULL
;
8024 stmt_vec_info prev_stmt_vinfo
;
8025 /* FORNOW. This restriction should be relaxed. */
8026 gcc_assert (!nested_in_vect_loop
);
8028 /* Create the vector that holds the step of the induction. */
8029 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8031 expr
= build_int_cst (integer_type_node
, nunits
);
8032 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8035 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
8036 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8040 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8041 gcc_assert (!new_bb
);
8044 t
= unshare_expr (new_name
);
8045 gcc_assert (CONSTANT_CLASS_P (new_name
)
8046 || TREE_CODE (new_name
) == SSA_NAME
);
8047 new_vec
= build_vector_from_val (vectype
, t
);
8048 vec_step
= vect_init_vector (phi
, new_vec
, vectype
, NULL
);
8050 vec_def
= induc_def
;
8051 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
8052 for (i
= 1; i
< ncopies
; i
++)
8054 /* vec_i = vec_prev + vec_step */
8055 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
8057 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
8058 gimple_assign_set_lhs (new_stmt
, vec_def
);
8060 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
8061 set_vinfo_for_stmt (new_stmt
,
8062 new_stmt_vec_info (new_stmt
, loop_vinfo
));
8063 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
8064 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
8068 if (nested_in_vect_loop
)
8070 /* Find the loop-closed exit-phi of the induction, and record
8071 the final vector of induction results: */
8073 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
8075 gimple
*use_stmt
= USE_STMT (use_p
);
8076 if (is_gimple_debug (use_stmt
))
8079 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
8081 exit_phi
= use_stmt
;
8087 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
8088 /* FORNOW. Currently not supporting the case that an inner-loop induction
8089 is not used in the outer-loop (i.e. only outside the outer-loop). */
8090 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
8091 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
8093 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
8094 if (dump_enabled_p ())
8096 dump_printf_loc (MSG_NOTE
, vect_location
,
8097 "vector of inductions after inner-loop:");
8098 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
8104 if (dump_enabled_p ())
8106 dump_printf_loc (MSG_NOTE
, vect_location
,
8107 "transform induction: created def-use cycle: ");
8108 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
8109 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
8110 SSA_NAME_DEF_STMT (vec_def
), 0);
8116 /* Function vectorizable_live_operation.
8118 STMT computes a value that is used outside the loop. Check if
8119 it can be supported. */
8122 vectorizable_live_operation (gimple
*stmt
,
8123 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
8124 slp_tree slp_node
, int slp_index
,
8127 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
8128 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
8129 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8130 imm_use_iterator imm_iter
;
8131 tree lhs
, lhs_type
, bitsize
, vec_bitsize
;
8132 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8133 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8136 auto_vec
<tree
> vec_oprnds
;
8138 poly_uint64 vec_index
= 0;
8140 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
8142 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
8145 /* FORNOW. CHECKME. */
8146 if (nested_in_vect_loop_p (loop
, stmt
))
8149 /* If STMT is not relevant and it is a simple assignment and its inputs are
8150 invariant then it can remain in place, unvectorized. The original last
8151 scalar value that it computes will be used. */
8152 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8154 gcc_assert (is_simple_and_all_uses_invariant (stmt
, loop_vinfo
));
8155 if (dump_enabled_p ())
8156 dump_printf_loc (MSG_NOTE
, vect_location
,
8157 "statement is simple and uses invariant. Leaving in "
8165 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8169 gcc_assert (slp_index
>= 0);
8171 int num_scalar
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
8172 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8174 /* Get the last occurrence of the scalar index from the concatenation of
8175 all the slp vectors. Calculate which slp vector it is and the index
8177 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
8179 /* Calculate which vector contains the result, and which lane of
8180 that vector we need. */
8181 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
8183 if (dump_enabled_p ())
8184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8185 "Cannot determine which vector holds the"
8186 " final result.\n");
8193 /* No transformation required. */
8194 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
8196 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
8197 OPTIMIZE_FOR_SPEED
))
8199 if (dump_enabled_p ())
8200 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8201 "can't use a fully-masked loop because "
8202 "the target doesn't support extract last "
8204 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8208 if (dump_enabled_p ())
8209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8210 "can't use a fully-masked loop because an "
8211 "SLP statement is live after the loop.\n");
8212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8214 else if (ncopies
> 1)
8216 if (dump_enabled_p ())
8217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8218 "can't use a fully-masked loop because"
8219 " ncopies is greater than 1.\n");
8220 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8224 gcc_assert (ncopies
== 1 && !slp_node
);
8225 vect_record_loop_mask (loop_vinfo
,
8226 &LOOP_VINFO_MASKS (loop_vinfo
),
8233 /* If stmt has a related stmt, then use that for getting the lhs. */
8234 if (is_pattern_stmt_p (stmt_info
))
8235 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
8237 lhs
= (is_a
<gphi
*> (stmt
)) ? gimple_phi_result (stmt
)
8238 : gimple_get_lhs (stmt
);
8239 lhs_type
= TREE_TYPE (lhs
);
8241 bitsize
= (VECTOR_BOOLEAN_TYPE_P (vectype
)
8242 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype
)))
8243 : TYPE_SIZE (TREE_TYPE (vectype
)));
8244 vec_bitsize
= TYPE_SIZE (vectype
);
8246 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8247 tree vec_lhs
, bitstart
;
8250 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8252 /* Get the correct slp vectorized stmt. */
8253 gimple
*vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
8254 if (gphi
*phi
= dyn_cast
<gphi
*> (vec_stmt
))
8255 vec_lhs
= gimple_phi_result (phi
);
8257 vec_lhs
= gimple_get_lhs (vec_stmt
);
8259 /* Get entry to use. */
8260 bitstart
= bitsize_int (vec_index
);
8261 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8265 enum vect_def_type dt
= STMT_VINFO_DEF_TYPE (stmt_info
);
8266 vec_lhs
= vect_get_vec_def_for_operand_1 (stmt
, dt
);
8267 gcc_checking_assert (ncopies
== 1
8268 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8270 /* For multiple copies, get the last copy. */
8271 for (int i
= 1; i
< ncopies
; ++i
)
8272 vec_lhs
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
,
8275 /* Get the last lane in the vector. */
8276 bitstart
= int_const_binop (MINUS_EXPR
, vec_bitsize
, bitsize
);
8279 gimple_seq stmts
= NULL
;
8281 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8285 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8287 where VEC_LHS is the vectorized live-out result and MASK is
8288 the loop mask for the final iteration. */
8289 gcc_assert (ncopies
== 1 && !slp_node
);
8290 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8291 tree scalar_res
= make_ssa_name (scalar_type
);
8292 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8294 gcall
*new_stmt
= gimple_build_call_internal (IFN_EXTRACT_LAST
,
8296 gimple_call_set_lhs (new_stmt
, scalar_res
);
8297 gimple_seq_add_stmt (&stmts
, new_stmt
);
8299 /* Convert the extracted vector element to the required scalar type. */
8300 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8304 tree bftype
= TREE_TYPE (vectype
);
8305 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8306 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8307 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs
, bitsize
, bitstart
);
8308 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8309 &stmts
, true, NULL_TREE
);
8313 gsi_insert_seq_on_edge_immediate (single_exit (loop
), stmts
);
8315 /* Replace use of lhs with newly computed result. If the use stmt is a
8316 single arg PHI, just replace all uses of PHI result. It's necessary
8317 because lcssa PHI defining lhs may be before newly inserted stmt. */
8318 use_operand_p use_p
;
8319 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8320 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8321 && !is_gimple_debug (use_stmt
))
8323 if (gimple_code (use_stmt
) == GIMPLE_PHI
8324 && gimple_phi_num_args (use_stmt
) == 1)
8326 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8330 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8331 SET_USE (use_p
, new_tree
);
8333 update_stmt (use_stmt
);
8339 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8342 vect_loop_kill_debug_uses (struct loop
*loop
, gimple
*stmt
)
8344 ssa_op_iter op_iter
;
8345 imm_use_iterator imm_iter
;
8346 def_operand_p def_p
;
8349 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
8351 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8355 if (!is_gimple_debug (ustmt
))
8358 bb
= gimple_bb (ustmt
);
8360 if (!flow_bb_inside_loop_p (loop
, bb
))
8362 if (gimple_debug_bind_p (ustmt
))
8364 if (dump_enabled_p ())
8365 dump_printf_loc (MSG_NOTE
, vect_location
,
8366 "killing debug use\n");
8368 gimple_debug_bind_reset_value (ustmt
);
8369 update_stmt (ustmt
);
8378 /* Given loop represented by LOOP_VINFO, return true if computation of
8379 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8383 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8385 /* Constant case. */
8386 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8388 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8389 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8391 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8392 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8393 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8398 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8399 /* Check the upper bound of loop niters. */
8400 if (get_max_loop_iterations (loop
, &max
))
8402 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8403 signop sgn
= TYPE_SIGN (type
);
8404 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8411 /* Return a mask type with half the number of elements as TYPE. */
8414 vect_halve_mask_nunits (tree type
)
8416 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (type
), 2);
8417 return build_truth_vector_type (nunits
, current_vector_size
);
8420 /* Return a mask type with twice as many elements as TYPE. */
8423 vect_double_mask_nunits (tree type
)
8425 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (type
) * 2;
8426 return build_truth_vector_type (nunits
, current_vector_size
);
8429 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8430 contain a sequence of NVECTORS masks that each control a vector of type
8434 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8435 unsigned int nvectors
, tree vectype
)
8437 gcc_assert (nvectors
!= 0);
8438 if (masks
->length () < nvectors
)
8439 masks
->safe_grow_cleared (nvectors
);
8440 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8441 /* The number of scalars per iteration and the number of vectors are
8442 both compile-time constants. */
8443 unsigned int nscalars_per_iter
8444 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8445 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8446 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8448 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8449 rgm
->mask_type
= build_same_sized_truth_vector_type (vectype
);
8453 /* Given a complete set of masks MASKS, extract mask number INDEX
8454 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8455 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8457 See the comment above vec_loop_masks for more details about the mask
8461 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8462 unsigned int nvectors
, tree vectype
, unsigned int index
)
8464 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8465 tree mask_type
= rgm
->mask_type
;
8467 /* Populate the rgroup's mask array, if this is the first time we've
8469 if (rgm
->masks
.is_empty ())
8471 rgm
->masks
.safe_grow_cleared (nvectors
);
8472 for (unsigned int i
= 0; i
< nvectors
; ++i
)
8474 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
8475 /* Provide a dummy definition until the real one is available. */
8476 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
8477 rgm
->masks
[i
] = mask
;
8481 tree mask
= rgm
->masks
[index
];
8482 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
8483 TYPE_VECTOR_SUBPARTS (vectype
)))
8485 /* A loop mask for data type X can be reused for data type Y
8486 if X has N times more elements than Y and if Y's elements
8487 are N times bigger than X's. In this case each sequence
8488 of N elements in the loop mask will be all-zero or all-one.
8489 We can then view-convert the mask so that each sequence of
8490 N elements is replaced by a single element. */
8491 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
8492 TYPE_VECTOR_SUBPARTS (vectype
)));
8493 gimple_seq seq
= NULL
;
8494 mask_type
= build_same_sized_truth_vector_type (vectype
);
8495 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
8497 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
8502 /* Scale profiling counters by estimation for LOOP which is vectorized
8506 scale_profile_for_vect_loop (struct loop
*loop
, unsigned vf
)
8508 edge preheader
= loop_preheader_edge (loop
);
8509 /* Reduce loop iterations by the vectorization factor. */
8510 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
8511 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
8513 if (freq_h
.nonzero_p ())
8515 profile_probability p
;
8517 /* Avoid dropping loop body profile counter to 0 because of zero count
8518 in loop's preheader. */
8519 if (!(freq_e
== profile_count::zero ()))
8520 freq_e
= freq_e
.force_nonzero ();
8521 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
8522 scale_loop_frequencies (loop
, p
);
8525 edge exit_e
= single_exit (loop
);
8526 exit_e
->probability
= profile_probability::always ()
8527 .apply_scale (1, new_est_niter
+ 1);
8529 edge exit_l
= single_pred_edge (loop
->latch
);
8530 profile_probability prob
= exit_l
->probability
;
8531 exit_l
->probability
= exit_e
->probability
.invert ();
8532 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
8533 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
8536 /* Function vect_transform_loop.
8538 The analysis phase has determined that the loop is vectorizable.
8539 Vectorize the loop - created vectorized stmts to replace the scalar
8540 stmts in the loop, and update the loop exit condition.
8541 Returns scalar epilogue loop if any. */
8544 vect_transform_loop (loop_vec_info loop_vinfo
)
8546 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8547 struct loop
*epilogue
= NULL
;
8548 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
8549 int nbbs
= loop
->num_nodes
;
8551 tree niters_vector
= NULL_TREE
;
8552 tree step_vector
= NULL_TREE
;
8553 tree niters_vector_mult_vf
= NULL_TREE
;
8554 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8555 unsigned int lowest_vf
= constant_lower_bound (vf
);
8557 bool slp_scheduled
= false;
8558 gimple
*stmt
, *pattern_stmt
;
8559 gimple_seq pattern_def_seq
= NULL
;
8560 gimple_stmt_iterator pattern_def_si
= gsi_none ();
8561 bool transform_pattern_stmt
= false;
8562 bool check_profitability
= false;
8565 if (dump_enabled_p ())
8566 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
8568 /* Use the more conservative vectorization threshold. If the number
8569 of iterations is constant assume the cost check has been performed
8570 by our caller. If the threshold makes all loops profitable that
8571 run at least the (estimated) vectorization factor number of times
8572 checking is pointless, too. */
8573 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
8574 if (th
>= vect_vf_for_cost (loop_vinfo
)
8575 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8577 if (dump_enabled_p ())
8578 dump_printf_loc (MSG_NOTE
, vect_location
,
8579 "Profitability threshold is %d loop iterations.\n",
8581 check_profitability
= true;
8584 /* Make sure there exists a single-predecessor exit bb. Do this before
8586 edge e
= single_exit (loop
);
8587 if (! single_pred_p (e
->dest
))
8589 split_loop_exit_edge (e
);
8590 if (dump_enabled_p ())
8591 dump_printf (MSG_NOTE
, "split exit edge\n");
8594 /* Version the loop first, if required, so the profitability check
8597 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
8599 poly_uint64 versioning_threshold
8600 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
8601 if (check_profitability
8602 && ordered_p (poly_uint64 (th
), versioning_threshold
))
8604 versioning_threshold
= ordered_max (poly_uint64 (th
),
8605 versioning_threshold
);
8606 check_profitability
= false;
8608 vect_loop_versioning (loop_vinfo
, th
, check_profitability
,
8609 versioning_threshold
);
8610 check_profitability
= false;
8613 /* Make sure there exists a single-predecessor exit bb also on the
8614 scalar loop copy. Do this after versioning but before peeling
8615 so CFG structure is fine for both scalar and if-converted loop
8616 to make slpeel_duplicate_current_defs_from_edges face matched
8617 loop closed PHI nodes on the exit. */
8618 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8620 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
8621 if (! single_pred_p (e
->dest
))
8623 split_loop_exit_edge (e
);
8624 if (dump_enabled_p ())
8625 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
8629 tree niters
= vect_build_loop_niters (loop_vinfo
);
8630 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
8631 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
8632 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
8633 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
8634 &step_vector
, &niters_vector_mult_vf
, th
,
8635 check_profitability
, niters_no_overflow
);
8637 if (niters_vector
== NULL_TREE
)
8639 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8640 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8641 && known_eq (lowest_vf
, vf
))
8644 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
8645 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
8646 step_vector
= build_one_cst (TREE_TYPE (niters
));
8649 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
8650 &step_vector
, niters_no_overflow
);
8653 /* 1) Make sure the loop header has exactly two entries
8654 2) Make sure we have a preheader basic block. */
8656 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
8658 split_edge (loop_preheader_edge (loop
));
8660 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8661 && vect_use_loop_mask_for_alignment_p (loop_vinfo
))
8662 /* This will deal with any possible peeling. */
8663 vect_prepare_for_masked_peels (loop_vinfo
);
8665 /* FORNOW: the vectorizer supports only loops which body consist
8666 of one basic block (header + empty latch). When the vectorizer will
8667 support more involved loop forms, the order by which the BBs are
8668 traversed need to be reconsidered. */
8670 for (i
= 0; i
< nbbs
; i
++)
8672 basic_block bb
= bbs
[i
];
8673 stmt_vec_info stmt_info
;
8675 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
8678 gphi
*phi
= si
.phi ();
8679 if (dump_enabled_p ())
8681 dump_printf_loc (MSG_NOTE
, vect_location
,
8682 "------>vectorizing phi: ");
8683 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
8685 stmt_info
= vinfo_for_stmt (phi
);
8689 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8690 vect_loop_kill_debug_uses (loop
, phi
);
8692 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8693 && !STMT_VINFO_LIVE_P (stmt_info
))
8696 if (STMT_VINFO_VECTYPE (stmt_info
)
8698 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
8699 && dump_enabled_p ())
8700 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8702 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
8703 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8704 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
8705 && ! PURE_SLP_STMT (stmt_info
))
8707 if (dump_enabled_p ())
8708 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
8709 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
8713 pattern_stmt
= NULL
;
8714 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
8715 !gsi_end_p (si
) || transform_pattern_stmt
;)
8719 if (transform_pattern_stmt
)
8720 stmt
= pattern_stmt
;
8723 stmt
= gsi_stmt (si
);
8724 /* During vectorization remove existing clobber stmts. */
8725 if (gimple_clobber_p (stmt
))
8727 unlink_stmt_vdef (stmt
);
8728 gsi_remove (&si
, true);
8729 release_defs (stmt
);
8734 if (dump_enabled_p ())
8736 dump_printf_loc (MSG_NOTE
, vect_location
,
8737 "------>vectorizing statement: ");
8738 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
8741 stmt_info
= vinfo_for_stmt (stmt
);
8743 /* vector stmts created in the outer-loop during vectorization of
8744 stmts in an inner-loop may not have a stmt_info, and do not
8745 need to be vectorized. */
8752 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8753 vect_loop_kill_debug_uses (loop
, stmt
);
8755 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8756 && !STMT_VINFO_LIVE_P (stmt_info
))
8758 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
8759 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
8760 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
8761 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
8763 stmt
= pattern_stmt
;
8764 stmt_info
= vinfo_for_stmt (stmt
);
8772 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
8773 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
8774 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
8775 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
8776 transform_pattern_stmt
= true;
8778 /* If pattern statement has def stmts, vectorize them too. */
8779 if (is_pattern_stmt_p (stmt_info
))
8781 if (pattern_def_seq
== NULL
)
8783 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
8784 pattern_def_si
= gsi_start (pattern_def_seq
);
8786 else if (!gsi_end_p (pattern_def_si
))
8787 gsi_next (&pattern_def_si
);
8788 if (pattern_def_seq
!= NULL
)
8790 gimple
*pattern_def_stmt
= NULL
;
8791 stmt_vec_info pattern_def_stmt_info
= NULL
;
8793 while (!gsi_end_p (pattern_def_si
))
8795 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
8796 pattern_def_stmt_info
8797 = vinfo_for_stmt (pattern_def_stmt
);
8798 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
8799 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
8801 gsi_next (&pattern_def_si
);
8804 if (!gsi_end_p (pattern_def_si
))
8806 if (dump_enabled_p ())
8808 dump_printf_loc (MSG_NOTE
, vect_location
,
8809 "==> vectorizing pattern def "
8811 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
8812 pattern_def_stmt
, 0);
8815 stmt
= pattern_def_stmt
;
8816 stmt_info
= pattern_def_stmt_info
;
8820 pattern_def_si
= gsi_none ();
8821 transform_pattern_stmt
= false;
8825 transform_pattern_stmt
= false;
8828 if (STMT_VINFO_VECTYPE (stmt_info
))
8831 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
8832 if (!STMT_SLP_TYPE (stmt_info
)
8833 && maybe_ne (nunits
, vf
)
8834 && dump_enabled_p ())
8835 /* For SLP VF is set according to unrolling factor, and not
8836 to vector size, hence for SLP this print is not valid. */
8837 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8840 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8842 if (STMT_SLP_TYPE (stmt_info
))
8846 slp_scheduled
= true;
8848 if (dump_enabled_p ())
8849 dump_printf_loc (MSG_NOTE
, vect_location
,
8850 "=== scheduling SLP instances ===\n");
8852 vect_schedule_slp (loop_vinfo
);
8855 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8856 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
8858 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
8860 pattern_def_seq
= NULL
;
8867 /* -------- vectorize statement ------------ */
8868 if (dump_enabled_p ())
8869 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
8871 grouped_store
= false;
8872 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
8875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
8877 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8878 interleaving chain was completed - free all the stores in
8881 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
8885 /* Free the attached stmt_vec_info and remove the stmt. */
8886 gimple
*store
= gsi_stmt (si
);
8887 free_stmt_vec_info (store
);
8888 unlink_stmt_vdef (store
);
8889 gsi_remove (&si
, true);
8890 release_defs (store
);
8893 /* Stores can only appear at the end of pattern statements. */
8894 gcc_assert (!transform_pattern_stmt
);
8895 pattern_def_seq
= NULL
;
8897 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
8899 pattern_def_seq
= NULL
;
8904 /* Stub out scalar statements that must not survive vectorization.
8905 Doing this here helps with grouped statements, or statements that
8906 are involved in patterns. */
8907 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
8908 !gsi_end_p (gsi
); gsi_next (&gsi
))
8910 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
8911 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
8913 tree lhs
= gimple_get_lhs (call
);
8914 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8916 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
8917 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
8918 gsi_replace (&gsi
, new_stmt
, true);
8924 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8925 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8926 if (integer_onep (step_vector
))
8927 niters_no_overflow
= true;
8928 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
8929 niters_vector_mult_vf
, !niters_no_overflow
);
8931 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
8932 scale_profile_for_vect_loop (loop
, assumed_vf
);
8934 /* True if the final iteration might not handle a full vector's
8935 worth of scalar iterations. */
8936 bool final_iter_may_be_partial
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8937 /* The minimum number of iterations performed by the epilogue. This
8938 is 1 when peeling for gaps because we always need a final scalar
8940 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
8941 /* +1 to convert latch counts to loop iteration counts,
8942 -min_epilogue_iters to remove iterations that cannot be performed
8943 by the vector code. */
8944 int bias_for_lowest
= 1 - min_epilogue_iters
;
8945 int bias_for_assumed
= bias_for_lowest
;
8946 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
8947 if (alignment_npeels
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8949 /* When the amount of peeling is known at compile time, the first
8950 iteration will have exactly alignment_npeels active elements.
8951 In the worst case it will have at least one. */
8952 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
8953 bias_for_lowest
+= lowest_vf
- min_first_active
;
8954 bias_for_assumed
+= assumed_vf
- min_first_active
;
8956 /* In these calculations the "- 1" converts loop iteration counts
8957 back to latch counts. */
8958 if (loop
->any_upper_bound
)
8959 loop
->nb_iterations_upper_bound
8960 = (final_iter_may_be_partial
8961 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8963 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8965 if (loop
->any_likely_upper_bound
)
8966 loop
->nb_iterations_likely_upper_bound
8967 = (final_iter_may_be_partial
8968 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
8969 + bias_for_lowest
, lowest_vf
) - 1
8970 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
8971 + bias_for_lowest
, lowest_vf
) - 1);
8972 if (loop
->any_estimate
)
8973 loop
->nb_iterations_estimate
8974 = (final_iter_may_be_partial
8975 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8977 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8980 if (dump_enabled_p ())
8982 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8984 dump_printf_loc (MSG_NOTE
, vect_location
,
8985 "LOOP VECTORIZED\n");
8987 dump_printf_loc (MSG_NOTE
, vect_location
,
8988 "OUTER LOOP VECTORIZED\n");
8989 dump_printf (MSG_NOTE
, "\n");
8993 dump_printf_loc (MSG_NOTE
, vect_location
,
8994 "LOOP EPILOGUE VECTORIZED (VS=");
8995 dump_dec (MSG_NOTE
, current_vector_size
);
8996 dump_printf (MSG_NOTE
, ")\n");
9000 /* Free SLP instances here because otherwise stmt reference counting
9002 slp_instance instance
;
9003 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
9004 vect_free_slp_instance (instance
);
9005 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
9006 /* Clear-up safelen field since its value is invalid after vectorization
9007 since vectorized loop can have loop-carried dependencies. */
9010 /* Don't vectorize epilogue for epilogue. */
9011 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
9014 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK
))
9019 auto_vector_sizes vector_sizes
;
9020 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
9021 unsigned int next_size
= 0;
9023 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
9024 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0
9025 && known_eq (vf
, lowest_vf
))
9028 = (LOOP_VINFO_INT_NITERS (loop_vinfo
)
9029 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
));
9030 eiters
= eiters
% lowest_vf
;
9031 epilogue
->nb_iterations_upper_bound
= eiters
- 1;
9034 while (next_size
< vector_sizes
.length ()
9035 && !(constant_multiple_p (current_vector_size
,
9036 vector_sizes
[next_size
], &ratio
)
9037 && eiters
>= lowest_vf
/ ratio
))
9041 while (next_size
< vector_sizes
.length ()
9042 && maybe_lt (current_vector_size
, vector_sizes
[next_size
]))
9045 if (next_size
== vector_sizes
.length ())
9051 epilogue
->force_vectorize
= loop
->force_vectorize
;
9052 epilogue
->safelen
= loop
->safelen
;
9053 epilogue
->dont_vectorize
= false;
9055 /* We may need to if-convert epilogue to vectorize it. */
9056 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
9057 tree_if_conversion (epilogue
);
9063 /* The code below is trying to perform simple optimization - revert
9064 if-conversion for masked stores, i.e. if the mask of a store is zero
9065 do not perform it and all stored value producers also if possible.
9073 this transformation will produce the following semi-hammock:
9075 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9077 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9078 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9079 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9080 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9081 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9082 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9087 optimize_mask_stores (struct loop
*loop
)
9089 basic_block
*bbs
= get_loop_body (loop
);
9090 unsigned nbbs
= loop
->num_nodes
;
9093 struct loop
*bb_loop
;
9094 gimple_stmt_iterator gsi
;
9096 auto_vec
<gimple
*> worklist
;
9098 vect_location
= find_loop_location (loop
);
9099 /* Pick up all masked stores in loop if any. */
9100 for (i
= 0; i
< nbbs
; i
++)
9103 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
9106 stmt
= gsi_stmt (gsi
);
9107 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
9108 worklist
.safe_push (stmt
);
9113 if (worklist
.is_empty ())
9116 /* Loop has masked stores. */
9117 while (!worklist
.is_empty ())
9119 gimple
*last
, *last_store
;
9122 basic_block store_bb
, join_bb
;
9123 gimple_stmt_iterator gsi_to
;
9124 tree vdef
, new_vdef
;
9129 last
= worklist
.pop ();
9130 mask
= gimple_call_arg (last
, 2);
9131 bb
= gimple_bb (last
);
9132 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9133 the same loop as if_bb. It could be different to LOOP when two
9134 level loop-nest is vectorized and mask_store belongs to the inner
9136 e
= split_block (bb
, last
);
9137 bb_loop
= bb
->loop_father
;
9138 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
9140 store_bb
= create_empty_bb (bb
);
9141 add_bb_to_loop (store_bb
, bb_loop
);
9142 e
->flags
= EDGE_TRUE_VALUE
;
9143 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
9144 /* Put STORE_BB to likely part. */
9145 efalse
->probability
= profile_probability::unlikely ();
9146 store_bb
->count
= efalse
->count ();
9147 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
9148 if (dom_info_available_p (CDI_DOMINATORS
))
9149 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
9150 if (dump_enabled_p ())
9151 dump_printf_loc (MSG_NOTE
, vect_location
,
9152 "Create new block %d to sink mask stores.",
9154 /* Create vector comparison with boolean result. */
9155 vectype
= TREE_TYPE (mask
);
9156 zero
= build_zero_cst (vectype
);
9157 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
9158 gsi
= gsi_last_bb (bb
);
9159 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
9160 /* Create new PHI node for vdef of the last masked store:
9161 .MEM_2 = VDEF <.MEM_1>
9162 will be converted to
9163 .MEM.3 = VDEF <.MEM_1>
9164 and new PHI node will be created in join bb
9165 .MEM_2 = PHI <.MEM_1, .MEM_3>
9167 vdef
= gimple_vdef (last
);
9168 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
9169 gimple_set_vdef (last
, new_vdef
);
9170 phi
= create_phi_node (vdef
, join_bb
);
9171 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
9173 /* Put all masked stores with the same mask to STORE_BB if possible. */
9176 gimple_stmt_iterator gsi_from
;
9177 gimple
*stmt1
= NULL
;
9179 /* Move masked store to STORE_BB. */
9181 gsi
= gsi_for_stmt (last
);
9183 /* Shift GSI to the previous stmt for further traversal. */
9185 gsi_to
= gsi_start_bb (store_bb
);
9186 gsi_move_before (&gsi_from
, &gsi_to
);
9187 /* Setup GSI_TO to the non-empty block start. */
9188 gsi_to
= gsi_start_bb (store_bb
);
9189 if (dump_enabled_p ())
9191 dump_printf_loc (MSG_NOTE
, vect_location
,
9192 "Move stmt to created bb\n");
9193 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, last
, 0);
9195 /* Move all stored value producers if possible. */
9196 while (!gsi_end_p (gsi
))
9199 imm_use_iterator imm_iter
;
9200 use_operand_p use_p
;
9203 /* Skip debug statements. */
9204 if (is_gimple_debug (gsi_stmt (gsi
)))
9209 stmt1
= gsi_stmt (gsi
);
9210 /* Do not consider statements writing to memory or having
9211 volatile operand. */
9212 if (gimple_vdef (stmt1
)
9213 || gimple_has_volatile_ops (stmt1
))
9217 lhs
= gimple_get_lhs (stmt1
);
9221 /* LHS of vectorized stmt must be SSA_NAME. */
9222 if (TREE_CODE (lhs
) != SSA_NAME
)
9225 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9227 /* Remove dead scalar statement. */
9228 if (has_zero_uses (lhs
))
9230 gsi_remove (&gsi_from
, true);
9235 /* Check that LHS does not have uses outside of STORE_BB. */
9237 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
9240 use_stmt
= USE_STMT (use_p
);
9241 if (is_gimple_debug (use_stmt
))
9243 if (gimple_bb (use_stmt
) != store_bb
)
9252 if (gimple_vuse (stmt1
)
9253 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
9256 /* Can move STMT1 to STORE_BB. */
9257 if (dump_enabled_p ())
9259 dump_printf_loc (MSG_NOTE
, vect_location
,
9260 "Move stmt to created bb\n");
9261 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt1
, 0);
9263 gsi_move_before (&gsi_from
, &gsi_to
);
9264 /* Shift GSI_TO for further insertion. */
9267 /* Put other masked stores with the same mask to STORE_BB. */
9268 if (worklist
.is_empty ()
9269 || gimple_call_arg (worklist
.last (), 2) != mask
9270 || worklist
.last () != stmt1
)
9272 last
= worklist
.pop ();
9274 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);