2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 for (i=0; i<N/8; i++){
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
172 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
173 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
174 int nbbs
= loop
->num_nodes
;
175 gimple_stmt_iterator si
;
176 unsigned int vectorization_factor
= 0;
181 stmt_vec_info stmt_info
;
184 gimple stmt
, pattern_stmt
= NULL
;
185 gimple_seq pattern_def_seq
= NULL
;
186 gimple_stmt_iterator pattern_def_si
= gsi_none ();
187 bool analyze_pattern_stmt
= false;
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE
, vect_location
,
191 "=== vect_determine_vectorization_factor ===");
193 for (i
= 0; i
< nbbs
; i
++)
195 basic_block bb
= bbs
[i
];
197 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
200 stmt_info
= vinfo_for_stmt (phi
);
201 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
207 gcc_assert (stmt_info
);
209 if (STMT_VINFO_RELEVANT_P (stmt_info
))
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
212 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE
, vect_location
,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
221 vectype
= get_vectype_for_scalar_type (scalar_type
);
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
227 "not vectorized: unsupported "
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
234 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
239 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
242 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d", nunits
);
246 if (!vectorization_factor
247 || (nunits
> vectorization_factor
))
248 vectorization_factor
= nunits
;
252 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
) || analyze_pattern_stmt
;)
256 if (analyze_pattern_stmt
)
259 stmt
= gsi_stmt (si
);
261 stmt_info
= vinfo_for_stmt (stmt
);
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_NOTE
, vect_location
,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
270 gcc_assert (stmt_info
);
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
274 && !STMT_VINFO_LIVE_P (stmt_info
))
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
277 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
282 stmt_info
= vinfo_for_stmt (pattern_stmt
);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE
, vect_location
,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.");
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
299 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
302 analyze_pattern_stmt
= true;
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info
))
307 if (pattern_def_seq
== NULL
)
309 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
310 pattern_def_si
= gsi_start (pattern_def_seq
);
312 else if (!gsi_end_p (pattern_def_si
))
313 gsi_next (&pattern_def_si
);
314 if (pattern_def_seq
!= NULL
)
316 gimple pattern_def_stmt
= NULL
;
317 stmt_vec_info pattern_def_stmt_info
= NULL
;
319 while (!gsi_end_p (pattern_def_si
))
321 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt
);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
327 gsi_next (&pattern_def_si
);
330 if (!gsi_end_p (pattern_def_si
))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE
, vect_location
,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
337 pattern_def_stmt
, 0);
340 stmt
= pattern_def_stmt
;
341 stmt_info
= pattern_def_stmt_info
;
345 pattern_def_si
= gsi_none ();
346 analyze_pattern_stmt
= false;
350 analyze_pattern_stmt
= false;
353 if (gimple_get_lhs (stmt
) == NULL_TREE
)
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
376 if (STMT_VINFO_VECTYPE (stmt_info
))
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
383 || is_pattern_stmt_p (stmt_info
)
384 || !gsi_end_p (pattern_def_si
));
385 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
390 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_NOTE
, vect_location
,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
397 vectype
= get_vectype_for_scalar_type (scalar_type
);
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
403 "not vectorized: unsupported "
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
411 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
413 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
416 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
425 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE
, vect_location
,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
431 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
454 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
464 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
467 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d", nunits
);
470 if (!vectorization_factor
471 || (nunits
> vectorization_factor
))
472 vectorization_factor
= nunits
;
474 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
476 pattern_def_seq
= NULL
;
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d",
485 vectorization_factor
);
486 if (vectorization_factor
<= 1)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
490 "not vectorized: unsupported data-type");
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
499 /* Function vect_is_simple_iv_evolution.
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
505 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
510 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
512 /* When there is no evolution in this loop, the evolution function
514 if (evolution_part
== NULL_TREE
)
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part
))
522 step_expr
= evolution_part
;
523 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
528 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
529 dump_printf (MSG_NOTE
, ", init: ");
530 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
536 if (TREE_CODE (step_expr
) != INTEGER_CST
)
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
547 /* Function vect_analyze_scalar_cycles_1.
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
557 basic_block bb
= loop
->header
;
559 vec
<gimple
> worklist
;
560 worklist
.create (64);
561 gimple_stmt_iterator gsi
;
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE
, vect_location
,
566 "=== vect_analyze_scalar_cycles ===");
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
571 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
573 gimple phi
= gsi_stmt (gsi
);
574 tree access_fn
= NULL
;
575 tree def
= PHI_RESULT (phi
);
576 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
578 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def
))
589 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
591 /* Analyze the evolution function. */
592 access_fn
= analyze_scalar_evolution (loop
, def
);
595 STRIP_NOPS (access_fn
);
596 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE
, vect_location
,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
603 = evolution_part_in_loop_num (access_fn
, loop
->num
);
607 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &dumy
, &dumy
))
609 worklist
.safe_push (phi
);
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist
.length () > 0)
624 gimple phi
= worklist
.pop ();
625 tree def
= PHI_RESULT (phi
);
626 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
630 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
636 gcc_assert (!virtual_operand_p (def
)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
639 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
640 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE
, vect_location
,
648 "Detected double reduction.");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
652 vect_double_reduction_def
;
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE
, vect_location
,
660 "Detected vectorizable nested cycle.");
662 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE
, vect_location
,
670 "Detected reduction.");
672 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
675 /* Store the reduction cycles for possible vectorization in
677 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
684 "Unknown def-use cycle pattern.");
691 /* Function vect_analyze_scalar_cycles.
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
715 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
717 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
729 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
732 /* Function vect_get_loop_niters.
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
740 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
)
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE
, vect_location
,
746 "=== get_loop_niters ===");
747 niters
= number_of_exit_cond_executions (loop
);
749 if (niters
!= NULL_TREE
750 && niters
!= chrec_dont_know
)
752 *number_of_iterations
= niters
;
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_NOTE
, vect_location
, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, *number_of_iterations
);
761 return get_loop_exit_condition (loop
);
765 /* Function bb_in_loop_p
767 Used as predicate for dfs order traversal of the loop bbs. */
770 bb_in_loop_p (const_basic_block bb
, const void *data
)
772 const struct loop
*const loop
= (const struct loop
*)data
;
773 if (flow_bb_inside_loop_p (loop
, bb
))
779 /* Function new_loop_vec_info.
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
785 new_loop_vec_info (struct loop
*loop
)
789 gimple_stmt_iterator si
;
790 unsigned int i
, nbbs
;
792 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
793 LOOP_VINFO_LOOP (res
) = loop
;
795 bbs
= get_loop_body (loop
);
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i
= 0; i
< loop
->num_nodes
; i
++)
800 basic_block bb
= bbs
[i
];
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb
->loop_father
!= loop
)
812 gcc_assert (loop
->inner
&& bb
->loop_father
== loop
->inner
);
813 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
815 gimple phi
= gsi_stmt (si
);
816 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
817 loop_vec_info inner_loop_vinfo
=
818 STMT_VINFO_LOOP_VINFO (stmt_info
);
819 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
820 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
822 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
824 gimple stmt
= gsi_stmt (si
);
825 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
826 loop_vec_info inner_loop_vinfo
=
827 STMT_VINFO_LOOP_VINFO (stmt_info
);
828 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
829 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
834 /* bb in current nest. */
835 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
837 gimple phi
= gsi_stmt (si
);
838 gimple_set_uid (phi
, 0);
839 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
, NULL
));
842 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
844 gimple stmt
= gsi_stmt (si
);
845 gimple_set_uid (stmt
, 0);
846 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
, NULL
));
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
857 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
858 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
859 bbs
, loop
->num_nodes
, loop
);
860 gcc_assert (nbbs
== loop
->num_nodes
);
862 LOOP_VINFO_BBS (res
) = bbs
;
863 LOOP_VINFO_NITERS (res
) = NULL
;
864 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res
) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res
) = 0;
868 LOOP_VINFO_VECT_FACTOR (res
) = 0;
869 LOOP_VINFO_LOOP_NEST (res
).create (3);
870 LOOP_VINFO_DATAREFS (res
).create (10);
871 LOOP_VINFO_DDRS (res
).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res
).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res
).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
877 LOOP_VINFO_GROUPED_STORES (res
).create (10);
878 LOOP_VINFO_REDUCTIONS (res
).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res
).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res
).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
882 LOOP_VINFO_PEELING_HTAB (res
) = NULL
;
883 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
884 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
885 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
891 /* Function destroy_loop_vec_info.
893 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
894 stmts in the loop. */
897 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
902 gimple_stmt_iterator si
;
904 vec
<slp_instance
> slp_instances
;
905 slp_instance instance
;
911 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
913 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
914 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
915 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
917 for (j
= 0; j
< nbbs
; j
++)
919 basic_block bb
= bbs
[j
];
920 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
921 free_stmt_vec_info (gsi_stmt (si
));
923 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
925 gimple stmt
= gsi_stmt (si
);
927 /* We may have broken canonical form by moving a constant
928 into RHS1 of a commutative op. Fix such occurrences. */
929 if (swapped
&& is_gimple_assign (stmt
))
931 enum tree_code code
= gimple_assign_rhs_code (stmt
);
933 if ((code
== PLUS_EXPR
934 || code
== POINTER_PLUS_EXPR
935 || code
== MULT_EXPR
)
936 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
937 swap_tree_operands (stmt
,
938 gimple_assign_rhs1_ptr (stmt
),
939 gimple_assign_rhs2_ptr (stmt
));
942 /* Free stmt_vec_info. */
943 free_stmt_vec_info (stmt
);
948 free (LOOP_VINFO_BBS (loop_vinfo
));
949 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo
));
950 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
951 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
952 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
953 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
954 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
955 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
956 vect_free_slp_instance (instance
);
958 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
959 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
960 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
961 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
963 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo
))
964 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo
));
966 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
973 /* Function vect_analyze_loop_1.
975 Apply a set of analyses on LOOP, and create a loop_vec_info struct
976 for it. The different analyses will record information in the
977 loop_vec_info struct. This is a subset of the analyses applied in
978 vect_analyze_loop, to be applied on an inner-loop nested in the loop
979 that is now considered for (outer-loop) vectorization. */
982 vect_analyze_loop_1 (struct loop
*loop
)
984 loop_vec_info loop_vinfo
;
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_NOTE
, vect_location
,
988 "===== analyze_loop_nest_1 =====");
990 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
992 loop_vinfo
= vect_analyze_loop_form (loop
);
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
997 "bad inner-loop form.");
1005 /* Function vect_analyze_loop_form.
1007 Verify that certain CFG restrictions hold, including:
1008 - the loop has a pre-header
1009 - the loop has a single entry and exit
1010 - the loop exit condition is simple enough, and the number of iterations
1011 can be analyzed (a countable loop). */
1014 vect_analyze_loop_form (struct loop
*loop
)
1016 loop_vec_info loop_vinfo
;
1018 tree number_of_iterations
= NULL
;
1019 loop_vec_info inner_loop_vinfo
= NULL
;
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_NOTE
, vect_location
,
1023 "=== vect_analyze_loop_form ===");
1025 /* Different restrictions apply when we are considering an inner-most loop,
1026 vs. an outer (nested) loop.
1027 (FORNOW. May want to relax some of these restrictions in the future). */
1031 /* Inner-most loop. We currently require that the number of BBs is
1032 exactly 2 (the header and latch). Vectorizable inner-most loops
1043 if (loop
->num_nodes
!= 2)
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1047 "not vectorized: control flow in loop.");
1051 if (empty_block_p (loop
->header
))
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1055 "not vectorized: empty loop.");
1061 struct loop
*innerloop
= loop
->inner
;
1064 /* Nested loop. We currently require that the loop is doubly-nested,
1065 contains a single inner loop, and the number of BBs is exactly 5.
1066 Vectorizable outer-loops look like this:
1078 The inner-loop has the properties expected of inner-most loops
1079 as described above. */
1081 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1083 if (dump_enabled_p ())
1084 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1085 "not vectorized: multiple nested loops.");
1089 /* Analyze the inner-loop. */
1090 inner_loop_vinfo
= vect_analyze_loop_1 (loop
->inner
);
1091 if (!inner_loop_vinfo
)
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1095 "not vectorized: Bad inner loop.");
1099 if (!expr_invariant_in_loop_p (loop
,
1100 LOOP_VINFO_NITERS (inner_loop_vinfo
)))
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1104 "not vectorized: inner-loop count not invariant.");
1105 destroy_loop_vec_info (inner_loop_vinfo
, true);
1109 if (loop
->num_nodes
!= 5)
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1113 "not vectorized: control flow in loop.");
1114 destroy_loop_vec_info (inner_loop_vinfo
, true);
1118 gcc_assert (EDGE_COUNT (innerloop
->header
->preds
) == 2);
1119 entryedge
= EDGE_PRED (innerloop
->header
, 0);
1120 if (EDGE_PRED (innerloop
->header
, 0)->src
== innerloop
->latch
)
1121 entryedge
= EDGE_PRED (innerloop
->header
, 1);
1123 if (entryedge
->src
!= loop
->header
1124 || !single_exit (innerloop
)
1125 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1127 if (dump_enabled_p ())
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1129 "not vectorized: unsupported outerloop form.");
1130 destroy_loop_vec_info (inner_loop_vinfo
, true);
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE
, vect_location
,
1136 "Considering outer-loop vectorization.");
1139 if (!single_exit (loop
)
1140 || EDGE_COUNT (loop
->header
->preds
) != 2)
1142 if (dump_enabled_p ())
1144 if (!single_exit (loop
))
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1146 "not vectorized: multiple exits.");
1147 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1148 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1149 "not vectorized: too many incoming edges.");
1151 if (inner_loop_vinfo
)
1152 destroy_loop_vec_info (inner_loop_vinfo
, true);
1156 /* We assume that the loop exit condition is at the end of the loop. i.e,
1157 that the loop is represented as a do-while (with a proper if-guard
1158 before the loop if needed), where the loop header contains all the
1159 executable statements, and the latch is empty. */
1160 if (!empty_block_p (loop
->latch
)
1161 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1165 "not vectorized: latch block not empty.");
1166 if (inner_loop_vinfo
)
1167 destroy_loop_vec_info (inner_loop_vinfo
, true);
1171 /* Make sure there exists a single-predecessor exit bb: */
1172 if (!single_pred_p (single_exit (loop
)->dest
))
1174 edge e
= single_exit (loop
);
1175 if (!(e
->flags
& EDGE_ABNORMAL
))
1177 split_loop_exit_edge (e
);
1178 if (dump_enabled_p ())
1179 dump_printf (MSG_NOTE
, "split exit edge.");
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1185 "not vectorized: abnormal loop exit edge.");
1186 if (inner_loop_vinfo
)
1187 destroy_loop_vec_info (inner_loop_vinfo
, true);
1192 loop_cond
= vect_get_loop_niters (loop
, &number_of_iterations
);
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1197 "not vectorized: complicated exit condition.");
1198 if (inner_loop_vinfo
)
1199 destroy_loop_vec_info (inner_loop_vinfo
, true);
1203 if (!number_of_iterations
)
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1207 "not vectorized: number of iterations cannot be "
1209 if (inner_loop_vinfo
)
1210 destroy_loop_vec_info (inner_loop_vinfo
, true);
1214 if (chrec_contains_undetermined (number_of_iterations
))
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1218 "Infinite number of iterations.");
1219 if (inner_loop_vinfo
)
1220 destroy_loop_vec_info (inner_loop_vinfo
, true);
1224 if (!NITERS_KNOWN_P (number_of_iterations
))
1226 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_NOTE
, vect_location
,
1229 "Symbolic number of iterations is ");
1230 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1233 else if (TREE_INT_CST_LOW (number_of_iterations
) == 0)
1235 if (dump_enabled_p ())
1236 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1237 "not vectorized: number of iterations = 0.");
1238 if (inner_loop_vinfo
)
1239 destroy_loop_vec_info (inner_loop_vinfo
, true);
1243 loop_vinfo
= new_loop_vec_info (loop
);
1244 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1245 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1247 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1249 /* CHECKME: May want to keep it around it in the future. */
1250 if (inner_loop_vinfo
)
1251 destroy_loop_vec_info (inner_loop_vinfo
, false);
1253 gcc_assert (!loop
->aux
);
1254 loop
->aux
= loop_vinfo
;
1259 /* Function vect_analyze_loop_operations.
1261 Scan the loop stmts and make sure they are all vectorizable. */
1264 vect_analyze_loop_operations (loop_vec_info loop_vinfo
, bool slp
)
1266 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1267 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1268 int nbbs
= loop
->num_nodes
;
1269 gimple_stmt_iterator si
;
1270 unsigned int vectorization_factor
= 0;
1273 stmt_vec_info stmt_info
;
1274 bool need_to_vectorize
= false;
1275 int min_profitable_iters
;
1276 int min_scalar_loop_bound
;
1278 bool only_slp_in_loop
= true, ok
;
1279 HOST_WIDE_INT max_niter
;
1280 HOST_WIDE_INT estimated_niter
;
1281 int min_profitable_estimate
;
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_NOTE
, vect_location
,
1285 "=== vect_analyze_loop_operations ===");
1287 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
1288 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1291 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1292 vectorization factor of the loop is the unrolling factor required by
1293 the SLP instances. If that unrolling factor is 1, we say, that we
1294 perform pure SLP on loop - cross iteration parallelism is not
1296 for (i
= 0; i
< nbbs
; i
++)
1298 basic_block bb
= bbs
[i
];
1299 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1301 gimple stmt
= gsi_stmt (si
);
1302 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1303 gcc_assert (stmt_info
);
1304 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1305 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1306 && !PURE_SLP_STMT (stmt_info
))
1307 /* STMT needs both SLP and loop-based vectorization. */
1308 only_slp_in_loop
= false;
1312 if (only_slp_in_loop
)
1313 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1315 vectorization_factor
= least_common_multiple (vectorization_factor
,
1316 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1318 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_NOTE
, vect_location
,
1321 "Updating vectorization factor to %d ",
1322 vectorization_factor
);
1325 for (i
= 0; i
< nbbs
; i
++)
1327 basic_block bb
= bbs
[i
];
1329 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1331 phi
= gsi_stmt (si
);
1334 stmt_info
= vinfo_for_stmt (phi
);
1335 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1338 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1341 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1342 (i.e., a phi in the tail of the outer-loop). */
1343 if (! is_loop_header_bb_p (bb
))
1345 /* FORNOW: we currently don't support the case that these phis
1346 are not used in the outerloop (unless it is double reduction,
1347 i.e., this phi is vect_reduction_def), cause this case
1348 requires to actually do something here. */
1349 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1350 || STMT_VINFO_LIVE_P (stmt_info
))
1351 && STMT_VINFO_DEF_TYPE (stmt_info
)
1352 != vect_double_reduction_def
)
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1356 "Unsupported loop-closed phi in "
1361 /* If PHI is used in the outer loop, we check that its operand
1362 is defined in the inner loop. */
1363 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1368 if (gimple_phi_num_args (phi
) != 1)
1371 phi_op
= PHI_ARG_DEF (phi
, 0);
1372 if (TREE_CODE (phi_op
) != SSA_NAME
)
1375 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1377 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1378 || !vinfo_for_stmt (op_def_stmt
))
1381 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1382 != vect_used_in_outer
1383 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1384 != vect_used_in_outer_by_reduction
)
1391 gcc_assert (stmt_info
);
1393 if (STMT_VINFO_LIVE_P (stmt_info
))
1395 /* FORNOW: not yet supported. */
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1398 "not vectorized: value used after loop.");
1402 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1403 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1405 /* A scalar-dependence cycle that we don't support. */
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1408 "not vectorized: scalar dependence cycle.");
1412 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1414 need_to_vectorize
= true;
1415 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1416 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1421 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1424 "not vectorized: relevant phi not "
1426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1432 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1434 gimple stmt
= gsi_stmt (si
);
1435 if (!vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1440 /* All operations in the loop are either irrelevant (deal with loop
1441 control, or dead), or only used outside the loop and can be moved
1442 out of the loop (e.g. invariants, inductions). The loop can be
1443 optimized away by scalar optimizations. We're better off not
1444 touching this loop. */
1445 if (!need_to_vectorize
)
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE
, vect_location
,
1449 "All the computation can be taken out of the loop.");
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1452 "not vectorized: redundant loop. no profit to "
1457 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE
, vect_location
,
1459 "vectorization_factor = %d, niters = "
1460 HOST_WIDE_INT_PRINT_DEC
, vectorization_factor
,
1461 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1463 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1464 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1465 || ((max_niter
= max_stmt_executions_int (loop
)) != -1
1466 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1470 "not vectorized: iteration count too small.");
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1473 "not vectorized: iteration count smaller than "
1474 "vectorization factor.");
1478 /* Analyze cost. Decide if worth while to vectorize. */
1480 /* Once VF is set, SLP costs should be updated since the number of created
1481 vector stmts depends on VF. */
1482 vect_update_slp_costs_according_to_vf (loop_vinfo
);
1484 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1485 &min_profitable_estimate
);
1486 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
) = min_profitable_iters
;
1488 if (min_profitable_iters
< 0)
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1492 "not vectorized: vectorization not profitable.");
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1495 "not vectorized: vector version will never be "
1500 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1501 * vectorization_factor
) - 1);
1504 /* Use the cost model only if it is more conservative than user specified
1507 th
= (unsigned) min_scalar_loop_bound
;
1508 if (min_profitable_iters
1509 && (!min_scalar_loop_bound
1510 || min_profitable_iters
> min_scalar_loop_bound
))
1511 th
= (unsigned) min_profitable_iters
;
1513 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1514 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1518 "not vectorized: vectorization not profitable.");
1519 if (dump_enabled_p ())
1520 dump_printf_loc (MSG_NOTE
, vect_location
,
1521 "not vectorized: iteration count smaller than user "
1522 "specified loop bound parameter or minimum profitable "
1523 "iterations (whichever is more conservative).");
1527 if ((estimated_niter
= estimated_stmt_executions_int (loop
)) != -1
1528 && ((unsigned HOST_WIDE_INT
) estimated_niter
1529 <= MAX (th
, (unsigned)min_profitable_estimate
)))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1533 "not vectorized: estimated iteration count too "
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_NOTE
, vect_location
,
1537 "not vectorized: estimated iteration count smaller "
1538 "than specified loop bound parameter or minimum "
1539 "profitable iterations (whichever is more "
1544 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1545 || LOOP_VINFO_INT_NITERS (loop_vinfo
) % vectorization_factor
!= 0
1546 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
))
1548 if (dump_enabled_p ())
1549 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required.");
1550 if (!vect_can_advance_ivs_p (loop_vinfo
))
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1554 "not vectorized: can't create epilog loop 1.");
1557 if (!slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1561 "not vectorized: can't create epilog loop 2.");
1570 /* Function vect_analyze_loop_2.
1572 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1573 for it. The different analyses will record information in the
1574 loop_vec_info struct. */
1576 vect_analyze_loop_2 (loop_vec_info loop_vinfo
)
1578 bool ok
, slp
= false;
1579 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1582 /* Find all data references in the loop (which correspond to vdefs/vuses)
1583 and analyze their evolution in the loop. Also adjust the minimal
1584 vectorization factor according to the loads and stores.
1586 FORNOW: Handle only simple, array references, which
1587 alignment can be forced, and aligned pointer-references. */
1589 ok
= vect_analyze_data_refs (loop_vinfo
, NULL
, &min_vf
);
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1594 "bad data references.");
1598 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1599 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1601 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1606 "bad data access.");
1610 /* Classify all cross-iteration scalar data-flow cycles.
1611 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1613 vect_analyze_scalar_cycles (loop_vinfo
);
1615 vect_pattern_recog (loop_vinfo
, NULL
);
1617 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1619 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1624 "unexpected pattern.");
1628 /* Analyze data dependences between the data-refs in the loop
1629 and adjust the maximum vectorization factor according to
1631 FORNOW: fail at the first data dependence that we encounter. */
1633 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1639 "bad data dependence.");
1643 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1648 "can't determine vectorization factor.");
1651 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1655 "bad data dependence.");
1659 /* Analyze the alignment of the data-refs in the loop.
1660 Fail if a data reference is found that cannot be vectorized. */
1662 ok
= vect_analyze_data_refs_alignment (loop_vinfo
, NULL
);
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1667 "bad data alignment.");
1671 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1672 It is important to call pruning after vect_analyze_data_ref_accesses,
1673 since we use grouping information gathered by interleaving analysis. */
1674 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1679 "too long list of versioning for alias "
1684 /* This pass will decide on using loop versioning and/or loop peeling in
1685 order to enhance the alignment of data references in the loop. */
1687 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1692 "bad data alignment.");
1696 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1697 ok
= vect_analyze_slp (loop_vinfo
, NULL
);
1700 /* Decide which possible SLP instances to SLP. */
1701 slp
= vect_make_slp_decision (loop_vinfo
);
1703 /* Find stmts that need to be both vectorized and SLPed. */
1704 vect_detect_hybrid_slp (loop_vinfo
);
1709 /* Scan all the operations in the loop and make sure they are
1712 ok
= vect_analyze_loop_operations (loop_vinfo
, slp
);
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1717 "bad operation or unsupported loop bound.");
1724 /* Function vect_analyze_loop.
1726 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1727 for it. The different analyses will record information in the
1728 loop_vec_info struct. */
1730 vect_analyze_loop (struct loop
*loop
)
1732 loop_vec_info loop_vinfo
;
1733 unsigned int vector_sizes
;
1735 /* Autodetect first vector size we try. */
1736 current_vector_size
= 0;
1737 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE
, vect_location
,
1741 "===== analyze_loop_nest =====");
1743 if (loop_outer (loop
)
1744 && loop_vec_info_for_loop (loop_outer (loop
))
1745 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
1747 if (dump_enabled_p ())
1748 dump_printf_loc (MSG_NOTE
, vect_location
,
1749 "outer-loop already vectorized.");
1755 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1756 loop_vinfo
= vect_analyze_loop_form (loop
);
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1765 if (vect_analyze_loop_2 (loop_vinfo
))
1767 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
1772 destroy_loop_vec_info (loop_vinfo
, true);
1774 vector_sizes
&= ~current_vector_size
;
1775 if (vector_sizes
== 0
1776 || current_vector_size
== 0)
1779 /* Try the next biggest vector size. */
1780 current_vector_size
= 1 << floor_log2 (vector_sizes
);
1781 if (dump_enabled_p ())
1782 dump_printf_loc (MSG_NOTE
, vect_location
,
1783 "***** Re-trying analysis with "
1784 "vector size %d\n", current_vector_size
);
1789 /* Function reduction_code_for_scalar_code
1792 CODE - tree_code of a reduction operations.
1795 REDUC_CODE - the corresponding tree-code to be used to reduce the
1796 vector of partial results into a single scalar result (which
1797 will also reside in a vector) or ERROR_MARK if the operation is
1798 a supported reduction operation, but does not have such tree-code.
1800 Return FALSE if CODE currently cannot be vectorized as reduction. */
1803 reduction_code_for_scalar_code (enum tree_code code
,
1804 enum tree_code
*reduc_code
)
1809 *reduc_code
= REDUC_MAX_EXPR
;
1813 *reduc_code
= REDUC_MIN_EXPR
;
1817 *reduc_code
= REDUC_PLUS_EXPR
;
1825 *reduc_code
= ERROR_MARK
;
1834 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1835 STMT is printed with a message MSG. */
1838 report_vect_op (int msg_type
, gimple stmt
, const char *msg
)
1840 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
1841 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
1845 /* Detect SLP reduction of the form:
1855 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1856 FIRST_STMT is the first reduction stmt in the chain
1857 (a2 = operation (a1)).
1859 Return TRUE if a reduction chain was detected. */
1862 vect_is_slp_reduction (loop_vec_info loop_info
, gimple phi
, gimple first_stmt
)
1864 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
1865 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
1866 enum tree_code code
;
1867 gimple current_stmt
= NULL
, loop_use_stmt
= NULL
, first
, next_stmt
;
1868 stmt_vec_info use_stmt_info
, current_stmt_info
;
1870 imm_use_iterator imm_iter
;
1871 use_operand_p use_p
;
1872 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
1875 if (loop
!= vect_loop
)
1878 lhs
= PHI_RESULT (phi
);
1879 code
= gimple_assign_rhs_code (first_stmt
);
1883 n_out_of_loop_uses
= 0;
1884 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
1886 gimple use_stmt
= USE_STMT (use_p
);
1887 if (is_gimple_debug (use_stmt
))
1890 use_stmt
= USE_STMT (use_p
);
1892 /* Check if we got back to the reduction phi. */
1893 if (use_stmt
== phi
)
1895 loop_use_stmt
= use_stmt
;
1900 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1902 if (vinfo_for_stmt (use_stmt
)
1903 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt
)))
1905 loop_use_stmt
= use_stmt
;
1910 n_out_of_loop_uses
++;
1912 /* There are can be either a single use in the loop or two uses in
1914 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
1921 /* We reached a statement with no loop uses. */
1922 if (nloop_uses
== 0)
1925 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1926 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
1929 if (!is_gimple_assign (loop_use_stmt
)
1930 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
1931 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
1934 /* Insert USE_STMT into reduction chain. */
1935 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
1938 current_stmt_info
= vinfo_for_stmt (current_stmt
);
1939 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
1940 GROUP_FIRST_ELEMENT (use_stmt_info
)
1941 = GROUP_FIRST_ELEMENT (current_stmt_info
);
1944 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
1946 lhs
= gimple_assign_lhs (loop_use_stmt
);
1947 current_stmt
= loop_use_stmt
;
1951 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
1954 /* Swap the operands, if needed, to make the reduction operand be the second
1956 lhs
= PHI_RESULT (phi
);
1957 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
1960 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
1962 tree op
= gimple_assign_rhs1 (next_stmt
);
1963 gimple def_stmt
= NULL
;
1965 if (TREE_CODE (op
) == SSA_NAME
)
1966 def_stmt
= SSA_NAME_DEF_STMT (op
);
1968 /* Check that the other def is either defined in the loop
1969 ("vect_internal_def"), or it's an induction (defined by a
1970 loop-header phi-node). */
1972 && gimple_bb (def_stmt
)
1973 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
1974 && (is_gimple_assign (def_stmt
)
1975 || is_gimple_call (def_stmt
)
1976 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
1977 == vect_induction_def
1978 || (gimple_code (def_stmt
) == GIMPLE_PHI
1979 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
1980 == vect_internal_def
1981 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
1983 lhs
= gimple_assign_lhs (next_stmt
);
1984 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
1992 tree op
= gimple_assign_rhs2 (next_stmt
);
1993 gimple def_stmt
= NULL
;
1995 if (TREE_CODE (op
) == SSA_NAME
)
1996 def_stmt
= SSA_NAME_DEF_STMT (op
);
1998 /* Check that the other def is either defined in the loop
1999 ("vect_internal_def"), or it's an induction (defined by a
2000 loop-header phi-node). */
2002 && gimple_bb (def_stmt
)
2003 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2004 && (is_gimple_assign (def_stmt
)
2005 || is_gimple_call (def_stmt
)
2006 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2007 == vect_induction_def
2008 || (gimple_code (def_stmt
) == GIMPLE_PHI
2009 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2010 == vect_internal_def
2011 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2013 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2016 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2019 swap_tree_operands (next_stmt
,
2020 gimple_assign_rhs1_ptr (next_stmt
),
2021 gimple_assign_rhs2_ptr (next_stmt
));
2022 update_stmt (next_stmt
);
2024 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2025 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2031 lhs
= gimple_assign_lhs (next_stmt
);
2032 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2035 /* Save the chain for further analysis in SLP detection. */
2036 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2037 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2038 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2044 /* Function vect_is_simple_reduction_1
2046 (1) Detect a cross-iteration def-use cycle that represents a simple
2047 reduction computation. We look for the following pattern:
2052 a2 = operation (a3, a1)
2055 1. operation is commutative and associative and it is safe to
2056 change the order of the computation (if CHECK_REDUCTION is true)
2057 2. no uses for a2 in the loop (a2 is used out of the loop)
2058 3. no uses of a1 in the loop besides the reduction operation
2059 4. no uses of a1 outside the loop.
2061 Conditions 1,4 are tested here.
2062 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2064 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2065 nested cycles, if CHECK_REDUCTION is false.
2067 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2071 inner loop (def of a3)
2074 If MODIFY is true it tries also to rework the code in-place to enable
2075 detection of more reduction patterns. For the time being we rewrite
2076 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2080 vect_is_simple_reduction_1 (loop_vec_info loop_info
, gimple phi
,
2081 bool check_reduction
, bool *double_reduc
,
2084 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2085 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2086 edge latch_e
= loop_latch_edge (loop
);
2087 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2088 gimple def_stmt
, def1
= NULL
, def2
= NULL
;
2089 enum tree_code orig_code
, code
;
2090 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2094 imm_use_iterator imm_iter
;
2095 use_operand_p use_p
;
2098 *double_reduc
= false;
2100 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2101 otherwise, we assume outer loop vectorization. */
2102 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2103 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2105 name
= PHI_RESULT (phi
);
2107 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2109 gimple use_stmt
= USE_STMT (use_p
);
2110 if (is_gimple_debug (use_stmt
))
2113 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2117 "intermediate value used outside loop.");
2122 if (vinfo_for_stmt (use_stmt
)
2123 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2129 "reduction used in loop.");
2134 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2136 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2139 "reduction: not ssa_name: ");
2140 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2145 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2150 "reduction: no def_stmt.");
2154 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2156 if (dump_enabled_p ())
2157 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2161 if (is_gimple_assign (def_stmt
))
2163 name
= gimple_assign_lhs (def_stmt
);
2168 name
= PHI_RESULT (def_stmt
);
2173 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2175 gimple use_stmt
= USE_STMT (use_p
);
2176 if (is_gimple_debug (use_stmt
))
2178 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
2179 && vinfo_for_stmt (use_stmt
)
2180 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2184 if (dump_enabled_p ())
2185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2186 "reduction used in loop.");
2191 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192 defined in the inner loop. */
2195 op1
= PHI_ARG_DEF (def_stmt
, 0);
2197 if (gimple_phi_num_args (def_stmt
) != 1
2198 || TREE_CODE (op1
) != SSA_NAME
)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2202 "unsupported phi node definition.");
2207 def1
= SSA_NAME_DEF_STMT (op1
);
2208 if (flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2210 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2211 && is_gimple_assign (def1
))
2213 if (dump_enabled_p ())
2214 report_vect_op (MSG_NOTE
, def_stmt
,
2215 "detected double reduction: ");
2217 *double_reduc
= true;
2224 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2226 /* We can handle "res -= x[i]", which is non-associative by
2227 simply rewriting this into "res += -x[i]". Avoid changing
2228 gimple instruction for the first simple tests and only do this
2229 if we're allowed to change code at all. */
2230 if (code
== MINUS_EXPR
2232 && (op1
= gimple_assign_rhs1 (def_stmt
))
2233 && TREE_CODE (op1
) == SSA_NAME
2234 && SSA_NAME_DEF_STMT (op1
) == phi
)
2238 && (!commutative_tree_code (code
) || !associative_tree_code (code
)))
2240 if (dump_enabled_p ())
2241 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2242 "reduction: not commutative/associative: ");
2246 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2248 if (code
!= COND_EXPR
)
2250 if (dump_enabled_p ())
2251 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2252 "reduction: not binary operation: ");
2257 op3
= gimple_assign_rhs1 (def_stmt
);
2258 if (COMPARISON_CLASS_P (op3
))
2260 op4
= TREE_OPERAND (op3
, 1);
2261 op3
= TREE_OPERAND (op3
, 0);
2264 op1
= gimple_assign_rhs2 (def_stmt
);
2265 op2
= gimple_assign_rhs3 (def_stmt
);
2267 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2269 if (dump_enabled_p ())
2270 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2271 "reduction: uses not ssa_names: ");
2278 op1
= gimple_assign_rhs1 (def_stmt
);
2279 op2
= gimple_assign_rhs2 (def_stmt
);
2281 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2283 if (dump_enabled_p ())
2284 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2285 "reduction: uses not ssa_names: ");
2291 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2292 if ((TREE_CODE (op1
) == SSA_NAME
2293 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2294 || (TREE_CODE (op2
) == SSA_NAME
2295 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2296 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2297 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2298 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2299 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2301 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE
, vect_location
,
2304 "reduction: multiple types: operation type: ");
2305 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2306 dump_printf (MSG_NOTE
, ", operands types: ");
2307 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2309 dump_printf (MSG_NOTE
, ",");
2310 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2314 dump_printf (MSG_NOTE
, ",");
2315 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2321 dump_printf (MSG_NOTE
, ",");
2322 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2330 /* Check that it's ok to change the order of the computation.
2331 Generally, when vectorizing a reduction we change the order of the
2332 computation. This may change the behavior of the program in some
2333 cases, so we need to check that this is ok. One exception is when
2334 vectorizing an outer-loop: the inner-loop is executed sequentially,
2335 and therefore vectorizing reductions in the inner-loop during
2336 outer-loop vectorization is safe. */
2338 /* CHECKME: check for !flag_finite_math_only too? */
2339 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
2342 /* Changing the order of operations changes the semantics. */
2343 if (dump_enabled_p ())
2344 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2345 "reduction: unsafe fp math optimization: ");
2348 else if (INTEGRAL_TYPE_P (type
) && TYPE_OVERFLOW_TRAPS (type
)
2351 /* Changing the order of operations changes the semantics. */
2352 if (dump_enabled_p ())
2353 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2354 "reduction: unsafe int math optimization: ");
2357 else if (SAT_FIXED_POINT_TYPE_P (type
) && check_reduction
)
2359 /* Changing the order of operations changes the semantics. */
2360 if (dump_enabled_p ())
2361 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2362 "reduction: unsafe fixed-point math optimization: ");
2366 /* If we detected "res -= x[i]" earlier, rewrite it into
2367 "res += -x[i]" now. If this turns out to be useless reassoc
2368 will clean it up again. */
2369 if (orig_code
== MINUS_EXPR
)
2371 tree rhs
= gimple_assign_rhs2 (def_stmt
);
2372 tree negrhs
= make_ssa_name (TREE_TYPE (rhs
), NULL
);
2373 gimple negate_stmt
= gimple_build_assign_with_ops (NEGATE_EXPR
, negrhs
,
2375 gimple_stmt_iterator gsi
= gsi_for_stmt (def_stmt
);
2376 set_vinfo_for_stmt (negate_stmt
, new_stmt_vec_info (negate_stmt
,
2378 gsi_insert_before (&gsi
, negate_stmt
, GSI_NEW_STMT
);
2379 gimple_assign_set_rhs2 (def_stmt
, negrhs
);
2380 gimple_assign_set_rhs_code (def_stmt
, PLUS_EXPR
);
2381 update_stmt (def_stmt
);
2384 /* Reduction is safe. We're dealing with one of the following:
2385 1) integer arithmetic and no trapv
2386 2) floating point arithmetic, and special flags permit this optimization
2387 3) nested cycle (i.e., outer loop vectorization). */
2388 if (TREE_CODE (op1
) == SSA_NAME
)
2389 def1
= SSA_NAME_DEF_STMT (op1
);
2391 if (TREE_CODE (op2
) == SSA_NAME
)
2392 def2
= SSA_NAME_DEF_STMT (op2
);
2394 if (code
!= COND_EXPR
2395 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2397 if (dump_enabled_p ())
2398 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2402 /* Check that one def is the reduction def, defined by PHI,
2403 the other def is either defined in the loop ("vect_internal_def"),
2404 or it's an induction (defined by a loop-header phi-node). */
2406 if (def2
&& def2
== phi
2407 && (code
== COND_EXPR
2408 || !def1
|| gimple_nop_p (def1
)
2409 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2410 && (is_gimple_assign (def1
)
2411 || is_gimple_call (def1
)
2412 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2413 == vect_induction_def
2414 || (gimple_code (def1
) == GIMPLE_PHI
2415 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2416 == vect_internal_def
2417 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2419 if (dump_enabled_p ())
2420 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2424 if (def1
&& def1
== phi
2425 && (code
== COND_EXPR
2426 || !def2
|| gimple_nop_p (def2
)
2427 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2428 && (is_gimple_assign (def2
)
2429 || is_gimple_call (def2
)
2430 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2431 == vect_induction_def
2432 || (gimple_code (def2
) == GIMPLE_PHI
2433 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2434 == vect_internal_def
2435 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2437 if (check_reduction
)
2439 /* Swap operands (just for simplicity - so that the rest of the code
2440 can assume that the reduction variable is always the last (second)
2442 if (dump_enabled_p ())
2443 report_vect_op (MSG_NOTE
, def_stmt
,
2444 "detected reduction: need to swap operands: ");
2446 swap_tree_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2447 gimple_assign_rhs2_ptr (def_stmt
));
2449 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2450 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2461 /* Try to find SLP reduction chain. */
2462 if (check_reduction
&& vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
2464 if (dump_enabled_p ())
2465 report_vect_op (MSG_NOTE
, def_stmt
,
2466 "reduction: detected reduction chain: ");
2471 if (dump_enabled_p ())
2472 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2473 "reduction: unknown pattern: ");
2478 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2479 in-place. Arguments as there. */
2482 vect_is_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2483 bool check_reduction
, bool *double_reduc
)
2485 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2486 double_reduc
, false);
2489 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2490 in-place if it enables detection of more reductions. Arguments
2494 vect_force_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2495 bool check_reduction
, bool *double_reduc
)
2497 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2498 double_reduc
, true);
2501 /* Calculate the cost of one scalar iteration of the loop. */
2503 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
2505 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2506 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2507 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
2508 int innerloop_iters
, i
, stmt_cost
;
2510 /* Count statements in scalar loop. Using this as scalar cost for a single
2513 TODO: Add outer loop support.
2515 TODO: Consider assigning different costs to different scalar
2519 innerloop_iters
= 1;
2521 innerloop_iters
= 50; /* FIXME */
2523 for (i
= 0; i
< nbbs
; i
++)
2525 gimple_stmt_iterator si
;
2526 basic_block bb
= bbs
[i
];
2528 if (bb
->loop_father
== loop
->inner
)
2529 factor
= innerloop_iters
;
2533 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
2535 gimple stmt
= gsi_stmt (si
);
2536 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2538 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
2541 /* Skip stmts that are not vectorized inside the loop. */
2543 && !STMT_VINFO_RELEVANT_P (stmt_info
)
2544 && (!STMT_VINFO_LIVE_P (stmt_info
)
2545 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2546 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
2549 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
2551 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
2552 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2554 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2557 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2559 scalar_single_iter_cost
+= stmt_cost
* factor
;
2562 return scalar_single_iter_cost
;
2565 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2567 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
2568 int *peel_iters_epilogue
,
2569 int scalar_single_iter_cost
,
2570 stmt_vector_for_cost
*prologue_cost_vec
,
2571 stmt_vector_for_cost
*epilogue_cost_vec
)
2574 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2576 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2578 *peel_iters_epilogue
= vf
/2;
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE
, vect_location
,
2581 "cost model: epilogue peel iters set to vf/2 "
2582 "because loop iterations are unknown .");
2584 /* If peeled iterations are known but number of scalar loop
2585 iterations are unknown, count a taken branch per peeled loop. */
2586 retval
= record_stmt_cost (prologue_cost_vec
, 2, cond_branch_taken
,
2587 NULL
, 0, vect_prologue
);
2591 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2592 peel_iters_prologue
= niters
< peel_iters_prologue
?
2593 niters
: peel_iters_prologue
;
2594 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
2595 /* If we need to peel for gaps, but no peeling is required, we have to
2596 peel VF iterations. */
2597 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
2598 *peel_iters_epilogue
= vf
;
2601 if (peel_iters_prologue
)
2602 retval
+= record_stmt_cost (prologue_cost_vec
,
2603 peel_iters_prologue
* scalar_single_iter_cost
,
2604 scalar_stmt
, NULL
, 0, vect_prologue
);
2605 if (*peel_iters_epilogue
)
2606 retval
+= record_stmt_cost (epilogue_cost_vec
,
2607 *peel_iters_epilogue
* scalar_single_iter_cost
,
2608 scalar_stmt
, NULL
, 0, vect_epilogue
);
2612 /* Function vect_estimate_min_profitable_iters
2614 Return the number of iterations required for the vector version of the
2615 loop to be profitable relative to the cost of the scalar version of the
2619 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
2620 int *ret_min_profitable_niters
,
2621 int *ret_min_profitable_estimate
)
2623 int min_profitable_iters
;
2624 int min_profitable_estimate
;
2625 int peel_iters_prologue
;
2626 int peel_iters_epilogue
;
2627 unsigned vec_inside_cost
= 0;
2628 int vec_outside_cost
= 0;
2629 unsigned vec_prologue_cost
= 0;
2630 unsigned vec_epilogue_cost
= 0;
2631 int scalar_single_iter_cost
= 0;
2632 int scalar_outside_cost
= 0;
2633 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2634 int npeel
= LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2635 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2637 /* Cost model disabled. */
2638 if (!flag_vect_cost_model
)
2640 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.");
2641 *ret_min_profitable_niters
= 0;
2642 *ret_min_profitable_estimate
= 0;
2646 /* Requires loop versioning tests to handle misalignment. */
2647 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2649 /* FIXME: Make cost depend on complexity of individual check. */
2650 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
2651 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2653 dump_printf (MSG_NOTE
,
2654 "cost model: Adding cost of checks for loop "
2655 "versioning to treat misalignment.\n");
2658 /* Requires loop versioning with alias checks. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).length ();
2663 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2665 dump_printf (MSG_NOTE
,
2666 "cost model: Adding cost of checks for loop "
2667 "versioning aliasing.\n");
2670 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2671 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2672 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
2675 /* Count statements in scalar loop. Using this as scalar cost for a single
2678 TODO: Add outer loop support.
2680 TODO: Consider assigning different costs to different scalar
2683 scalar_single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
2685 /* Add additional cost for the peeled instructions in prologue and epilogue
2688 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2689 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2691 TODO: Build an expression that represents peel_iters for prologue and
2692 epilogue to be used in a run-time test. */
2696 peel_iters_prologue
= vf
/2;
2697 dump_printf (MSG_NOTE
, "cost model: "
2698 "prologue peel iters set to vf/2.");
2700 /* If peeling for alignment is unknown, loop bound of main loop becomes
2702 peel_iters_epilogue
= vf
/2;
2703 dump_printf (MSG_NOTE
, "cost model: "
2704 "epilogue peel iters set to vf/2 because "
2705 "peeling for alignment is unknown.");
2707 /* If peeled iterations are unknown, count a taken branch and a not taken
2708 branch per peeled loop. Even if scalar loop iterations are known,
2709 vector iterations are not known since peeled prologue iterations are
2710 not known. Hence guards remain the same. */
2711 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_taken
,
2712 NULL
, 0, vect_prologue
);
2713 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_not_taken
,
2714 NULL
, 0, vect_prologue
);
2715 /* FORNOW: Don't attempt to pass individual scalar instructions to
2716 the model; just assume linear cost for scalar iterations. */
2717 (void) add_stmt_cost (target_cost_data
,
2718 peel_iters_prologue
* scalar_single_iter_cost
,
2719 scalar_stmt
, NULL
, 0, vect_prologue
);
2720 (void) add_stmt_cost (target_cost_data
,
2721 peel_iters_epilogue
* scalar_single_iter_cost
,
2722 scalar_stmt
, NULL
, 0, vect_epilogue
);
2726 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2727 stmt_info_for_cost
*si
;
2729 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2731 prologue_cost_vec
.create (2);
2732 epilogue_cost_vec
.create (2);
2733 peel_iters_prologue
= npeel
;
2735 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
2736 &peel_iters_epilogue
,
2737 scalar_single_iter_cost
,
2739 &epilogue_cost_vec
);
2741 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
2743 struct _stmt_vec_info
*stmt_info
2744 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2745 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2746 si
->misalign
, vect_prologue
);
2749 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
2751 struct _stmt_vec_info
*stmt_info
2752 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2753 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2754 si
->misalign
, vect_epilogue
);
2757 prologue_cost_vec
.release ();
2758 epilogue_cost_vec
.release ();
2761 /* FORNOW: The scalar outside cost is incremented in one of the
2764 1. The vectorizer checks for alignment and aliasing and generates
2765 a condition that allows dynamic vectorization. A cost model
2766 check is ANDED with the versioning condition. Hence scalar code
2767 path now has the added cost of the versioning check.
2769 if (cost > th & versioning_check)
2772 Hence run-time scalar is incremented by not-taken branch cost.
2774 2. The vectorizer then checks if a prologue is required. If the
2775 cost model check was not done before during versioning, it has to
2776 be done before the prologue check.
2779 prologue = scalar_iters
2784 if (prologue == num_iters)
2787 Hence the run-time scalar cost is incremented by a taken branch,
2788 plus a not-taken branch, plus a taken branch cost.
2790 3. The vectorizer then checks if an epilogue is required. If the
2791 cost model check was not done before during prologue check, it
2792 has to be done with the epilogue check.
2798 if (prologue == num_iters)
2801 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2804 Hence the run-time scalar cost should be incremented by 2 taken
2807 TODO: The back end may reorder the BBS's differently and reverse
2808 conditions/branch directions. Change the estimates below to
2809 something more reasonable. */
2811 /* If the number of iterations is known and we do not do versioning, we can
2812 decide whether to vectorize at compile time. Hence the scalar version
2813 do not carry cost model guard costs. */
2814 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2816 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2818 /* Cost model check occurs at versioning. */
2819 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2820 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2821 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
2824 /* Cost model check occurs at prologue generation. */
2825 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2826 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
2827 + vect_get_stmt_cost (cond_branch_not_taken
);
2828 /* Cost model check occurs at epilogue generation. */
2830 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
2834 /* Complete the target-specific cost calculations. */
2835 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
2836 &vec_inside_cost
, &vec_epilogue_cost
);
2838 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
2840 /* Calculate number of iterations required to make the vector version
2841 profitable, relative to the loop bodies only. The following condition
2843 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2845 SIC = scalar iteration cost, VIC = vector iteration cost,
2846 VOC = vector outside cost, VF = vectorization factor,
2847 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2848 SOC = scalar outside cost for run time cost model check. */
2850 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
2852 if (vec_outside_cost
<= 0)
2853 min_profitable_iters
= 1;
2856 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
2857 - vec_inside_cost
* peel_iters_prologue
2858 - vec_inside_cost
* peel_iters_epilogue
)
2859 / ((scalar_single_iter_cost
* vf
)
2862 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
2863 <= (((int) vec_inside_cost
* min_profitable_iters
)
2864 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
2865 min_profitable_iters
++;
2868 /* vector version will never be profitable. */
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2873 "cost model: the vector iteration cost = %d "
2874 "divided by the scalar iteration cost = %d "
2875 "is greater or equal to the vectorization factor = %d.",
2876 vec_inside_cost
, scalar_single_iter_cost
, vf
);
2877 *ret_min_profitable_niters
= -1;
2878 *ret_min_profitable_estimate
= -1;
2882 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2885 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
2887 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
2889 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
2891 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
2892 scalar_single_iter_cost
);
2893 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
2894 scalar_outside_cost
);
2895 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
2897 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
2898 peel_iters_prologue
);
2899 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
2900 peel_iters_epilogue
);
2901 dump_printf (MSG_NOTE
,
2902 " Calculated minimum iters for profitability: %d\n",
2903 min_profitable_iters
);
2906 min_profitable_iters
=
2907 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
2909 /* Because the condition we create is:
2910 if (niters <= min_profitable_iters)
2911 then skip the vectorized loop. */
2912 min_profitable_iters
--;
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_NOTE
, vect_location
,
2916 " Runtime profitability threshold = %d\n", min_profitable_iters
);
2918 *ret_min_profitable_niters
= min_profitable_iters
;
2920 /* Calculate number of iterations required to make the vector version
2921 profitable, relative to the loop bodies only.
2923 Non-vectorized variant is SIC * niters and it must win over vector
2924 variant on the expected loop trip count. The following condition must hold true:
2925 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2927 if (vec_outside_cost
<= 0)
2928 min_profitable_estimate
= 1;
2931 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
2932 - vec_inside_cost
* peel_iters_prologue
2933 - vec_inside_cost
* peel_iters_epilogue
)
2934 / ((scalar_single_iter_cost
* vf
)
2937 min_profitable_estimate
--;
2938 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE
, vect_location
,
2941 " Static estimate profitability threshold = %d\n",
2942 min_profitable_iters
);
2944 *ret_min_profitable_estimate
= min_profitable_estimate
;
2948 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2949 functions. Design better to avoid maintenance issues. */
2951 /* Function vect_model_reduction_cost.
2953 Models cost for a reduction operation, including the vector ops
2954 generated within the strip-mine loop, the initial definition before
2955 the loop, and the epilogue code that must be generated. */
2958 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
2961 int prologue_cost
= 0, epilogue_cost
= 0;
2962 enum tree_code code
;
2965 gimple stmt
, orig_stmt
;
2967 enum machine_mode mode
;
2968 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2969 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2970 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2972 /* Cost of reduction op inside loop. */
2973 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
2974 stmt_info
, 0, vect_body
);
2975 stmt
= STMT_VINFO_STMT (stmt_info
);
2977 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
2979 case GIMPLE_SINGLE_RHS
:
2980 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
)) == ternary_op
);
2981 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 2);
2983 case GIMPLE_UNARY_RHS
:
2984 reduction_op
= gimple_assign_rhs1 (stmt
);
2986 case GIMPLE_BINARY_RHS
:
2987 reduction_op
= gimple_assign_rhs2 (stmt
);
2989 case GIMPLE_TERNARY_RHS
:
2990 reduction_op
= gimple_assign_rhs3 (stmt
);
2996 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
2999 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3002 "unsupported data-type ");
3003 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3004 TREE_TYPE (reduction_op
));
3009 mode
= TYPE_MODE (vectype
);
3010 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3013 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3015 code
= gimple_assign_rhs_code (orig_stmt
);
3017 /* Add in cost for initial definition. */
3018 prologue_cost
+= add_stmt_cost (target_cost_data
, 1, scalar_to_vec
,
3019 stmt_info
, 0, vect_prologue
);
3021 /* Determine cost of epilogue code.
3023 We have a reduction operator that will reduce the vector in one statement.
3024 Also requires scalar extract. */
3026 if (!nested_in_vect_loop_p (loop
, orig_stmt
))
3028 if (reduc_code
!= ERROR_MARK
)
3030 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3031 stmt_info
, 0, vect_epilogue
);
3032 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vec_to_scalar
,
3033 stmt_info
, 0, vect_epilogue
);
3037 int vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
3039 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3040 int element_bitsize
= tree_low_cst (bitsize
, 1);
3041 int nelements
= vec_size_in_bits
/ element_bitsize
;
3043 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3045 /* We have a whole vector shift available. */
3046 if (VECTOR_MODE_P (mode
)
3047 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3048 && optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3050 /* Final reduction via vector shifts and the reduction operator.
3051 Also requires scalar extract. */
3052 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3053 exact_log2 (nelements
) * 2,
3054 vector_stmt
, stmt_info
, 0,
3056 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3057 vec_to_scalar
, stmt_info
, 0,
3061 /* Use extracts and reduction op for final reduction. For N
3062 elements, we have N extracts and N-1 reduction ops. */
3063 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3064 nelements
+ nelements
- 1,
3065 vector_stmt
, stmt_info
, 0,
3070 if (dump_enabled_p ())
3071 dump_printf (MSG_NOTE
,
3072 "vect_model_reduction_cost: inside_cost = %d, "
3073 "prologue_cost = %d, epilogue_cost = %d .", inside_cost
,
3074 prologue_cost
, epilogue_cost
);
3080 /* Function vect_model_induction_cost.
3082 Models cost for induction operations. */
3085 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3087 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3088 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3089 unsigned inside_cost
, prologue_cost
;
3091 /* loop cost for vec_loop. */
3092 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3093 stmt_info
, 0, vect_body
);
3095 /* prologue cost for vec_init and vec_step. */
3096 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3097 stmt_info
, 0, vect_prologue
);
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE
, vect_location
,
3101 "vect_model_induction_cost: inside_cost = %d, "
3102 "prologue_cost = %d .", inside_cost
, prologue_cost
);
3106 /* Function get_initial_def_for_induction
3109 STMT - a stmt that performs an induction operation in the loop.
3110 IV_PHI - the initial value of the induction variable
3113 Return a vector variable, initialized with the first VF values of
3114 the induction variable. E.g., for an iv with IV_PHI='X' and
3115 evolution S, for a vector of 4 units, we want to return:
3116 [X, X + S, X + 2*S, X + 3*S]. */
3119 get_initial_def_for_induction (gimple iv_phi
)
3121 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3122 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3123 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3127 edge pe
= loop_preheader_edge (loop
);
3128 struct loop
*iv_loop
;
3130 tree new_vec
, vec_init
, vec_step
, t
;
3134 gimple init_stmt
, induction_phi
, new_stmt
;
3135 tree induc_def
, vec_def
, vec_dest
;
3136 tree init_expr
, step_expr
;
3137 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3142 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3143 bool nested_in_vect_loop
= false;
3144 gimple_seq stmts
= NULL
;
3145 imm_use_iterator imm_iter
;
3146 use_operand_p use_p
;
3150 gimple_stmt_iterator si
;
3151 basic_block bb
= gimple_bb (iv_phi
);
3155 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3156 if (nested_in_vect_loop_p (loop
, iv_phi
))
3158 nested_in_vect_loop
= true;
3159 iv_loop
= loop
->inner
;
3163 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3165 latch_e
= loop_latch_edge (iv_loop
);
3166 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3168 access_fn
= analyze_scalar_evolution (iv_loop
, PHI_RESULT (iv_phi
));
3169 gcc_assert (access_fn
);
3170 STRIP_NOPS (access_fn
);
3171 ok
= vect_is_simple_iv_evolution (iv_loop
->num
, access_fn
,
3172 &init_expr
, &step_expr
);
3174 pe
= loop_preheader_edge (iv_loop
);
3176 scalar_type
= TREE_TYPE (init_expr
);
3177 vectype
= get_vectype_for_scalar_type (scalar_type
);
3178 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3179 gcc_assert (vectype
);
3180 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3181 ncopies
= vf
/ nunits
;
3183 gcc_assert (phi_info
);
3184 gcc_assert (ncopies
>= 1);
3186 /* Find the first insertion point in the BB. */
3187 si
= gsi_after_labels (bb
);
3189 /* Create the vector that holds the initial_value of the induction. */
3190 if (nested_in_vect_loop
)
3192 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3193 been created during vectorization of previous stmts. We obtain it
3194 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3195 tree iv_def
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3196 loop_preheader_edge (iv_loop
));
3197 vec_init
= vect_get_vec_def_for_operand (iv_def
, iv_phi
, NULL
);
3198 /* If the initial value is not of proper type, convert it. */
3199 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3201 new_stmt
= gimple_build_assign_with_ops
3203 vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_"),
3204 build1 (VIEW_CONVERT_EXPR
, vectype
, vec_init
), NULL_TREE
);
3205 vec_init
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3206 gimple_assign_set_lhs (new_stmt
, vec_init
);
3207 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3209 gcc_assert (!new_bb
);
3210 set_vinfo_for_stmt (new_stmt
,
3211 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3216 vec
<constructor_elt
, va_gc
> *v
;
3218 /* iv_loop is the loop to be vectorized. Create:
3219 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3220 new_var
= vect_get_new_vect_var (scalar_type
, vect_scalar_var
, "var_");
3221 new_name
= force_gimple_operand (init_expr
, &stmts
, false, new_var
);
3224 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3225 gcc_assert (!new_bb
);
3228 vec_alloc (v
, nunits
);
3229 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3230 for (i
= 1; i
< nunits
; i
++)
3232 /* Create: new_name_i = new_name + step_expr */
3233 enum tree_code code
= POINTER_TYPE_P (scalar_type
)
3234 ? POINTER_PLUS_EXPR
: PLUS_EXPR
;
3235 init_stmt
= gimple_build_assign_with_ops (code
, new_var
,
3236 new_name
, step_expr
);
3237 new_name
= make_ssa_name (new_var
, init_stmt
);
3238 gimple_assign_set_lhs (init_stmt
, new_name
);
3240 new_bb
= gsi_insert_on_edge_immediate (pe
, init_stmt
);
3241 gcc_assert (!new_bb
);
3243 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE
, vect_location
,
3246 "created new init_stmt: ");
3247 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, init_stmt
, 0);
3249 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3251 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3252 new_vec
= build_constructor (vectype
, v
);
3253 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3257 /* Create the vector that holds the step of the induction. */
3258 if (nested_in_vect_loop
)
3259 /* iv_loop is nested in the loop to be vectorized. Generate:
3260 vec_step = [S, S, S, S] */
3261 new_name
= step_expr
;
3264 /* iv_loop is the loop to be vectorized. Generate:
3265 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3266 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3267 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3271 t
= unshare_expr (new_name
);
3272 gcc_assert (CONSTANT_CLASS_P (new_name
));
3273 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3274 gcc_assert (stepvectype
);
3275 new_vec
= build_vector_from_val (stepvectype
, t
);
3276 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3279 /* Create the following def-use cycle:
3284 vec_iv = PHI <vec_init, vec_loop>
3288 vec_loop = vec_iv + vec_step; */
3290 /* Create the induction-phi that defines the induction-operand. */
3291 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3292 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3293 set_vinfo_for_stmt (induction_phi
,
3294 new_stmt_vec_info (induction_phi
, loop_vinfo
, NULL
));
3295 induc_def
= PHI_RESULT (induction_phi
);
3297 /* Create the iv update inside the loop */
3298 new_stmt
= gimple_build_assign_with_ops (PLUS_EXPR
, vec_dest
,
3299 induc_def
, vec_step
);
3300 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3301 gimple_assign_set_lhs (new_stmt
, vec_def
);
3302 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3303 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
,
3306 /* Set the arguments of the phi node: */
3307 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3308 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3312 /* In case that vectorization factor (VF) is bigger than the number
3313 of elements that we can fit in a vectype (nunits), we have to generate
3314 more than one vector stmt - i.e - we need to "unroll" the
3315 vector stmt by a factor VF/nunits. For more details see documentation
3316 in vectorizable_operation. */
3320 stmt_vec_info prev_stmt_vinfo
;
3321 /* FORNOW. This restriction should be relaxed. */
3322 gcc_assert (!nested_in_vect_loop
);
3324 /* Create the vector that holds the step of the induction. */
3325 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3326 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3328 t
= unshare_expr (new_name
);
3329 gcc_assert (CONSTANT_CLASS_P (new_name
));
3330 new_vec
= build_vector_from_val (stepvectype
, t
);
3331 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3333 vec_def
= induc_def
;
3334 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3335 for (i
= 1; i
< ncopies
; i
++)
3337 /* vec_i = vec_prev + vec_step */
3338 new_stmt
= gimple_build_assign_with_ops (PLUS_EXPR
, vec_dest
,
3340 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3341 gimple_assign_set_lhs (new_stmt
, vec_def
);
3343 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3344 if (!useless_type_conversion_p (resvectype
, vectype
))
3346 new_stmt
= gimple_build_assign_with_ops
3348 vect_get_new_vect_var (resvectype
, vect_simple_var
,
3350 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3351 gimple_assign_lhs (new_stmt
)), NULL_TREE
);
3352 gimple_assign_set_lhs (new_stmt
,
3354 (gimple_assign_lhs (new_stmt
), new_stmt
));
3355 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3357 set_vinfo_for_stmt (new_stmt
,
3358 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3359 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3360 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3364 if (nested_in_vect_loop
)
3366 /* Find the loop-closed exit-phi of the induction, and record
3367 the final vector of induction results: */
3369 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3371 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (USE_STMT (use_p
))))
3373 exit_phi
= USE_STMT (use_p
);
3379 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3380 /* FORNOW. Currently not supporting the case that an inner-loop induction
3381 is not used in the outer-loop (i.e. only outside the outer-loop). */
3382 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3383 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3385 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3386 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE
, vect_location
,
3389 "vector of inductions after inner-loop:");
3390 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3396 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE
, vect_location
,
3399 "transform induction: created def-use cycle: ");
3400 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3401 dump_printf (MSG_NOTE
, "\n");
3402 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
3403 SSA_NAME_DEF_STMT (vec_def
), 0);
3406 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
3407 if (!useless_type_conversion_p (resvectype
, vectype
))
3409 new_stmt
= gimple_build_assign_with_ops
3411 vect_get_new_vect_var (resvectype
, vect_simple_var
, "vec_iv_"),
3412 build1 (VIEW_CONVERT_EXPR
, resvectype
, induc_def
), NULL_TREE
);
3413 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3414 gimple_assign_set_lhs (new_stmt
, induc_def
);
3415 si
= gsi_after_labels (bb
);
3416 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3417 set_vinfo_for_stmt (new_stmt
,
3418 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3419 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
3420 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
3427 /* Function get_initial_def_for_reduction
3430 STMT - a stmt that performs a reduction operation in the loop.
3431 INIT_VAL - the initial value of the reduction variable
3434 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3435 of the reduction (used for adjusting the epilog - see below).
3436 Return a vector variable, initialized according to the operation that STMT
3437 performs. This vector will be used as the initial value of the
3438 vector of partial results.
3440 Option1 (adjust in epilog): Initialize the vector as follows:
3441 add/bit or/xor: [0,0,...,0,0]
3442 mult/bit and: [1,1,...,1,1]
3443 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3444 and when necessary (e.g. add/mult case) let the caller know
3445 that it needs to adjust the result by init_val.
3447 Option2: Initialize the vector as follows:
3448 add/bit or/xor: [init_val,0,0,...,0]
3449 mult/bit and: [init_val,1,1,...,1]
3450 min/max/cond_expr: [init_val,init_val,...,init_val]
3451 and no adjustments are needed.
3453 For example, for the following code:
3459 STMT is 's = s + a[i]', and the reduction variable is 's'.
3460 For a vector of 4 units, we want to return either [0,0,0,init_val],
3461 or [0,0,0,0] and let the caller know that it needs to adjust
3462 the result at the end by 'init_val'.
3464 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3465 initialization vector is simpler (same element in all entries), if
3466 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3468 A cost model should help decide between these two schemes. */
3471 get_initial_def_for_reduction (gimple stmt
, tree init_val
,
3472 tree
*adjustment_def
)
3474 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
3475 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3476 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3477 tree scalar_type
= TREE_TYPE (init_val
);
3478 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3480 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3485 bool nested_in_vect_loop
= false;
3487 REAL_VALUE_TYPE real_init_val
= dconst0
;
3488 int int_init_val
= 0;
3489 gimple def_stmt
= NULL
;
3491 gcc_assert (vectype
);
3492 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3494 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3495 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3497 if (nested_in_vect_loop_p (loop
, stmt
))
3498 nested_in_vect_loop
= true;
3500 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
3502 /* In case of double reduction we only create a vector variable to be put
3503 in the reduction phi node. The actual statement creation is done in
3504 vect_create_epilog_for_reduction. */
3505 if (adjustment_def
&& nested_in_vect_loop
3506 && TREE_CODE (init_val
) == SSA_NAME
3507 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
3508 && gimple_code (def_stmt
) == GIMPLE_PHI
3509 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3510 && vinfo_for_stmt (def_stmt
)
3511 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
3512 == vect_double_reduction_def
)
3514 *adjustment_def
= NULL
;
3515 return vect_create_destination_var (init_val
, vectype
);
3518 if (TREE_CONSTANT (init_val
))
3520 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3521 init_value
= build_real (scalar_type
, TREE_REAL_CST (init_val
));
3523 init_value
= build_int_cst (scalar_type
, TREE_INT_CST_LOW (init_val
));
3526 init_value
= init_val
;
3530 case WIDEN_SUM_EXPR
:
3538 /* ADJUSMENT_DEF is NULL when called from
3539 vect_create_epilog_for_reduction to vectorize double reduction. */
3542 if (nested_in_vect_loop
)
3543 *adjustment_def
= vect_get_vec_def_for_operand (init_val
, stmt
,
3546 *adjustment_def
= init_val
;
3549 if (code
== MULT_EXPR
)
3551 real_init_val
= dconst1
;
3555 if (code
== BIT_AND_EXPR
)
3558 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3559 def_for_init
= build_real (scalar_type
, real_init_val
);
3561 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3563 /* Create a vector of '0' or '1' except the first element. */
3564 elts
= XALLOCAVEC (tree
, nunits
);
3565 for (i
= nunits
- 2; i
>= 0; --i
)
3566 elts
[i
+ 1] = def_for_init
;
3568 /* Option1: the first element is '0' or '1' as well. */
3571 elts
[0] = def_for_init
;
3572 init_def
= build_vector (vectype
, elts
);
3576 /* Option2: the first element is INIT_VAL. */
3578 if (TREE_CONSTANT (init_val
))
3579 init_def
= build_vector (vectype
, elts
);
3582 vec
<constructor_elt
, va_gc
> *v
;
3583 vec_alloc (v
, nunits
);
3584 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
3585 for (i
= 1; i
< nunits
; ++i
)
3586 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
3587 init_def
= build_constructor (vectype
, v
);
3597 *adjustment_def
= NULL_TREE
;
3598 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
, NULL
);
3602 init_def
= build_vector_from_val (vectype
, init_value
);
3613 /* Function vect_create_epilog_for_reduction
3615 Create code at the loop-epilog to finalize the result of a reduction
3618 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3619 reduction statements.
3620 STMT is the scalar reduction stmt that is being vectorized.
3621 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3622 number of elements that we can fit in a vectype (nunits). In this case
3623 we have to generate more than one vector stmt - i.e - we need to "unroll"
3624 the vector stmt by a factor VF/nunits. For more details see documentation
3625 in vectorizable_operation.
3626 REDUC_CODE is the tree-code for the epilog reduction.
3627 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3629 REDUC_INDEX is the index of the operand in the right hand side of the
3630 statement that is defined by REDUCTION_PHI.
3631 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3632 SLP_NODE is an SLP node containing a group of reduction statements. The
3633 first one in this group is STMT.
3636 1. Creates the reduction def-use cycles: sets the arguments for
3638 The loop-entry argument is the vectorized initial-value of the reduction.
3639 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3641 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3642 by applying the operation specified by REDUC_CODE if available, or by
3643 other means (whole-vector shifts or a scalar loop).
3644 The function also creates a new phi node at the loop exit to preserve
3645 loop-closed form, as illustrated below.
3647 The flow at the entry to this function:
3650 vec_def = phi <null, null> # REDUCTION_PHI
3651 VECT_DEF = vector_stmt # vectorized form of STMT
3652 s_loop = scalar_stmt # (scalar) STMT
3654 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3658 The above is transformed by this function into:
3661 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3662 VECT_DEF = vector_stmt # vectorized form of STMT
3663 s_loop = scalar_stmt # (scalar) STMT
3665 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3666 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3667 v_out2 = reduce <v_out1>
3668 s_out3 = extract_field <v_out2, 0>
3669 s_out4 = adjust_result <s_out3>
3675 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple stmt
,
3676 int ncopies
, enum tree_code reduc_code
,
3677 vec
<gimple
> reduction_phis
,
3678 int reduc_index
, bool double_reduc
,
3681 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3682 stmt_vec_info prev_phi_info
;
3684 enum machine_mode mode
;
3685 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3686 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
3687 basic_block exit_bb
;
3690 gimple new_phi
= NULL
, phi
;
3691 gimple_stmt_iterator exit_gsi
;
3693 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
3694 gimple epilog_stmt
= NULL
;
3695 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3697 tree bitsize
, bitpos
;
3698 tree adjustment_def
= NULL
;
3699 tree vec_initial_def
= NULL
;
3700 tree reduction_op
, expr
, def
;
3701 tree orig_name
, scalar_result
;
3702 imm_use_iterator imm_iter
, phi_imm_iter
;
3703 use_operand_p use_p
, phi_use_p
;
3704 bool extract_scalar_result
= false;
3705 gimple use_stmt
, orig_stmt
, reduction_phi
= NULL
;
3706 bool nested_in_vect_loop
= false;
3707 vec
<gimple
> new_phis
= vNULL
;
3708 vec
<gimple
> inner_phis
= vNULL
;
3709 enum vect_def_type dt
= vect_unknown_def_type
;
3711 vec
<tree
> scalar_results
= vNULL
;
3712 unsigned int group_size
= 1, k
, ratio
;
3713 vec
<tree
> vec_initial_defs
= vNULL
;
3715 bool slp_reduc
= false;
3716 tree new_phi_result
;
3717 gimple inner_phi
= NULL
;
3720 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
3722 if (nested_in_vect_loop_p (loop
, stmt
))
3726 nested_in_vect_loop
= true;
3727 gcc_assert (!slp_node
);
3730 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3732 case GIMPLE_SINGLE_RHS
:
3733 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3735 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3737 case GIMPLE_UNARY_RHS
:
3738 reduction_op
= gimple_assign_rhs1 (stmt
);
3740 case GIMPLE_BINARY_RHS
:
3741 reduction_op
= reduc_index
?
3742 gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
);
3744 case GIMPLE_TERNARY_RHS
:
3745 reduction_op
= gimple_op (stmt
, reduc_index
+ 1);
3751 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3752 gcc_assert (vectype
);
3753 mode
= TYPE_MODE (vectype
);
3755 /* 1. Create the reduction def-use cycle:
3756 Set the arguments of REDUCTION_PHIS, i.e., transform
3759 vec_def = phi <null, null> # REDUCTION_PHI
3760 VECT_DEF = vector_stmt # vectorized form of STMT
3766 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3767 VECT_DEF = vector_stmt # vectorized form of STMT
3770 (in case of SLP, do it for all the phis). */
3772 /* Get the loop-entry arguments. */
3774 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
3775 NULL
, slp_node
, reduc_index
);
3778 vec_initial_defs
.create (1);
3779 /* For the case of reduction, vect_get_vec_def_for_operand returns
3780 the scalar def before the loop, that defines the initial value
3781 of the reduction variable. */
3782 vec_initial_def
= vect_get_vec_def_for_operand (reduction_op
, stmt
,
3784 vec_initial_defs
.quick_push (vec_initial_def
);
3787 /* Set phi nodes arguments. */
3788 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
3790 tree vec_init_def
= vec_initial_defs
[i
];
3791 tree def
= vect_defs
[i
];
3792 for (j
= 0; j
< ncopies
; j
++)
3794 /* Set the loop-entry arg of the reduction-phi. */
3795 add_phi_arg (phi
, vec_init_def
, loop_preheader_edge (loop
),
3798 /* Set the loop-latch arg for the reduction-phi. */
3800 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
3802 add_phi_arg (phi
, def
, loop_latch_edge (loop
), UNKNOWN_LOCATION
);
3804 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE
, vect_location
,
3807 "transform reduction: created def-use cycle: ");
3808 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
3809 dump_printf (MSG_NOTE
, "\n");
3810 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
3813 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
3817 vec_initial_defs
.release ();
3819 /* 2. Create epilog code.
3820 The reduction epilog code operates across the elements of the vector
3821 of partial results computed by the vectorized loop.
3822 The reduction epilog code consists of:
3824 step 1: compute the scalar result in a vector (v_out2)
3825 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3826 step 3: adjust the scalar result (s_out3) if needed.
3828 Step 1 can be accomplished using one the following three schemes:
3829 (scheme 1) using reduc_code, if available.
3830 (scheme 2) using whole-vector shifts, if available.
3831 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3834 The overall epilog code looks like this:
3836 s_out0 = phi <s_loop> # original EXIT_PHI
3837 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3838 v_out2 = reduce <v_out1> # step 1
3839 s_out3 = extract_field <v_out2, 0> # step 2
3840 s_out4 = adjust_result <s_out3> # step 3
3842 (step 3 is optional, and steps 1 and 2 may be combined).
3843 Lastly, the uses of s_out0 are replaced by s_out4. */
3846 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3847 v_out1 = phi <VECT_DEF>
3848 Store them in NEW_PHIS. */
3850 exit_bb
= single_exit (loop
)->dest
;
3851 prev_phi_info
= NULL
;
3852 new_phis
.create (vect_defs
.length ());
3853 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
3855 for (j
= 0; j
< ncopies
; j
++)
3857 tree new_def
= copy_ssa_name (def
, NULL
);
3858 phi
= create_phi_node (new_def
, exit_bb
);
3859 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
, NULL
));
3861 new_phis
.quick_push (phi
);
3864 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
3865 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
3868 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
3869 prev_phi_info
= vinfo_for_stmt (phi
);
3873 /* The epilogue is created for the outer-loop, i.e., for the loop being
3874 vectorized. Create exit phis for the outer loop. */
3878 exit_bb
= single_exit (loop
)->dest
;
3879 inner_phis
.create (vect_defs
.length ());
3880 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
3882 tree new_result
= copy_ssa_name (PHI_RESULT (phi
), NULL
);
3883 gimple outer_phi
= create_phi_node (new_result
, exit_bb
);
3884 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
3886 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
3888 inner_phis
.quick_push (phi
);
3889 new_phis
[i
] = outer_phi
;
3890 prev_phi_info
= vinfo_for_stmt (outer_phi
);
3891 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
3893 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
3894 new_result
= copy_ssa_name (PHI_RESULT (phi
), NULL
);
3895 outer_phi
= create_phi_node (new_result
, exit_bb
);
3896 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
3898 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
3900 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
3901 prev_phi_info
= vinfo_for_stmt (outer_phi
);
3906 exit_gsi
= gsi_after_labels (exit_bb
);
3908 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3909 (i.e. when reduc_code is not available) and in the final adjustment
3910 code (if needed). Also get the original scalar reduction variable as
3911 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3912 represents a reduction pattern), the tree-code and scalar-def are
3913 taken from the original stmt that the pattern-stmt (STMT) replaces.
3914 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3915 are taken from STMT. */
3917 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3920 /* Regular reduction */
3925 /* Reduction pattern */
3926 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
3927 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
3928 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
3931 code
= gimple_assign_rhs_code (orig_stmt
);
3932 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3933 partial results are added and not subtracted. */
3934 if (code
== MINUS_EXPR
)
3937 scalar_dest
= gimple_assign_lhs (orig_stmt
);
3938 scalar_type
= TREE_TYPE (scalar_dest
);
3939 scalar_results
.create (group_size
);
3940 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
3941 bitsize
= TYPE_SIZE (scalar_type
);
3943 /* In case this is a reduction in an inner-loop while vectorizing an outer
3944 loop - we don't need to extract a single scalar result at the end of the
3945 inner-loop (unless it is double reduction, i.e., the use of reduction is
3946 outside the outer-loop). The final vector of partial results will be used
3947 in the vectorized outer-loop, or reduced to a scalar result at the end of
3949 if (nested_in_vect_loop
&& !double_reduc
)
3950 goto vect_finalize_reduction
;
3952 /* SLP reduction without reduction chain, e.g.,
3956 b2 = operation (b1) */
3957 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
3959 /* In case of reduction chain, e.g.,
3962 a3 = operation (a2),
3964 we may end up with more than one vector result. Here we reduce them to
3966 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
3968 tree first_vect
= PHI_RESULT (new_phis
[0]);
3970 gimple new_vec_stmt
= NULL
;
3972 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
3973 for (k
= 1; k
< new_phis
.length (); k
++)
3975 gimple next_phi
= new_phis
[k
];
3976 tree second_vect
= PHI_RESULT (next_phi
);
3978 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
3979 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
3980 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
3981 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
3982 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
3985 new_phi_result
= first_vect
;
3988 new_phis
.truncate (0);
3989 new_phis
.safe_push (new_vec_stmt
);
3993 new_phi_result
= PHI_RESULT (new_phis
[0]);
3995 /* 2.3 Create the reduction code, using one of the three schemes described
3996 above. In SLP we simply need to extract all the elements from the
3997 vector (without reducing them), so we use scalar shifts. */
3998 if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4002 /*** Case 1: Create:
4003 v_out2 = reduc_expr <v_out1> */
4005 if (dump_enabled_p ())
4006 dump_printf_loc (MSG_NOTE
, vect_location
,
4007 "Reduce using direct vector reduction.");
4009 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4010 tmp
= build1 (reduc_code
, vectype
, new_phi_result
);
4011 epilog_stmt
= gimple_build_assign (vec_dest
, tmp
);
4012 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4013 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4014 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4016 extract_scalar_result
= true;
4020 enum tree_code shift_code
= ERROR_MARK
;
4021 bool have_whole_vector_shift
= true;
4023 int element_bitsize
= tree_low_cst (bitsize
, 1);
4024 int vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
4027 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4028 shift_code
= VEC_RSHIFT_EXPR
;
4030 have_whole_vector_shift
= false;
4032 /* Regardless of whether we have a whole vector shift, if we're
4033 emulating the operation via tree-vect-generic, we don't want
4034 to use it. Only the first round of the reduction is likely
4035 to still be profitable via emulation. */
4036 /* ??? It might be better to emit a reduction tree code here, so that
4037 tree-vect-generic can expand the first round via bit tricks. */
4038 if (!VECTOR_MODE_P (mode
))
4039 have_whole_vector_shift
= false;
4042 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4043 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4044 have_whole_vector_shift
= false;
4047 if (have_whole_vector_shift
&& !slp_reduc
)
4049 /*** Case 2: Create:
4050 for (offset = VS/2; offset >= element_size; offset/=2)
4052 Create: va' = vec_shift <va, offset>
4053 Create: va = vop <va, va'>
4056 if (dump_enabled_p ())
4057 dump_printf_loc (MSG_NOTE
, vect_location
,
4058 "Reduce using vector shifts");
4060 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4061 new_temp
= new_phi_result
;
4062 for (bit_offset
= vec_size_in_bits
/2;
4063 bit_offset
>= element_bitsize
;
4066 tree bitpos
= size_int (bit_offset
);
4068 epilog_stmt
= gimple_build_assign_with_ops (shift_code
,
4069 vec_dest
, new_temp
, bitpos
);
4070 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4071 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4072 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4074 epilog_stmt
= gimple_build_assign_with_ops (code
, vec_dest
,
4075 new_name
, new_temp
);
4076 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4077 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4078 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4081 extract_scalar_result
= true;
4087 /*** Case 3: Create:
4088 s = extract_field <v_out2, 0>
4089 for (offset = element_size;
4090 offset < vector_size;
4091 offset += element_size;)
4093 Create: s' = extract_field <v_out2, offset>
4094 Create: s = op <s, s'> // For non SLP cases
4097 if (dump_enabled_p ())
4098 dump_printf_loc (MSG_NOTE
, vect_location
,
4099 "Reduce using scalar code. ");
4101 vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
4102 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4104 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4105 vec_temp
= PHI_RESULT (new_phi
);
4107 vec_temp
= gimple_assign_lhs (new_phi
);
4108 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4110 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4111 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4112 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4113 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4115 /* In SLP we don't need to apply reduction operation, so we just
4116 collect s' values in SCALAR_RESULTS. */
4118 scalar_results
.safe_push (new_temp
);
4120 for (bit_offset
= element_bitsize
;
4121 bit_offset
< vec_size_in_bits
;
4122 bit_offset
+= element_bitsize
)
4124 tree bitpos
= bitsize_int (bit_offset
);
4125 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4128 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4129 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4130 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4131 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4135 /* In SLP we don't need to apply reduction operation, so
4136 we just collect s' values in SCALAR_RESULTS. */
4137 new_temp
= new_name
;
4138 scalar_results
.safe_push (new_name
);
4142 epilog_stmt
= gimple_build_assign_with_ops (code
,
4143 new_scalar_dest
, new_name
, new_temp
);
4144 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4145 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4146 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4151 /* The only case where we need to reduce scalar results in SLP, is
4152 unrolling. If the size of SCALAR_RESULTS is greater than
4153 GROUP_SIZE, we reduce them combining elements modulo
4157 tree res
, first_res
, new_res
;
4160 /* Reduce multiple scalar results in case of SLP unrolling. */
4161 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4164 first_res
= scalar_results
[j
% group_size
];
4165 new_stmt
= gimple_build_assign_with_ops (code
,
4166 new_scalar_dest
, first_res
, res
);
4167 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4168 gimple_assign_set_lhs (new_stmt
, new_res
);
4169 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4170 scalar_results
[j
% group_size
] = new_res
;
4174 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4175 scalar_results
.safe_push (new_temp
);
4177 extract_scalar_result
= false;
4181 /* 2.4 Extract the final scalar result. Create:
4182 s_out3 = extract_field <v_out2, bitpos> */
4184 if (extract_scalar_result
)
4188 if (dump_enabled_p ())
4189 dump_printf_loc (MSG_NOTE
, vect_location
,
4190 "extract scalar result");
4192 if (BYTES_BIG_ENDIAN
)
4193 bitpos
= size_binop (MULT_EXPR
,
4194 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1),
4195 TYPE_SIZE (scalar_type
));
4197 bitpos
= bitsize_zero_node
;
4199 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
, bitsize
, bitpos
);
4200 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4201 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4202 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4203 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4204 scalar_results
.safe_push (new_temp
);
4207 vect_finalize_reduction
:
4212 /* 2.5 Adjust the final result by the initial value of the reduction
4213 variable. (When such adjustment is not needed, then
4214 'adjustment_def' is zero). For example, if code is PLUS we create:
4215 new_temp = loop_exit_def + adjustment_def */
4219 gcc_assert (!slp_reduc
);
4220 if (nested_in_vect_loop
)
4222 new_phi
= new_phis
[0];
4223 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4224 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4225 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4229 new_temp
= scalar_results
[0];
4230 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4231 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4232 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4235 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4236 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4237 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4238 SSA_NAME_DEF_STMT (new_temp
) = epilog_stmt
;
4239 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4240 if (nested_in_vect_loop
)
4242 set_vinfo_for_stmt (epilog_stmt
,
4243 new_stmt_vec_info (epilog_stmt
, loop_vinfo
,
4245 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4246 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4249 scalar_results
.quick_push (new_temp
);
4251 scalar_results
[0] = new_temp
;
4254 scalar_results
[0] = new_temp
;
4256 new_phis
[0] = epilog_stmt
;
4259 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4260 phis with new adjusted scalar results, i.e., replace use <s_out0>
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4276 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4277 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4278 v_out2 = reduce <v_out1>
4279 s_out3 = extract_field <v_out2, 0>
4280 s_out4 = adjust_result <s_out3>
4285 /* In SLP reduction chain we reduce vector results into one vector if
4286 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4287 the last stmt in the reduction chain, since we are looking for the loop
4289 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4291 scalar_dest
= gimple_assign_lhs (
4292 SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
4296 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4297 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4298 need to match SCALAR_RESULTS with corresponding statements. The first
4299 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4300 the first vector stmt, etc.
4301 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4302 if (group_size
> new_phis
.length ())
4304 ratio
= group_size
/ new_phis
.length ();
4305 gcc_assert (!(group_size
% new_phis
.length ()));
4310 for (k
= 0; k
< group_size
; k
++)
4314 epilog_stmt
= new_phis
[k
/ ratio
];
4315 reduction_phi
= reduction_phis
[k
/ ratio
];
4317 inner_phi
= inner_phis
[k
/ ratio
];
4322 gimple current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
4324 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
4325 /* SLP statements can't participate in patterns. */
4326 gcc_assert (!orig_stmt
);
4327 scalar_dest
= gimple_assign_lhs (current_stmt
);
4331 /* Find the loop-closed-use at the loop exit of the original scalar
4332 result. (The reduction result is expected to have two immediate uses -
4333 one at the latch block, and one at the loop exit). */
4334 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4335 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4336 phis
.safe_push (USE_STMT (use_p
));
4338 /* We expect to have found an exit_phi because of loop-closed-ssa
4340 gcc_assert (!phis
.is_empty ());
4342 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4346 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
4349 /* FORNOW. Currently not supporting the case that an inner-loop
4350 reduction is not used in the outer-loop (but only outside the
4351 outer-loop), unless it is double reduction. */
4352 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
4353 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
4356 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
4358 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
4359 != vect_double_reduction_def
)
4362 /* Handle double reduction:
4364 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4365 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4366 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4367 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4369 At that point the regular reduction (stmt2 and stmt3) is
4370 already vectorized, as well as the exit phi node, stmt4.
4371 Here we vectorize the phi node of double reduction, stmt1, and
4372 update all relevant statements. */
4374 /* Go through all the uses of s2 to find double reduction phi
4375 node, i.e., stmt1 above. */
4376 orig_name
= PHI_RESULT (exit_phi
);
4377 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4379 stmt_vec_info use_stmt_vinfo
;
4380 stmt_vec_info new_phi_vinfo
;
4381 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
4382 basic_block bb
= gimple_bb (use_stmt
);
4385 /* Check that USE_STMT is really double reduction phi
4387 if (gimple_code (use_stmt
) != GIMPLE_PHI
4388 || gimple_phi_num_args (use_stmt
) != 2
4389 || bb
->loop_father
!= outer_loop
)
4391 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
4393 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
4394 != vect_double_reduction_def
)
4397 /* Create vector phi node for double reduction:
4398 vs1 = phi <vs0, vs2>
4399 vs1 was created previously in this function by a call to
4400 vect_get_vec_def_for_operand and is stored in
4402 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4403 vs0 is created here. */
4405 /* Create vector phi node. */
4406 vect_phi
= create_phi_node (vec_initial_def
, bb
);
4407 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
4408 loop_vec_info_for_loop (outer_loop
), NULL
);
4409 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
4411 /* Create vs0 - initial def of the double reduction phi. */
4412 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
4413 loop_preheader_edge (outer_loop
));
4414 init_def
= get_initial_def_for_reduction (stmt
,
4415 preheader_arg
, NULL
);
4416 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
4419 /* Update phi node arguments with vs0 and vs2. */
4420 add_phi_arg (vect_phi
, vect_phi_init
,
4421 loop_preheader_edge (outer_loop
),
4423 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
4424 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
4425 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_NOTE
, vect_location
,
4428 "created double reduction phi node: ");
4429 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
4432 vect_phi_res
= PHI_RESULT (vect_phi
);
4434 /* Replace the use, i.e., set the correct vs1 in the regular
4435 reduction phi node. FORNOW, NCOPIES is always 1, so the
4436 loop is redundant. */
4437 use
= reduction_phi
;
4438 for (j
= 0; j
< ncopies
; j
++)
4440 edge pr_edge
= loop_preheader_edge (loop
);
4441 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
4442 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
4449 if (nested_in_vect_loop
)
4458 /* Find the loop-closed-use at the loop exit of the original scalar
4459 result. (The reduction result is expected to have two immediate uses,
4460 one at the latch block, and one at the loop exit). For double
4461 reductions we are looking for exit phis of the outer loop. */
4462 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4464 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4465 phis
.safe_push (USE_STMT (use_p
));
4468 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
4470 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
4472 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
4474 if (!flow_bb_inside_loop_p (loop
,
4475 gimple_bb (USE_STMT (phi_use_p
))))
4476 phis
.safe_push (USE_STMT (phi_use_p
));
4482 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4484 /* Replace the uses: */
4485 orig_name
= PHI_RESULT (exit_phi
);
4486 scalar_result
= scalar_results
[k
];
4487 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4488 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4489 SET_USE (use_p
, scalar_result
);
4495 scalar_results
.release ();
4496 inner_phis
.release ();
4497 new_phis
.release ();
4501 /* Function vectorizable_reduction.
4503 Check if STMT performs a reduction operation that can be vectorized.
4504 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4505 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4506 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4508 This function also handles reduction idioms (patterns) that have been
4509 recognized in advance during vect_pattern_recog. In this case, STMT may be
4511 X = pattern_expr (arg0, arg1, ..., X)
4512 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4513 sequence that had been detected and replaced by the pattern-stmt (STMT).
4515 In some cases of reduction patterns, the type of the reduction variable X is
4516 different than the type of the other arguments of STMT.
4517 In such cases, the vectype that is used when transforming STMT into a vector
4518 stmt is different than the vectype that is used to determine the
4519 vectorization factor, because it consists of a different number of elements
4520 than the actual number of elements that are being operated upon in parallel.
4522 For example, consider an accumulation of shorts into an int accumulator.
4523 On some targets it's possible to vectorize this pattern operating on 8
4524 shorts at a time (hence, the vectype for purposes of determining the
4525 vectorization factor should be V8HI); on the other hand, the vectype that
4526 is used to create the vector form is actually V4SI (the type of the result).
4528 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4529 indicates what is the actual level of parallelism (V8HI in the example), so
4530 that the right vectorization factor would be derived. This vectype
4531 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4532 be used to create the vectorized stmt. The right vectype for the vectorized
4533 stmt is obtained from the type of the result X:
4534 get_vectype_for_scalar_type (TREE_TYPE (X))
4536 This means that, contrary to "regular" reductions (or "regular" stmts in
4537 general), the following equation:
4538 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4539 does *NOT* necessarily hold for reduction patterns. */
4542 vectorizable_reduction (gimple stmt
, gimple_stmt_iterator
*gsi
,
4543 gimple
*vec_stmt
, slp_tree slp_node
)
4547 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
4548 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4549 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
4550 tree vectype_in
= NULL_TREE
;
4551 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4552 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4553 enum tree_code code
, orig_code
, epilog_reduc_code
;
4554 enum machine_mode vec_mode
;
4556 optab optab
, reduc_optab
;
4557 tree new_temp
= NULL_TREE
;
4560 enum vect_def_type dt
;
4561 gimple new_phi
= NULL
;
4565 stmt_vec_info orig_stmt_info
;
4566 tree expr
= NULL_TREE
;
4570 stmt_vec_info prev_stmt_info
, prev_phi_info
;
4571 bool single_defuse_cycle
= false;
4572 tree reduc_def
= NULL_TREE
;
4573 gimple new_stmt
= NULL
;
4576 bool nested_cycle
= false, found_nested_cycle_def
= false;
4577 gimple reduc_def_stmt
= NULL
;
4578 /* The default is that the reduction variable is the last in statement. */
4579 int reduc_index
= 2;
4580 bool double_reduc
= false, dummy
;
4582 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
4584 gimple def_arg_stmt
;
4585 vec
<tree
> vec_oprnds0
= vNULL
;
4586 vec
<tree
> vec_oprnds1
= vNULL
;
4587 vec
<tree
> vect_defs
= vNULL
;
4588 vec
<gimple
> phis
= vNULL
;
4590 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
4592 /* In case of reduction chain we switch to the first stmt in the chain, but
4593 we don't update STMT_INFO, since only the last stmt is marked as reduction
4594 and has reduction properties. */
4595 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4596 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
4598 if (nested_in_vect_loop_p (loop
, stmt
))
4602 nested_cycle
= true;
4605 /* 1. Is vectorizable reduction? */
4606 /* Not supportable if the reduction variable is used in the loop, unless
4607 it's a reduction chain. */
4608 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
4609 && !GROUP_FIRST_ELEMENT (stmt_info
))
4612 /* Reductions that are not used even in an enclosing outer-loop,
4613 are expected to be "live" (used out of the loop). */
4614 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
4615 && !STMT_VINFO_LIVE_P (stmt_info
))
4618 /* Make sure it was already recognized as a reduction computation. */
4619 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
4620 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
4623 /* 2. Has this been recognized as a reduction pattern?
4625 Check if STMT represents a pattern that has been recognized
4626 in earlier analysis stages. For stmts that represent a pattern,
4627 the STMT_VINFO_RELATED_STMT field records the last stmt in
4628 the original sequence that constitutes the pattern. */
4630 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4633 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
4634 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4635 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
4638 /* 3. Check the operands of the operation. The first operands are defined
4639 inside the loop body. The last operand is the reduction variable,
4640 which is defined by the loop-header-phi. */
4642 gcc_assert (is_gimple_assign (stmt
));
4645 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
4647 case GIMPLE_SINGLE_RHS
:
4648 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
4649 if (op_type
== ternary_op
)
4651 tree rhs
= gimple_assign_rhs1 (stmt
);
4652 ops
[0] = TREE_OPERAND (rhs
, 0);
4653 ops
[1] = TREE_OPERAND (rhs
, 1);
4654 ops
[2] = TREE_OPERAND (rhs
, 2);
4655 code
= TREE_CODE (rhs
);
4661 case GIMPLE_BINARY_RHS
:
4662 code
= gimple_assign_rhs_code (stmt
);
4663 op_type
= TREE_CODE_LENGTH (code
);
4664 gcc_assert (op_type
== binary_op
);
4665 ops
[0] = gimple_assign_rhs1 (stmt
);
4666 ops
[1] = gimple_assign_rhs2 (stmt
);
4669 case GIMPLE_TERNARY_RHS
:
4670 code
= gimple_assign_rhs_code (stmt
);
4671 op_type
= TREE_CODE_LENGTH (code
);
4672 gcc_assert (op_type
== ternary_op
);
4673 ops
[0] = gimple_assign_rhs1 (stmt
);
4674 ops
[1] = gimple_assign_rhs2 (stmt
);
4675 ops
[2] = gimple_assign_rhs3 (stmt
);
4678 case GIMPLE_UNARY_RHS
:
4685 if (code
== COND_EXPR
&& slp_node
)
4688 scalar_dest
= gimple_assign_lhs (stmt
);
4689 scalar_type
= TREE_TYPE (scalar_dest
);
4690 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
4691 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
4694 /* Do not try to vectorize bit-precision reductions. */
4695 if ((TYPE_PRECISION (scalar_type
)
4696 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
4699 /* All uses but the last are expected to be defined in the loop.
4700 The last use is the reduction variable. In case of nested cycle this
4701 assumption is not true: we use reduc_index to record the index of the
4702 reduction variable. */
4703 for (i
= 0; i
< op_type
- 1; i
++)
4705 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4706 if (i
== 0 && code
== COND_EXPR
)
4709 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4710 &def_stmt
, &def
, &dt
, &tem
);
4713 gcc_assert (is_simple_use
);
4715 if (dt
!= vect_internal_def
4716 && dt
!= vect_external_def
4717 && dt
!= vect_constant_def
4718 && dt
!= vect_induction_def
4719 && !(dt
== vect_nested_cycle
&& nested_cycle
))
4722 if (dt
== vect_nested_cycle
)
4724 found_nested_cycle_def
= true;
4725 reduc_def_stmt
= def_stmt
;
4730 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4731 &def_stmt
, &def
, &dt
, &tem
);
4734 gcc_assert (is_simple_use
);
4735 if (!(dt
== vect_reduction_def
4736 || dt
== vect_nested_cycle
4737 || ((dt
== vect_internal_def
|| dt
== vect_external_def
4738 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
4739 && nested_cycle
&& found_nested_cycle_def
)))
4741 /* For pattern recognized stmts, orig_stmt might be a reduction,
4742 but some helper statements for the pattern might not, or
4743 might be COND_EXPRs with reduction uses in the condition. */
4744 gcc_assert (orig_stmt
);
4747 if (!found_nested_cycle_def
)
4748 reduc_def_stmt
= def_stmt
;
4750 gcc_assert (gimple_code (reduc_def_stmt
) == GIMPLE_PHI
);
4752 gcc_assert (orig_stmt
== vect_is_simple_reduction (loop_vinfo
,
4758 gimple tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
4759 !nested_cycle
, &dummy
);
4760 /* We changed STMT to be the first stmt in reduction chain, hence we
4761 check that in this case the first element in the chain is STMT. */
4762 gcc_assert (stmt
== tmp
4763 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
4766 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
4769 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
4772 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4773 / TYPE_VECTOR_SUBPARTS (vectype_in
));
4775 gcc_assert (ncopies
>= 1);
4777 vec_mode
= TYPE_MODE (vectype_in
);
4779 if (code
== COND_EXPR
)
4781 if (!vectorizable_condition (stmt
, gsi
, NULL
, ops
[reduc_index
], 0, NULL
))
4783 if (dump_enabled_p ())
4784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4785 "unsupported condition in reduction");
4792 /* 4. Supportable by target? */
4794 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
4795 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
4797 /* Shifts and rotates are only supported by vectorizable_shifts,
4798 not vectorizable_reduction. */
4799 if (dump_enabled_p ())
4800 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4801 "unsupported shift or rotation.");
4805 /* 4.1. check support for the operation in the loop */
4806 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
4809 if (dump_enabled_p ())
4810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4816 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
4818 if (dump_enabled_p ())
4819 dump_printf (MSG_NOTE
, "op not supported by target.");
4821 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
4822 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4823 < vect_min_worthwhile_factor (code
))
4826 if (dump_enabled_p ())
4827 dump_printf (MSG_NOTE
, "proceeding using word mode.");
4830 /* Worthwhile without SIMD support? */
4831 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
4832 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4833 < vect_min_worthwhile_factor (code
))
4835 if (dump_enabled_p ())
4836 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4837 "not worthwhile without SIMD support.");
4843 /* 4.2. Check support for the epilog operation.
4845 If STMT represents a reduction pattern, then the type of the
4846 reduction variable may be different than the type of the rest
4847 of the arguments. For example, consider the case of accumulation
4848 of shorts into an int accumulator; The original code:
4849 S1: int_a = (int) short_a;
4850 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4853 STMT: int_acc = widen_sum <short_a, int_acc>
4856 1. The tree-code that is used to create the vector operation in the
4857 epilog code (that reduces the partial results) is not the
4858 tree-code of STMT, but is rather the tree-code of the original
4859 stmt from the pattern that STMT is replacing. I.e, in the example
4860 above we want to use 'widen_sum' in the loop, but 'plus' in the
4862 2. The type (mode) we use to check available target support
4863 for the vector operation to be created in the *epilog*, is
4864 determined by the type of the reduction variable (in the example
4865 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4866 However the type (mode) we use to check available target support
4867 for the vector operation to be created *inside the loop*, is
4868 determined by the type of the other arguments to STMT (in the
4869 example we'd check this: optab_handler (widen_sum_optab,
4872 This is contrary to "regular" reductions, in which the types of all
4873 the arguments are the same as the type of the reduction variable.
4874 For "regular" reductions we can therefore use the same vector type
4875 (and also the same tree-code) when generating the epilog code and
4876 when generating the code inside the loop. */
4880 /* This is a reduction pattern: get the vectype from the type of the
4881 reduction variable, and get the tree-code from orig_stmt. */
4882 orig_code
= gimple_assign_rhs_code (orig_stmt
);
4883 gcc_assert (vectype_out
);
4884 vec_mode
= TYPE_MODE (vectype_out
);
4888 /* Regular reduction: use the same vectype and tree-code as used for
4889 the vector code inside the loop can be used for the epilog code. */
4895 def_bb
= gimple_bb (reduc_def_stmt
);
4896 def_stmt_loop
= def_bb
->loop_father
;
4897 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4898 loop_preheader_edge (def_stmt_loop
));
4899 if (TREE_CODE (def_arg
) == SSA_NAME
4900 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
4901 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
4902 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
4903 && vinfo_for_stmt (def_arg_stmt
)
4904 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
4905 == vect_double_reduction_def
)
4906 double_reduc
= true;
4909 epilog_reduc_code
= ERROR_MARK
;
4910 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
4912 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
4916 if (dump_enabled_p ())
4917 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4918 "no optab for reduction.");
4920 epilog_reduc_code
= ERROR_MARK
;
4924 && optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
4926 if (dump_enabled_p ())
4927 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4928 "reduc op not supported by target.");
4930 epilog_reduc_code
= ERROR_MARK
;
4935 if (!nested_cycle
|| double_reduc
)
4937 if (dump_enabled_p ())
4938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4939 "no reduc code for scalar code.");
4945 if (double_reduc
&& ncopies
> 1)
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4949 "multiple types in double reduction");
4954 /* In case of widenning multiplication by a constant, we update the type
4955 of the constant to be the type of the other operand. We check that the
4956 constant fits the type in the pattern recognition pass. */
4957 if (code
== DOT_PROD_EXPR
4958 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
4960 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
4961 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
4962 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
4963 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
4966 if (dump_enabled_p ())
4967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4968 "invalid types in dot-prod");
4974 if (!vec_stmt
) /* transformation not required. */
4976 if (!vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
))
4978 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
4984 if (dump_enabled_p ())
4985 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.");
4987 /* FORNOW: Multiple types are not supported for condition. */
4988 if (code
== COND_EXPR
)
4989 gcc_assert (ncopies
== 1);
4991 /* Create the destination vector */
4992 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
4994 /* In case the vectorization factor (VF) is bigger than the number
4995 of elements that we can fit in a vectype (nunits), we have to generate
4996 more than one vector stmt - i.e - we need to "unroll" the
4997 vector stmt by a factor VF/nunits. For more details see documentation
4998 in vectorizable_operation. */
5000 /* If the reduction is used in an outer loop we need to generate
5001 VF intermediate results, like so (e.g. for ncopies=2):
5006 (i.e. we generate VF results in 2 registers).
5007 In this case we have a separate def-use cycle for each copy, and therefore
5008 for each copy we get the vector def for the reduction variable from the
5009 respective phi node created for this copy.
5011 Otherwise (the reduction is unused in the loop nest), we can combine
5012 together intermediate results, like so (e.g. for ncopies=2):
5016 (i.e. we generate VF/2 results in a single register).
5017 In this case for each copy we get the vector def for the reduction variable
5018 from the vectorized reduction operation generated in the previous iteration.
5021 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5023 single_defuse_cycle
= true;
5027 epilog_copies
= ncopies
;
5029 prev_stmt_info
= NULL
;
5030 prev_phi_info
= NULL
;
5033 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5034 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out
)
5035 == TYPE_VECTOR_SUBPARTS (vectype_in
));
5040 vec_oprnds0
.create (1);
5041 if (op_type
== ternary_op
)
5042 vec_oprnds1
.create (1);
5045 phis
.create (vec_num
);
5046 vect_defs
.create (vec_num
);
5048 vect_defs
.quick_push (NULL_TREE
);
5050 for (j
= 0; j
< ncopies
; j
++)
5052 if (j
== 0 || !single_defuse_cycle
)
5054 for (i
= 0; i
< vec_num
; i
++)
5056 /* Create the reduction-phi that defines the reduction
5058 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5059 set_vinfo_for_stmt (new_phi
,
5060 new_stmt_vec_info (new_phi
, loop_vinfo
,
5062 if (j
== 0 || slp_node
)
5063 phis
.quick_push (new_phi
);
5067 if (code
== COND_EXPR
)
5069 gcc_assert (!slp_node
);
5070 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5071 PHI_RESULT (phis
[0]),
5073 /* Multiple types are not supported for condition. */
5080 op0
= ops
[!reduc_index
];
5081 if (op_type
== ternary_op
)
5083 if (reduc_index
== 0)
5090 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
5094 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
5096 vec_oprnds0
.quick_push (loop_vec_def0
);
5097 if (op_type
== ternary_op
)
5099 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
,
5101 vec_oprnds1
.quick_push (loop_vec_def1
);
5109 enum vect_def_type dt
;
5113 vect_is_simple_use (ops
[!reduc_index
], stmt
, loop_vinfo
, NULL
,
5114 &dummy_stmt
, &dummy
, &dt
);
5115 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
5117 vec_oprnds0
[0] = loop_vec_def0
;
5118 if (op_type
== ternary_op
)
5120 vect_is_simple_use (op1
, stmt
, loop_vinfo
, NULL
, &dummy_stmt
,
5122 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
5124 vec_oprnds1
[0] = loop_vec_def1
;
5128 if (single_defuse_cycle
)
5129 reduc_def
= gimple_assign_lhs (new_stmt
);
5131 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
5134 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5137 reduc_def
= PHI_RESULT (phis
[i
]);
5140 if (!single_defuse_cycle
|| j
== 0)
5141 reduc_def
= PHI_RESULT (new_phi
);
5144 def1
= ((op_type
== ternary_op
)
5145 ? vec_oprnds1
[i
] : NULL
);
5146 if (op_type
== binary_op
)
5148 if (reduc_index
== 0)
5149 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
5151 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
5155 if (reduc_index
== 0)
5156 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
5159 if (reduc_index
== 1)
5160 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
5162 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
5166 new_stmt
= gimple_build_assign (vec_dest
, expr
);
5167 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5168 gimple_assign_set_lhs (new_stmt
, new_temp
);
5169 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
5173 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
5174 vect_defs
.quick_push (new_temp
);
5177 vect_defs
[0] = new_temp
;
5184 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
5186 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
5188 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
5189 prev_phi_info
= vinfo_for_stmt (new_phi
);
5192 /* Finalize the reduction-phi (set its arguments) and create the
5193 epilog reduction code. */
5194 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
5196 new_temp
= gimple_assign_lhs (*vec_stmt
);
5197 vect_defs
[0] = new_temp
;
5200 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
5201 epilog_reduc_code
, phis
, reduc_index
,
5202 double_reduc
, slp_node
);
5205 vect_defs
.release ();
5206 vec_oprnds0
.release ();
5207 vec_oprnds1
.release ();
5212 /* Function vect_min_worthwhile_factor.
5214 For a loop where we could vectorize the operation indicated by CODE,
5215 return the minimum vectorization factor that makes it worthwhile
5216 to use generic vectors. */
5218 vect_min_worthwhile_factor (enum tree_code code
)
5239 /* Function vectorizable_induction
5241 Check if PHI performs an induction computation that can be vectorized.
5242 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5243 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5244 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5247 vectorizable_induction (gimple phi
, gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5250 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
5251 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5252 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5253 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5254 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5255 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
5258 gcc_assert (ncopies
>= 1);
5259 /* FORNOW. These restrictions should be relaxed. */
5260 if (nested_in_vect_loop_p (loop
, phi
))
5262 imm_use_iterator imm_iter
;
5263 use_operand_p use_p
;
5270 if (dump_enabled_p ())
5271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5272 "multiple types in nested loop.");
5277 latch_e
= loop_latch_edge (loop
->inner
);
5278 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
5279 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
5281 if (!flow_bb_inside_loop_p (loop
->inner
,
5282 gimple_bb (USE_STMT (use_p
))))
5284 exit_phi
= USE_STMT (use_p
);
5290 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5291 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5292 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
5294 if (dump_enabled_p ())
5295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5296 "inner-loop induction only used outside "
5297 "of the outer vectorized loop.");
5303 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
5306 /* FORNOW: SLP not supported. */
5307 if (STMT_SLP_TYPE (stmt_info
))
5310 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
5312 if (gimple_code (phi
) != GIMPLE_PHI
)
5315 if (!vec_stmt
) /* transformation not required. */
5317 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
5318 if (dump_enabled_p ())
5319 dump_printf_loc (MSG_NOTE
, vect_location
,
5320 "=== vectorizable_induction ===");
5321 vect_model_induction_cost (stmt_info
, ncopies
);
5327 if (dump_enabled_p ())
5328 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.");
5330 vec_def
= get_initial_def_for_induction (phi
);
5331 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
5335 /* Function vectorizable_live_operation.
5337 STMT computes a value that is used outside the loop. Check if
5338 it can be supported. */
5341 vectorizable_live_operation (gimple stmt
,
5342 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5343 gimple
*vec_stmt ATTRIBUTE_UNUSED
)
5345 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5346 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5347 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5353 enum vect_def_type dt
;
5354 enum tree_code code
;
5355 enum gimple_rhs_class rhs_class
;
5357 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
5359 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5362 if (!is_gimple_assign (stmt
))
5365 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
5368 /* FORNOW. CHECKME. */
5369 if (nested_in_vect_loop_p (loop
, stmt
))
5372 code
= gimple_assign_rhs_code (stmt
);
5373 op_type
= TREE_CODE_LENGTH (code
);
5374 rhs_class
= get_gimple_rhs_class (code
);
5375 gcc_assert (rhs_class
!= GIMPLE_UNARY_RHS
|| op_type
== unary_op
);
5376 gcc_assert (rhs_class
!= GIMPLE_BINARY_RHS
|| op_type
== binary_op
);
5378 /* FORNOW: support only if all uses are invariant. This means
5379 that the scalar operations can remain in place, unvectorized.
5380 The original last scalar value that they compute will be used. */
5382 for (i
= 0; i
< op_type
; i
++)
5384 if (rhs_class
== GIMPLE_SINGLE_RHS
)
5385 op
= TREE_OPERAND (gimple_op (stmt
, 1), i
);
5387 op
= gimple_op (stmt
, i
+ 1);
5389 && !vect_is_simple_use (op
, stmt
, loop_vinfo
, NULL
, &def_stmt
, &def
,
5392 if (dump_enabled_p ())
5393 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5398 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
5402 /* No transformation is required for the cases we currently support. */
5406 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5409 vect_loop_kill_debug_uses (struct loop
*loop
, gimple stmt
)
5411 ssa_op_iter op_iter
;
5412 imm_use_iterator imm_iter
;
5413 def_operand_p def_p
;
5416 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
5418 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
5422 if (!is_gimple_debug (ustmt
))
5425 bb
= gimple_bb (ustmt
);
5427 if (!flow_bb_inside_loop_p (loop
, bb
))
5429 if (gimple_debug_bind_p (ustmt
))
5431 if (dump_enabled_p ())
5432 dump_printf_loc (MSG_NOTE
, vect_location
,
5433 "killing debug use");
5435 gimple_debug_bind_reset_value (ustmt
);
5436 update_stmt (ustmt
);
5445 /* Function vect_transform_loop.
5447 The analysis phase has determined that the loop is vectorizable.
5448 Vectorize the loop - created vectorized stmts to replace the scalar
5449 stmts in the loop, and update the loop exit condition. */
5452 vect_transform_loop (loop_vec_info loop_vinfo
)
5454 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5455 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
5456 int nbbs
= loop
->num_nodes
;
5457 gimple_stmt_iterator si
;
5460 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5462 bool slp_scheduled
= false;
5463 unsigned int nunits
;
5464 gimple stmt
, pattern_stmt
;
5465 gimple_seq pattern_def_seq
= NULL
;
5466 gimple_stmt_iterator pattern_def_si
= gsi_none ();
5467 bool transform_pattern_stmt
= false;
5468 bool check_profitability
= false;
5470 /* Record number of iterations before we started tampering with the profile. */
5471 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
5473 if (dump_enabled_p ())
5474 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===");
5476 /* If profile is inprecise, we have chance to fix it up. */
5477 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5478 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
5480 /* Use the more conservative vectorization threshold. If the number
5481 of iterations is constant assume the cost check has been performed
5482 by our caller. If the threshold makes all loops profitable that
5483 run at least the vectorization factor number of times checking
5484 is pointless, too. */
5485 th
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
5486 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
)) - 1);
5487 th
= MAX (th
, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
));
5488 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
5489 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5491 if (dump_enabled_p ())
5492 dump_printf_loc (MSG_NOTE
, vect_location
,
5493 "Profitability threshold is %d loop iterations.", th
);
5494 check_profitability
= true;
5497 /* Peel the loop if there are data refs with unknown alignment.
5498 Only one data ref with unknown store is allowed. */
5500 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
))
5502 vect_do_peeling_for_alignment (loop_vinfo
, th
, check_profitability
);
5503 check_profitability
= false;
5506 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
5507 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
5509 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
5510 check_profitability
= false;
5513 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5514 compile time constant), or it is a constant that doesn't divide by the
5515 vectorization factor, then an epilog loop needs to be created.
5516 We therefore duplicate the loop: the original loop will be vectorized,
5517 and will compute the first (n/VF) iterations. The second copy of the loop
5518 will remain scalar and will compute the remaining (n%VF) iterations.
5519 (VF is the vectorization factor). */
5521 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
5522 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
5523 && LOOP_VINFO_INT_NITERS (loop_vinfo
) % vectorization_factor
!= 0)
5524 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5525 vect_do_peeling_for_loop_bound (loop_vinfo
, &ratio
,
5526 th
, check_profitability
);
5528 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
5529 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
5531 /* 1) Make sure the loop header has exactly two entries
5532 2) Make sure we have a preheader basic block. */
5534 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
5536 split_edge (loop_preheader_edge (loop
));
5538 /* FORNOW: the vectorizer supports only loops which body consist
5539 of one basic block (header + empty latch). When the vectorizer will
5540 support more involved loop forms, the order by which the BBs are
5541 traversed need to be reconsidered. */
5543 for (i
= 0; i
< nbbs
; i
++)
5545 basic_block bb
= bbs
[i
];
5546 stmt_vec_info stmt_info
;
5549 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
5551 phi
= gsi_stmt (si
);
5552 if (dump_enabled_p ())
5554 dump_printf_loc (MSG_NOTE
, vect_location
,
5555 "------>vectorizing phi: ");
5556 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
5558 stmt_info
= vinfo_for_stmt (phi
);
5562 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5563 vect_loop_kill_debug_uses (loop
, phi
);
5565 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5566 && !STMT_VINFO_LIVE_P (stmt_info
))
5569 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
5570 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
5571 && dump_enabled_p ())
5572 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.");
5574 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
5576 if (dump_enabled_p ())
5577 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.");
5578 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
5582 pattern_stmt
= NULL
;
5583 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
) || transform_pattern_stmt
;)
5587 if (transform_pattern_stmt
)
5588 stmt
= pattern_stmt
;
5590 stmt
= gsi_stmt (si
);
5592 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_NOTE
, vect_location
,
5595 "------>vectorizing statement: ");
5596 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
5599 stmt_info
= vinfo_for_stmt (stmt
);
5601 /* vector stmts created in the outer-loop during vectorization of
5602 stmts in an inner-loop may not have a stmt_info, and do not
5603 need to be vectorized. */
5610 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5611 vect_loop_kill_debug_uses (loop
, stmt
);
5613 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5614 && !STMT_VINFO_LIVE_P (stmt_info
))
5616 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
5617 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
5618 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
5619 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
5621 stmt
= pattern_stmt
;
5622 stmt_info
= vinfo_for_stmt (stmt
);
5630 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
5631 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
5632 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
5633 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
5634 transform_pattern_stmt
= true;
5636 /* If pattern statement has def stmts, vectorize them too. */
5637 if (is_pattern_stmt_p (stmt_info
))
5639 if (pattern_def_seq
== NULL
)
5641 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
5642 pattern_def_si
= gsi_start (pattern_def_seq
);
5644 else if (!gsi_end_p (pattern_def_si
))
5645 gsi_next (&pattern_def_si
);
5646 if (pattern_def_seq
!= NULL
)
5648 gimple pattern_def_stmt
= NULL
;
5649 stmt_vec_info pattern_def_stmt_info
= NULL
;
5651 while (!gsi_end_p (pattern_def_si
))
5653 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
5654 pattern_def_stmt_info
5655 = vinfo_for_stmt (pattern_def_stmt
);
5656 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
5657 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
5659 gsi_next (&pattern_def_si
);
5662 if (!gsi_end_p (pattern_def_si
))
5664 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_NOTE
, vect_location
,
5667 "==> vectorizing pattern def "
5669 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
5670 pattern_def_stmt
, 0);
5673 stmt
= pattern_def_stmt
;
5674 stmt_info
= pattern_def_stmt_info
;
5678 pattern_def_si
= gsi_none ();
5679 transform_pattern_stmt
= false;
5683 transform_pattern_stmt
= false;
5686 gcc_assert (STMT_VINFO_VECTYPE (stmt_info
));
5687 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (
5688 STMT_VINFO_VECTYPE (stmt_info
));
5689 if (!STMT_SLP_TYPE (stmt_info
)
5690 && nunits
!= (unsigned int) vectorization_factor
5691 && dump_enabled_p ())
5692 /* For SLP VF is set according to unrolling factor, and not to
5693 vector size, hence for SLP this print is not valid. */
5694 dump_printf_loc (MSG_NOTE
, vect_location
,
5697 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5699 if (STMT_SLP_TYPE (stmt_info
))
5703 slp_scheduled
= true;
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_NOTE
, vect_location
,
5707 "=== scheduling SLP instances ===");
5709 vect_schedule_slp (loop_vinfo
, NULL
);
5712 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5713 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
5715 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
5717 pattern_def_seq
= NULL
;
5724 /* -------- vectorize statement ------------ */
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.");
5728 grouped_store
= false;
5729 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
5732 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5734 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5735 interleaving chain was completed - free all the stores in
5738 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
5743 /* Free the attached stmt_vec_info and remove the stmt. */
5744 gimple store
= gsi_stmt (si
);
5745 free_stmt_vec_info (store
);
5746 unlink_stmt_vdef (store
);
5747 gsi_remove (&si
, true);
5748 release_defs (store
);
5753 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
5755 pattern_def_seq
= NULL
;
5761 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
5763 /* Reduce loop iterations by the vectorization factor. */
5764 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
5765 expected_iterations
/ vectorization_factor
);
5766 loop
->nb_iterations_upper_bound
5767 = loop
->nb_iterations_upper_bound
.udiv (double_int::from_uhwi (vectorization_factor
),
5769 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
5770 && loop
->nb_iterations_upper_bound
!= double_int_zero
)
5771 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- double_int_one
;
5772 if (loop
->any_estimate
)
5774 loop
->nb_iterations_estimate
5775 = loop
->nb_iterations_estimate
.udiv (double_int::from_uhwi (vectorization_factor
),
5777 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
5778 && loop
->nb_iterations_estimate
!= double_int_zero
)
5779 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- double_int_one
;
5782 if (dump_enabled_p ())
5783 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
, "LOOP VECTORIZED.");
5784 if (loop
->inner
&& dump_enabled_p ())
5785 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5786 "OUTER LOOP VECTORIZED.");