2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 for (i=0; i<N/8; i++){
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
172 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
173 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
174 int nbbs
= loop
->num_nodes
;
175 gimple_stmt_iterator si
;
176 unsigned int vectorization_factor
= 0;
181 stmt_vec_info stmt_info
;
184 gimple stmt
, pattern_stmt
= NULL
;
185 gimple_seq pattern_def_seq
= NULL
;
186 gimple_stmt_iterator pattern_def_si
= gsi_none ();
187 bool analyze_pattern_stmt
= false;
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE
, vect_location
,
191 "=== vect_determine_vectorization_factor ===");
193 for (i
= 0; i
< nbbs
; i
++)
195 basic_block bb
= bbs
[i
];
197 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
200 stmt_info
= vinfo_for_stmt (phi
);
201 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
207 gcc_assert (stmt_info
);
209 if (STMT_VINFO_RELEVANT_P (stmt_info
))
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
212 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE
, vect_location
,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
221 vectype
= get_vectype_for_scalar_type (scalar_type
);
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
227 "not vectorized: unsupported "
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
234 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
239 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
242 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d", nunits
);
246 if (!vectorization_factor
247 || (nunits
> vectorization_factor
))
248 vectorization_factor
= nunits
;
252 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
) || analyze_pattern_stmt
;)
256 if (analyze_pattern_stmt
)
259 stmt
= gsi_stmt (si
);
261 stmt_info
= vinfo_for_stmt (stmt
);
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_NOTE
, vect_location
,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
270 gcc_assert (stmt_info
);
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
274 && !STMT_VINFO_LIVE_P (stmt_info
))
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
277 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
282 stmt_info
= vinfo_for_stmt (pattern_stmt
);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE
, vect_location
,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.");
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
299 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
302 analyze_pattern_stmt
= true;
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info
))
307 if (pattern_def_seq
== NULL
)
309 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
310 pattern_def_si
= gsi_start (pattern_def_seq
);
312 else if (!gsi_end_p (pattern_def_si
))
313 gsi_next (&pattern_def_si
);
314 if (pattern_def_seq
!= NULL
)
316 gimple pattern_def_stmt
= NULL
;
317 stmt_vec_info pattern_def_stmt_info
= NULL
;
319 while (!gsi_end_p (pattern_def_si
))
321 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt
);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
327 gsi_next (&pattern_def_si
);
330 if (!gsi_end_p (pattern_def_si
))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE
, vect_location
,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
337 pattern_def_stmt
, 0);
340 stmt
= pattern_def_stmt
;
341 stmt_info
= pattern_def_stmt_info
;
345 pattern_def_si
= gsi_none ();
346 analyze_pattern_stmt
= false;
350 analyze_pattern_stmt
= false;
353 if (gimple_get_lhs (stmt
) == NULL_TREE
)
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
376 if (STMT_VINFO_VECTYPE (stmt_info
))
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
383 || is_pattern_stmt_p (stmt_info
)
384 || !gsi_end_p (pattern_def_si
));
385 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
390 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_NOTE
, vect_location
,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
397 vectype
= get_vectype_for_scalar_type (scalar_type
);
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
403 "not vectorized: unsupported "
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
411 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
413 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
416 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
425 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE
, vect_location
,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
431 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
454 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
464 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
467 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d", nunits
);
470 if (!vectorization_factor
471 || (nunits
> vectorization_factor
))
472 vectorization_factor
= nunits
;
474 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
476 pattern_def_seq
= NULL
;
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d",
485 vectorization_factor
);
486 if (vectorization_factor
<= 1)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
490 "not vectorized: unsupported data-type");
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
499 /* Function vect_is_simple_iv_evolution.
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
505 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
510 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
512 /* When there is no evolution in this loop, the evolution function
514 if (evolution_part
== NULL_TREE
)
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part
))
522 step_expr
= evolution_part
;
523 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
528 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
529 dump_printf (MSG_NOTE
, ", init: ");
530 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
536 if (TREE_CODE (step_expr
) != INTEGER_CST
)
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
547 /* Function vect_analyze_scalar_cycles_1.
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
557 basic_block bb
= loop
->header
;
559 vec
<gimple
> worklist
;
560 worklist
.create (64);
561 gimple_stmt_iterator gsi
;
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE
, vect_location
,
566 "=== vect_analyze_scalar_cycles ===");
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
571 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
573 gimple phi
= gsi_stmt (gsi
);
574 tree access_fn
= NULL
;
575 tree def
= PHI_RESULT (phi
);
576 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
578 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def
))
589 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
591 /* Analyze the evolution function. */
592 access_fn
= analyze_scalar_evolution (loop
, def
);
595 STRIP_NOPS (access_fn
);
596 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE
, vect_location
,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
603 = evolution_part_in_loop_num (access_fn
, loop
->num
);
607 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &dumy
, &dumy
))
609 worklist
.safe_push (phi
);
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist
.length () > 0)
624 gimple phi
= worklist
.pop ();
625 tree def
= PHI_RESULT (phi
);
626 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
630 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
636 gcc_assert (!virtual_operand_p (def
)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
639 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
640 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE
, vect_location
,
648 "Detected double reduction.");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
652 vect_double_reduction_def
;
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE
, vect_location
,
660 "Detected vectorizable nested cycle.");
662 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE
, vect_location
,
670 "Detected reduction.");
672 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
675 /* Store the reduction cycles for possible vectorization in
677 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
684 "Unknown def-use cycle pattern.");
691 /* Function vect_analyze_scalar_cycles.
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
715 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
717 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
729 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
732 /* Function vect_get_loop_niters.
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
740 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
)
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE
, vect_location
,
746 "=== get_loop_niters ===");
747 niters
= number_of_exit_cond_executions (loop
);
749 if (niters
!= NULL_TREE
750 && niters
!= chrec_dont_know
)
752 *number_of_iterations
= niters
;
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_NOTE
, vect_location
, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, *number_of_iterations
);
761 return get_loop_exit_condition (loop
);
765 /* Function bb_in_loop_p
767 Used as predicate for dfs order traversal of the loop bbs. */
770 bb_in_loop_p (const_basic_block bb
, const void *data
)
772 const struct loop
*const loop
= (const struct loop
*)data
;
773 if (flow_bb_inside_loop_p (loop
, bb
))
779 /* Function new_loop_vec_info.
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
785 new_loop_vec_info (struct loop
*loop
)
789 gimple_stmt_iterator si
;
790 unsigned int i
, nbbs
;
792 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
793 LOOP_VINFO_LOOP (res
) = loop
;
795 bbs
= get_loop_body (loop
);
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i
= 0; i
< loop
->num_nodes
; i
++)
800 basic_block bb
= bbs
[i
];
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb
->loop_father
!= loop
)
812 gcc_assert (loop
->inner
&& bb
->loop_father
== loop
->inner
);
813 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
815 gimple phi
= gsi_stmt (si
);
816 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
817 loop_vec_info inner_loop_vinfo
=
818 STMT_VINFO_LOOP_VINFO (stmt_info
);
819 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
820 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
822 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
824 gimple stmt
= gsi_stmt (si
);
825 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
826 loop_vec_info inner_loop_vinfo
=
827 STMT_VINFO_LOOP_VINFO (stmt_info
);
828 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
829 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
834 /* bb in current nest. */
835 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
837 gimple phi
= gsi_stmt (si
);
838 gimple_set_uid (phi
, 0);
839 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
, NULL
));
842 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
844 gimple stmt
= gsi_stmt (si
);
845 gimple_set_uid (stmt
, 0);
846 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
, NULL
));
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
857 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
858 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
859 bbs
, loop
->num_nodes
, loop
);
860 gcc_assert (nbbs
== loop
->num_nodes
);
862 LOOP_VINFO_BBS (res
) = bbs
;
863 LOOP_VINFO_NITERS (res
) = NULL
;
864 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res
) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res
) = 0;
868 LOOP_VINFO_VECT_FACTOR (res
) = 0;
869 LOOP_VINFO_LOOP_NEST (res
).create (3);
870 LOOP_VINFO_DATAREFS (res
).create (10);
871 LOOP_VINFO_DDRS (res
).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res
).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res
).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
877 LOOP_VINFO_GROUPED_STORES (res
).create (10);
878 LOOP_VINFO_REDUCTIONS (res
).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res
).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res
).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
882 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
883 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
884 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
890 /* Function destroy_loop_vec_info.
892 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
893 stmts in the loop. */
896 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
901 gimple_stmt_iterator si
;
903 vec
<slp_instance
> slp_instances
;
904 slp_instance instance
;
910 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
912 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
913 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
914 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
916 for (j
= 0; j
< nbbs
; j
++)
918 basic_block bb
= bbs
[j
];
919 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
920 free_stmt_vec_info (gsi_stmt (si
));
922 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
924 gimple stmt
= gsi_stmt (si
);
926 /* We may have broken canonical form by moving a constant
927 into RHS1 of a commutative op. Fix such occurrences. */
928 if (swapped
&& is_gimple_assign (stmt
))
930 enum tree_code code
= gimple_assign_rhs_code (stmt
);
932 if ((code
== PLUS_EXPR
933 || code
== POINTER_PLUS_EXPR
934 || code
== MULT_EXPR
)
935 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
936 swap_tree_operands (stmt
,
937 gimple_assign_rhs1_ptr (stmt
),
938 gimple_assign_rhs2_ptr (stmt
));
941 /* Free stmt_vec_info. */
942 free_stmt_vec_info (stmt
);
947 free (LOOP_VINFO_BBS (loop_vinfo
));
948 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo
));
949 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
950 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
951 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
952 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
953 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
954 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
955 vect_free_slp_instance (instance
);
957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
958 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
959 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
960 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
962 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
963 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).dispose ();
965 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
972 /* Function vect_analyze_loop_1.
974 Apply a set of analyses on LOOP, and create a loop_vec_info struct
975 for it. The different analyses will record information in the
976 loop_vec_info struct. This is a subset of the analyses applied in
977 vect_analyze_loop, to be applied on an inner-loop nested in the loop
978 that is now considered for (outer-loop) vectorization. */
981 vect_analyze_loop_1 (struct loop
*loop
)
983 loop_vec_info loop_vinfo
;
985 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE
, vect_location
,
987 "===== analyze_loop_nest_1 =====");
989 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
991 loop_vinfo
= vect_analyze_loop_form (loop
);
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
996 "bad inner-loop form.");
1004 /* Function vect_analyze_loop_form.
1006 Verify that certain CFG restrictions hold, including:
1007 - the loop has a pre-header
1008 - the loop has a single entry and exit
1009 - the loop exit condition is simple enough, and the number of iterations
1010 can be analyzed (a countable loop). */
1013 vect_analyze_loop_form (struct loop
*loop
)
1015 loop_vec_info loop_vinfo
;
1017 tree number_of_iterations
= NULL
;
1018 loop_vec_info inner_loop_vinfo
= NULL
;
1020 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_NOTE
, vect_location
,
1022 "=== vect_analyze_loop_form ===");
1024 /* Different restrictions apply when we are considering an inner-most loop,
1025 vs. an outer (nested) loop.
1026 (FORNOW. May want to relax some of these restrictions in the future). */
1030 /* Inner-most loop. We currently require that the number of BBs is
1031 exactly 2 (the header and latch). Vectorizable inner-most loops
1042 if (loop
->num_nodes
!= 2)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1046 "not vectorized: control flow in loop.");
1050 if (empty_block_p (loop
->header
))
1052 if (dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1054 "not vectorized: empty loop.");
1060 struct loop
*innerloop
= loop
->inner
;
1063 /* Nested loop. We currently require that the loop is doubly-nested,
1064 contains a single inner loop, and the number of BBs is exactly 5.
1065 Vectorizable outer-loops look like this:
1077 The inner-loop has the properties expected of inner-most loops
1078 as described above. */
1080 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1084 "not vectorized: multiple nested loops.");
1088 /* Analyze the inner-loop. */
1089 inner_loop_vinfo
= vect_analyze_loop_1 (loop
->inner
);
1090 if (!inner_loop_vinfo
)
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1094 "not vectorized: Bad inner loop.");
1098 if (!expr_invariant_in_loop_p (loop
,
1099 LOOP_VINFO_NITERS (inner_loop_vinfo
)))
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1103 "not vectorized: inner-loop count not invariant.");
1104 destroy_loop_vec_info (inner_loop_vinfo
, true);
1108 if (loop
->num_nodes
!= 5)
1110 if (dump_enabled_p ())
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1112 "not vectorized: control flow in loop.");
1113 destroy_loop_vec_info (inner_loop_vinfo
, true);
1117 gcc_assert (EDGE_COUNT (innerloop
->header
->preds
) == 2);
1118 entryedge
= EDGE_PRED (innerloop
->header
, 0);
1119 if (EDGE_PRED (innerloop
->header
, 0)->src
== innerloop
->latch
)
1120 entryedge
= EDGE_PRED (innerloop
->header
, 1);
1122 if (entryedge
->src
!= loop
->header
1123 || !single_exit (innerloop
)
1124 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1128 "not vectorized: unsupported outerloop form.");
1129 destroy_loop_vec_info (inner_loop_vinfo
, true);
1133 if (dump_enabled_p ())
1134 dump_printf_loc (MSG_NOTE
, vect_location
,
1135 "Considering outer-loop vectorization.");
1138 if (!single_exit (loop
)
1139 || EDGE_COUNT (loop
->header
->preds
) != 2)
1141 if (dump_enabled_p ())
1143 if (!single_exit (loop
))
1144 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1145 "not vectorized: multiple exits.");
1146 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1147 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1148 "not vectorized: too many incoming edges.");
1150 if (inner_loop_vinfo
)
1151 destroy_loop_vec_info (inner_loop_vinfo
, true);
1155 /* We assume that the loop exit condition is at the end of the loop. i.e,
1156 that the loop is represented as a do-while (with a proper if-guard
1157 before the loop if needed), where the loop header contains all the
1158 executable statements, and the latch is empty. */
1159 if (!empty_block_p (loop
->latch
)
1160 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1162 if (dump_enabled_p ())
1163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1164 "not vectorized: latch block not empty.");
1165 if (inner_loop_vinfo
)
1166 destroy_loop_vec_info (inner_loop_vinfo
, true);
1170 /* Make sure there exists a single-predecessor exit bb: */
1171 if (!single_pred_p (single_exit (loop
)->dest
))
1173 edge e
= single_exit (loop
);
1174 if (!(e
->flags
& EDGE_ABNORMAL
))
1176 split_loop_exit_edge (e
);
1177 if (dump_enabled_p ())
1178 dump_printf (MSG_NOTE
, "split exit edge.");
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1184 "not vectorized: abnormal loop exit edge.");
1185 if (inner_loop_vinfo
)
1186 destroy_loop_vec_info (inner_loop_vinfo
, true);
1191 loop_cond
= vect_get_loop_niters (loop
, &number_of_iterations
);
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1196 "not vectorized: complicated exit condition.");
1197 if (inner_loop_vinfo
)
1198 destroy_loop_vec_info (inner_loop_vinfo
, true);
1202 if (!number_of_iterations
)
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1206 "not vectorized: number of iterations cannot be "
1208 if (inner_loop_vinfo
)
1209 destroy_loop_vec_info (inner_loop_vinfo
, true);
1213 if (chrec_contains_undetermined (number_of_iterations
))
1215 if (dump_enabled_p ())
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1217 "Infinite number of iterations.");
1218 if (inner_loop_vinfo
)
1219 destroy_loop_vec_info (inner_loop_vinfo
, true);
1223 if (!NITERS_KNOWN_P (number_of_iterations
))
1225 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE
, vect_location
,
1228 "Symbolic number of iterations is ");
1229 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1232 else if (TREE_INT_CST_LOW (number_of_iterations
) == 0)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1236 "not vectorized: number of iterations = 0.");
1237 if (inner_loop_vinfo
)
1238 destroy_loop_vec_info (inner_loop_vinfo
, true);
1242 loop_vinfo
= new_loop_vec_info (loop
);
1243 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1244 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1246 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1248 /* CHECKME: May want to keep it around it in the future. */
1249 if (inner_loop_vinfo
)
1250 destroy_loop_vec_info (inner_loop_vinfo
, false);
1252 gcc_assert (!loop
->aux
);
1253 loop
->aux
= loop_vinfo
;
1258 /* Function vect_analyze_loop_operations.
1260 Scan the loop stmts and make sure they are all vectorizable. */
1263 vect_analyze_loop_operations (loop_vec_info loop_vinfo
, bool slp
)
1265 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1266 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1267 int nbbs
= loop
->num_nodes
;
1268 gimple_stmt_iterator si
;
1269 unsigned int vectorization_factor
= 0;
1272 stmt_vec_info stmt_info
;
1273 bool need_to_vectorize
= false;
1274 int min_profitable_iters
;
1275 int min_scalar_loop_bound
;
1277 bool only_slp_in_loop
= true, ok
;
1278 HOST_WIDE_INT max_niter
;
1279 HOST_WIDE_INT estimated_niter
;
1280 int min_profitable_estimate
;
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_NOTE
, vect_location
,
1284 "=== vect_analyze_loop_operations ===");
1286 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
1287 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1290 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1291 vectorization factor of the loop is the unrolling factor required by
1292 the SLP instances. If that unrolling factor is 1, we say, that we
1293 perform pure SLP on loop - cross iteration parallelism is not
1295 for (i
= 0; i
< nbbs
; i
++)
1297 basic_block bb
= bbs
[i
];
1298 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1300 gimple stmt
= gsi_stmt (si
);
1301 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1302 gcc_assert (stmt_info
);
1303 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1304 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1305 && !PURE_SLP_STMT (stmt_info
))
1306 /* STMT needs both SLP and loop-based vectorization. */
1307 only_slp_in_loop
= false;
1311 if (only_slp_in_loop
)
1312 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1314 vectorization_factor
= least_common_multiple (vectorization_factor
,
1315 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1317 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_NOTE
, vect_location
,
1320 "Updating vectorization factor to %d ",
1321 vectorization_factor
);
1324 for (i
= 0; i
< nbbs
; i
++)
1326 basic_block bb
= bbs
[i
];
1328 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1330 phi
= gsi_stmt (si
);
1333 stmt_info
= vinfo_for_stmt (phi
);
1334 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1337 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1340 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1341 (i.e., a phi in the tail of the outer-loop). */
1342 if (! is_loop_header_bb_p (bb
))
1344 /* FORNOW: we currently don't support the case that these phis
1345 are not used in the outerloop (unless it is double reduction,
1346 i.e., this phi is vect_reduction_def), cause this case
1347 requires to actually do something here. */
1348 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1349 || STMT_VINFO_LIVE_P (stmt_info
))
1350 && STMT_VINFO_DEF_TYPE (stmt_info
)
1351 != vect_double_reduction_def
)
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1355 "Unsupported loop-closed phi in "
1360 /* If PHI is used in the outer loop, we check that its operand
1361 is defined in the inner loop. */
1362 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1367 if (gimple_phi_num_args (phi
) != 1)
1370 phi_op
= PHI_ARG_DEF (phi
, 0);
1371 if (TREE_CODE (phi_op
) != SSA_NAME
)
1374 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1376 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1377 || !vinfo_for_stmt (op_def_stmt
))
1380 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1381 != vect_used_in_outer
1382 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1383 != vect_used_in_outer_by_reduction
)
1390 gcc_assert (stmt_info
);
1392 if (STMT_VINFO_LIVE_P (stmt_info
))
1394 /* FORNOW: not yet supported. */
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1397 "not vectorized: value used after loop.");
1401 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1402 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1404 /* A scalar-dependence cycle that we don't support. */
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1407 "not vectorized: scalar dependence cycle.");
1411 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1413 need_to_vectorize
= true;
1414 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1415 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1420 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1423 "not vectorized: relevant phi not "
1425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1431 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1433 gimple stmt
= gsi_stmt (si
);
1434 if (!vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1439 /* All operations in the loop are either irrelevant (deal with loop
1440 control, or dead), or only used outside the loop and can be moved
1441 out of the loop (e.g. invariants, inductions). The loop can be
1442 optimized away by scalar optimizations. We're better off not
1443 touching this loop. */
1444 if (!need_to_vectorize
)
1446 if (dump_enabled_p ())
1447 dump_printf_loc (MSG_NOTE
, vect_location
,
1448 "All the computation can be taken out of the loop.");
1449 if (dump_enabled_p ())
1450 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1451 "not vectorized: redundant loop. no profit to "
1456 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1457 dump_printf_loc (MSG_NOTE
, vect_location
,
1458 "vectorization_factor = %d, niters = "
1459 HOST_WIDE_INT_PRINT_DEC
, vectorization_factor
,
1460 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1462 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1463 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1464 || ((max_niter
= max_stmt_executions_int (loop
)) != -1
1465 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1469 "not vectorized: iteration count too small.");
1470 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1472 "not vectorized: iteration count smaller than "
1473 "vectorization factor.");
1477 /* Analyze cost. Decide if worth while to vectorize. */
1479 /* Once VF is set, SLP costs should be updated since the number of created
1480 vector stmts depends on VF. */
1481 vect_update_slp_costs_according_to_vf (loop_vinfo
);
1483 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1484 &min_profitable_estimate
);
1485 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
) = min_profitable_iters
;
1487 if (min_profitable_iters
< 0)
1489 if (dump_enabled_p ())
1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1491 "not vectorized: vectorization not profitable.");
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1494 "not vectorized: vector version will never be "
1499 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1500 * vectorization_factor
) - 1);
1503 /* Use the cost model only if it is more conservative than user specified
1506 th
= (unsigned) min_scalar_loop_bound
;
1507 if (min_profitable_iters
1508 && (!min_scalar_loop_bound
1509 || min_profitable_iters
> min_scalar_loop_bound
))
1510 th
= (unsigned) min_profitable_iters
;
1512 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1513 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
1515 if (dump_enabled_p ())
1516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1517 "not vectorized: vectorization not profitable.");
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_NOTE
, vect_location
,
1520 "not vectorized: iteration count smaller than user "
1521 "specified loop bound parameter or minimum profitable "
1522 "iterations (whichever is more conservative).");
1526 if ((estimated_niter
= estimated_stmt_executions_int (loop
)) != -1
1527 && ((unsigned HOST_WIDE_INT
) estimated_niter
1528 <= MAX (th
, (unsigned)min_profitable_estimate
)))
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1532 "not vectorized: estimated iteration count too "
1534 if (dump_enabled_p ())
1535 dump_printf_loc (MSG_NOTE
, vect_location
,
1536 "not vectorized: estimated iteration count smaller "
1537 "than specified loop bound parameter or minimum "
1538 "profitable iterations (whichever is more "
1543 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1544 || LOOP_VINFO_INT_NITERS (loop_vinfo
) % vectorization_factor
!= 0
1545 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
))
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required.");
1549 if (!vect_can_advance_ivs_p (loop_vinfo
))
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1553 "not vectorized: can't create epilog loop 1.");
1556 if (!slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1560 "not vectorized: can't create epilog loop 2.");
1569 /* Function vect_analyze_loop_2.
1571 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1572 for it. The different analyses will record information in the
1573 loop_vec_info struct. */
1575 vect_analyze_loop_2 (loop_vec_info loop_vinfo
)
1577 bool ok
, slp
= false;
1578 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1581 /* Find all data references in the loop (which correspond to vdefs/vuses)
1582 and analyze their evolution in the loop. Also adjust the minimal
1583 vectorization factor according to the loads and stores.
1585 FORNOW: Handle only simple, array references, which
1586 alignment can be forced, and aligned pointer-references. */
1588 ok
= vect_analyze_data_refs (loop_vinfo
, NULL
, &min_vf
);
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1593 "bad data references.");
1597 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1598 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1600 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1605 "bad data access.");
1609 /* Classify all cross-iteration scalar data-flow cycles.
1610 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1612 vect_analyze_scalar_cycles (loop_vinfo
);
1614 vect_pattern_recog (loop_vinfo
, NULL
);
1616 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1618 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1623 "unexpected pattern.");
1627 /* Analyze data dependences between the data-refs in the loop
1628 and adjust the maximum vectorization factor according to
1630 FORNOW: fail at the first data dependence that we encounter. */
1632 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1638 "bad data dependence.");
1642 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1647 "can't determine vectorization factor.");
1650 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1654 "bad data dependence.");
1658 /* Analyze the alignment of the data-refs in the loop.
1659 Fail if a data reference is found that cannot be vectorized. */
1661 ok
= vect_analyze_data_refs_alignment (loop_vinfo
, NULL
);
1664 if (dump_enabled_p ())
1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1666 "bad data alignment.");
1670 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1671 It is important to call pruning after vect_analyze_data_ref_accesses,
1672 since we use grouping information gathered by interleaving analysis. */
1673 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1678 "too long list of versioning for alias "
1683 /* This pass will decide on using loop versioning and/or loop peeling in
1684 order to enhance the alignment of data references in the loop. */
1686 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1691 "bad data alignment.");
1695 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1696 ok
= vect_analyze_slp (loop_vinfo
, NULL
);
1699 /* Decide which possible SLP instances to SLP. */
1700 slp
= vect_make_slp_decision (loop_vinfo
);
1702 /* Find stmts that need to be both vectorized and SLPed. */
1703 vect_detect_hybrid_slp (loop_vinfo
);
1708 /* Scan all the operations in the loop and make sure they are
1711 ok
= vect_analyze_loop_operations (loop_vinfo
, slp
);
1714 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1716 "bad operation or unsupported loop bound.");
1723 /* Function vect_analyze_loop.
1725 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1726 for it. The different analyses will record information in the
1727 loop_vec_info struct. */
1729 vect_analyze_loop (struct loop
*loop
)
1731 loop_vec_info loop_vinfo
;
1732 unsigned int vector_sizes
;
1734 /* Autodetect first vector size we try. */
1735 current_vector_size
= 0;
1736 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE
, vect_location
,
1740 "===== analyze_loop_nest =====");
1742 if (loop_outer (loop
)
1743 && loop_vec_info_for_loop (loop_outer (loop
))
1744 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE
, vect_location
,
1748 "outer-loop already vectorized.");
1754 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1755 loop_vinfo
= vect_analyze_loop_form (loop
);
1758 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1764 if (vect_analyze_loop_2 (loop_vinfo
))
1766 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
1771 destroy_loop_vec_info (loop_vinfo
, true);
1773 vector_sizes
&= ~current_vector_size
;
1774 if (vector_sizes
== 0
1775 || current_vector_size
== 0)
1778 /* Try the next biggest vector size. */
1779 current_vector_size
= 1 << floor_log2 (vector_sizes
);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE
, vect_location
,
1782 "***** Re-trying analysis with "
1783 "vector size %d\n", current_vector_size
);
1788 /* Function reduction_code_for_scalar_code
1791 CODE - tree_code of a reduction operations.
1794 REDUC_CODE - the corresponding tree-code to be used to reduce the
1795 vector of partial results into a single scalar result (which
1796 will also reside in a vector) or ERROR_MARK if the operation is
1797 a supported reduction operation, but does not have such tree-code.
1799 Return FALSE if CODE currently cannot be vectorized as reduction. */
1802 reduction_code_for_scalar_code (enum tree_code code
,
1803 enum tree_code
*reduc_code
)
1808 *reduc_code
= REDUC_MAX_EXPR
;
1812 *reduc_code
= REDUC_MIN_EXPR
;
1816 *reduc_code
= REDUC_PLUS_EXPR
;
1824 *reduc_code
= ERROR_MARK
;
1833 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1834 STMT is printed with a message MSG. */
1837 report_vect_op (int msg_type
, gimple stmt
, const char *msg
)
1839 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
1840 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
1844 /* Detect SLP reduction of the form:
1854 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1855 FIRST_STMT is the first reduction stmt in the chain
1856 (a2 = operation (a1)).
1858 Return TRUE if a reduction chain was detected. */
1861 vect_is_slp_reduction (loop_vec_info loop_info
, gimple phi
, gimple first_stmt
)
1863 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
1864 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
1865 enum tree_code code
;
1866 gimple current_stmt
= NULL
, loop_use_stmt
= NULL
, first
, next_stmt
;
1867 stmt_vec_info use_stmt_info
, current_stmt_info
;
1869 imm_use_iterator imm_iter
;
1870 use_operand_p use_p
;
1871 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
1874 if (loop
!= vect_loop
)
1877 lhs
= PHI_RESULT (phi
);
1878 code
= gimple_assign_rhs_code (first_stmt
);
1882 n_out_of_loop_uses
= 0;
1883 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
1885 gimple use_stmt
= USE_STMT (use_p
);
1886 if (is_gimple_debug (use_stmt
))
1889 use_stmt
= USE_STMT (use_p
);
1891 /* Check if we got back to the reduction phi. */
1892 if (use_stmt
== phi
)
1894 loop_use_stmt
= use_stmt
;
1899 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1901 if (vinfo_for_stmt (use_stmt
)
1902 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt
)))
1904 loop_use_stmt
= use_stmt
;
1909 n_out_of_loop_uses
++;
1911 /* There are can be either a single use in the loop or two uses in
1913 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
1920 /* We reached a statement with no loop uses. */
1921 if (nloop_uses
== 0)
1924 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1925 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
1928 if (!is_gimple_assign (loop_use_stmt
)
1929 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
1930 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
1933 /* Insert USE_STMT into reduction chain. */
1934 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
1937 current_stmt_info
= vinfo_for_stmt (current_stmt
);
1938 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
1939 GROUP_FIRST_ELEMENT (use_stmt_info
)
1940 = GROUP_FIRST_ELEMENT (current_stmt_info
);
1943 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
1945 lhs
= gimple_assign_lhs (loop_use_stmt
);
1946 current_stmt
= loop_use_stmt
;
1950 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
1953 /* Swap the operands, if needed, to make the reduction operand be the second
1955 lhs
= PHI_RESULT (phi
);
1956 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
1959 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
1961 tree op
= gimple_assign_rhs1 (next_stmt
);
1962 gimple def_stmt
= NULL
;
1964 if (TREE_CODE (op
) == SSA_NAME
)
1965 def_stmt
= SSA_NAME_DEF_STMT (op
);
1967 /* Check that the other def is either defined in the loop
1968 ("vect_internal_def"), or it's an induction (defined by a
1969 loop-header phi-node). */
1971 && gimple_bb (def_stmt
)
1972 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
1973 && (is_gimple_assign (def_stmt
)
1974 || is_gimple_call (def_stmt
)
1975 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
1976 == vect_induction_def
1977 || (gimple_code (def_stmt
) == GIMPLE_PHI
1978 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
1979 == vect_internal_def
1980 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
1982 lhs
= gimple_assign_lhs (next_stmt
);
1983 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
1991 tree op
= gimple_assign_rhs2 (next_stmt
);
1992 gimple def_stmt
= NULL
;
1994 if (TREE_CODE (op
) == SSA_NAME
)
1995 def_stmt
= SSA_NAME_DEF_STMT (op
);
1997 /* Check that the other def is either defined in the loop
1998 ("vect_internal_def"), or it's an induction (defined by a
1999 loop-header phi-node). */
2001 && gimple_bb (def_stmt
)
2002 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2003 && (is_gimple_assign (def_stmt
)
2004 || is_gimple_call (def_stmt
)
2005 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2006 == vect_induction_def
2007 || (gimple_code (def_stmt
) == GIMPLE_PHI
2008 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2009 == vect_internal_def
2010 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2015 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2018 swap_tree_operands (next_stmt
,
2019 gimple_assign_rhs1_ptr (next_stmt
),
2020 gimple_assign_rhs2_ptr (next_stmt
));
2021 update_stmt (next_stmt
);
2023 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2024 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2030 lhs
= gimple_assign_lhs (next_stmt
);
2031 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2034 /* Save the chain for further analysis in SLP detection. */
2035 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2036 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2037 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2043 /* Function vect_is_simple_reduction_1
2045 (1) Detect a cross-iteration def-use cycle that represents a simple
2046 reduction computation. We look for the following pattern:
2051 a2 = operation (a3, a1)
2054 1. operation is commutative and associative and it is safe to
2055 change the order of the computation (if CHECK_REDUCTION is true)
2056 2. no uses for a2 in the loop (a2 is used out of the loop)
2057 3. no uses of a1 in the loop besides the reduction operation
2058 4. no uses of a1 outside the loop.
2060 Conditions 1,4 are tested here.
2061 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2063 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2064 nested cycles, if CHECK_REDUCTION is false.
2066 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2070 inner loop (def of a3)
2073 If MODIFY is true it tries also to rework the code in-place to enable
2074 detection of more reduction patterns. For the time being we rewrite
2075 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2079 vect_is_simple_reduction_1 (loop_vec_info loop_info
, gimple phi
,
2080 bool check_reduction
, bool *double_reduc
,
2083 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2084 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2085 edge latch_e
= loop_latch_edge (loop
);
2086 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2087 gimple def_stmt
, def1
= NULL
, def2
= NULL
;
2088 enum tree_code orig_code
, code
;
2089 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2093 imm_use_iterator imm_iter
;
2094 use_operand_p use_p
;
2097 *double_reduc
= false;
2099 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2100 otherwise, we assume outer loop vectorization. */
2101 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2102 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2104 name
= PHI_RESULT (phi
);
2106 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2108 gimple use_stmt
= USE_STMT (use_p
);
2109 if (is_gimple_debug (use_stmt
))
2112 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2116 "intermediate value used outside loop.");
2121 if (vinfo_for_stmt (use_stmt
)
2122 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2128 "reduction used in loop.");
2133 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2135 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2138 "reduction: not ssa_name: ");
2139 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2144 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2149 "reduction: no def_stmt.");
2153 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2155 if (dump_enabled_p ())
2156 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2160 if (is_gimple_assign (def_stmt
))
2162 name
= gimple_assign_lhs (def_stmt
);
2167 name
= PHI_RESULT (def_stmt
);
2172 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2174 gimple use_stmt
= USE_STMT (use_p
);
2175 if (is_gimple_debug (use_stmt
))
2177 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
2178 && vinfo_for_stmt (use_stmt
)
2179 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2185 "reduction used in loop.");
2190 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2191 defined in the inner loop. */
2194 op1
= PHI_ARG_DEF (def_stmt
, 0);
2196 if (gimple_phi_num_args (def_stmt
) != 1
2197 || TREE_CODE (op1
) != SSA_NAME
)
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2201 "unsupported phi node definition.");
2206 def1
= SSA_NAME_DEF_STMT (op1
);
2207 if (flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2209 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2210 && is_gimple_assign (def1
))
2212 if (dump_enabled_p ())
2213 report_vect_op (MSG_NOTE
, def_stmt
,
2214 "detected double reduction: ");
2216 *double_reduc
= true;
2223 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2225 /* We can handle "res -= x[i]", which is non-associative by
2226 simply rewriting this into "res += -x[i]". Avoid changing
2227 gimple instruction for the first simple tests and only do this
2228 if we're allowed to change code at all. */
2229 if (code
== MINUS_EXPR
2231 && (op1
= gimple_assign_rhs1 (def_stmt
))
2232 && TREE_CODE (op1
) == SSA_NAME
2233 && SSA_NAME_DEF_STMT (op1
) == phi
)
2237 && (!commutative_tree_code (code
) || !associative_tree_code (code
)))
2239 if (dump_enabled_p ())
2240 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2241 "reduction: not commutative/associative: ");
2245 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2247 if (code
!= COND_EXPR
)
2249 if (dump_enabled_p ())
2250 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2251 "reduction: not binary operation: ");
2256 op3
= gimple_assign_rhs1 (def_stmt
);
2257 if (COMPARISON_CLASS_P (op3
))
2259 op4
= TREE_OPERAND (op3
, 1);
2260 op3
= TREE_OPERAND (op3
, 0);
2263 op1
= gimple_assign_rhs2 (def_stmt
);
2264 op2
= gimple_assign_rhs3 (def_stmt
);
2266 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2268 if (dump_enabled_p ())
2269 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2270 "reduction: uses not ssa_names: ");
2277 op1
= gimple_assign_rhs1 (def_stmt
);
2278 op2
= gimple_assign_rhs2 (def_stmt
);
2280 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2282 if (dump_enabled_p ())
2283 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2284 "reduction: uses not ssa_names: ");
2290 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2291 if ((TREE_CODE (op1
) == SSA_NAME
2292 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2293 || (TREE_CODE (op2
) == SSA_NAME
2294 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2295 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2296 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2297 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2298 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE
, vect_location
,
2303 "reduction: multiple types: operation type: ");
2304 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2305 dump_printf (MSG_NOTE
, ", operands types: ");
2306 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2308 dump_printf (MSG_NOTE
, ",");
2309 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2313 dump_printf (MSG_NOTE
, ",");
2314 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2320 dump_printf (MSG_NOTE
, ",");
2321 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2329 /* Check that it's ok to change the order of the computation.
2330 Generally, when vectorizing a reduction we change the order of the
2331 computation. This may change the behavior of the program in some
2332 cases, so we need to check that this is ok. One exception is when
2333 vectorizing an outer-loop: the inner-loop is executed sequentially,
2334 and therefore vectorizing reductions in the inner-loop during
2335 outer-loop vectorization is safe. */
2337 /* CHECKME: check for !flag_finite_math_only too? */
2338 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
2341 /* Changing the order of operations changes the semantics. */
2342 if (dump_enabled_p ())
2343 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2344 "reduction: unsafe fp math optimization: ");
2347 else if (INTEGRAL_TYPE_P (type
) && TYPE_OVERFLOW_TRAPS (type
)
2350 /* Changing the order of operations changes the semantics. */
2351 if (dump_enabled_p ())
2352 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2353 "reduction: unsafe int math optimization: ");
2356 else if (SAT_FIXED_POINT_TYPE_P (type
) && check_reduction
)
2358 /* Changing the order of operations changes the semantics. */
2359 if (dump_enabled_p ())
2360 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2361 "reduction: unsafe fixed-point math optimization: ");
2365 /* If we detected "res -= x[i]" earlier, rewrite it into
2366 "res += -x[i]" now. If this turns out to be useless reassoc
2367 will clean it up again. */
2368 if (orig_code
== MINUS_EXPR
)
2370 tree rhs
= gimple_assign_rhs2 (def_stmt
);
2371 tree negrhs
= make_ssa_name (TREE_TYPE (rhs
), NULL
);
2372 gimple negate_stmt
= gimple_build_assign_with_ops (NEGATE_EXPR
, negrhs
,
2374 gimple_stmt_iterator gsi
= gsi_for_stmt (def_stmt
);
2375 set_vinfo_for_stmt (negate_stmt
, new_stmt_vec_info (negate_stmt
,
2377 gsi_insert_before (&gsi
, negate_stmt
, GSI_NEW_STMT
);
2378 gimple_assign_set_rhs2 (def_stmt
, negrhs
);
2379 gimple_assign_set_rhs_code (def_stmt
, PLUS_EXPR
);
2380 update_stmt (def_stmt
);
2383 /* Reduction is safe. We're dealing with one of the following:
2384 1) integer arithmetic and no trapv
2385 2) floating point arithmetic, and special flags permit this optimization
2386 3) nested cycle (i.e., outer loop vectorization). */
2387 if (TREE_CODE (op1
) == SSA_NAME
)
2388 def1
= SSA_NAME_DEF_STMT (op1
);
2390 if (TREE_CODE (op2
) == SSA_NAME
)
2391 def2
= SSA_NAME_DEF_STMT (op2
);
2393 if (code
!= COND_EXPR
2394 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2396 if (dump_enabled_p ())
2397 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2401 /* Check that one def is the reduction def, defined by PHI,
2402 the other def is either defined in the loop ("vect_internal_def"),
2403 or it's an induction (defined by a loop-header phi-node). */
2405 if (def2
&& def2
== phi
2406 && (code
== COND_EXPR
2407 || !def1
|| gimple_nop_p (def1
)
2408 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2409 && (is_gimple_assign (def1
)
2410 || is_gimple_call (def1
)
2411 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2412 == vect_induction_def
2413 || (gimple_code (def1
) == GIMPLE_PHI
2414 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2415 == vect_internal_def
2416 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2418 if (dump_enabled_p ())
2419 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2423 if (def1
&& def1
== phi
2424 && (code
== COND_EXPR
2425 || !def2
|| gimple_nop_p (def2
)
2426 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2427 && (is_gimple_assign (def2
)
2428 || is_gimple_call (def2
)
2429 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2430 == vect_induction_def
2431 || (gimple_code (def2
) == GIMPLE_PHI
2432 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2433 == vect_internal_def
2434 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2436 if (check_reduction
)
2438 /* Swap operands (just for simplicity - so that the rest of the code
2439 can assume that the reduction variable is always the last (second)
2441 if (dump_enabled_p ())
2442 report_vect_op (MSG_NOTE
, def_stmt
,
2443 "detected reduction: need to swap operands: ");
2445 swap_tree_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2446 gimple_assign_rhs2_ptr (def_stmt
));
2448 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2449 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2453 if (dump_enabled_p ())
2454 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2460 /* Try to find SLP reduction chain. */
2461 if (check_reduction
&& vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
2463 if (dump_enabled_p ())
2464 report_vect_op (MSG_NOTE
, def_stmt
,
2465 "reduction: detected reduction chain: ");
2470 if (dump_enabled_p ())
2471 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2472 "reduction: unknown pattern: ");
2477 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2478 in-place. Arguments as there. */
2481 vect_is_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2482 bool check_reduction
, bool *double_reduc
)
2484 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2485 double_reduc
, false);
2488 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2489 in-place if it enables detection of more reductions. Arguments
2493 vect_force_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2494 bool check_reduction
, bool *double_reduc
)
2496 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2497 double_reduc
, true);
2500 /* Calculate the cost of one scalar iteration of the loop. */
2502 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
2504 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2505 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2506 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
2507 int innerloop_iters
, i
, stmt_cost
;
2509 /* Count statements in scalar loop. Using this as scalar cost for a single
2512 TODO: Add outer loop support.
2514 TODO: Consider assigning different costs to different scalar
2518 innerloop_iters
= 1;
2520 innerloop_iters
= 50; /* FIXME */
2522 for (i
= 0; i
< nbbs
; i
++)
2524 gimple_stmt_iterator si
;
2525 basic_block bb
= bbs
[i
];
2527 if (bb
->loop_father
== loop
->inner
)
2528 factor
= innerloop_iters
;
2532 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
2534 gimple stmt
= gsi_stmt (si
);
2535 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2537 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
2540 /* Skip stmts that are not vectorized inside the loop. */
2542 && !STMT_VINFO_RELEVANT_P (stmt_info
)
2543 && (!STMT_VINFO_LIVE_P (stmt_info
)
2544 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2545 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
2548 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
2550 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
2551 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2553 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2556 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2558 scalar_single_iter_cost
+= stmt_cost
* factor
;
2561 return scalar_single_iter_cost
;
2564 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2566 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
2567 int *peel_iters_epilogue
,
2568 int scalar_single_iter_cost
,
2569 stmt_vector_for_cost
*prologue_cost_vec
,
2570 stmt_vector_for_cost
*epilogue_cost_vec
)
2573 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2575 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2577 *peel_iters_epilogue
= vf
/2;
2578 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE
, vect_location
,
2580 "cost model: epilogue peel iters set to vf/2 "
2581 "because loop iterations are unknown .");
2583 /* If peeled iterations are known but number of scalar loop
2584 iterations are unknown, count a taken branch per peeled loop. */
2585 retval
= record_stmt_cost (prologue_cost_vec
, 2, cond_branch_taken
,
2586 NULL
, 0, vect_prologue
);
2590 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2591 peel_iters_prologue
= niters
< peel_iters_prologue
?
2592 niters
: peel_iters_prologue
;
2593 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
2594 /* If we need to peel for gaps, but no peeling is required, we have to
2595 peel VF iterations. */
2596 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
2597 *peel_iters_epilogue
= vf
;
2600 if (peel_iters_prologue
)
2601 retval
+= record_stmt_cost (prologue_cost_vec
,
2602 peel_iters_prologue
* scalar_single_iter_cost
,
2603 scalar_stmt
, NULL
, 0, vect_prologue
);
2604 if (*peel_iters_epilogue
)
2605 retval
+= record_stmt_cost (epilogue_cost_vec
,
2606 *peel_iters_epilogue
* scalar_single_iter_cost
,
2607 scalar_stmt
, NULL
, 0, vect_epilogue
);
2611 /* Function vect_estimate_min_profitable_iters
2613 Return the number of iterations required for the vector version of the
2614 loop to be profitable relative to the cost of the scalar version of the
2618 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
2619 int *ret_min_profitable_niters
,
2620 int *ret_min_profitable_estimate
)
2622 int min_profitable_iters
;
2623 int min_profitable_estimate
;
2624 int peel_iters_prologue
;
2625 int peel_iters_epilogue
;
2626 unsigned vec_inside_cost
= 0;
2627 int vec_outside_cost
= 0;
2628 unsigned vec_prologue_cost
= 0;
2629 unsigned vec_epilogue_cost
= 0;
2630 int scalar_single_iter_cost
= 0;
2631 int scalar_outside_cost
= 0;
2632 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2633 int npeel
= LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2634 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2636 /* Cost model disabled. */
2637 if (!flag_vect_cost_model
)
2639 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.");
2640 *ret_min_profitable_niters
= 0;
2641 *ret_min_profitable_estimate
= 0;
2645 /* Requires loop versioning tests to handle misalignment. */
2646 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2648 /* FIXME: Make cost depend on complexity of individual check. */
2649 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
2650 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2652 dump_printf (MSG_NOTE
,
2653 "cost model: Adding cost of checks for loop "
2654 "versioning to treat misalignment.\n");
2657 /* Requires loop versioning with alias checks. */
2658 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2660 /* FIXME: Make cost depend on complexity of individual check. */
2661 unsigned len
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).length ();
2662 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2664 dump_printf (MSG_NOTE
,
2665 "cost model: Adding cost of checks for loop "
2666 "versioning aliasing.\n");
2669 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2670 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2671 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
2674 /* Count statements in scalar loop. Using this as scalar cost for a single
2677 TODO: Add outer loop support.
2679 TODO: Consider assigning different costs to different scalar
2682 scalar_single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
2684 /* Add additional cost for the peeled instructions in prologue and epilogue
2687 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2688 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2690 TODO: Build an expression that represents peel_iters for prologue and
2691 epilogue to be used in a run-time test. */
2695 peel_iters_prologue
= vf
/2;
2696 dump_printf (MSG_NOTE
, "cost model: "
2697 "prologue peel iters set to vf/2.");
2699 /* If peeling for alignment is unknown, loop bound of main loop becomes
2701 peel_iters_epilogue
= vf
/2;
2702 dump_printf (MSG_NOTE
, "cost model: "
2703 "epilogue peel iters set to vf/2 because "
2704 "peeling for alignment is unknown.");
2706 /* If peeled iterations are unknown, count a taken branch and a not taken
2707 branch per peeled loop. Even if scalar loop iterations are known,
2708 vector iterations are not known since peeled prologue iterations are
2709 not known. Hence guards remain the same. */
2710 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_taken
,
2711 NULL
, 0, vect_prologue
);
2712 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_not_taken
,
2713 NULL
, 0, vect_prologue
);
2714 /* FORNOW: Don't attempt to pass individual scalar instructions to
2715 the model; just assume linear cost for scalar iterations. */
2716 (void) add_stmt_cost (target_cost_data
,
2717 peel_iters_prologue
* scalar_single_iter_cost
,
2718 scalar_stmt
, NULL
, 0, vect_prologue
);
2719 (void) add_stmt_cost (target_cost_data
,
2720 peel_iters_epilogue
* scalar_single_iter_cost
,
2721 scalar_stmt
, NULL
, 0, vect_epilogue
);
2725 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2726 stmt_info_for_cost
*si
;
2728 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2730 prologue_cost_vec
.create (2);
2731 epilogue_cost_vec
.create (2);
2732 peel_iters_prologue
= npeel
;
2734 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
2735 &peel_iters_epilogue
,
2736 scalar_single_iter_cost
,
2738 &epilogue_cost_vec
);
2740 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
2742 struct _stmt_vec_info
*stmt_info
2743 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2744 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2745 si
->misalign
, vect_prologue
);
2748 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
2750 struct _stmt_vec_info
*stmt_info
2751 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2752 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2753 si
->misalign
, vect_epilogue
);
2756 prologue_cost_vec
.release ();
2757 epilogue_cost_vec
.release ();
2760 /* FORNOW: The scalar outside cost is incremented in one of the
2763 1. The vectorizer checks for alignment and aliasing and generates
2764 a condition that allows dynamic vectorization. A cost model
2765 check is ANDED with the versioning condition. Hence scalar code
2766 path now has the added cost of the versioning check.
2768 if (cost > th & versioning_check)
2771 Hence run-time scalar is incremented by not-taken branch cost.
2773 2. The vectorizer then checks if a prologue is required. If the
2774 cost model check was not done before during versioning, it has to
2775 be done before the prologue check.
2778 prologue = scalar_iters
2783 if (prologue == num_iters)
2786 Hence the run-time scalar cost is incremented by a taken branch,
2787 plus a not-taken branch, plus a taken branch cost.
2789 3. The vectorizer then checks if an epilogue is required. If the
2790 cost model check was not done before during prologue check, it
2791 has to be done with the epilogue check.
2797 if (prologue == num_iters)
2800 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2803 Hence the run-time scalar cost should be incremented by 2 taken
2806 TODO: The back end may reorder the BBS's differently and reverse
2807 conditions/branch directions. Change the estimates below to
2808 something more reasonable. */
2810 /* If the number of iterations is known and we do not do versioning, we can
2811 decide whether to vectorize at compile time. Hence the scalar version
2812 do not carry cost model guard costs. */
2813 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2814 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2817 /* Cost model check occurs at versioning. */
2818 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2819 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2820 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
2823 /* Cost model check occurs at prologue generation. */
2824 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2825 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
2826 + vect_get_stmt_cost (cond_branch_not_taken
);
2827 /* Cost model check occurs at epilogue generation. */
2829 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
2833 /* Complete the target-specific cost calculations. */
2834 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
2835 &vec_inside_cost
, &vec_epilogue_cost
);
2837 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
2839 /* Calculate number of iterations required to make the vector version
2840 profitable, relative to the loop bodies only. The following condition
2842 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2844 SIC = scalar iteration cost, VIC = vector iteration cost,
2845 VOC = vector outside cost, VF = vectorization factor,
2846 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2847 SOC = scalar outside cost for run time cost model check. */
2849 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
2851 if (vec_outside_cost
<= 0)
2852 min_profitable_iters
= 1;
2855 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
2856 - vec_inside_cost
* peel_iters_prologue
2857 - vec_inside_cost
* peel_iters_epilogue
)
2858 / ((scalar_single_iter_cost
* vf
)
2861 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
2862 <= (((int) vec_inside_cost
* min_profitable_iters
)
2863 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
2864 min_profitable_iters
++;
2867 /* vector version will never be profitable. */
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2872 "cost model: the vector iteration cost = %d "
2873 "divided by the scalar iteration cost = %d "
2874 "is greater or equal to the vectorization factor = %d.",
2875 vec_inside_cost
, scalar_single_iter_cost
, vf
);
2876 *ret_min_profitable_niters
= -1;
2877 *ret_min_profitable_estimate
= -1;
2881 if (dump_enabled_p ())
2883 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2884 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
2886 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
2888 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
2890 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
2891 scalar_single_iter_cost
);
2892 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
2893 scalar_outside_cost
);
2894 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
2896 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
2897 peel_iters_prologue
);
2898 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
2899 peel_iters_epilogue
);
2900 dump_printf (MSG_NOTE
,
2901 " Calculated minimum iters for profitability: %d\n",
2902 min_profitable_iters
);
2905 min_profitable_iters
=
2906 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
2908 /* Because the condition we create is:
2909 if (niters <= min_profitable_iters)
2910 then skip the vectorized loop. */
2911 min_profitable_iters
--;
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE
, vect_location
,
2915 " Runtime profitability threshold = %d\n", min_profitable_iters
);
2917 *ret_min_profitable_niters
= min_profitable_iters
;
2919 /* Calculate number of iterations required to make the vector version
2920 profitable, relative to the loop bodies only.
2922 Non-vectorized variant is SIC * niters and it must win over vector
2923 variant on the expected loop trip count. The following condition must hold true:
2924 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2926 if (vec_outside_cost
<= 0)
2927 min_profitable_estimate
= 1;
2930 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
2931 - vec_inside_cost
* peel_iters_prologue
2932 - vec_inside_cost
* peel_iters_epilogue
)
2933 / ((scalar_single_iter_cost
* vf
)
2936 min_profitable_estimate
--;
2937 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE
, vect_location
,
2940 " Static estimate profitability threshold = %d\n",
2941 min_profitable_iters
);
2943 *ret_min_profitable_estimate
= min_profitable_estimate
;
2947 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2948 functions. Design better to avoid maintenance issues. */
2950 /* Function vect_model_reduction_cost.
2952 Models cost for a reduction operation, including the vector ops
2953 generated within the strip-mine loop, the initial definition before
2954 the loop, and the epilogue code that must be generated. */
2957 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
2960 int prologue_cost
= 0, epilogue_cost
= 0;
2961 enum tree_code code
;
2964 gimple stmt
, orig_stmt
;
2966 enum machine_mode mode
;
2967 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2968 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2969 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2971 /* Cost of reduction op inside loop. */
2972 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
2973 stmt_info
, 0, vect_body
);
2974 stmt
= STMT_VINFO_STMT (stmt_info
);
2976 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
2978 case GIMPLE_SINGLE_RHS
:
2979 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
)) == ternary_op
);
2980 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 2);
2982 case GIMPLE_UNARY_RHS
:
2983 reduction_op
= gimple_assign_rhs1 (stmt
);
2985 case GIMPLE_BINARY_RHS
:
2986 reduction_op
= gimple_assign_rhs2 (stmt
);
2988 case GIMPLE_TERNARY_RHS
:
2989 reduction_op
= gimple_assign_rhs3 (stmt
);
2995 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
2998 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3001 "unsupported data-type ");
3002 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3003 TREE_TYPE (reduction_op
));
3008 mode
= TYPE_MODE (vectype
);
3009 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3012 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3014 code
= gimple_assign_rhs_code (orig_stmt
);
3016 /* Add in cost for initial definition. */
3017 prologue_cost
+= add_stmt_cost (target_cost_data
, 1, scalar_to_vec
,
3018 stmt_info
, 0, vect_prologue
);
3020 /* Determine cost of epilogue code.
3022 We have a reduction operator that will reduce the vector in one statement.
3023 Also requires scalar extract. */
3025 if (!nested_in_vect_loop_p (loop
, orig_stmt
))
3027 if (reduc_code
!= ERROR_MARK
)
3029 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3030 stmt_info
, 0, vect_epilogue
);
3031 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vec_to_scalar
,
3032 stmt_info
, 0, vect_epilogue
);
3036 int vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
3038 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3039 int element_bitsize
= tree_low_cst (bitsize
, 1);
3040 int nelements
= vec_size_in_bits
/ element_bitsize
;
3042 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3044 /* We have a whole vector shift available. */
3045 if (VECTOR_MODE_P (mode
)
3046 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3047 && optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3049 /* Final reduction via vector shifts and the reduction operator.
3050 Also requires scalar extract. */
3051 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3052 exact_log2 (nelements
) * 2,
3053 vector_stmt
, stmt_info
, 0,
3055 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3056 vec_to_scalar
, stmt_info
, 0,
3060 /* Use extracts and reduction op for final reduction. For N
3061 elements, we have N extracts and N-1 reduction ops. */
3062 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3063 nelements
+ nelements
- 1,
3064 vector_stmt
, stmt_info
, 0,
3069 if (dump_enabled_p ())
3070 dump_printf (MSG_NOTE
,
3071 "vect_model_reduction_cost: inside_cost = %d, "
3072 "prologue_cost = %d, epilogue_cost = %d .", inside_cost
,
3073 prologue_cost
, epilogue_cost
);
3079 /* Function vect_model_induction_cost.
3081 Models cost for induction operations. */
3084 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3086 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3087 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3088 unsigned inside_cost
, prologue_cost
;
3090 /* loop cost for vec_loop. */
3091 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3092 stmt_info
, 0, vect_body
);
3094 /* prologue cost for vec_init and vec_step. */
3095 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3096 stmt_info
, 0, vect_prologue
);
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE
, vect_location
,
3100 "vect_model_induction_cost: inside_cost = %d, "
3101 "prologue_cost = %d .", inside_cost
, prologue_cost
);
3105 /* Function get_initial_def_for_induction
3108 STMT - a stmt that performs an induction operation in the loop.
3109 IV_PHI - the initial value of the induction variable
3112 Return a vector variable, initialized with the first VF values of
3113 the induction variable. E.g., for an iv with IV_PHI='X' and
3114 evolution S, for a vector of 4 units, we want to return:
3115 [X, X + S, X + 2*S, X + 3*S]. */
3118 get_initial_def_for_induction (gimple iv_phi
)
3120 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3121 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3122 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3126 edge pe
= loop_preheader_edge (loop
);
3127 struct loop
*iv_loop
;
3129 tree new_vec
, vec_init
, vec_step
, t
;
3133 gimple init_stmt
, induction_phi
, new_stmt
;
3134 tree induc_def
, vec_def
, vec_dest
;
3135 tree init_expr
, step_expr
;
3136 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3141 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3142 bool nested_in_vect_loop
= false;
3143 gimple_seq stmts
= NULL
;
3144 imm_use_iterator imm_iter
;
3145 use_operand_p use_p
;
3149 gimple_stmt_iterator si
;
3150 basic_block bb
= gimple_bb (iv_phi
);
3154 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3155 if (nested_in_vect_loop_p (loop
, iv_phi
))
3157 nested_in_vect_loop
= true;
3158 iv_loop
= loop
->inner
;
3162 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3164 latch_e
= loop_latch_edge (iv_loop
);
3165 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3167 access_fn
= analyze_scalar_evolution (iv_loop
, PHI_RESULT (iv_phi
));
3168 gcc_assert (access_fn
);
3169 STRIP_NOPS (access_fn
);
3170 ok
= vect_is_simple_iv_evolution (iv_loop
->num
, access_fn
,
3171 &init_expr
, &step_expr
);
3173 pe
= loop_preheader_edge (iv_loop
);
3175 scalar_type
= TREE_TYPE (init_expr
);
3176 vectype
= get_vectype_for_scalar_type (scalar_type
);
3177 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3178 gcc_assert (vectype
);
3179 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3180 ncopies
= vf
/ nunits
;
3182 gcc_assert (phi_info
);
3183 gcc_assert (ncopies
>= 1);
3185 /* Find the first insertion point in the BB. */
3186 si
= gsi_after_labels (bb
);
3188 /* Create the vector that holds the initial_value of the induction. */
3189 if (nested_in_vect_loop
)
3191 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3192 been created during vectorization of previous stmts. We obtain it
3193 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3194 tree iv_def
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3195 loop_preheader_edge (iv_loop
));
3196 vec_init
= vect_get_vec_def_for_operand (iv_def
, iv_phi
, NULL
);
3197 /* If the initial value is not of proper type, convert it. */
3198 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3200 new_stmt
= gimple_build_assign_with_ops
3202 vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_"),
3203 build1 (VIEW_CONVERT_EXPR
, vectype
, vec_init
), NULL_TREE
);
3204 vec_init
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3205 gimple_assign_set_lhs (new_stmt
, vec_init
);
3206 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3208 gcc_assert (!new_bb
);
3209 set_vinfo_for_stmt (new_stmt
,
3210 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3215 vec
<constructor_elt
, va_gc
> *v
;
3217 /* iv_loop is the loop to be vectorized. Create:
3218 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3219 new_var
= vect_get_new_vect_var (scalar_type
, vect_scalar_var
, "var_");
3220 new_name
= force_gimple_operand (init_expr
, &stmts
, false, new_var
);
3223 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3224 gcc_assert (!new_bb
);
3227 vec_alloc (v
, nunits
);
3228 bool constant_p
= is_gimple_min_invariant (new_name
);
3229 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3230 for (i
= 1; i
< nunits
; i
++)
3232 /* Create: new_name_i = new_name + step_expr */
3233 enum tree_code code
= POINTER_TYPE_P (scalar_type
)
3234 ? POINTER_PLUS_EXPR
: PLUS_EXPR
;
3235 new_name
= fold_build2 (code
, scalar_type
, new_name
, step_expr
);
3236 if (!is_gimple_min_invariant (new_name
))
3238 init_stmt
= gimple_build_assign (new_var
, new_name
);
3239 new_name
= make_ssa_name (new_var
, init_stmt
);
3240 gimple_assign_set_lhs (init_stmt
, new_name
);
3241 new_bb
= gsi_insert_on_edge_immediate (pe
, init_stmt
);
3242 gcc_assert (!new_bb
);
3243 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE
, vect_location
,
3246 "created new init_stmt: ");
3247 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, init_stmt
, 0);
3251 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3253 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3255 new_vec
= build_vector_from_ctor (vectype
, v
);
3257 new_vec
= build_constructor (vectype
, v
);
3258 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3262 /* Create the vector that holds the step of the induction. */
3263 if (nested_in_vect_loop
)
3264 /* iv_loop is nested in the loop to be vectorized. Generate:
3265 vec_step = [S, S, S, S] */
3266 new_name
= step_expr
;
3269 /* iv_loop is the loop to be vectorized. Generate:
3270 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3271 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3272 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3276 t
= unshare_expr (new_name
);
3277 gcc_assert (CONSTANT_CLASS_P (new_name
));
3278 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3279 gcc_assert (stepvectype
);
3280 new_vec
= build_vector_from_val (stepvectype
, t
);
3281 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3284 /* Create the following def-use cycle:
3289 vec_iv = PHI <vec_init, vec_loop>
3293 vec_loop = vec_iv + vec_step; */
3295 /* Create the induction-phi that defines the induction-operand. */
3296 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3297 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3298 set_vinfo_for_stmt (induction_phi
,
3299 new_stmt_vec_info (induction_phi
, loop_vinfo
, NULL
));
3300 induc_def
= PHI_RESULT (induction_phi
);
3302 /* Create the iv update inside the loop */
3303 new_stmt
= gimple_build_assign_with_ops (PLUS_EXPR
, vec_dest
,
3304 induc_def
, vec_step
);
3305 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3306 gimple_assign_set_lhs (new_stmt
, vec_def
);
3307 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3308 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
,
3311 /* Set the arguments of the phi node: */
3312 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3313 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3317 /* In case that vectorization factor (VF) is bigger than the number
3318 of elements that we can fit in a vectype (nunits), we have to generate
3319 more than one vector stmt - i.e - we need to "unroll" the
3320 vector stmt by a factor VF/nunits. For more details see documentation
3321 in vectorizable_operation. */
3325 stmt_vec_info prev_stmt_vinfo
;
3326 /* FORNOW. This restriction should be relaxed. */
3327 gcc_assert (!nested_in_vect_loop
);
3329 /* Create the vector that holds the step of the induction. */
3330 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3331 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3333 t
= unshare_expr (new_name
);
3334 gcc_assert (CONSTANT_CLASS_P (new_name
));
3335 new_vec
= build_vector_from_val (stepvectype
, t
);
3336 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3338 vec_def
= induc_def
;
3339 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3340 for (i
= 1; i
< ncopies
; i
++)
3342 /* vec_i = vec_prev + vec_step */
3343 new_stmt
= gimple_build_assign_with_ops (PLUS_EXPR
, vec_dest
,
3345 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3346 gimple_assign_set_lhs (new_stmt
, vec_def
);
3348 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3349 if (!useless_type_conversion_p (resvectype
, vectype
))
3351 new_stmt
= gimple_build_assign_with_ops
3353 vect_get_new_vect_var (resvectype
, vect_simple_var
,
3355 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3356 gimple_assign_lhs (new_stmt
)), NULL_TREE
);
3357 gimple_assign_set_lhs (new_stmt
,
3359 (gimple_assign_lhs (new_stmt
), new_stmt
));
3360 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3362 set_vinfo_for_stmt (new_stmt
,
3363 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3364 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3365 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3369 if (nested_in_vect_loop
)
3371 /* Find the loop-closed exit-phi of the induction, and record
3372 the final vector of induction results: */
3374 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3376 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (USE_STMT (use_p
))))
3378 exit_phi
= USE_STMT (use_p
);
3384 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3385 /* FORNOW. Currently not supporting the case that an inner-loop induction
3386 is not used in the outer-loop (i.e. only outside the outer-loop). */
3387 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3388 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3390 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3391 if (dump_enabled_p ())
3393 dump_printf_loc (MSG_NOTE
, vect_location
,
3394 "vector of inductions after inner-loop:");
3395 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3401 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE
, vect_location
,
3404 "transform induction: created def-use cycle: ");
3405 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3406 dump_printf (MSG_NOTE
, "\n");
3407 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
3408 SSA_NAME_DEF_STMT (vec_def
), 0);
3411 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
3412 if (!useless_type_conversion_p (resvectype
, vectype
))
3414 new_stmt
= gimple_build_assign_with_ops
3416 vect_get_new_vect_var (resvectype
, vect_simple_var
, "vec_iv_"),
3417 build1 (VIEW_CONVERT_EXPR
, resvectype
, induc_def
), NULL_TREE
);
3418 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3419 gimple_assign_set_lhs (new_stmt
, induc_def
);
3420 si
= gsi_after_labels (bb
);
3421 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3422 set_vinfo_for_stmt (new_stmt
,
3423 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3424 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
3425 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
3432 /* Function get_initial_def_for_reduction
3435 STMT - a stmt that performs a reduction operation in the loop.
3436 INIT_VAL - the initial value of the reduction variable
3439 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3440 of the reduction (used for adjusting the epilog - see below).
3441 Return a vector variable, initialized according to the operation that STMT
3442 performs. This vector will be used as the initial value of the
3443 vector of partial results.
3445 Option1 (adjust in epilog): Initialize the vector as follows:
3446 add/bit or/xor: [0,0,...,0,0]
3447 mult/bit and: [1,1,...,1,1]
3448 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3449 and when necessary (e.g. add/mult case) let the caller know
3450 that it needs to adjust the result by init_val.
3452 Option2: Initialize the vector as follows:
3453 add/bit or/xor: [init_val,0,0,...,0]
3454 mult/bit and: [init_val,1,1,...,1]
3455 min/max/cond_expr: [init_val,init_val,...,init_val]
3456 and no adjustments are needed.
3458 For example, for the following code:
3464 STMT is 's = s + a[i]', and the reduction variable is 's'.
3465 For a vector of 4 units, we want to return either [0,0,0,init_val],
3466 or [0,0,0,0] and let the caller know that it needs to adjust
3467 the result at the end by 'init_val'.
3469 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3470 initialization vector is simpler (same element in all entries), if
3471 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3473 A cost model should help decide between these two schemes. */
3476 get_initial_def_for_reduction (gimple stmt
, tree init_val
,
3477 tree
*adjustment_def
)
3479 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
3480 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3481 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3482 tree scalar_type
= TREE_TYPE (init_val
);
3483 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3485 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3490 bool nested_in_vect_loop
= false;
3492 REAL_VALUE_TYPE real_init_val
= dconst0
;
3493 int int_init_val
= 0;
3494 gimple def_stmt
= NULL
;
3496 gcc_assert (vectype
);
3497 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3499 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3500 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3502 if (nested_in_vect_loop_p (loop
, stmt
))
3503 nested_in_vect_loop
= true;
3505 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
3507 /* In case of double reduction we only create a vector variable to be put
3508 in the reduction phi node. The actual statement creation is done in
3509 vect_create_epilog_for_reduction. */
3510 if (adjustment_def
&& nested_in_vect_loop
3511 && TREE_CODE (init_val
) == SSA_NAME
3512 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
3513 && gimple_code (def_stmt
) == GIMPLE_PHI
3514 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3515 && vinfo_for_stmt (def_stmt
)
3516 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
3517 == vect_double_reduction_def
)
3519 *adjustment_def
= NULL
;
3520 return vect_create_destination_var (init_val
, vectype
);
3523 if (TREE_CONSTANT (init_val
))
3525 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3526 init_value
= build_real (scalar_type
, TREE_REAL_CST (init_val
));
3528 init_value
= build_int_cst (scalar_type
, TREE_INT_CST_LOW (init_val
));
3531 init_value
= init_val
;
3535 case WIDEN_SUM_EXPR
:
3543 /* ADJUSMENT_DEF is NULL when called from
3544 vect_create_epilog_for_reduction to vectorize double reduction. */
3547 if (nested_in_vect_loop
)
3548 *adjustment_def
= vect_get_vec_def_for_operand (init_val
, stmt
,
3551 *adjustment_def
= init_val
;
3554 if (code
== MULT_EXPR
)
3556 real_init_val
= dconst1
;
3560 if (code
== BIT_AND_EXPR
)
3563 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3564 def_for_init
= build_real (scalar_type
, real_init_val
);
3566 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3568 /* Create a vector of '0' or '1' except the first element. */
3569 elts
= XALLOCAVEC (tree
, nunits
);
3570 for (i
= nunits
- 2; i
>= 0; --i
)
3571 elts
[i
+ 1] = def_for_init
;
3573 /* Option1: the first element is '0' or '1' as well. */
3576 elts
[0] = def_for_init
;
3577 init_def
= build_vector (vectype
, elts
);
3581 /* Option2: the first element is INIT_VAL. */
3583 if (TREE_CONSTANT (init_val
))
3584 init_def
= build_vector (vectype
, elts
);
3587 vec
<constructor_elt
, va_gc
> *v
;
3588 vec_alloc (v
, nunits
);
3589 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
3590 for (i
= 1; i
< nunits
; ++i
)
3591 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
3592 init_def
= build_constructor (vectype
, v
);
3602 *adjustment_def
= NULL_TREE
;
3603 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
, NULL
);
3607 init_def
= build_vector_from_val (vectype
, init_value
);
3618 /* Function vect_create_epilog_for_reduction
3620 Create code at the loop-epilog to finalize the result of a reduction
3623 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3624 reduction statements.
3625 STMT is the scalar reduction stmt that is being vectorized.
3626 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3627 number of elements that we can fit in a vectype (nunits). In this case
3628 we have to generate more than one vector stmt - i.e - we need to "unroll"
3629 the vector stmt by a factor VF/nunits. For more details see documentation
3630 in vectorizable_operation.
3631 REDUC_CODE is the tree-code for the epilog reduction.
3632 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3634 REDUC_INDEX is the index of the operand in the right hand side of the
3635 statement that is defined by REDUCTION_PHI.
3636 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3637 SLP_NODE is an SLP node containing a group of reduction statements. The
3638 first one in this group is STMT.
3641 1. Creates the reduction def-use cycles: sets the arguments for
3643 The loop-entry argument is the vectorized initial-value of the reduction.
3644 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3646 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3647 by applying the operation specified by REDUC_CODE if available, or by
3648 other means (whole-vector shifts or a scalar loop).
3649 The function also creates a new phi node at the loop exit to preserve
3650 loop-closed form, as illustrated below.
3652 The flow at the entry to this function:
3655 vec_def = phi <null, null> # REDUCTION_PHI
3656 VECT_DEF = vector_stmt # vectorized form of STMT
3657 s_loop = scalar_stmt # (scalar) STMT
3659 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3663 The above is transformed by this function into:
3666 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3667 VECT_DEF = vector_stmt # vectorized form of STMT
3668 s_loop = scalar_stmt # (scalar) STMT
3670 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3671 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3672 v_out2 = reduce <v_out1>
3673 s_out3 = extract_field <v_out2, 0>
3674 s_out4 = adjust_result <s_out3>
3680 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple stmt
,
3681 int ncopies
, enum tree_code reduc_code
,
3682 vec
<gimple
> reduction_phis
,
3683 int reduc_index
, bool double_reduc
,
3686 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3687 stmt_vec_info prev_phi_info
;
3689 enum machine_mode mode
;
3690 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3691 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
3692 basic_block exit_bb
;
3695 gimple new_phi
= NULL
, phi
;
3696 gimple_stmt_iterator exit_gsi
;
3698 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
3699 gimple epilog_stmt
= NULL
;
3700 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3702 tree bitsize
, bitpos
;
3703 tree adjustment_def
= NULL
;
3704 tree vec_initial_def
= NULL
;
3705 tree reduction_op
, expr
, def
;
3706 tree orig_name
, scalar_result
;
3707 imm_use_iterator imm_iter
, phi_imm_iter
;
3708 use_operand_p use_p
, phi_use_p
;
3709 bool extract_scalar_result
= false;
3710 gimple use_stmt
, orig_stmt
, reduction_phi
= NULL
;
3711 bool nested_in_vect_loop
= false;
3712 vec
<gimple
> new_phis
= vNULL
;
3713 vec
<gimple
> inner_phis
= vNULL
;
3714 enum vect_def_type dt
= vect_unknown_def_type
;
3716 vec
<tree
> scalar_results
= vNULL
;
3717 unsigned int group_size
= 1, k
, ratio
;
3718 vec
<tree
> vec_initial_defs
= vNULL
;
3720 bool slp_reduc
= false;
3721 tree new_phi_result
;
3722 gimple inner_phi
= NULL
;
3725 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
3727 if (nested_in_vect_loop_p (loop
, stmt
))
3731 nested_in_vect_loop
= true;
3732 gcc_assert (!slp_node
);
3735 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3737 case GIMPLE_SINGLE_RHS
:
3738 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3740 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3742 case GIMPLE_UNARY_RHS
:
3743 reduction_op
= gimple_assign_rhs1 (stmt
);
3745 case GIMPLE_BINARY_RHS
:
3746 reduction_op
= reduc_index
?
3747 gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
);
3749 case GIMPLE_TERNARY_RHS
:
3750 reduction_op
= gimple_op (stmt
, reduc_index
+ 1);
3756 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3757 gcc_assert (vectype
);
3758 mode
= TYPE_MODE (vectype
);
3760 /* 1. Create the reduction def-use cycle:
3761 Set the arguments of REDUCTION_PHIS, i.e., transform
3764 vec_def = phi <null, null> # REDUCTION_PHI
3765 VECT_DEF = vector_stmt # vectorized form of STMT
3771 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3772 VECT_DEF = vector_stmt # vectorized form of STMT
3775 (in case of SLP, do it for all the phis). */
3777 /* Get the loop-entry arguments. */
3779 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
3780 NULL
, slp_node
, reduc_index
);
3783 vec_initial_defs
.create (1);
3784 /* For the case of reduction, vect_get_vec_def_for_operand returns
3785 the scalar def before the loop, that defines the initial value
3786 of the reduction variable. */
3787 vec_initial_def
= vect_get_vec_def_for_operand (reduction_op
, stmt
,
3789 vec_initial_defs
.quick_push (vec_initial_def
);
3792 /* Set phi nodes arguments. */
3793 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
3795 tree vec_init_def
= vec_initial_defs
[i
];
3796 tree def
= vect_defs
[i
];
3797 for (j
= 0; j
< ncopies
; j
++)
3799 /* Set the loop-entry arg of the reduction-phi. */
3800 add_phi_arg (phi
, vec_init_def
, loop_preheader_edge (loop
),
3803 /* Set the loop-latch arg for the reduction-phi. */
3805 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
3807 add_phi_arg (phi
, def
, loop_latch_edge (loop
), UNKNOWN_LOCATION
);
3809 if (dump_enabled_p ())
3811 dump_printf_loc (MSG_NOTE
, vect_location
,
3812 "transform reduction: created def-use cycle: ");
3813 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
3814 dump_printf (MSG_NOTE
, "\n");
3815 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
3818 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
3822 vec_initial_defs
.release ();
3824 /* 2. Create epilog code.
3825 The reduction epilog code operates across the elements of the vector
3826 of partial results computed by the vectorized loop.
3827 The reduction epilog code consists of:
3829 step 1: compute the scalar result in a vector (v_out2)
3830 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3831 step 3: adjust the scalar result (s_out3) if needed.
3833 Step 1 can be accomplished using one the following three schemes:
3834 (scheme 1) using reduc_code, if available.
3835 (scheme 2) using whole-vector shifts, if available.
3836 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3839 The overall epilog code looks like this:
3841 s_out0 = phi <s_loop> # original EXIT_PHI
3842 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3843 v_out2 = reduce <v_out1> # step 1
3844 s_out3 = extract_field <v_out2, 0> # step 2
3845 s_out4 = adjust_result <s_out3> # step 3
3847 (step 3 is optional, and steps 1 and 2 may be combined).
3848 Lastly, the uses of s_out0 are replaced by s_out4. */
3851 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3852 v_out1 = phi <VECT_DEF>
3853 Store them in NEW_PHIS. */
3855 exit_bb
= single_exit (loop
)->dest
;
3856 prev_phi_info
= NULL
;
3857 new_phis
.create (vect_defs
.length ());
3858 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
3860 for (j
= 0; j
< ncopies
; j
++)
3862 tree new_def
= copy_ssa_name (def
, NULL
);
3863 phi
= create_phi_node (new_def
, exit_bb
);
3864 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
, NULL
));
3866 new_phis
.quick_push (phi
);
3869 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
3870 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
3873 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
3874 prev_phi_info
= vinfo_for_stmt (phi
);
3878 /* The epilogue is created for the outer-loop, i.e., for the loop being
3879 vectorized. Create exit phis for the outer loop. */
3883 exit_bb
= single_exit (loop
)->dest
;
3884 inner_phis
.create (vect_defs
.length ());
3885 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
3887 tree new_result
= copy_ssa_name (PHI_RESULT (phi
), NULL
);
3888 gimple outer_phi
= create_phi_node (new_result
, exit_bb
);
3889 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
3891 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
3893 inner_phis
.quick_push (phi
);
3894 new_phis
[i
] = outer_phi
;
3895 prev_phi_info
= vinfo_for_stmt (outer_phi
);
3896 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
3898 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
3899 new_result
= copy_ssa_name (PHI_RESULT (phi
), NULL
);
3900 outer_phi
= create_phi_node (new_result
, exit_bb
);
3901 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
3903 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
3905 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
3906 prev_phi_info
= vinfo_for_stmt (outer_phi
);
3911 exit_gsi
= gsi_after_labels (exit_bb
);
3913 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3914 (i.e. when reduc_code is not available) and in the final adjustment
3915 code (if needed). Also get the original scalar reduction variable as
3916 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3917 represents a reduction pattern), the tree-code and scalar-def are
3918 taken from the original stmt that the pattern-stmt (STMT) replaces.
3919 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3920 are taken from STMT. */
3922 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3925 /* Regular reduction */
3930 /* Reduction pattern */
3931 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
3932 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
3933 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
3936 code
= gimple_assign_rhs_code (orig_stmt
);
3937 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3938 partial results are added and not subtracted. */
3939 if (code
== MINUS_EXPR
)
3942 scalar_dest
= gimple_assign_lhs (orig_stmt
);
3943 scalar_type
= TREE_TYPE (scalar_dest
);
3944 scalar_results
.create (group_size
);
3945 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
3946 bitsize
= TYPE_SIZE (scalar_type
);
3948 /* In case this is a reduction in an inner-loop while vectorizing an outer
3949 loop - we don't need to extract a single scalar result at the end of the
3950 inner-loop (unless it is double reduction, i.e., the use of reduction is
3951 outside the outer-loop). The final vector of partial results will be used
3952 in the vectorized outer-loop, or reduced to a scalar result at the end of
3954 if (nested_in_vect_loop
&& !double_reduc
)
3955 goto vect_finalize_reduction
;
3957 /* SLP reduction without reduction chain, e.g.,
3961 b2 = operation (b1) */
3962 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
3964 /* In case of reduction chain, e.g.,
3967 a3 = operation (a2),
3969 we may end up with more than one vector result. Here we reduce them to
3971 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
3973 tree first_vect
= PHI_RESULT (new_phis
[0]);
3975 gimple new_vec_stmt
= NULL
;
3977 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
3978 for (k
= 1; k
< new_phis
.length (); k
++)
3980 gimple next_phi
= new_phis
[k
];
3981 tree second_vect
= PHI_RESULT (next_phi
);
3983 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
3984 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
3985 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
3986 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
3987 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
3990 new_phi_result
= first_vect
;
3993 new_phis
.truncate (0);
3994 new_phis
.safe_push (new_vec_stmt
);
3998 new_phi_result
= PHI_RESULT (new_phis
[0]);
4000 /* 2.3 Create the reduction code, using one of the three schemes described
4001 above. In SLP we simply need to extract all the elements from the
4002 vector (without reducing them), so we use scalar shifts. */
4003 if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4007 /*** Case 1: Create:
4008 v_out2 = reduc_expr <v_out1> */
4010 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_NOTE
, vect_location
,
4012 "Reduce using direct vector reduction.");
4014 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4015 tmp
= build1 (reduc_code
, vectype
, new_phi_result
);
4016 epilog_stmt
= gimple_build_assign (vec_dest
, tmp
);
4017 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4018 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4019 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4021 extract_scalar_result
= true;
4025 enum tree_code shift_code
= ERROR_MARK
;
4026 bool have_whole_vector_shift
= true;
4028 int element_bitsize
= tree_low_cst (bitsize
, 1);
4029 int vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
4032 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4033 shift_code
= VEC_RSHIFT_EXPR
;
4035 have_whole_vector_shift
= false;
4037 /* Regardless of whether we have a whole vector shift, if we're
4038 emulating the operation via tree-vect-generic, we don't want
4039 to use it. Only the first round of the reduction is likely
4040 to still be profitable via emulation. */
4041 /* ??? It might be better to emit a reduction tree code here, so that
4042 tree-vect-generic can expand the first round via bit tricks. */
4043 if (!VECTOR_MODE_P (mode
))
4044 have_whole_vector_shift
= false;
4047 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4048 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4049 have_whole_vector_shift
= false;
4052 if (have_whole_vector_shift
&& !slp_reduc
)
4054 /*** Case 2: Create:
4055 for (offset = VS/2; offset >= element_size; offset/=2)
4057 Create: va' = vec_shift <va, offset>
4058 Create: va = vop <va, va'>
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE
, vect_location
,
4063 "Reduce using vector shifts");
4065 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4066 new_temp
= new_phi_result
;
4067 for (bit_offset
= vec_size_in_bits
/2;
4068 bit_offset
>= element_bitsize
;
4071 tree bitpos
= size_int (bit_offset
);
4073 epilog_stmt
= gimple_build_assign_with_ops (shift_code
,
4074 vec_dest
, new_temp
, bitpos
);
4075 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4076 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4077 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4079 epilog_stmt
= gimple_build_assign_with_ops (code
, vec_dest
,
4080 new_name
, new_temp
);
4081 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4082 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4083 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4086 extract_scalar_result
= true;
4092 /*** Case 3: Create:
4093 s = extract_field <v_out2, 0>
4094 for (offset = element_size;
4095 offset < vector_size;
4096 offset += element_size;)
4098 Create: s' = extract_field <v_out2, offset>
4099 Create: s = op <s, s'> // For non SLP cases
4102 if (dump_enabled_p ())
4103 dump_printf_loc (MSG_NOTE
, vect_location
,
4104 "Reduce using scalar code. ");
4106 vec_size_in_bits
= tree_low_cst (TYPE_SIZE (vectype
), 1);
4107 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4109 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4110 vec_temp
= PHI_RESULT (new_phi
);
4112 vec_temp
= gimple_assign_lhs (new_phi
);
4113 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4115 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4116 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4117 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4118 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4120 /* In SLP we don't need to apply reduction operation, so we just
4121 collect s' values in SCALAR_RESULTS. */
4123 scalar_results
.safe_push (new_temp
);
4125 for (bit_offset
= element_bitsize
;
4126 bit_offset
< vec_size_in_bits
;
4127 bit_offset
+= element_bitsize
)
4129 tree bitpos
= bitsize_int (bit_offset
);
4130 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4133 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4134 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4135 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4136 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4140 /* In SLP we don't need to apply reduction operation, so
4141 we just collect s' values in SCALAR_RESULTS. */
4142 new_temp
= new_name
;
4143 scalar_results
.safe_push (new_name
);
4147 epilog_stmt
= gimple_build_assign_with_ops (code
,
4148 new_scalar_dest
, new_name
, new_temp
);
4149 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4150 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4151 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4156 /* The only case where we need to reduce scalar results in SLP, is
4157 unrolling. If the size of SCALAR_RESULTS is greater than
4158 GROUP_SIZE, we reduce them combining elements modulo
4162 tree res
, first_res
, new_res
;
4165 /* Reduce multiple scalar results in case of SLP unrolling. */
4166 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4169 first_res
= scalar_results
[j
% group_size
];
4170 new_stmt
= gimple_build_assign_with_ops (code
,
4171 new_scalar_dest
, first_res
, res
);
4172 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4173 gimple_assign_set_lhs (new_stmt
, new_res
);
4174 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4175 scalar_results
[j
% group_size
] = new_res
;
4179 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4180 scalar_results
.safe_push (new_temp
);
4182 extract_scalar_result
= false;
4186 /* 2.4 Extract the final scalar result. Create:
4187 s_out3 = extract_field <v_out2, bitpos> */
4189 if (extract_scalar_result
)
4193 if (dump_enabled_p ())
4194 dump_printf_loc (MSG_NOTE
, vect_location
,
4195 "extract scalar result");
4197 if (BYTES_BIG_ENDIAN
)
4198 bitpos
= size_binop (MULT_EXPR
,
4199 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1),
4200 TYPE_SIZE (scalar_type
));
4202 bitpos
= bitsize_zero_node
;
4204 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
, bitsize
, bitpos
);
4205 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4206 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4207 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4208 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4209 scalar_results
.safe_push (new_temp
);
4212 vect_finalize_reduction
:
4217 /* 2.5 Adjust the final result by the initial value of the reduction
4218 variable. (When such adjustment is not needed, then
4219 'adjustment_def' is zero). For example, if code is PLUS we create:
4220 new_temp = loop_exit_def + adjustment_def */
4224 gcc_assert (!slp_reduc
);
4225 if (nested_in_vect_loop
)
4227 new_phi
= new_phis
[0];
4228 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4229 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4230 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4234 new_temp
= scalar_results
[0];
4235 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4236 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4237 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4240 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4241 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4242 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4243 SSA_NAME_DEF_STMT (new_temp
) = epilog_stmt
;
4244 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4245 if (nested_in_vect_loop
)
4247 set_vinfo_for_stmt (epilog_stmt
,
4248 new_stmt_vec_info (epilog_stmt
, loop_vinfo
,
4250 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4251 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4254 scalar_results
.quick_push (new_temp
);
4256 scalar_results
[0] = new_temp
;
4259 scalar_results
[0] = new_temp
;
4261 new_phis
[0] = epilog_stmt
;
4264 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4265 phis with new adjusted scalar results, i.e., replace use <s_out0>
4270 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4271 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4272 v_out2 = reduce <v_out1>
4273 s_out3 = extract_field <v_out2, 0>
4274 s_out4 = adjust_result <s_out3>
4281 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4282 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4283 v_out2 = reduce <v_out1>
4284 s_out3 = extract_field <v_out2, 0>
4285 s_out4 = adjust_result <s_out3>
4290 /* In SLP reduction chain we reduce vector results into one vector if
4291 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4292 the last stmt in the reduction chain, since we are looking for the loop
4294 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4296 scalar_dest
= gimple_assign_lhs (
4297 SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
4301 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4302 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4303 need to match SCALAR_RESULTS with corresponding statements. The first
4304 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4305 the first vector stmt, etc.
4306 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4307 if (group_size
> new_phis
.length ())
4309 ratio
= group_size
/ new_phis
.length ();
4310 gcc_assert (!(group_size
% new_phis
.length ()));
4315 for (k
= 0; k
< group_size
; k
++)
4319 epilog_stmt
= new_phis
[k
/ ratio
];
4320 reduction_phi
= reduction_phis
[k
/ ratio
];
4322 inner_phi
= inner_phis
[k
/ ratio
];
4327 gimple current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
4329 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
4330 /* SLP statements can't participate in patterns. */
4331 gcc_assert (!orig_stmt
);
4332 scalar_dest
= gimple_assign_lhs (current_stmt
);
4336 /* Find the loop-closed-use at the loop exit of the original scalar
4337 result. (The reduction result is expected to have two immediate uses -
4338 one at the latch block, and one at the loop exit). */
4339 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4340 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4341 phis
.safe_push (USE_STMT (use_p
));
4343 /* We expect to have found an exit_phi because of loop-closed-ssa
4345 gcc_assert (!phis
.is_empty ());
4347 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4351 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
4354 /* FORNOW. Currently not supporting the case that an inner-loop
4355 reduction is not used in the outer-loop (but only outside the
4356 outer-loop), unless it is double reduction. */
4357 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
4358 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
4361 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
4363 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
4364 != vect_double_reduction_def
)
4367 /* Handle double reduction:
4369 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4370 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4371 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4372 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4374 At that point the regular reduction (stmt2 and stmt3) is
4375 already vectorized, as well as the exit phi node, stmt4.
4376 Here we vectorize the phi node of double reduction, stmt1, and
4377 update all relevant statements. */
4379 /* Go through all the uses of s2 to find double reduction phi
4380 node, i.e., stmt1 above. */
4381 orig_name
= PHI_RESULT (exit_phi
);
4382 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4384 stmt_vec_info use_stmt_vinfo
;
4385 stmt_vec_info new_phi_vinfo
;
4386 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
4387 basic_block bb
= gimple_bb (use_stmt
);
4390 /* Check that USE_STMT is really double reduction phi
4392 if (gimple_code (use_stmt
) != GIMPLE_PHI
4393 || gimple_phi_num_args (use_stmt
) != 2
4394 || bb
->loop_father
!= outer_loop
)
4396 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
4398 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
4399 != vect_double_reduction_def
)
4402 /* Create vector phi node for double reduction:
4403 vs1 = phi <vs0, vs2>
4404 vs1 was created previously in this function by a call to
4405 vect_get_vec_def_for_operand and is stored in
4407 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4408 vs0 is created here. */
4410 /* Create vector phi node. */
4411 vect_phi
= create_phi_node (vec_initial_def
, bb
);
4412 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
4413 loop_vec_info_for_loop (outer_loop
), NULL
);
4414 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
4416 /* Create vs0 - initial def of the double reduction phi. */
4417 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
4418 loop_preheader_edge (outer_loop
));
4419 init_def
= get_initial_def_for_reduction (stmt
,
4420 preheader_arg
, NULL
);
4421 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
4424 /* Update phi node arguments with vs0 and vs2. */
4425 add_phi_arg (vect_phi
, vect_phi_init
,
4426 loop_preheader_edge (outer_loop
),
4428 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
4429 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
4430 if (dump_enabled_p ())
4432 dump_printf_loc (MSG_NOTE
, vect_location
,
4433 "created double reduction phi node: ");
4434 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
4437 vect_phi_res
= PHI_RESULT (vect_phi
);
4439 /* Replace the use, i.e., set the correct vs1 in the regular
4440 reduction phi node. FORNOW, NCOPIES is always 1, so the
4441 loop is redundant. */
4442 use
= reduction_phi
;
4443 for (j
= 0; j
< ncopies
; j
++)
4445 edge pr_edge
= loop_preheader_edge (loop
);
4446 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
4447 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
4454 if (nested_in_vect_loop
)
4463 /* Find the loop-closed-use at the loop exit of the original scalar
4464 result. (The reduction result is expected to have two immediate uses,
4465 one at the latch block, and one at the loop exit). For double
4466 reductions we are looking for exit phis of the outer loop. */
4467 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4469 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4470 phis
.safe_push (USE_STMT (use_p
));
4473 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
4475 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
4477 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
4479 if (!flow_bb_inside_loop_p (loop
,
4480 gimple_bb (USE_STMT (phi_use_p
))))
4481 phis
.safe_push (USE_STMT (phi_use_p
));
4487 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4489 /* Replace the uses: */
4490 orig_name
= PHI_RESULT (exit_phi
);
4491 scalar_result
= scalar_results
[k
];
4492 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4493 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4494 SET_USE (use_p
, scalar_result
);
4500 scalar_results
.release ();
4501 inner_phis
.release ();
4502 new_phis
.release ();
4506 /* Function vectorizable_reduction.
4508 Check if STMT performs a reduction operation that can be vectorized.
4509 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4510 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4511 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4513 This function also handles reduction idioms (patterns) that have been
4514 recognized in advance during vect_pattern_recog. In this case, STMT may be
4516 X = pattern_expr (arg0, arg1, ..., X)
4517 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4518 sequence that had been detected and replaced by the pattern-stmt (STMT).
4520 In some cases of reduction patterns, the type of the reduction variable X is
4521 different than the type of the other arguments of STMT.
4522 In such cases, the vectype that is used when transforming STMT into a vector
4523 stmt is different than the vectype that is used to determine the
4524 vectorization factor, because it consists of a different number of elements
4525 than the actual number of elements that are being operated upon in parallel.
4527 For example, consider an accumulation of shorts into an int accumulator.
4528 On some targets it's possible to vectorize this pattern operating on 8
4529 shorts at a time (hence, the vectype for purposes of determining the
4530 vectorization factor should be V8HI); on the other hand, the vectype that
4531 is used to create the vector form is actually V4SI (the type of the result).
4533 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4534 indicates what is the actual level of parallelism (V8HI in the example), so
4535 that the right vectorization factor would be derived. This vectype
4536 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4537 be used to create the vectorized stmt. The right vectype for the vectorized
4538 stmt is obtained from the type of the result X:
4539 get_vectype_for_scalar_type (TREE_TYPE (X))
4541 This means that, contrary to "regular" reductions (or "regular" stmts in
4542 general), the following equation:
4543 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4544 does *NOT* necessarily hold for reduction patterns. */
4547 vectorizable_reduction (gimple stmt
, gimple_stmt_iterator
*gsi
,
4548 gimple
*vec_stmt
, slp_tree slp_node
)
4552 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
4553 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4554 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
4555 tree vectype_in
= NULL_TREE
;
4556 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4557 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4558 enum tree_code code
, orig_code
, epilog_reduc_code
;
4559 enum machine_mode vec_mode
;
4561 optab optab
, reduc_optab
;
4562 tree new_temp
= NULL_TREE
;
4565 enum vect_def_type dt
;
4566 gimple new_phi
= NULL
;
4570 stmt_vec_info orig_stmt_info
;
4571 tree expr
= NULL_TREE
;
4575 stmt_vec_info prev_stmt_info
, prev_phi_info
;
4576 bool single_defuse_cycle
= false;
4577 tree reduc_def
= NULL_TREE
;
4578 gimple new_stmt
= NULL
;
4581 bool nested_cycle
= false, found_nested_cycle_def
= false;
4582 gimple reduc_def_stmt
= NULL
;
4583 /* The default is that the reduction variable is the last in statement. */
4584 int reduc_index
= 2;
4585 bool double_reduc
= false, dummy
;
4587 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
4589 gimple def_arg_stmt
;
4590 vec
<tree
> vec_oprnds0
= vNULL
;
4591 vec
<tree
> vec_oprnds1
= vNULL
;
4592 vec
<tree
> vect_defs
= vNULL
;
4593 vec
<gimple
> phis
= vNULL
;
4595 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
4597 /* In case of reduction chain we switch to the first stmt in the chain, but
4598 we don't update STMT_INFO, since only the last stmt is marked as reduction
4599 and has reduction properties. */
4600 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4601 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
4603 if (nested_in_vect_loop_p (loop
, stmt
))
4607 nested_cycle
= true;
4610 /* 1. Is vectorizable reduction? */
4611 /* Not supportable if the reduction variable is used in the loop, unless
4612 it's a reduction chain. */
4613 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
4614 && !GROUP_FIRST_ELEMENT (stmt_info
))
4617 /* Reductions that are not used even in an enclosing outer-loop,
4618 are expected to be "live" (used out of the loop). */
4619 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
4620 && !STMT_VINFO_LIVE_P (stmt_info
))
4623 /* Make sure it was already recognized as a reduction computation. */
4624 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
4625 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
4628 /* 2. Has this been recognized as a reduction pattern?
4630 Check if STMT represents a pattern that has been recognized
4631 in earlier analysis stages. For stmts that represent a pattern,
4632 the STMT_VINFO_RELATED_STMT field records the last stmt in
4633 the original sequence that constitutes the pattern. */
4635 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4638 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
4639 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4640 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
4643 /* 3. Check the operands of the operation. The first operands are defined
4644 inside the loop body. The last operand is the reduction variable,
4645 which is defined by the loop-header-phi. */
4647 gcc_assert (is_gimple_assign (stmt
));
4650 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
4652 case GIMPLE_SINGLE_RHS
:
4653 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
4654 if (op_type
== ternary_op
)
4656 tree rhs
= gimple_assign_rhs1 (stmt
);
4657 ops
[0] = TREE_OPERAND (rhs
, 0);
4658 ops
[1] = TREE_OPERAND (rhs
, 1);
4659 ops
[2] = TREE_OPERAND (rhs
, 2);
4660 code
= TREE_CODE (rhs
);
4666 case GIMPLE_BINARY_RHS
:
4667 code
= gimple_assign_rhs_code (stmt
);
4668 op_type
= TREE_CODE_LENGTH (code
);
4669 gcc_assert (op_type
== binary_op
);
4670 ops
[0] = gimple_assign_rhs1 (stmt
);
4671 ops
[1] = gimple_assign_rhs2 (stmt
);
4674 case GIMPLE_TERNARY_RHS
:
4675 code
= gimple_assign_rhs_code (stmt
);
4676 op_type
= TREE_CODE_LENGTH (code
);
4677 gcc_assert (op_type
== ternary_op
);
4678 ops
[0] = gimple_assign_rhs1 (stmt
);
4679 ops
[1] = gimple_assign_rhs2 (stmt
);
4680 ops
[2] = gimple_assign_rhs3 (stmt
);
4683 case GIMPLE_UNARY_RHS
:
4690 if (code
== COND_EXPR
&& slp_node
)
4693 scalar_dest
= gimple_assign_lhs (stmt
);
4694 scalar_type
= TREE_TYPE (scalar_dest
);
4695 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
4696 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
4699 /* Do not try to vectorize bit-precision reductions. */
4700 if ((TYPE_PRECISION (scalar_type
)
4701 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
4704 /* All uses but the last are expected to be defined in the loop.
4705 The last use is the reduction variable. In case of nested cycle this
4706 assumption is not true: we use reduc_index to record the index of the
4707 reduction variable. */
4708 for (i
= 0; i
< op_type
- 1; i
++)
4710 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4711 if (i
== 0 && code
== COND_EXPR
)
4714 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4715 &def_stmt
, &def
, &dt
, &tem
);
4718 gcc_assert (is_simple_use
);
4720 if (dt
!= vect_internal_def
4721 && dt
!= vect_external_def
4722 && dt
!= vect_constant_def
4723 && dt
!= vect_induction_def
4724 && !(dt
== vect_nested_cycle
&& nested_cycle
))
4727 if (dt
== vect_nested_cycle
)
4729 found_nested_cycle_def
= true;
4730 reduc_def_stmt
= def_stmt
;
4735 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4736 &def_stmt
, &def
, &dt
, &tem
);
4739 gcc_assert (is_simple_use
);
4740 if (!(dt
== vect_reduction_def
4741 || dt
== vect_nested_cycle
4742 || ((dt
== vect_internal_def
|| dt
== vect_external_def
4743 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
4744 && nested_cycle
&& found_nested_cycle_def
)))
4746 /* For pattern recognized stmts, orig_stmt might be a reduction,
4747 but some helper statements for the pattern might not, or
4748 might be COND_EXPRs with reduction uses in the condition. */
4749 gcc_assert (orig_stmt
);
4752 if (!found_nested_cycle_def
)
4753 reduc_def_stmt
= def_stmt
;
4755 gcc_assert (gimple_code (reduc_def_stmt
) == GIMPLE_PHI
);
4757 gcc_assert (orig_stmt
== vect_is_simple_reduction (loop_vinfo
,
4763 gimple tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
4764 !nested_cycle
, &dummy
);
4765 /* We changed STMT to be the first stmt in reduction chain, hence we
4766 check that in this case the first element in the chain is STMT. */
4767 gcc_assert (stmt
== tmp
4768 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
4771 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
4774 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
4777 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4778 / TYPE_VECTOR_SUBPARTS (vectype_in
));
4780 gcc_assert (ncopies
>= 1);
4782 vec_mode
= TYPE_MODE (vectype_in
);
4784 if (code
== COND_EXPR
)
4786 if (!vectorizable_condition (stmt
, gsi
, NULL
, ops
[reduc_index
], 0, NULL
))
4788 if (dump_enabled_p ())
4789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4790 "unsupported condition in reduction");
4797 /* 4. Supportable by target? */
4799 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
4800 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
4802 /* Shifts and rotates are only supported by vectorizable_shifts,
4803 not vectorizable_reduction. */
4804 if (dump_enabled_p ())
4805 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4806 "unsupported shift or rotation.");
4810 /* 4.1. check support for the operation in the loop */
4811 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
4814 if (dump_enabled_p ())
4815 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4821 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
4823 if (dump_enabled_p ())
4824 dump_printf (MSG_NOTE
, "op not supported by target.");
4826 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
4827 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4828 < vect_min_worthwhile_factor (code
))
4831 if (dump_enabled_p ())
4832 dump_printf (MSG_NOTE
, "proceeding using word mode.");
4835 /* Worthwhile without SIMD support? */
4836 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
4837 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4838 < vect_min_worthwhile_factor (code
))
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4842 "not worthwhile without SIMD support.");
4848 /* 4.2. Check support for the epilog operation.
4850 If STMT represents a reduction pattern, then the type of the
4851 reduction variable may be different than the type of the rest
4852 of the arguments. For example, consider the case of accumulation
4853 of shorts into an int accumulator; The original code:
4854 S1: int_a = (int) short_a;
4855 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4858 STMT: int_acc = widen_sum <short_a, int_acc>
4861 1. The tree-code that is used to create the vector operation in the
4862 epilog code (that reduces the partial results) is not the
4863 tree-code of STMT, but is rather the tree-code of the original
4864 stmt from the pattern that STMT is replacing. I.e, in the example
4865 above we want to use 'widen_sum' in the loop, but 'plus' in the
4867 2. The type (mode) we use to check available target support
4868 for the vector operation to be created in the *epilog*, is
4869 determined by the type of the reduction variable (in the example
4870 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4871 However the type (mode) we use to check available target support
4872 for the vector operation to be created *inside the loop*, is
4873 determined by the type of the other arguments to STMT (in the
4874 example we'd check this: optab_handler (widen_sum_optab,
4877 This is contrary to "regular" reductions, in which the types of all
4878 the arguments are the same as the type of the reduction variable.
4879 For "regular" reductions we can therefore use the same vector type
4880 (and also the same tree-code) when generating the epilog code and
4881 when generating the code inside the loop. */
4885 /* This is a reduction pattern: get the vectype from the type of the
4886 reduction variable, and get the tree-code from orig_stmt. */
4887 orig_code
= gimple_assign_rhs_code (orig_stmt
);
4888 gcc_assert (vectype_out
);
4889 vec_mode
= TYPE_MODE (vectype_out
);
4893 /* Regular reduction: use the same vectype and tree-code as used for
4894 the vector code inside the loop can be used for the epilog code. */
4900 def_bb
= gimple_bb (reduc_def_stmt
);
4901 def_stmt_loop
= def_bb
->loop_father
;
4902 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4903 loop_preheader_edge (def_stmt_loop
));
4904 if (TREE_CODE (def_arg
) == SSA_NAME
4905 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
4906 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
4907 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
4908 && vinfo_for_stmt (def_arg_stmt
)
4909 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
4910 == vect_double_reduction_def
)
4911 double_reduc
= true;
4914 epilog_reduc_code
= ERROR_MARK
;
4915 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
4917 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
4921 if (dump_enabled_p ())
4922 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4923 "no optab for reduction.");
4925 epilog_reduc_code
= ERROR_MARK
;
4929 && optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
4931 if (dump_enabled_p ())
4932 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4933 "reduc op not supported by target.");
4935 epilog_reduc_code
= ERROR_MARK
;
4940 if (!nested_cycle
|| double_reduc
)
4942 if (dump_enabled_p ())
4943 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4944 "no reduc code for scalar code.");
4950 if (double_reduc
&& ncopies
> 1)
4952 if (dump_enabled_p ())
4953 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4954 "multiple types in double reduction");
4959 /* In case of widenning multiplication by a constant, we update the type
4960 of the constant to be the type of the other operand. We check that the
4961 constant fits the type in the pattern recognition pass. */
4962 if (code
== DOT_PROD_EXPR
4963 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
4965 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
4966 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
4967 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
4968 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
4971 if (dump_enabled_p ())
4972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4973 "invalid types in dot-prod");
4979 if (!vec_stmt
) /* transformation not required. */
4981 if (!vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
))
4983 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
4989 if (dump_enabled_p ())
4990 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.");
4992 /* FORNOW: Multiple types are not supported for condition. */
4993 if (code
== COND_EXPR
)
4994 gcc_assert (ncopies
== 1);
4996 /* Create the destination vector */
4997 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
4999 /* In case the vectorization factor (VF) is bigger than the number
5000 of elements that we can fit in a vectype (nunits), we have to generate
5001 more than one vector stmt - i.e - we need to "unroll" the
5002 vector stmt by a factor VF/nunits. For more details see documentation
5003 in vectorizable_operation. */
5005 /* If the reduction is used in an outer loop we need to generate
5006 VF intermediate results, like so (e.g. for ncopies=2):
5011 (i.e. we generate VF results in 2 registers).
5012 In this case we have a separate def-use cycle for each copy, and therefore
5013 for each copy we get the vector def for the reduction variable from the
5014 respective phi node created for this copy.
5016 Otherwise (the reduction is unused in the loop nest), we can combine
5017 together intermediate results, like so (e.g. for ncopies=2):
5021 (i.e. we generate VF/2 results in a single register).
5022 In this case for each copy we get the vector def for the reduction variable
5023 from the vectorized reduction operation generated in the previous iteration.
5026 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5028 single_defuse_cycle
= true;
5032 epilog_copies
= ncopies
;
5034 prev_stmt_info
= NULL
;
5035 prev_phi_info
= NULL
;
5038 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5039 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out
)
5040 == TYPE_VECTOR_SUBPARTS (vectype_in
));
5045 vec_oprnds0
.create (1);
5046 if (op_type
== ternary_op
)
5047 vec_oprnds1
.create (1);
5050 phis
.create (vec_num
);
5051 vect_defs
.create (vec_num
);
5053 vect_defs
.quick_push (NULL_TREE
);
5055 for (j
= 0; j
< ncopies
; j
++)
5057 if (j
== 0 || !single_defuse_cycle
)
5059 for (i
= 0; i
< vec_num
; i
++)
5061 /* Create the reduction-phi that defines the reduction
5063 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5064 set_vinfo_for_stmt (new_phi
,
5065 new_stmt_vec_info (new_phi
, loop_vinfo
,
5067 if (j
== 0 || slp_node
)
5068 phis
.quick_push (new_phi
);
5072 if (code
== COND_EXPR
)
5074 gcc_assert (!slp_node
);
5075 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5076 PHI_RESULT (phis
[0]),
5078 /* Multiple types are not supported for condition. */
5085 op0
= ops
[!reduc_index
];
5086 if (op_type
== ternary_op
)
5088 if (reduc_index
== 0)
5095 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
5099 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
5101 vec_oprnds0
.quick_push (loop_vec_def0
);
5102 if (op_type
== ternary_op
)
5104 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
,
5106 vec_oprnds1
.quick_push (loop_vec_def1
);
5114 enum vect_def_type dt
;
5118 vect_is_simple_use (ops
[!reduc_index
], stmt
, loop_vinfo
, NULL
,
5119 &dummy_stmt
, &dummy
, &dt
);
5120 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
5122 vec_oprnds0
[0] = loop_vec_def0
;
5123 if (op_type
== ternary_op
)
5125 vect_is_simple_use (op1
, stmt
, loop_vinfo
, NULL
, &dummy_stmt
,
5127 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
5129 vec_oprnds1
[0] = loop_vec_def1
;
5133 if (single_defuse_cycle
)
5134 reduc_def
= gimple_assign_lhs (new_stmt
);
5136 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
5139 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5142 reduc_def
= PHI_RESULT (phis
[i
]);
5145 if (!single_defuse_cycle
|| j
== 0)
5146 reduc_def
= PHI_RESULT (new_phi
);
5149 def1
= ((op_type
== ternary_op
)
5150 ? vec_oprnds1
[i
] : NULL
);
5151 if (op_type
== binary_op
)
5153 if (reduc_index
== 0)
5154 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
5156 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
5160 if (reduc_index
== 0)
5161 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
5164 if (reduc_index
== 1)
5165 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
5167 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
5171 new_stmt
= gimple_build_assign (vec_dest
, expr
);
5172 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5173 gimple_assign_set_lhs (new_stmt
, new_temp
);
5174 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
5178 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
5179 vect_defs
.quick_push (new_temp
);
5182 vect_defs
[0] = new_temp
;
5189 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
5191 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
5193 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
5194 prev_phi_info
= vinfo_for_stmt (new_phi
);
5197 /* Finalize the reduction-phi (set its arguments) and create the
5198 epilog reduction code. */
5199 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
5201 new_temp
= gimple_assign_lhs (*vec_stmt
);
5202 vect_defs
[0] = new_temp
;
5205 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
5206 epilog_reduc_code
, phis
, reduc_index
,
5207 double_reduc
, slp_node
);
5210 vect_defs
.release ();
5211 vec_oprnds0
.release ();
5212 vec_oprnds1
.release ();
5217 /* Function vect_min_worthwhile_factor.
5219 For a loop where we could vectorize the operation indicated by CODE,
5220 return the minimum vectorization factor that makes it worthwhile
5221 to use generic vectors. */
5223 vect_min_worthwhile_factor (enum tree_code code
)
5244 /* Function vectorizable_induction
5246 Check if PHI performs an induction computation that can be vectorized.
5247 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5248 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5249 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5252 vectorizable_induction (gimple phi
, gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5255 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
5256 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5257 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5258 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5259 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5260 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
5263 gcc_assert (ncopies
>= 1);
5264 /* FORNOW. These restrictions should be relaxed. */
5265 if (nested_in_vect_loop_p (loop
, phi
))
5267 imm_use_iterator imm_iter
;
5268 use_operand_p use_p
;
5275 if (dump_enabled_p ())
5276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5277 "multiple types in nested loop.");
5282 latch_e
= loop_latch_edge (loop
->inner
);
5283 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
5284 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
5286 if (!flow_bb_inside_loop_p (loop
->inner
,
5287 gimple_bb (USE_STMT (use_p
))))
5289 exit_phi
= USE_STMT (use_p
);
5295 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5296 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5297 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5301 "inner-loop induction only used outside "
5302 "of the outer vectorized loop.");
5308 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
5311 /* FORNOW: SLP not supported. */
5312 if (STMT_SLP_TYPE (stmt_info
))
5315 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
5317 if (gimple_code (phi
) != GIMPLE_PHI
)
5320 if (!vec_stmt
) /* transformation not required. */
5322 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
5323 if (dump_enabled_p ())
5324 dump_printf_loc (MSG_NOTE
, vect_location
,
5325 "=== vectorizable_induction ===");
5326 vect_model_induction_cost (stmt_info
, ncopies
);
5332 if (dump_enabled_p ())
5333 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.");
5335 vec_def
= get_initial_def_for_induction (phi
);
5336 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
5340 /* Function vectorizable_live_operation.
5342 STMT computes a value that is used outside the loop. Check if
5343 it can be supported. */
5346 vectorizable_live_operation (gimple stmt
,
5347 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5348 gimple
*vec_stmt ATTRIBUTE_UNUSED
)
5350 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5351 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5352 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5358 enum vect_def_type dt
;
5359 enum tree_code code
;
5360 enum gimple_rhs_class rhs_class
;
5362 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
5364 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5367 if (!is_gimple_assign (stmt
))
5370 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
5373 /* FORNOW. CHECKME. */
5374 if (nested_in_vect_loop_p (loop
, stmt
))
5377 code
= gimple_assign_rhs_code (stmt
);
5378 op_type
= TREE_CODE_LENGTH (code
);
5379 rhs_class
= get_gimple_rhs_class (code
);
5380 gcc_assert (rhs_class
!= GIMPLE_UNARY_RHS
|| op_type
== unary_op
);
5381 gcc_assert (rhs_class
!= GIMPLE_BINARY_RHS
|| op_type
== binary_op
);
5383 /* FORNOW: support only if all uses are invariant. This means
5384 that the scalar operations can remain in place, unvectorized.
5385 The original last scalar value that they compute will be used. */
5387 for (i
= 0; i
< op_type
; i
++)
5389 if (rhs_class
== GIMPLE_SINGLE_RHS
)
5390 op
= TREE_OPERAND (gimple_op (stmt
, 1), i
);
5392 op
= gimple_op (stmt
, i
+ 1);
5394 && !vect_is_simple_use (op
, stmt
, loop_vinfo
, NULL
, &def_stmt
, &def
,
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5403 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
5407 /* No transformation is required for the cases we currently support. */
5411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5414 vect_loop_kill_debug_uses (struct loop
*loop
, gimple stmt
)
5416 ssa_op_iter op_iter
;
5417 imm_use_iterator imm_iter
;
5418 def_operand_p def_p
;
5421 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
5423 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
5427 if (!is_gimple_debug (ustmt
))
5430 bb
= gimple_bb (ustmt
);
5432 if (!flow_bb_inside_loop_p (loop
, bb
))
5434 if (gimple_debug_bind_p (ustmt
))
5436 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_NOTE
, vect_location
,
5438 "killing debug use");
5440 gimple_debug_bind_reset_value (ustmt
);
5441 update_stmt (ustmt
);
5450 /* Function vect_transform_loop.
5452 The analysis phase has determined that the loop is vectorizable.
5453 Vectorize the loop - created vectorized stmts to replace the scalar
5454 stmts in the loop, and update the loop exit condition. */
5457 vect_transform_loop (loop_vec_info loop_vinfo
)
5459 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5460 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
5461 int nbbs
= loop
->num_nodes
;
5462 gimple_stmt_iterator si
;
5465 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5467 bool slp_scheduled
= false;
5468 unsigned int nunits
;
5469 gimple stmt
, pattern_stmt
;
5470 gimple_seq pattern_def_seq
= NULL
;
5471 gimple_stmt_iterator pattern_def_si
= gsi_none ();
5472 bool transform_pattern_stmt
= false;
5473 bool check_profitability
= false;
5475 /* Record number of iterations before we started tampering with the profile. */
5476 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===");
5481 /* If profile is inprecise, we have chance to fix it up. */
5482 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5483 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
5485 /* Use the more conservative vectorization threshold. If the number
5486 of iterations is constant assume the cost check has been performed
5487 by our caller. If the threshold makes all loops profitable that
5488 run at least the vectorization factor number of times checking
5489 is pointless, too. */
5490 th
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
5491 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
)) - 1);
5492 th
= MAX (th
, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
));
5493 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
5494 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE
, vect_location
,
5498 "Profitability threshold is %d loop iterations.", th
);
5499 check_profitability
= true;
5502 /* Version the loop first, if required, so the profitability check
5505 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
5506 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
5508 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
5509 check_profitability
= false;
5512 /* Peel the loop if there are data refs with unknown alignment.
5513 Only one data ref with unknown store is allowed. */
5515 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
))
5517 vect_do_peeling_for_alignment (loop_vinfo
, th
, check_profitability
);
5518 check_profitability
= false;
5521 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5522 compile time constant), or it is a constant that doesn't divide by the
5523 vectorization factor, then an epilog loop needs to be created.
5524 We therefore duplicate the loop: the original loop will be vectorized,
5525 and will compute the first (n/VF) iterations. The second copy of the loop
5526 will remain scalar and will compute the remaining (n%VF) iterations.
5527 (VF is the vectorization factor). */
5529 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
5530 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
5531 && LOOP_VINFO_INT_NITERS (loop_vinfo
) % vectorization_factor
!= 0)
5532 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5533 vect_do_peeling_for_loop_bound (loop_vinfo
, &ratio
,
5534 th
, check_profitability
);
5536 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
5537 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
5539 /* 1) Make sure the loop header has exactly two entries
5540 2) Make sure we have a preheader basic block. */
5542 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
5544 split_edge (loop_preheader_edge (loop
));
5546 /* FORNOW: the vectorizer supports only loops which body consist
5547 of one basic block (header + empty latch). When the vectorizer will
5548 support more involved loop forms, the order by which the BBs are
5549 traversed need to be reconsidered. */
5551 for (i
= 0; i
< nbbs
; i
++)
5553 basic_block bb
= bbs
[i
];
5554 stmt_vec_info stmt_info
;
5557 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
5559 phi
= gsi_stmt (si
);
5560 if (dump_enabled_p ())
5562 dump_printf_loc (MSG_NOTE
, vect_location
,
5563 "------>vectorizing phi: ");
5564 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
5566 stmt_info
= vinfo_for_stmt (phi
);
5570 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5571 vect_loop_kill_debug_uses (loop
, phi
);
5573 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5574 && !STMT_VINFO_LIVE_P (stmt_info
))
5577 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
5578 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
5579 && dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.");
5582 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.");
5586 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
5590 pattern_stmt
= NULL
;
5591 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
) || transform_pattern_stmt
;)
5595 if (transform_pattern_stmt
)
5596 stmt
= pattern_stmt
;
5598 stmt
= gsi_stmt (si
);
5600 if (dump_enabled_p ())
5602 dump_printf_loc (MSG_NOTE
, vect_location
,
5603 "------>vectorizing statement: ");
5604 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
5607 stmt_info
= vinfo_for_stmt (stmt
);
5609 /* vector stmts created in the outer-loop during vectorization of
5610 stmts in an inner-loop may not have a stmt_info, and do not
5611 need to be vectorized. */
5618 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5619 vect_loop_kill_debug_uses (loop
, stmt
);
5621 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5622 && !STMT_VINFO_LIVE_P (stmt_info
))
5624 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
5625 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
5626 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
5627 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
5629 stmt
= pattern_stmt
;
5630 stmt_info
= vinfo_for_stmt (stmt
);
5638 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
5639 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
5640 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
5641 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
5642 transform_pattern_stmt
= true;
5644 /* If pattern statement has def stmts, vectorize them too. */
5645 if (is_pattern_stmt_p (stmt_info
))
5647 if (pattern_def_seq
== NULL
)
5649 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
5650 pattern_def_si
= gsi_start (pattern_def_seq
);
5652 else if (!gsi_end_p (pattern_def_si
))
5653 gsi_next (&pattern_def_si
);
5654 if (pattern_def_seq
!= NULL
)
5656 gimple pattern_def_stmt
= NULL
;
5657 stmt_vec_info pattern_def_stmt_info
= NULL
;
5659 while (!gsi_end_p (pattern_def_si
))
5661 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
5662 pattern_def_stmt_info
5663 = vinfo_for_stmt (pattern_def_stmt
);
5664 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
5665 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
5667 gsi_next (&pattern_def_si
);
5670 if (!gsi_end_p (pattern_def_si
))
5672 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_NOTE
, vect_location
,
5675 "==> vectorizing pattern def "
5677 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
5678 pattern_def_stmt
, 0);
5681 stmt
= pattern_def_stmt
;
5682 stmt_info
= pattern_def_stmt_info
;
5686 pattern_def_si
= gsi_none ();
5687 transform_pattern_stmt
= false;
5691 transform_pattern_stmt
= false;
5694 gcc_assert (STMT_VINFO_VECTYPE (stmt_info
));
5695 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (
5696 STMT_VINFO_VECTYPE (stmt_info
));
5697 if (!STMT_SLP_TYPE (stmt_info
)
5698 && nunits
!= (unsigned int) vectorization_factor
5699 && dump_enabled_p ())
5700 /* For SLP VF is set according to unrolling factor, and not to
5701 vector size, hence for SLP this print is not valid. */
5702 dump_printf_loc (MSG_NOTE
, vect_location
,
5705 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5707 if (STMT_SLP_TYPE (stmt_info
))
5711 slp_scheduled
= true;
5713 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_NOTE
, vect_location
,
5715 "=== scheduling SLP instances ===");
5717 vect_schedule_slp (loop_vinfo
, NULL
);
5720 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5721 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
5723 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
5725 pattern_def_seq
= NULL
;
5732 /* -------- vectorize statement ------------ */
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.");
5736 grouped_store
= false;
5737 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
5740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5742 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5743 interleaving chain was completed - free all the stores in
5746 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
5751 /* Free the attached stmt_vec_info and remove the stmt. */
5752 gimple store
= gsi_stmt (si
);
5753 free_stmt_vec_info (store
);
5754 unlink_stmt_vdef (store
);
5755 gsi_remove (&si
, true);
5756 release_defs (store
);
5761 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
5763 pattern_def_seq
= NULL
;
5769 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
5771 /* Reduce loop iterations by the vectorization factor. */
5772 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
5773 expected_iterations
/ vectorization_factor
);
5774 loop
->nb_iterations_upper_bound
5775 = loop
->nb_iterations_upper_bound
.udiv (double_int::from_uhwi (vectorization_factor
),
5777 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
5778 && loop
->nb_iterations_upper_bound
!= double_int_zero
)
5779 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- double_int_one
;
5780 if (loop
->any_estimate
)
5782 loop
->nb_iterations_estimate
5783 = loop
->nb_iterations_estimate
.udiv (double_int::from_uhwi (vectorization_factor
),
5785 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
5786 && loop
->nb_iterations_estimate
!= double_int_zero
)
5787 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- double_int_one
;
5790 if (dump_enabled_p ())
5792 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5793 "LOOP VECTORIZED\n");
5795 dump_printf_loc (MSG_NOTE
, vect_location
,
5796 "OUTER LOOP VECTORIZED\n");