2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
51 /* Loop Vectorization Pass.
53 This pass tries to vectorize loops.
55 For example, the vectorizer transforms the following simple loop:
57 short a[N]; short b[N]; short c[N]; int i;
63 as if it was manually vectorized by rewriting the source code into:
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
70 for (i=0; i<N/8; i++){
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
111 For example, say stmt S1 was vectorized into stmt VS1:
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
149 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
151 /* Function vect_determine_vectorization_factor
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
179 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
180 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
181 unsigned nbbs
= loop
->num_nodes
;
182 unsigned int vectorization_factor
= 0;
187 stmt_vec_info stmt_info
;
190 gimple
*stmt
, *pattern_stmt
= NULL
;
191 gimple_seq pattern_def_seq
= NULL
;
192 gimple_stmt_iterator pattern_def_si
= gsi_none ();
193 bool analyze_pattern_stmt
= false;
195 auto_vec
<stmt_vec_info
> mask_producers
;
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE
, vect_location
,
199 "=== vect_determine_vectorization_factor ===\n");
201 for (i
= 0; i
< nbbs
; i
++)
203 basic_block bb
= bbs
[i
];
205 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
209 stmt_info
= vinfo_for_stmt (phi
);
210 if (dump_enabled_p ())
212 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
214 dump_printf (MSG_NOTE
, "\n");
217 gcc_assert (stmt_info
);
219 if (STMT_VINFO_RELEVANT_P (stmt_info
)
220 || STMT_VINFO_LIVE_P (stmt_info
))
222 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
223 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
225 if (dump_enabled_p ())
227 dump_printf_loc (MSG_NOTE
, vect_location
,
228 "get vectype for scalar type: ");
229 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
230 dump_printf (MSG_NOTE
, "\n");
233 vectype
= get_vectype_for_scalar_type (scalar_type
);
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
239 "not vectorized: unsupported "
241 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
243 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
247 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
249 if (dump_enabled_p ())
251 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
252 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
253 dump_printf (MSG_NOTE
, "\n");
256 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
257 if (dump_enabled_p ())
258 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n",
261 if (!vectorization_factor
262 || (nunits
> vectorization_factor
))
263 vectorization_factor
= nunits
;
267 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
268 !gsi_end_p (si
) || analyze_pattern_stmt
;)
272 if (analyze_pattern_stmt
)
275 stmt
= gsi_stmt (si
);
277 stmt_info
= vinfo_for_stmt (stmt
);
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_NOTE
, vect_location
,
282 "==> examining statement: ");
283 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
284 dump_printf (MSG_NOTE
, "\n");
287 gcc_assert (stmt_info
);
289 /* Skip stmts which do not need to be vectorized. */
290 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
291 && !STMT_VINFO_LIVE_P (stmt_info
))
292 || gimple_clobber_p (stmt
))
294 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
295 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
296 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
297 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
300 stmt_info
= vinfo_for_stmt (pattern_stmt
);
301 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE
, vect_location
,
304 "==> examining pattern statement: ");
305 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
306 dump_printf (MSG_NOTE
, "\n");
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
317 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
318 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
319 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
320 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
321 analyze_pattern_stmt
= true;
323 /* If a pattern statement has def stmts, analyze them too. */
324 if (is_pattern_stmt_p (stmt_info
))
326 if (pattern_def_seq
== NULL
)
328 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
329 pattern_def_si
= gsi_start (pattern_def_seq
);
331 else if (!gsi_end_p (pattern_def_si
))
332 gsi_next (&pattern_def_si
);
333 if (pattern_def_seq
!= NULL
)
335 gimple
*pattern_def_stmt
= NULL
;
336 stmt_vec_info pattern_def_stmt_info
= NULL
;
338 while (!gsi_end_p (pattern_def_si
))
340 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
341 pattern_def_stmt_info
342 = vinfo_for_stmt (pattern_def_stmt
);
343 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
344 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
346 gsi_next (&pattern_def_si
);
349 if (!gsi_end_p (pattern_def_si
))
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_NOTE
, vect_location
,
354 "==> examining pattern def stmt: ");
355 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
356 pattern_def_stmt
, 0);
357 dump_printf (MSG_NOTE
, "\n");
360 stmt
= pattern_def_stmt
;
361 stmt_info
= pattern_def_stmt_info
;
365 pattern_def_si
= gsi_none ();
366 analyze_pattern_stmt
= false;
370 analyze_pattern_stmt
= false;
373 if (gimple_get_lhs (stmt
) == NULL_TREE
374 /* MASK_STORE has no lhs, but is ok. */
375 && (!is_gimple_call (stmt
)
376 || !gimple_call_internal_p (stmt
)
377 || gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
))
379 if (is_gimple_call (stmt
))
381 /* Ignore calls with no lhs. These must be calls to
382 #pragma omp simd functions, and what vectorization factor
383 it really needs can't be determined until
384 vectorizable_simd_clone_call. */
385 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
387 pattern_def_seq
= NULL
;
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
395 "not vectorized: irregular stmt.");
396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
398 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
410 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
417 if (STMT_VINFO_VECTYPE (stmt_info
))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
424 || is_pattern_stmt_p (stmt_info
)
425 || !gsi_end_p (pattern_def_si
));
426 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
431 if (is_gimple_call (stmt
)
432 && gimple_call_internal_p (stmt
)
433 && gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
434 scalar_type
= TREE_TYPE (gimple_call_arg (stmt
, 3));
436 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
438 /* Bool ops don't participate in vectorization factor
439 computation. For comparison use compared types to
441 if (TREE_CODE (scalar_type
) == BOOLEAN_TYPE
442 && is_gimple_assign (stmt
)
443 && gimple_assign_rhs_code (stmt
) != COND_EXPR
)
445 if (STMT_VINFO_RELEVANT_P (stmt_info
)
446 || STMT_VINFO_LIVE_P (stmt_info
))
447 mask_producers
.safe_push (stmt_info
);
450 if (gimple_code (stmt
) == GIMPLE_ASSIGN
451 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
))
453 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt
)))
455 scalar_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
458 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
460 pattern_def_seq
= NULL
;
467 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE
, vect_location
,
470 "get vectype for scalar type: ");
471 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
472 dump_printf (MSG_NOTE
, "\n");
474 vectype
= get_vectype_for_scalar_type (scalar_type
);
477 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
480 "not vectorized: unsupported "
482 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
484 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
490 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
492 if (dump_enabled_p ())
494 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
495 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
496 dump_printf (MSG_NOTE
, "\n");
500 /* Don't try to compute VF out scalar types if we stmt
501 produces boolean vector. Use result vectype instead. */
502 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
503 vf_vectype
= vectype
;
506 /* The vectorization factor is according to the smallest
507 scalar type (or the largest vector size, but we only
508 support one vector size per loop). */
510 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
512 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE
, vect_location
,
515 "get vectype for scalar type: ");
516 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
517 dump_printf (MSG_NOTE
, "\n");
519 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
526 "not vectorized: unsupported data-type ");
527 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
529 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
534 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
535 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
537 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
540 "not vectorized: different sized vector "
541 "types in statement, ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
544 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
545 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
547 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
552 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
555 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
556 dump_printf (MSG_NOTE
, "\n");
559 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n", nunits
);
562 if (!vectorization_factor
563 || (nunits
> vectorization_factor
))
564 vectorization_factor
= nunits
;
566 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
568 pattern_def_seq
= NULL
;
574 /* TODO: Analyze cost. Decide if worth while to vectorize. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d\n",
577 vectorization_factor
);
578 if (vectorization_factor
<= 1)
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
582 "not vectorized: unsupported data-type\n");
585 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
587 for (i
= 0; i
< mask_producers
.length (); i
++)
589 tree mask_type
= NULL
;
591 stmt
= STMT_VINFO_STMT (mask_producers
[i
]);
593 if (gimple_code (stmt
) == GIMPLE_ASSIGN
594 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
)) == tcc_comparison
595 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt
))) != BOOLEAN_TYPE
)
597 scalar_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
598 mask_type
= get_mask_type_for_scalar_type (scalar_type
);
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
604 "not vectorized: unsupported mask\n");
613 enum vect_def_type dt
;
615 FOR_EACH_SSA_TREE_OPERAND (rhs
, stmt
, iter
, SSA_OP_USE
)
617 if (!vect_is_simple_use (rhs
, mask_producers
[i
]->vinfo
,
618 &def_stmt
, &dt
, &vectype
))
620 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
623 "not vectorized: can't compute mask type "
625 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
627 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
632 /* No vectype probably means external definition.
633 Allow it in case there is another operand which
634 allows to determine mask type. */
640 else if (TYPE_VECTOR_SUBPARTS (mask_type
)
641 != TYPE_VECTOR_SUBPARTS (vectype
))
643 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
646 "not vectorized: different sized masks "
647 "types in statement, ");
648 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
650 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
653 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
657 else if (VECTOR_BOOLEAN_TYPE_P (mask_type
)
658 != VECTOR_BOOLEAN_TYPE_P (vectype
))
660 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
663 "not vectorized: mixed mask and "
664 "nonmask vector types in statement, ");
665 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
667 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
670 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
676 /* We may compare boolean value loaded as vector of integers.
677 Fix mask_type in such case. */
679 && !VECTOR_BOOLEAN_TYPE_P (mask_type
)
680 && gimple_code (stmt
) == GIMPLE_ASSIGN
681 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt
)) == tcc_comparison
)
682 mask_type
= build_same_sized_truth_vector_type (mask_type
);
685 /* No mask_type should mean loop invariant predicate.
686 This is probably a subject for optimization in
690 if (dump_enabled_p ())
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
693 "not vectorized: can't compute mask type "
695 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
697 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
702 STMT_VINFO_VECTYPE (mask_producers
[i
]) = mask_type
;
709 /* Function vect_is_simple_iv_evolution.
711 FORNOW: A simple evolution of an induction variables in the loop is
712 considered a polynomial evolution. */
715 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
720 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
723 /* When there is no evolution in this loop, the evolution function
725 if (evolution_part
== NULL_TREE
)
728 /* When the evolution is a polynomial of degree >= 2
729 the evolution function is not "simple". */
730 if (tree_is_chrec (evolution_part
))
733 step_expr
= evolution_part
;
734 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
736 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
739 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
740 dump_printf (MSG_NOTE
, ", init: ");
741 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
742 dump_printf (MSG_NOTE
, "\n");
748 if (TREE_CODE (step_expr
) != INTEGER_CST
749 && (TREE_CODE (step_expr
) != SSA_NAME
750 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
751 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
752 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
753 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
754 || !flag_associative_math
)))
755 && (TREE_CODE (step_expr
) != REAL_CST
756 || !flag_associative_math
))
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
767 /* Function vect_analyze_scalar_cycles_1.
769 Examine the cross iteration def-use cycles of scalar variables
770 in LOOP. LOOP_VINFO represents the loop that is now being
771 considered for vectorization (can be LOOP, or an outer-loop
775 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
777 basic_block bb
= loop
->header
;
779 auto_vec
<gimple
*, 64> worklist
;
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE
, vect_location
,
785 "=== vect_analyze_scalar_cycles ===\n");
787 /* First - identify all inductions. Reduction detection assumes that all the
788 inductions have been identified, therefore, this order must not be
790 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
792 gphi
*phi
= gsi
.phi ();
793 tree access_fn
= NULL
;
794 tree def
= PHI_RESULT (phi
);
795 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
797 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
800 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
801 dump_printf (MSG_NOTE
, "\n");
804 /* Skip virtual phi's. The data dependences that are associated with
805 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
806 if (virtual_operand_p (def
))
809 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
811 /* Analyze the evolution function. */
812 access_fn
= analyze_scalar_evolution (loop
, def
);
815 STRIP_NOPS (access_fn
);
816 if (dump_enabled_p ())
818 dump_printf_loc (MSG_NOTE
, vect_location
,
819 "Access function of PHI: ");
820 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
821 dump_printf (MSG_NOTE
, "\n");
823 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
824 = initial_condition_in_loop_num (access_fn
, loop
->num
);
825 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
826 = evolution_part_in_loop_num (access_fn
, loop
->num
);
830 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
831 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
832 && TREE_CODE (step
) != INTEGER_CST
))
834 worklist
.safe_push (phi
);
838 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
840 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
842 if (dump_enabled_p ())
843 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
844 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
848 /* Second - identify all reductions and nested cycles. */
849 while (worklist
.length () > 0)
851 gimple
*phi
= worklist
.pop ();
852 tree def
= PHI_RESULT (phi
);
853 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
861 dump_printf (MSG_NOTE
, "\n");
864 gcc_assert (!virtual_operand_p (def
)
865 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
867 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
868 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
869 &double_reduc
, false);
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE
, vect_location
,
876 "Detected double reduction.\n");
878 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
879 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
880 vect_double_reduction_def
;
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE
, vect_location
,
888 "Detected vectorizable nested cycle.\n");
890 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE
, vect_location
,
898 "Detected reduction.\n");
900 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
901 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
903 /* Store the reduction cycles for possible vectorization in
905 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
941 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
943 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
955 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
961 vect_fixup_reduc_chain (gimple
*stmt
)
963 gimple
*firstp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp
))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
967 GROUP_SIZE (vinfo_for_stmt (firstp
)) = GROUP_SIZE (vinfo_for_stmt (stmt
));
970 stmtp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp
)) = firstp
;
972 stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp
))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp
)) = vect_reduction_def
;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first
)))
992 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (first
));
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
)))
997 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1003 vect_fixup_reduc_chain (first
);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first
));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1.
1016 Return the loop exit condition. */
1020 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
,
1021 tree
*number_of_iterationsm1
)
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE
, vect_location
,
1027 "=== get_loop_niters ===\n");
1029 niters
= number_of_latch_executions (loop
);
1030 *number_of_iterationsm1
= niters
;
1032 /* We want the number of loop header executions which is the number
1033 of latch executions plus one.
1034 ??? For UINT_MAX latch executions this number overflows to zero
1035 for loops like do { n++; } while (n != 0); */
1036 if (niters
&& !chrec_contains_undetermined (niters
))
1037 niters
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niters
), unshare_expr (niters
),
1038 build_int_cst (TREE_TYPE (niters
), 1));
1039 *number_of_iterations
= niters
;
1041 return get_loop_exit_condition (loop
);
1045 /* Function bb_in_loop_p
1047 Used as predicate for dfs order traversal of the loop bbs. */
1050 bb_in_loop_p (const_basic_block bb
, const void *data
)
1052 const struct loop
*const loop
= (const struct loop
*)data
;
1053 if (flow_bb_inside_loop_p (loop
, bb
))
1059 /* Function new_loop_vec_info.
1061 Create and initialize a new loop_vec_info struct for LOOP, as well as
1062 stmt_vec_info structs for all the stmts in LOOP. */
1064 static loop_vec_info
1065 new_loop_vec_info (struct loop
*loop
)
1069 gimple_stmt_iterator si
;
1070 unsigned int i
, nbbs
;
1072 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
1073 res
->kind
= vec_info::loop
;
1074 LOOP_VINFO_LOOP (res
) = loop
;
1076 bbs
= get_loop_body (loop
);
1078 /* Create/Update stmt_info for all stmts in the loop. */
1079 for (i
= 0; i
< loop
->num_nodes
; i
++)
1081 basic_block bb
= bbs
[i
];
1083 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1085 gimple
*phi
= gsi_stmt (si
);
1086 gimple_set_uid (phi
, 0);
1087 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
));
1090 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1092 gimple
*stmt
= gsi_stmt (si
);
1093 gimple_set_uid (stmt
, 0);
1094 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
));
1098 /* CHECKME: We want to visit all BBs before their successors (except for
1099 latch blocks, for which this assertion wouldn't hold). In the simple
1100 case of the loop forms we allow, a dfs order of the BBs would the same
1101 as reversed postorder traversal, so we are safe. */
1104 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
1105 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
1106 bbs
, loop
->num_nodes
, loop
);
1107 gcc_assert (nbbs
== loop
->num_nodes
);
1109 LOOP_VINFO_BBS (res
) = bbs
;
1110 LOOP_VINFO_NITERSM1 (res
) = NULL
;
1111 LOOP_VINFO_NITERS (res
) = NULL
;
1112 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
1113 LOOP_VINFO_COST_MODEL_THRESHOLD (res
) = 0;
1114 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
1115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res
) = 0;
1116 LOOP_VINFO_VECT_FACTOR (res
) = 0;
1117 LOOP_VINFO_LOOP_NEST (res
) = vNULL
;
1118 LOOP_VINFO_DATAREFS (res
) = vNULL
;
1119 LOOP_VINFO_DDRS (res
) = vNULL
;
1120 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
1121 LOOP_VINFO_MAY_MISALIGN_STMTS (res
) = vNULL
;
1122 LOOP_VINFO_MAY_ALIAS_DDRS (res
) = vNULL
;
1123 LOOP_VINFO_GROUPED_STORES (res
) = vNULL
;
1124 LOOP_VINFO_REDUCTIONS (res
) = vNULL
;
1125 LOOP_VINFO_REDUCTION_CHAINS (res
) = vNULL
;
1126 LOOP_VINFO_SLP_INSTANCES (res
) = vNULL
;
1127 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
1128 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
1129 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
1130 LOOP_VINFO_PEELING_FOR_NITER (res
) = false;
1131 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
1137 /* Function destroy_loop_vec_info.
1139 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1140 stmts in the loop. */
1143 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
1148 gimple_stmt_iterator si
;
1150 vec
<slp_instance
> slp_instances
;
1151 slp_instance instance
;
1157 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1159 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1160 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
1161 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
1163 for (j
= 0; j
< nbbs
; j
++)
1165 basic_block bb
= bbs
[j
];
1166 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1167 free_stmt_vec_info (gsi_stmt (si
));
1169 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
1171 gimple
*stmt
= gsi_stmt (si
);
1173 /* We may have broken canonical form by moving a constant
1174 into RHS1 of a commutative op. Fix such occurrences. */
1175 if (swapped
&& is_gimple_assign (stmt
))
1177 enum tree_code code
= gimple_assign_rhs_code (stmt
);
1179 if ((code
== PLUS_EXPR
1180 || code
== POINTER_PLUS_EXPR
1181 || code
== MULT_EXPR
)
1182 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
1183 swap_ssa_operands (stmt
,
1184 gimple_assign_rhs1_ptr (stmt
),
1185 gimple_assign_rhs2_ptr (stmt
));
1188 /* Free stmt_vec_info. */
1189 free_stmt_vec_info (stmt
);
1194 free (LOOP_VINFO_BBS (loop_vinfo
));
1195 vect_destroy_datarefs (loop_vinfo
);
1196 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
1197 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
1198 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
1199 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
1200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
1201 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1202 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
1203 vect_free_slp_instance (instance
);
1205 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
1206 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
1207 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
1208 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
1210 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
1211 loop_vinfo
->scalar_cost_vec
.release ();
1218 /* Calculate the cost of one scalar iteration of the loop. */
1220 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1222 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1223 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1224 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
1225 int innerloop_iters
, i
;
1227 /* Count statements in scalar loop. Using this as scalar cost for a single
1230 TODO: Add outer loop support.
1232 TODO: Consider assigning different costs to different scalar
1236 innerloop_iters
= 1;
1238 innerloop_iters
= 50; /* FIXME */
1240 for (i
= 0; i
< nbbs
; i
++)
1242 gimple_stmt_iterator si
;
1243 basic_block bb
= bbs
[i
];
1245 if (bb
->loop_father
== loop
->inner
)
1246 factor
= innerloop_iters
;
1250 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1252 gimple
*stmt
= gsi_stmt (si
);
1253 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1255 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1258 /* Skip stmts that are not vectorized inside the loop. */
1260 && !STMT_VINFO_RELEVANT_P (stmt_info
)
1261 && (!STMT_VINFO_LIVE_P (stmt_info
)
1262 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1263 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
1266 vect_cost_for_stmt kind
;
1267 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
1269 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
1272 kind
= scalar_store
;
1277 scalar_single_iter_cost
1278 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1279 factor
, kind
, NULL
, 0, vect_prologue
);
1282 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
)
1283 = scalar_single_iter_cost
;
1287 /* Function vect_analyze_loop_form_1.
1289 Verify that certain CFG restrictions hold, including:
1290 - the loop has a pre-header
1291 - the loop has a single entry and exit
1292 - the loop exit condition is simple enough, and the number of iterations
1293 can be analyzed (a countable loop). */
1296 vect_analyze_loop_form_1 (struct loop
*loop
, gcond
**loop_cond
,
1297 tree
*number_of_iterationsm1
,
1298 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_NOTE
, vect_location
,
1302 "=== vect_analyze_loop_form ===\n");
1304 /* Different restrictions apply when we are considering an inner-most loop,
1305 vs. an outer (nested) loop.
1306 (FORNOW. May want to relax some of these restrictions in the future). */
1310 /* Inner-most loop. We currently require that the number of BBs is
1311 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 if (loop
->num_nodes
!= 2)
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1326 "not vectorized: control flow in loop.\n");
1330 if (empty_block_p (loop
->header
))
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1334 "not vectorized: empty loop.\n");
1340 struct loop
*innerloop
= loop
->inner
;
1343 /* Nested loop. We currently require that the loop is doubly-nested,
1344 contains a single inner loop, and the number of BBs is exactly 5.
1345 Vectorizable outer-loops look like this:
1357 The inner-loop has the properties expected of inner-most loops
1358 as described above. */
1360 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1364 "not vectorized: multiple nested loops.\n");
1368 if (loop
->num_nodes
!= 5)
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1372 "not vectorized: control flow in loop.\n");
1376 entryedge
= loop_preheader_edge (innerloop
);
1377 if (entryedge
->src
!= loop
->header
1378 || !single_exit (innerloop
)
1379 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1383 "not vectorized: unsupported outerloop form.\n");
1387 /* Analyze the inner-loop. */
1388 tree inner_niterm1
, inner_niter
;
1389 if (! vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1390 &inner_niterm1
, &inner_niter
, NULL
))
1392 if (dump_enabled_p ())
1393 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1394 "not vectorized: Bad inner loop.\n");
1398 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1400 if (dump_enabled_p ())
1401 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1402 "not vectorized: inner-loop count not"
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE
, vect_location
,
1409 "Considering outer-loop vectorization.\n");
1412 if (!single_exit (loop
)
1413 || EDGE_COUNT (loop
->header
->preds
) != 2)
1415 if (dump_enabled_p ())
1417 if (!single_exit (loop
))
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1419 "not vectorized: multiple exits.\n");
1420 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1422 "not vectorized: too many incoming edges.\n");
1427 /* We assume that the loop exit condition is at the end of the loop. i.e,
1428 that the loop is represented as a do-while (with a proper if-guard
1429 before the loop if needed), where the loop header contains all the
1430 executable statements, and the latch is empty. */
1431 if (!empty_block_p (loop
->latch
)
1432 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1436 "not vectorized: latch block not empty.\n");
1440 /* Make sure there exists a single-predecessor exit bb: */
1441 if (!single_pred_p (single_exit (loop
)->dest
))
1443 edge e
= single_exit (loop
);
1444 if (!(e
->flags
& EDGE_ABNORMAL
))
1446 split_loop_exit_edge (e
);
1447 if (dump_enabled_p ())
1448 dump_printf (MSG_NOTE
, "split exit edge.\n");
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1454 "not vectorized: abnormal loop exit edge.\n");
1459 *loop_cond
= vect_get_loop_niters (loop
, number_of_iterations
,
1460 number_of_iterationsm1
);
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1465 "not vectorized: complicated exit condition.\n");
1469 if (!*number_of_iterations
1470 || chrec_contains_undetermined (*number_of_iterations
))
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1474 "not vectorized: number of iterations cannot be "
1479 if (integer_zerop (*number_of_iterations
))
1481 if (dump_enabled_p ())
1482 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1483 "not vectorized: number of iterations = 0.\n");
1490 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1493 vect_analyze_loop_form (struct loop
*loop
)
1495 tree number_of_iterations
, number_of_iterationsm1
;
1496 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1498 if (! vect_analyze_loop_form_1 (loop
, &loop_cond
, &number_of_iterationsm1
,
1499 &number_of_iterations
, &inner_loop_cond
))
1502 loop_vec_info loop_vinfo
= new_loop_vec_info (loop
);
1503 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1504 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1505 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1507 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1509 if (dump_enabled_p ())
1511 dump_printf_loc (MSG_NOTE
, vect_location
,
1512 "Symbolic number of iterations is ");
1513 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1514 dump_printf (MSG_NOTE
, "\n");
1518 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1519 if (inner_loop_cond
)
1520 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond
))
1521 = loop_exit_ctrl_vec_info_type
;
1523 gcc_assert (!loop
->aux
);
1524 loop
->aux
= loop_vinfo
;
1530 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1531 statements update the vectorization factor. */
1534 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1536 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1537 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1538 int nbbs
= loop
->num_nodes
;
1539 unsigned int vectorization_factor
;
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE
, vect_location
,
1544 "=== vect_update_vf_for_slp ===\n");
1546 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1547 gcc_assert (vectorization_factor
!= 0);
1549 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1550 vectorization factor of the loop is the unrolling factor required by
1551 the SLP instances. If that unrolling factor is 1, we say, that we
1552 perform pure SLP on loop - cross iteration parallelism is not
1554 bool only_slp_in_loop
= true;
1555 for (i
= 0; i
< nbbs
; i
++)
1557 basic_block bb
= bbs
[i
];
1558 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1561 gimple
*stmt
= gsi_stmt (si
);
1562 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1563 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
1564 && STMT_VINFO_RELATED_STMT (stmt_info
))
1566 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
1567 stmt_info
= vinfo_for_stmt (stmt
);
1569 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1570 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1571 && !PURE_SLP_STMT (stmt_info
))
1572 /* STMT needs both SLP and loop-based vectorization. */
1573 only_slp_in_loop
= false;
1577 if (only_slp_in_loop
)
1578 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1580 vectorization_factor
1581 = least_common_multiple (vectorization_factor
,
1582 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1584 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1585 if (dump_enabled_p ())
1586 dump_printf_loc (MSG_NOTE
, vect_location
,
1587 "Updating vectorization factor to %d\n",
1588 vectorization_factor
);
1591 /* Function vect_analyze_loop_operations.
1593 Scan the loop stmts and make sure they are all vectorizable. */
1596 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1598 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1599 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1600 int nbbs
= loop
->num_nodes
;
1602 stmt_vec_info stmt_info
;
1603 bool need_to_vectorize
= false;
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE
, vect_location
,
1608 "=== vect_analyze_loop_operations ===\n");
1610 for (i
= 0; i
< nbbs
; i
++)
1612 basic_block bb
= bbs
[i
];
1614 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1617 gphi
*phi
= si
.phi ();
1620 stmt_info
= vinfo_for_stmt (phi
);
1621 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1624 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1625 dump_printf (MSG_NOTE
, "\n");
1627 if (virtual_operand_p (gimple_phi_result (phi
)))
1630 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1631 (i.e., a phi in the tail of the outer-loop). */
1632 if (! is_loop_header_bb_p (bb
))
1634 /* FORNOW: we currently don't support the case that these phis
1635 are not used in the outerloop (unless it is double reduction,
1636 i.e., this phi is vect_reduction_def), cause this case
1637 requires to actually do something here. */
1638 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1639 || STMT_VINFO_LIVE_P (stmt_info
))
1640 && STMT_VINFO_DEF_TYPE (stmt_info
)
1641 != vect_double_reduction_def
)
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1645 "Unsupported loop-closed phi in "
1650 /* If PHI is used in the outer loop, we check that its operand
1651 is defined in the inner loop. */
1652 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1655 gimple
*op_def_stmt
;
1657 if (gimple_phi_num_args (phi
) != 1)
1660 phi_op
= PHI_ARG_DEF (phi
, 0);
1661 if (TREE_CODE (phi_op
) != SSA_NAME
)
1664 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1665 if (gimple_nop_p (op_def_stmt
)
1666 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1667 || !vinfo_for_stmt (op_def_stmt
))
1670 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1671 != vect_used_in_outer
1672 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1673 != vect_used_in_outer_by_reduction
)
1680 gcc_assert (stmt_info
);
1682 if (STMT_VINFO_LIVE_P (stmt_info
))
1684 /* FORNOW: not yet supported. */
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1687 "not vectorized: value used after loop.\n");
1691 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1692 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1694 /* A scalar-dependence cycle that we don't support. */
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1697 "not vectorized: scalar dependence cycle.\n");
1701 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1703 need_to_vectorize
= true;
1704 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1705 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1710 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1713 "not vectorized: relevant phi not "
1715 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1716 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1722 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1725 gimple
*stmt
= gsi_stmt (si
);
1726 if (!gimple_clobber_p (stmt
)
1727 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1732 /* All operations in the loop are either irrelevant (deal with loop
1733 control, or dead), or only used outside the loop and can be moved
1734 out of the loop (e.g. invariants, inductions). The loop can be
1735 optimized away by scalar optimizations. We're better off not
1736 touching this loop. */
1737 if (!need_to_vectorize
)
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE
, vect_location
,
1741 "All the computation can be taken out of the loop.\n");
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1744 "not vectorized: redundant loop. no profit to "
1753 /* Function vect_analyze_loop_2.
1755 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1756 for it. The different analyses will record information in the
1757 loop_vec_info struct. */
1759 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
)
1762 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1764 unsigned int n_stmts
= 0;
1766 /* The first group of checks is independent of the vector size. */
1769 /* Find all data references in the loop (which correspond to vdefs/vuses)
1770 and analyze their evolution in the loop. */
1772 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1774 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1775 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
1777 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1779 "not vectorized: loop nest containing two "
1780 "or more consecutive inner loops cannot be "
1785 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1786 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1787 !gsi_end_p (gsi
); gsi_next (&gsi
))
1789 gimple
*stmt
= gsi_stmt (gsi
);
1790 if (is_gimple_debug (stmt
))
1793 if (!find_data_references_in_stmt (loop
, stmt
,
1794 &LOOP_VINFO_DATAREFS (loop_vinfo
)))
1796 if (is_gimple_call (stmt
) && loop
->safelen
)
1798 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1799 if (fndecl
!= NULL_TREE
)
1801 cgraph_node
*node
= cgraph_node::get (fndecl
);
1802 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1804 unsigned int j
, n
= gimple_call_num_args (stmt
);
1805 for (j
= 0; j
< n
; j
++)
1807 op
= gimple_call_arg (stmt
, j
);
1809 || (REFERENCE_CLASS_P (op
)
1810 && get_base_address (op
)))
1813 op
= gimple_call_lhs (stmt
);
1814 /* Ignore #pragma omp declare simd functions
1815 if they don't have data references in the
1816 call stmt itself. */
1820 || (REFERENCE_CLASS_P (op
)
1821 && get_base_address (op
)))))
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1828 "not vectorized: loop contains function "
1829 "calls or data references that cannot "
1835 /* Analyze the data references and also adjust the minimal
1836 vectorization factor according to the loads and stores. */
1838 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
);
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1843 "bad data references.\n");
1847 /* Classify all cross-iteration scalar data-flow cycles.
1848 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1849 vect_analyze_scalar_cycles (loop_vinfo
);
1851 vect_pattern_recog (loop_vinfo
);
1853 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1855 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1856 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1858 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1863 "bad data access.\n");
1867 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1869 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1874 "unexpected pattern.\n");
1878 /* While the rest of the analysis below depends on it in some way. */
1881 /* Analyze data dependences between the data-refs in the loop
1882 and adjust the maximum vectorization factor according to
1884 FORNOW: fail at the first data dependence that we encounter. */
1886 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1892 "bad data dependence.\n");
1896 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1901 "can't determine vectorization factor.\n");
1904 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1908 "bad data dependence.\n");
1912 /* Compute the scalar iteration cost. */
1913 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
1915 int saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1916 HOST_WIDE_INT estimated_niter
;
1918 int min_scalar_loop_bound
;
1920 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1921 ok
= vect_analyze_slp (loop_vinfo
, n_stmts
);
1925 /* If there are any SLP instances mark them as pure_slp. */
1926 bool slp
= vect_make_slp_decision (loop_vinfo
);
1929 /* Find stmts that need to be both vectorized and SLPed. */
1930 vect_detect_hybrid_slp (loop_vinfo
);
1932 /* Update the vectorization factor based on the SLP decision. */
1933 vect_update_vf_for_slp (loop_vinfo
);
1936 /* This is the point where we can re-start analysis with SLP forced off. */
1939 /* Now the vectorization factor is final. */
1940 unsigned vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1941 gcc_assert (vectorization_factor
!= 0);
1943 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE
, vect_location
,
1945 "vectorization_factor = %d, niters = "
1946 HOST_WIDE_INT_PRINT_DEC
"\n", vectorization_factor
,
1947 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1949 HOST_WIDE_INT max_niter
1950 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1951 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1952 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1954 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1956 if (dump_enabled_p ())
1957 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1958 "not vectorized: iteration count smaller than "
1959 "vectorization factor.\n");
1963 /* Analyze the alignment of the data-refs in the loop.
1964 Fail if a data reference is found that cannot be vectorized. */
1966 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1971 "bad data alignment.\n");
1975 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976 It is important to call pruning after vect_analyze_data_ref_accesses,
1977 since we use grouping information gathered by interleaving analysis. */
1978 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1983 "number of versioning for alias "
1984 "run-time tests exceeds %d "
1985 "(--param vect-max-version-for-alias-checks)\n",
1986 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
1990 /* This pass will decide on using loop versioning and/or loop peeling in
1991 order to enhance the alignment of data references in the loop. */
1992 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1997 "bad data alignment.\n");
2003 /* Analyze operations in the SLP instances. Note this may
2004 remove unsupported SLP instances which makes the above
2005 SLP kind detection invalid. */
2006 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2007 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
),
2008 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2009 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2013 /* Scan all the remaining operations in the loop that are not subject
2014 to SLP and make sure they are vectorizable. */
2015 ok
= vect_analyze_loop_operations (loop_vinfo
);
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2020 "bad operation or unsupported loop bound.\n");
2024 /* Analyze cost. Decide if worth while to vectorize. */
2025 int min_profitable_estimate
, min_profitable_iters
;
2026 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
2027 &min_profitable_estimate
);
2029 if (min_profitable_iters
< 0)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2033 "not vectorized: vectorization not profitable.\n");
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2036 "not vectorized: vector version will never be "
2041 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
2042 * vectorization_factor
) - 1);
2044 /* Use the cost model only if it is more conservative than user specified
2046 th
= (unsigned) min_scalar_loop_bound
;
2047 if (min_profitable_iters
2048 && (!min_scalar_loop_bound
2049 || min_profitable_iters
> min_scalar_loop_bound
))
2050 th
= (unsigned) min_profitable_iters
;
2052 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
2054 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2055 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2059 "not vectorized: vectorization not profitable.\n");
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE
, vect_location
,
2062 "not vectorized: iteration count smaller than user "
2063 "specified loop bound parameter or minimum profitable "
2064 "iterations (whichever is more conservative).\n");
2069 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
2070 if (estimated_niter
!= -1
2071 && ((unsigned HOST_WIDE_INT
) estimated_niter
2072 <= MAX (th
, (unsigned)min_profitable_estimate
)))
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2076 "not vectorized: estimated iteration count too "
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_NOTE
, vect_location
,
2080 "not vectorized: estimated iteration count smaller "
2081 "than specified loop bound parameter or minimum "
2082 "profitable iterations (whichever is more "
2083 "conservative).\n");
2087 /* Decide whether we need to create an epilogue loop to handle
2088 remaining scalar iterations. */
2089 th
= ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) + 1)
2090 / LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
2091 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2093 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2094 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
2096 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo
)
2097 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
2098 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2099 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2101 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2102 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
2103 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
2104 /* In case of versioning, check if the maximum number of
2105 iterations is greater than th. If they are identical,
2106 the epilogue is unnecessary. */
2107 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
)
2108 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2109 || (unsigned HOST_WIDE_INT
) max_niter
> th
)))
2110 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2112 /* If an epilogue loop is required make sure we can create one. */
2113 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2114 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2118 if (!vect_can_advance_ivs_p (loop_vinfo
)
2119 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2120 single_exit (LOOP_VINFO_LOOP
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2125 "not vectorized: can't create required "
2131 gcc_assert (vectorization_factor
2132 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
2134 /* Ok to vectorize! */
2138 /* Try again with SLP forced off but if we didn't do any SLP there is
2139 no point in re-trying. */
2143 /* If there are reduction chains re-trying will fail anyway. */
2144 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2147 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2148 via interleaving or lane instructions. */
2149 slp_instance instance
;
2152 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2154 stmt_vec_info vinfo
;
2155 vinfo
= vinfo_for_stmt
2156 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0]);
2157 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2159 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2160 unsigned int size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2161 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2162 if (! vect_store_lanes_supported (vectype
, size
)
2163 && ! vect_grouped_store_supported (vectype
, size
))
2165 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2167 vinfo
= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]);
2168 vinfo
= vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo
));
2169 size
= STMT_VINFO_GROUP_SIZE (vinfo
);
2170 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2171 if (! vect_load_lanes_supported (vectype
, size
)
2172 && ! vect_grouped_load_supported (vectype
, size
))
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE
, vect_location
,
2179 "re-trying with SLP disabled\n");
2181 /* Roll back state appropriately. No SLP this time. */
2183 /* Restore vectorization factor as it were without SLP. */
2184 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2185 /* Free the SLP instances. */
2186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2187 vect_free_slp_instance (instance
);
2188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2189 /* Reset SLP type to loop_vect on all stmts. */
2190 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2192 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2193 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2194 !gsi_end_p (si
); gsi_next (&si
))
2196 stmt_vec_info stmt_info
= vinfo_for_stmt (gsi_stmt (si
));
2197 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2198 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2200 stmt_info
= vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
2201 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2202 for (gimple_stmt_iterator pi
2203 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2204 !gsi_end_p (pi
); gsi_next (&pi
))
2206 gimple
*pstmt
= gsi_stmt (pi
);
2207 STMT_SLP_TYPE (vinfo_for_stmt (pstmt
)) = loop_vect
;
2212 /* Free optimized alias test DDRS. */
2213 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2214 /* Reset target cost data. */
2215 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2216 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2217 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2218 /* Reset assorted flags. */
2219 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2220 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2221 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2226 /* Function vect_analyze_loop.
2228 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2229 for it. The different analyses will record information in the
2230 loop_vec_info struct. */
2232 vect_analyze_loop (struct loop
*loop
)
2234 loop_vec_info loop_vinfo
;
2235 unsigned int vector_sizes
;
2237 /* Autodetect first vector size we try. */
2238 current_vector_size
= 0;
2239 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_NOTE
, vect_location
,
2243 "===== analyze_loop_nest =====\n");
2245 if (loop_outer (loop
)
2246 && loop_vec_info_for_loop (loop_outer (loop
))
2247 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE
, vect_location
,
2251 "outer-loop already vectorized.\n");
2257 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2258 loop_vinfo
= vect_analyze_loop_form (loop
);
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2263 "bad loop form.\n");
2268 if (vect_analyze_loop_2 (loop_vinfo
, fatal
))
2270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2275 destroy_loop_vec_info (loop_vinfo
, true);
2277 vector_sizes
&= ~current_vector_size
;
2279 || vector_sizes
== 0
2280 || current_vector_size
== 0)
2283 /* Try the next biggest vector size. */
2284 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE
, vect_location
,
2287 "***** Re-trying analysis with "
2288 "vector size %d\n", current_vector_size
);
2293 /* Function reduction_code_for_scalar_code
2296 CODE - tree_code of a reduction operations.
2299 REDUC_CODE - the corresponding tree-code to be used to reduce the
2300 vector of partial results into a single scalar result, or ERROR_MARK
2301 if the operation is a supported reduction operation, but does not have
2304 Return FALSE if CODE currently cannot be vectorized as reduction. */
2307 reduction_code_for_scalar_code (enum tree_code code
,
2308 enum tree_code
*reduc_code
)
2313 *reduc_code
= REDUC_MAX_EXPR
;
2317 *reduc_code
= REDUC_MIN_EXPR
;
2321 *reduc_code
= REDUC_PLUS_EXPR
;
2329 *reduc_code
= ERROR_MARK
;
2338 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2339 STMT is printed with a message MSG. */
2342 report_vect_op (int msg_type
, gimple
*stmt
, const char *msg
)
2344 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
2345 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
2346 dump_printf (msg_type
, "\n");
2350 /* Detect SLP reduction of the form:
2360 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2361 FIRST_STMT is the first reduction stmt in the chain
2362 (a2 = operation (a1)).
2364 Return TRUE if a reduction chain was detected. */
2367 vect_is_slp_reduction (loop_vec_info loop_info
, gimple
*phi
,
2370 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2371 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2372 enum tree_code code
;
2373 gimple
*current_stmt
= NULL
, *loop_use_stmt
= NULL
, *first
, *next_stmt
;
2374 stmt_vec_info use_stmt_info
, current_stmt_info
;
2376 imm_use_iterator imm_iter
;
2377 use_operand_p use_p
;
2378 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2381 if (loop
!= vect_loop
)
2384 lhs
= PHI_RESULT (phi
);
2385 code
= gimple_assign_rhs_code (first_stmt
);
2389 n_out_of_loop_uses
= 0;
2390 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2392 gimple
*use_stmt
= USE_STMT (use_p
);
2393 if (is_gimple_debug (use_stmt
))
2396 /* Check if we got back to the reduction phi. */
2397 if (use_stmt
== phi
)
2399 loop_use_stmt
= use_stmt
;
2404 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2406 loop_use_stmt
= use_stmt
;
2410 n_out_of_loop_uses
++;
2412 /* There are can be either a single use in the loop or two uses in
2414 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2421 /* We reached a statement with no loop uses. */
2422 if (nloop_uses
== 0)
2425 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2426 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2429 if (!is_gimple_assign (loop_use_stmt
)
2430 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2431 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2434 /* Insert USE_STMT into reduction chain. */
2435 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2438 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2439 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2440 GROUP_FIRST_ELEMENT (use_stmt_info
)
2441 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2444 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2446 lhs
= gimple_assign_lhs (loop_use_stmt
);
2447 current_stmt
= loop_use_stmt
;
2451 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2454 /* Swap the operands, if needed, to make the reduction operand be the second
2456 lhs
= PHI_RESULT (phi
);
2457 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2460 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2462 tree op
= gimple_assign_rhs1 (next_stmt
);
2463 gimple
*def_stmt
= NULL
;
2465 if (TREE_CODE (op
) == SSA_NAME
)
2466 def_stmt
= SSA_NAME_DEF_STMT (op
);
2468 /* Check that the other def is either defined in the loop
2469 ("vect_internal_def"), or it's an induction (defined by a
2470 loop-header phi-node). */
2472 && gimple_bb (def_stmt
)
2473 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2474 && (is_gimple_assign (def_stmt
)
2475 || is_gimple_call (def_stmt
)
2476 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2477 == vect_induction_def
2478 || (gimple_code (def_stmt
) == GIMPLE_PHI
2479 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2480 == vect_internal_def
2481 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2483 lhs
= gimple_assign_lhs (next_stmt
);
2484 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2492 tree op
= gimple_assign_rhs2 (next_stmt
);
2493 gimple
*def_stmt
= NULL
;
2495 if (TREE_CODE (op
) == SSA_NAME
)
2496 def_stmt
= SSA_NAME_DEF_STMT (op
);
2498 /* Check that the other def is either defined in the loop
2499 ("vect_internal_def"), or it's an induction (defined by a
2500 loop-header phi-node). */
2502 && gimple_bb (def_stmt
)
2503 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2504 && (is_gimple_assign (def_stmt
)
2505 || is_gimple_call (def_stmt
)
2506 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2507 == vect_induction_def
2508 || (gimple_code (def_stmt
) == GIMPLE_PHI
2509 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2510 == vect_internal_def
2511 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2513 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2516 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2517 dump_printf (MSG_NOTE
, "\n");
2520 swap_ssa_operands (next_stmt
,
2521 gimple_assign_rhs1_ptr (next_stmt
),
2522 gimple_assign_rhs2_ptr (next_stmt
));
2523 update_stmt (next_stmt
);
2525 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2526 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2532 lhs
= gimple_assign_lhs (next_stmt
);
2533 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2536 /* Save the chain for further analysis in SLP detection. */
2537 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2538 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2539 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2545 /* Function vect_is_simple_reduction_1
2547 (1) Detect a cross-iteration def-use cycle that represents a simple
2548 reduction computation. We look for the following pattern:
2553 a2 = operation (a3, a1)
2560 a2 = operation (a3, a1)
2563 1. operation is commutative and associative and it is safe to
2564 change the order of the computation (if CHECK_REDUCTION is true)
2565 2. no uses for a2 in the loop (a2 is used out of the loop)
2566 3. no uses of a1 in the loop besides the reduction operation
2567 4. no uses of a1 outside the loop.
2569 Conditions 1,4 are tested here.
2570 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2572 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2573 nested cycles, if CHECK_REDUCTION is false.
2575 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2579 inner loop (def of a3)
2582 (4) Detect condition expressions, ie:
2583 for (int i = 0; i < N; i++)
2590 vect_is_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
2591 bool check_reduction
, bool *double_reduc
,
2592 bool need_wrapping_integral_overflow
,
2593 enum vect_reduction_type
*v_reduc_type
)
2595 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2596 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2597 edge latch_e
= loop_latch_edge (loop
);
2598 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2599 gimple
*def_stmt
, *def1
= NULL
, *def2
= NULL
, *phi_use_stmt
= NULL
;
2600 enum tree_code orig_code
, code
;
2601 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2605 imm_use_iterator imm_iter
;
2606 use_operand_p use_p
;
2609 *double_reduc
= false;
2610 *v_reduc_type
= TREE_CODE_REDUCTION
;
2612 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2613 otherwise, we assume outer loop vectorization. */
2614 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2615 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2617 name
= PHI_RESULT (phi
);
2618 /* ??? If there are no uses of the PHI result the inner loop reduction
2619 won't be detected as possibly double-reduction by vectorizable_reduction
2620 because that tries to walk the PHI arg from the preheader edge which
2621 can be constant. See PR60382. */
2622 if (has_zero_uses (name
))
2625 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2627 gimple
*use_stmt
= USE_STMT (use_p
);
2628 if (is_gimple_debug (use_stmt
))
2631 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2635 "intermediate value used outside loop.\n");
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2645 "reduction used in loop.\n");
2649 phi_use_stmt
= use_stmt
;
2652 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2657 "reduction: not ssa_name: ");
2658 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2659 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2664 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2669 "reduction: no def_stmt.\n");
2673 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2675 if (dump_enabled_p ())
2677 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2678 dump_printf (MSG_NOTE
, "\n");
2683 if (is_gimple_assign (def_stmt
))
2685 name
= gimple_assign_lhs (def_stmt
);
2690 name
= PHI_RESULT (def_stmt
);
2695 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2697 gimple
*use_stmt
= USE_STMT (use_p
);
2698 if (is_gimple_debug (use_stmt
))
2700 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2704 if (dump_enabled_p ())
2705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2706 "reduction used in loop.\n");
2711 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2712 defined in the inner loop. */
2715 op1
= PHI_ARG_DEF (def_stmt
, 0);
2717 if (gimple_phi_num_args (def_stmt
) != 1
2718 || TREE_CODE (op1
) != SSA_NAME
)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2722 "unsupported phi node definition.\n");
2727 def1
= SSA_NAME_DEF_STMT (op1
);
2728 if (gimple_bb (def1
)
2729 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2731 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2732 && is_gimple_assign (def1
)
2733 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE
, def_stmt
,
2737 "detected double reduction: ");
2739 *double_reduc
= true;
2746 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2748 /* We can handle "res -= x[i]", which is non-associative by
2749 simply rewriting this into "res += -x[i]". Avoid changing
2750 gimple instruction for the first simple tests and only do this
2751 if we're allowed to change code at all. */
2752 if (code
== MINUS_EXPR
2753 && (op1
= gimple_assign_rhs1 (def_stmt
))
2754 && TREE_CODE (op1
) == SSA_NAME
2755 && SSA_NAME_DEF_STMT (op1
) == phi
)
2758 if (code
== COND_EXPR
)
2760 if (check_reduction
)
2761 *v_reduc_type
= COND_REDUCTION
;
2763 else if (!commutative_tree_code (code
) || !associative_tree_code (code
))
2765 if (dump_enabled_p ())
2766 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2767 "reduction: not commutative/associative: ");
2771 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2773 if (code
!= COND_EXPR
)
2775 if (dump_enabled_p ())
2776 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2777 "reduction: not binary operation: ");
2782 op3
= gimple_assign_rhs1 (def_stmt
);
2783 if (COMPARISON_CLASS_P (op3
))
2785 op4
= TREE_OPERAND (op3
, 1);
2786 op3
= TREE_OPERAND (op3
, 0);
2789 op1
= gimple_assign_rhs2 (def_stmt
);
2790 op2
= gimple_assign_rhs3 (def_stmt
);
2792 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2794 if (dump_enabled_p ())
2795 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2796 "reduction: uses not ssa_names: ");
2803 op1
= gimple_assign_rhs1 (def_stmt
);
2804 op2
= gimple_assign_rhs2 (def_stmt
);
2806 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2808 if (dump_enabled_p ())
2809 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2810 "reduction: uses not ssa_names: ");
2816 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2817 if ((TREE_CODE (op1
) == SSA_NAME
2818 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2819 || (TREE_CODE (op2
) == SSA_NAME
2820 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2821 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2822 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2823 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2824 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2826 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE
, vect_location
,
2829 "reduction: multiple types: operation type: ");
2830 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2831 dump_printf (MSG_NOTE
, ", operands types: ");
2832 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2834 dump_printf (MSG_NOTE
, ",");
2835 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2839 dump_printf (MSG_NOTE
, ",");
2840 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2846 dump_printf (MSG_NOTE
, ",");
2847 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2850 dump_printf (MSG_NOTE
, "\n");
2856 /* Check that it's ok to change the order of the computation.
2857 Generally, when vectorizing a reduction we change the order of the
2858 computation. This may change the behavior of the program in some
2859 cases, so we need to check that this is ok. One exception is when
2860 vectorizing an outer-loop: the inner-loop is executed sequentially,
2861 and therefore vectorizing reductions in the inner-loop during
2862 outer-loop vectorization is safe. */
2864 if (*v_reduc_type
!= COND_REDUCTION
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
)
2870 /* Changing the order of operations changes the semantics. */
2871 if (dump_enabled_p ())
2872 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2873 "reduction: unsafe fp math optimization: ");
2876 else if (INTEGRAL_TYPE_P (type
))
2878 if (!operation_no_trapping_overflow (type
, code
))
2880 /* Changing the order of operations changes the semantics. */
2881 if (dump_enabled_p ())
2882 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2883 "reduction: unsafe int math optimization"
2884 " (overflow traps): ");
2887 if (need_wrapping_integral_overflow
2888 && !TYPE_OVERFLOW_WRAPS (type
)
2889 && operation_can_overflow (code
))
2891 /* Changing the order of operations changes the semantics. */
2892 if (dump_enabled_p ())
2893 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2894 "reduction: unsafe int math optimization"
2895 " (overflow doesn't wrap): ");
2899 else if (SAT_FIXED_POINT_TYPE_P (type
))
2901 /* Changing the order of operations changes the semantics. */
2902 if (dump_enabled_p ())
2903 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2904 "reduction: unsafe fixed-point math optimization: ");
2909 /* Reduction is safe. We're dealing with one of the following:
2910 1) integer arithmetic and no trapv
2911 2) floating point arithmetic, and special flags permit this optimization
2912 3) nested cycle (i.e., outer loop vectorization). */
2913 if (TREE_CODE (op1
) == SSA_NAME
)
2914 def1
= SSA_NAME_DEF_STMT (op1
);
2916 if (TREE_CODE (op2
) == SSA_NAME
)
2917 def2
= SSA_NAME_DEF_STMT (op2
);
2919 if (code
!= COND_EXPR
2920 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2922 if (dump_enabled_p ())
2923 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2927 /* Check that one def is the reduction def, defined by PHI,
2928 the other def is either defined in the loop ("vect_internal_def"),
2929 or it's an induction (defined by a loop-header phi-node). */
2931 if (def2
&& def2
== phi
2932 && (code
== COND_EXPR
2933 || !def1
|| gimple_nop_p (def1
)
2934 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2935 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2936 && (is_gimple_assign (def1
)
2937 || is_gimple_call (def1
)
2938 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2939 == vect_induction_def
2940 || (gimple_code (def1
) == GIMPLE_PHI
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2942 == vect_internal_def
2943 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2950 if (def1
&& def1
== phi
2951 && (code
== COND_EXPR
2952 || !def2
|| gimple_nop_p (def2
)
2953 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2954 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2955 && (is_gimple_assign (def2
)
2956 || is_gimple_call (def2
)
2957 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2958 == vect_induction_def
2959 || (gimple_code (def2
) == GIMPLE_PHI
2960 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2961 == vect_internal_def
2962 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2965 && orig_code
!= MINUS_EXPR
)
2967 if (code
== COND_EXPR
)
2969 /* No current known use where this case would be useful. */
2970 if (dump_enabled_p ())
2971 report_vect_op (MSG_NOTE
, def_stmt
,
2972 "detected reduction: cannot currently swap "
2973 "operands for cond_expr");
2977 /* Swap operands (just for simplicity - so that the rest of the code
2978 can assume that the reduction variable is always the last (second)
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE
, def_stmt
,
2982 "detected reduction: need to swap operands: ");
2984 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2985 gimple_assign_rhs2_ptr (def_stmt
));
2987 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2988 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2999 /* Try to find SLP reduction chain. */
3000 if (check_reduction
&& code
!= COND_EXPR
3001 && vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
3003 if (dump_enabled_p ())
3004 report_vect_op (MSG_NOTE
, def_stmt
,
3005 "reduction: detected reduction chain: ");
3010 if (dump_enabled_p ())
3011 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3012 "reduction: unknown pattern: ");
3017 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3018 in-place if it enables detection of more reductions. Arguments
3022 vect_force_simple_reduction (loop_vec_info loop_info
, gimple
*phi
,
3023 bool check_reduction
, bool *double_reduc
,
3024 bool need_wrapping_integral_overflow
)
3026 enum vect_reduction_type v_reduc_type
;
3027 return vect_is_simple_reduction (loop_info
, phi
, check_reduction
,
3029 need_wrapping_integral_overflow
,
3033 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3035 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3036 int *peel_iters_epilogue
,
3037 stmt_vector_for_cost
*scalar_cost_vec
,
3038 stmt_vector_for_cost
*prologue_cost_vec
,
3039 stmt_vector_for_cost
*epilogue_cost_vec
)
3042 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3044 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3046 *peel_iters_epilogue
= vf
/2;
3047 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE
, vect_location
,
3049 "cost model: epilogue peel iters set to vf/2 "
3050 "because loop iterations are unknown .\n");
3052 /* If peeled iterations are known but number of scalar loop
3053 iterations are unknown, count a taken branch per peeled loop. */
3054 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3055 NULL
, 0, vect_prologue
);
3056 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3057 NULL
, 0, vect_epilogue
);
3061 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3062 peel_iters_prologue
= niters
< peel_iters_prologue
?
3063 niters
: peel_iters_prologue
;
3064 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
3065 /* If we need to peel for gaps, but no peeling is required, we have to
3066 peel VF iterations. */
3067 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3068 *peel_iters_epilogue
= vf
;
3071 stmt_info_for_cost
*si
;
3073 if (peel_iters_prologue
)
3074 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3075 retval
+= record_stmt_cost (prologue_cost_vec
,
3076 si
->count
* peel_iters_prologue
,
3077 si
->kind
, NULL
, si
->misalign
,
3079 if (*peel_iters_epilogue
)
3080 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3081 retval
+= record_stmt_cost (epilogue_cost_vec
,
3082 si
->count
* *peel_iters_epilogue
,
3083 si
->kind
, NULL
, si
->misalign
,
3089 /* Function vect_estimate_min_profitable_iters
3091 Return the number of iterations required for the vector version of the
3092 loop to be profitable relative to the cost of the scalar version of the
3096 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3097 int *ret_min_profitable_niters
,
3098 int *ret_min_profitable_estimate
)
3100 int min_profitable_iters
;
3101 int min_profitable_estimate
;
3102 int peel_iters_prologue
;
3103 int peel_iters_epilogue
;
3104 unsigned vec_inside_cost
= 0;
3105 int vec_outside_cost
= 0;
3106 unsigned vec_prologue_cost
= 0;
3107 unsigned vec_epilogue_cost
= 0;
3108 int scalar_single_iter_cost
= 0;
3109 int scalar_outside_cost
= 0;
3110 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3111 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3112 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3114 /* Cost model disabled. */
3115 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3117 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3118 *ret_min_profitable_niters
= 0;
3119 *ret_min_profitable_estimate
= 0;
3123 /* Requires loop versioning tests to handle misalignment. */
3124 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3126 /* FIXME: Make cost depend on complexity of individual check. */
3127 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3128 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3130 dump_printf (MSG_NOTE
,
3131 "cost model: Adding cost of checks for loop "
3132 "versioning to treat misalignment.\n");
3135 /* Requires loop versioning with alias checks. */
3136 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3138 /* FIXME: Make cost depend on complexity of individual check. */
3139 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3140 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3142 dump_printf (MSG_NOTE
,
3143 "cost model: Adding cost of checks for loop "
3144 "versioning aliasing.\n");
3147 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
3148 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3149 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
3152 /* Count statements in scalar loop. Using this as scalar cost for a single
3155 TODO: Add outer loop support.
3157 TODO: Consider assigning different costs to different scalar
3160 scalar_single_iter_cost
3161 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3163 /* Add additional cost for the peeled instructions in prologue and epilogue
3166 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3167 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3169 TODO: Build an expression that represents peel_iters for prologue and
3170 epilogue to be used in a run-time test. */
3174 peel_iters_prologue
= vf
/2;
3175 dump_printf (MSG_NOTE
, "cost model: "
3176 "prologue peel iters set to vf/2.\n");
3178 /* If peeling for alignment is unknown, loop bound of main loop becomes
3180 peel_iters_epilogue
= vf
/2;
3181 dump_printf (MSG_NOTE
, "cost model: "
3182 "epilogue peel iters set to vf/2 because "
3183 "peeling for alignment is unknown.\n");
3185 /* If peeled iterations are unknown, count a taken branch and a not taken
3186 branch per peeled loop. Even if scalar loop iterations are known,
3187 vector iterations are not known since peeled prologue iterations are
3188 not known. Hence guards remain the same. */
3189 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3190 NULL
, 0, vect_prologue
);
3191 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3192 NULL
, 0, vect_prologue
);
3193 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3194 NULL
, 0, vect_epilogue
);
3195 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3196 NULL
, 0, vect_epilogue
);
3197 stmt_info_for_cost
*si
;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3201 struct _stmt_vec_info
*stmt_info
3202 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3203 (void) add_stmt_cost (target_cost_data
,
3204 si
->count
* peel_iters_prologue
,
3205 si
->kind
, stmt_info
, si
->misalign
,
3207 (void) add_stmt_cost (target_cost_data
,
3208 si
->count
* peel_iters_epilogue
,
3209 si
->kind
, stmt_info
, si
->misalign
,
3215 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3216 stmt_info_for_cost
*si
;
3218 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3220 prologue_cost_vec
.create (2);
3221 epilogue_cost_vec
.create (2);
3222 peel_iters_prologue
= npeel
;
3224 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3225 &peel_iters_epilogue
,
3226 &LOOP_VINFO_SCALAR_ITERATION_COST
3229 &epilogue_cost_vec
);
3231 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3233 struct _stmt_vec_info
*stmt_info
3234 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3235 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3236 si
->misalign
, vect_prologue
);
3239 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3241 struct _stmt_vec_info
*stmt_info
3242 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
3243 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
3244 si
->misalign
, vect_epilogue
);
3247 prologue_cost_vec
.release ();
3248 epilogue_cost_vec
.release ();
3251 /* FORNOW: The scalar outside cost is incremented in one of the
3254 1. The vectorizer checks for alignment and aliasing and generates
3255 a condition that allows dynamic vectorization. A cost model
3256 check is ANDED with the versioning condition. Hence scalar code
3257 path now has the added cost of the versioning check.
3259 if (cost > th & versioning_check)
3262 Hence run-time scalar is incremented by not-taken branch cost.
3264 2. The vectorizer then checks if a prologue is required. If the
3265 cost model check was not done before during versioning, it has to
3266 be done before the prologue check.
3269 prologue = scalar_iters
3274 if (prologue == num_iters)
3277 Hence the run-time scalar cost is incremented by a taken branch,
3278 plus a not-taken branch, plus a taken branch cost.
3280 3. The vectorizer then checks if an epilogue is required. If the
3281 cost model check was not done before during prologue check, it
3282 has to be done with the epilogue check.
3288 if (prologue == num_iters)
3291 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3294 Hence the run-time scalar cost should be incremented by 2 taken
3297 TODO: The back end may reorder the BBS's differently and reverse
3298 conditions/branch directions. Change the estimates below to
3299 something more reasonable. */
3301 /* If the number of iterations is known and we do not do versioning, we can
3302 decide whether to vectorize at compile time. Hence the scalar version
3303 do not carry cost model guard costs. */
3304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3305 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
3306 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3308 /* Cost model check occurs at versioning. */
3309 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
3310 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3311 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3314 /* Cost model check occurs at prologue generation. */
3315 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3316 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3317 + vect_get_stmt_cost (cond_branch_not_taken
);
3318 /* Cost model check occurs at epilogue generation. */
3320 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3324 /* Complete the target-specific cost calculations. */
3325 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3326 &vec_inside_cost
, &vec_epilogue_cost
);
3328 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3330 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3333 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3335 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3337 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3339 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3340 scalar_single_iter_cost
);
3341 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3342 scalar_outside_cost
);
3343 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3345 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3346 peel_iters_prologue
);
3347 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3348 peel_iters_epilogue
);
3351 /* Calculate number of iterations required to make the vector version
3352 profitable, relative to the loop bodies only. The following condition
3354 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3356 SIC = scalar iteration cost, VIC = vector iteration cost,
3357 VOC = vector outside cost, VF = vectorization factor,
3358 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3359 SOC = scalar outside cost for run time cost model check. */
3361 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
3363 if (vec_outside_cost
<= 0)
3364 min_profitable_iters
= 1;
3367 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
3368 - vec_inside_cost
* peel_iters_prologue
3369 - vec_inside_cost
* peel_iters_epilogue
)
3370 / ((scalar_single_iter_cost
* vf
)
3373 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
3374 <= (((int) vec_inside_cost
* min_profitable_iters
)
3375 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
3376 min_profitable_iters
++;
3379 /* vector version will never be profitable. */
3382 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3383 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
3384 "did not happen for a simd loop");
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3388 "cost model: the vector iteration cost = %d "
3389 "divided by the scalar iteration cost = %d "
3390 "is greater or equal to the vectorization factor = %d"
3392 vec_inside_cost
, scalar_single_iter_cost
, vf
);
3393 *ret_min_profitable_niters
= -1;
3394 *ret_min_profitable_estimate
= -1;
3398 dump_printf (MSG_NOTE
,
3399 " Calculated minimum iters for profitability: %d\n",
3400 min_profitable_iters
);
3402 min_profitable_iters
=
3403 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
3405 /* Because the condition we create is:
3406 if (niters <= min_profitable_iters)
3407 then skip the vectorized loop. */
3408 min_profitable_iters
--;
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE
, vect_location
,
3412 " Runtime profitability threshold = %d\n",
3413 min_profitable_iters
);
3415 *ret_min_profitable_niters
= min_profitable_iters
;
3417 /* Calculate number of iterations required to make the vector version
3418 profitable, relative to the loop bodies only.
3420 Non-vectorized variant is SIC * niters and it must win over vector
3421 variant on the expected loop trip count. The following condition must hold true:
3422 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3424 if (vec_outside_cost
<= 0)
3425 min_profitable_estimate
= 1;
3428 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
3429 - vec_inside_cost
* peel_iters_prologue
3430 - vec_inside_cost
* peel_iters_epilogue
)
3431 / ((scalar_single_iter_cost
* vf
)
3434 min_profitable_estimate
--;
3435 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3436 if (dump_enabled_p ())
3437 dump_printf_loc (MSG_NOTE
, vect_location
,
3438 " Static estimate profitability threshold = %d\n",
3439 min_profitable_estimate
);
3441 *ret_min_profitable_estimate
= min_profitable_estimate
;
3444 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3445 vector elements (not bits) for a vector of mode MODE. */
3447 calc_vec_perm_mask_for_shift (enum machine_mode mode
, unsigned int offset
,
3450 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3452 for (i
= 0; i
< nelt
; i
++)
3453 sel
[i
] = (i
+ offset
) & (2*nelt
- 1);
3456 /* Checks whether the target supports whole-vector shifts for vectors of mode
3457 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3458 it supports vec_perm_const with masks for all necessary shift amounts. */
3460 have_whole_vector_shift (enum machine_mode mode
)
3462 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3465 if (direct_optab_handler (vec_perm_const_optab
, mode
) == CODE_FOR_nothing
)
3468 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3469 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
3471 for (i
= nelt
/2; i
>= 1; i
/=2)
3473 calc_vec_perm_mask_for_shift (mode
, i
, sel
);
3474 if (!can_vec_perm_p (mode
, false, sel
))
3480 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3483 get_reduction_op (gimple
*stmt
, int reduc_index
)
3485 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3487 case GIMPLE_SINGLE_RHS
:
3488 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3490 return TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3491 case GIMPLE_UNARY_RHS
:
3492 return gimple_assign_rhs1 (stmt
);
3493 case GIMPLE_BINARY_RHS
:
3495 ? gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
));
3496 case GIMPLE_TERNARY_RHS
:
3497 return gimple_op (stmt
, reduc_index
+ 1);
3503 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3504 functions. Design better to avoid maintenance issues. */
3506 /* Function vect_model_reduction_cost.
3508 Models cost for a reduction operation, including the vector ops
3509 generated within the strip-mine loop, the initial definition before
3510 the loop, and the epilogue code that must be generated. */
3513 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
3514 int ncopies
, int reduc_index
)
3516 int prologue_cost
= 0, epilogue_cost
= 0;
3517 enum tree_code code
;
3520 gimple
*stmt
, *orig_stmt
;
3523 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3524 struct loop
*loop
= NULL
;
3525 void *target_cost_data
;
3529 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3530 target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3533 target_cost_data
= BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info
));
3535 /* Condition reductions generate two reductions in the loop. */
3536 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
3539 /* Cost of reduction op inside loop. */
3540 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3541 stmt_info
, 0, vect_body
);
3542 stmt
= STMT_VINFO_STMT (stmt_info
);
3544 reduction_op
= get_reduction_op (stmt
, reduc_index
);
3546 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3552 "unsupported data-type ");
3553 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3554 TREE_TYPE (reduction_op
));
3555 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3560 mode
= TYPE_MODE (vectype
);
3561 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3564 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3566 code
= gimple_assign_rhs_code (orig_stmt
);
3568 /* Add in cost for initial definition.
3569 For cond reduction we have four vectors: initial index, step, initial
3570 result of the data reduction, initial value of the index reduction. */
3571 int prologue_stmts
= STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
3572 == COND_REDUCTION
? 4 : 1;
3573 prologue_cost
+= add_stmt_cost (target_cost_data
, prologue_stmts
,
3574 scalar_to_vec
, stmt_info
, 0,
3577 /* Determine cost of epilogue code.
3579 We have a reduction operator that will reduce the vector in one statement.
3580 Also requires scalar extract. */
3582 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt
))
3584 if (reduc_code
!= ERROR_MARK
)
3586 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
3588 /* An EQ stmt and an COND_EXPR stmt. */
3589 epilogue_cost
+= add_stmt_cost (target_cost_data
, 2,
3590 vector_stmt
, stmt_info
, 0,
3592 /* Reduction of the max index and a reduction of the found
3594 epilogue_cost
+= add_stmt_cost (target_cost_data
, 2,
3595 vec_to_scalar
, stmt_info
, 0,
3597 /* A broadcast of the max value. */
3598 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3599 scalar_to_vec
, stmt_info
, 0,
3604 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3605 stmt_info
, 0, vect_epilogue
);
3606 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3607 vec_to_scalar
, stmt_info
, 0,
3613 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3615 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3616 int element_bitsize
= tree_to_uhwi (bitsize
);
3617 int nelements
= vec_size_in_bits
/ element_bitsize
;
3619 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3621 /* We have a whole vector shift available. */
3622 if (VECTOR_MODE_P (mode
)
3623 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3624 && have_whole_vector_shift (mode
))
3626 /* Final reduction via vector shifts and the reduction operator.
3627 Also requires scalar extract. */
3628 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3629 exact_log2 (nelements
) * 2,
3630 vector_stmt
, stmt_info
, 0,
3632 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3633 vec_to_scalar
, stmt_info
, 0,
3637 /* Use extracts and reduction op for final reduction. For N
3638 elements, we have N extracts and N-1 reduction ops. */
3639 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3640 nelements
+ nelements
- 1,
3641 vector_stmt
, stmt_info
, 0,
3646 if (dump_enabled_p ())
3647 dump_printf (MSG_NOTE
,
3648 "vect_model_reduction_cost: inside_cost = %d, "
3649 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
3650 prologue_cost
, epilogue_cost
);
3656 /* Function vect_model_induction_cost.
3658 Models cost for induction operations. */
3661 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3663 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3664 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3665 unsigned inside_cost
, prologue_cost
;
3667 /* loop cost for vec_loop. */
3668 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3669 stmt_info
, 0, vect_body
);
3671 /* prologue cost for vec_init and vec_step. */
3672 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3673 stmt_info
, 0, vect_prologue
);
3675 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_NOTE
, vect_location
,
3677 "vect_model_induction_cost: inside_cost = %d, "
3678 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
3682 /* Function get_initial_def_for_induction
3685 STMT - a stmt that performs an induction operation in the loop.
3686 IV_PHI - the initial value of the induction variable
3689 Return a vector variable, initialized with the first VF values of
3690 the induction variable. E.g., for an iv with IV_PHI='X' and
3691 evolution S, for a vector of 4 units, we want to return:
3692 [X, X + S, X + 2*S, X + 3*S]. */
3695 get_initial_def_for_induction (gimple
*iv_phi
)
3697 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3698 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3699 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3702 edge pe
= loop_preheader_edge (loop
);
3703 struct loop
*iv_loop
;
3705 tree new_vec
, vec_init
, vec_step
, t
;
3708 gphi
*induction_phi
;
3709 tree induc_def
, vec_def
, vec_dest
;
3710 tree init_expr
, step_expr
;
3711 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3715 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3716 bool nested_in_vect_loop
= false;
3718 imm_use_iterator imm_iter
;
3719 use_operand_p use_p
;
3723 gimple_stmt_iterator si
;
3724 basic_block bb
= gimple_bb (iv_phi
);
3728 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3729 if (nested_in_vect_loop_p (loop
, iv_phi
))
3731 nested_in_vect_loop
= true;
3732 iv_loop
= loop
->inner
;
3736 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3738 latch_e
= loop_latch_edge (iv_loop
);
3739 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3741 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
3742 gcc_assert (step_expr
!= NULL_TREE
);
3744 pe
= loop_preheader_edge (iv_loop
);
3745 init_expr
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3746 loop_preheader_edge (iv_loop
));
3748 vectype
= get_vectype_for_scalar_type (TREE_TYPE (init_expr
));
3749 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3750 gcc_assert (vectype
);
3751 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3752 ncopies
= vf
/ nunits
;
3754 gcc_assert (phi_info
);
3755 gcc_assert (ncopies
>= 1);
3757 /* Convert the step to the desired type. */
3759 step_expr
= gimple_convert (&stmts
, TREE_TYPE (vectype
), step_expr
);
3762 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3763 gcc_assert (!new_bb
);
3766 /* Find the first insertion point in the BB. */
3767 si
= gsi_after_labels (bb
);
3769 /* Create the vector that holds the initial_value of the induction. */
3770 if (nested_in_vect_loop
)
3772 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3773 been created during vectorization of previous stmts. We obtain it
3774 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3775 vec_init
= vect_get_vec_def_for_operand (init_expr
, iv_phi
);
3776 /* If the initial value is not of proper type, convert it. */
3777 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3780 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
3784 build1 (VIEW_CONVERT_EXPR
, vectype
,
3786 vec_init
= gimple_assign_lhs (new_stmt
);
3787 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3789 gcc_assert (!new_bb
);
3790 set_vinfo_for_stmt (new_stmt
,
3791 new_stmt_vec_info (new_stmt
, loop_vinfo
));
3796 vec
<constructor_elt
, va_gc
> *v
;
3798 /* iv_loop is the loop to be vectorized. Create:
3799 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3801 new_name
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_expr
);
3803 vec_alloc (v
, nunits
);
3804 bool constant_p
= is_gimple_min_invariant (new_name
);
3805 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3806 for (i
= 1; i
< nunits
; i
++)
3808 /* Create: new_name_i = new_name + step_expr */
3809 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
3810 new_name
, step_expr
);
3811 if (!is_gimple_min_invariant (new_name
))
3813 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3817 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3818 gcc_assert (!new_bb
);
3821 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3823 new_vec
= build_vector_from_ctor (vectype
, v
);
3825 new_vec
= build_constructor (vectype
, v
);
3826 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3830 /* Create the vector that holds the step of the induction. */
3831 if (nested_in_vect_loop
)
3832 /* iv_loop is nested in the loop to be vectorized. Generate:
3833 vec_step = [S, S, S, S] */
3834 new_name
= step_expr
;
3837 /* iv_loop is the loop to be vectorized. Generate:
3838 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3839 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3841 expr
= build_int_cst (integer_type_node
, vf
);
3842 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3845 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3846 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3848 if (TREE_CODE (step_expr
) == SSA_NAME
)
3849 new_name
= vect_init_vector (iv_phi
, new_name
,
3850 TREE_TYPE (step_expr
), NULL
);
3853 t
= unshare_expr (new_name
);
3854 gcc_assert (CONSTANT_CLASS_P (new_name
)
3855 || TREE_CODE (new_name
) == SSA_NAME
);
3856 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3857 gcc_assert (stepvectype
);
3858 new_vec
= build_vector_from_val (stepvectype
, t
);
3859 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3862 /* Create the following def-use cycle:
3867 vec_iv = PHI <vec_init, vec_loop>
3871 vec_loop = vec_iv + vec_step; */
3873 /* Create the induction-phi that defines the induction-operand. */
3874 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3875 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3876 set_vinfo_for_stmt (induction_phi
,
3877 new_stmt_vec_info (induction_phi
, loop_vinfo
));
3878 induc_def
= PHI_RESULT (induction_phi
);
3880 /* Create the iv update inside the loop */
3881 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
, induc_def
, vec_step
);
3882 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3883 gimple_assign_set_lhs (new_stmt
, vec_def
);
3884 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3885 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
));
3887 /* Set the arguments of the phi node: */
3888 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3889 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3893 /* In case that vectorization factor (VF) is bigger than the number
3894 of elements that we can fit in a vectype (nunits), we have to generate
3895 more than one vector stmt - i.e - we need to "unroll" the
3896 vector stmt by a factor VF/nunits. For more details see documentation
3897 in vectorizable_operation. */
3901 stmt_vec_info prev_stmt_vinfo
;
3902 /* FORNOW. This restriction should be relaxed. */
3903 gcc_assert (!nested_in_vect_loop
);
3905 /* Create the vector that holds the step of the induction. */
3906 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3908 expr
= build_int_cst (integer_type_node
, nunits
);
3909 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3912 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3913 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3915 if (TREE_CODE (step_expr
) == SSA_NAME
)
3916 new_name
= vect_init_vector (iv_phi
, new_name
,
3917 TREE_TYPE (step_expr
), NULL
);
3918 t
= unshare_expr (new_name
);
3919 gcc_assert (CONSTANT_CLASS_P (new_name
)
3920 || TREE_CODE (new_name
) == SSA_NAME
);
3921 new_vec
= build_vector_from_val (stepvectype
, t
);
3922 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3924 vec_def
= induc_def
;
3925 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3926 for (i
= 1; i
< ncopies
; i
++)
3928 /* vec_i = vec_prev + vec_step */
3929 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
3931 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3932 gimple_assign_set_lhs (new_stmt
, vec_def
);
3934 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3935 if (!useless_type_conversion_p (resvectype
, vectype
))
3938 = gimple_build_assign
3939 (vect_get_new_vect_var (resvectype
, vect_simple_var
,
3942 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3943 gimple_assign_lhs (new_stmt
)));
3944 gimple_assign_set_lhs (new_stmt
,
3946 (gimple_assign_lhs (new_stmt
), new_stmt
));
3947 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3949 set_vinfo_for_stmt (new_stmt
,
3950 new_stmt_vec_info (new_stmt
, loop_vinfo
));
3951 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3952 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3956 if (nested_in_vect_loop
)
3958 /* Find the loop-closed exit-phi of the induction, and record
3959 the final vector of induction results: */
3961 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3963 gimple
*use_stmt
= USE_STMT (use_p
);
3964 if (is_gimple_debug (use_stmt
))
3967 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
3969 exit_phi
= use_stmt
;
3975 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3976 /* FORNOW. Currently not supporting the case that an inner-loop induction
3977 is not used in the outer-loop (i.e. only outside the outer-loop). */
3978 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3979 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3981 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE
, vect_location
,
3985 "vector of inductions after inner-loop:");
3986 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3987 dump_printf (MSG_NOTE
, "\n");
3993 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_NOTE
, vect_location
,
3996 "transform induction: created def-use cycle: ");
3997 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3998 dump_printf (MSG_NOTE
, "\n");
3999 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
4000 SSA_NAME_DEF_STMT (vec_def
), 0);
4001 dump_printf (MSG_NOTE
, "\n");
4004 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
4005 if (!useless_type_conversion_p (resvectype
, vectype
))
4007 new_stmt
= gimple_build_assign (vect_get_new_vect_var (resvectype
,
4011 build1 (VIEW_CONVERT_EXPR
, resvectype
,
4013 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
4014 gimple_assign_set_lhs (new_stmt
, induc_def
);
4015 si
= gsi_after_labels (bb
);
4016 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
4017 set_vinfo_for_stmt (new_stmt
,
4018 new_stmt_vec_info (new_stmt
, loop_vinfo
));
4019 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
4020 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
4027 /* Function get_initial_def_for_reduction
4030 STMT - a stmt that performs a reduction operation in the loop.
4031 INIT_VAL - the initial value of the reduction variable
4034 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4035 of the reduction (used for adjusting the epilog - see below).
4036 Return a vector variable, initialized according to the operation that STMT
4037 performs. This vector will be used as the initial value of the
4038 vector of partial results.
4040 Option1 (adjust in epilog): Initialize the vector as follows:
4041 add/bit or/xor: [0,0,...,0,0]
4042 mult/bit and: [1,1,...,1,1]
4043 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4044 and when necessary (e.g. add/mult case) let the caller know
4045 that it needs to adjust the result by init_val.
4047 Option2: Initialize the vector as follows:
4048 add/bit or/xor: [init_val,0,0,...,0]
4049 mult/bit and: [init_val,1,1,...,1]
4050 min/max/cond_expr: [init_val,init_val,...,init_val]
4051 and no adjustments are needed.
4053 For example, for the following code:
4059 STMT is 's = s + a[i]', and the reduction variable is 's'.
4060 For a vector of 4 units, we want to return either [0,0,0,init_val],
4061 or [0,0,0,0] and let the caller know that it needs to adjust
4062 the result at the end by 'init_val'.
4064 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4065 initialization vector is simpler (same element in all entries), if
4066 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4068 A cost model should help decide between these two schemes. */
4071 get_initial_def_for_reduction (gimple
*stmt
, tree init_val
,
4072 tree
*adjustment_def
)
4074 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
4075 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
4076 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4077 tree scalar_type
= TREE_TYPE (init_val
);
4078 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
4080 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4085 bool nested_in_vect_loop
= false;
4086 REAL_VALUE_TYPE real_init_val
= dconst0
;
4087 int int_init_val
= 0;
4088 gimple
*def_stmt
= NULL
;
4089 gimple_seq stmts
= NULL
;
4091 gcc_assert (vectype
);
4092 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4094 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4095 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4097 if (nested_in_vect_loop_p (loop
, stmt
))
4098 nested_in_vect_loop
= true;
4100 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
4102 /* In case of double reduction we only create a vector variable to be put
4103 in the reduction phi node. The actual statement creation is done in
4104 vect_create_epilog_for_reduction. */
4105 if (adjustment_def
&& nested_in_vect_loop
4106 && TREE_CODE (init_val
) == SSA_NAME
4107 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
4108 && gimple_code (def_stmt
) == GIMPLE_PHI
4109 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
4110 && vinfo_for_stmt (def_stmt
)
4111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
4112 == vect_double_reduction_def
)
4114 *adjustment_def
= NULL
;
4115 return vect_create_destination_var (init_val
, vectype
);
4118 /* In case of a nested reduction do not use an adjustment def as
4119 that case is not supported by the epilogue generation correctly
4120 if ncopies is not one. */
4121 if (adjustment_def
&& nested_in_vect_loop
)
4123 *adjustment_def
= NULL
;
4124 return vect_get_vec_def_for_operand (init_val
, stmt
);
4129 case WIDEN_SUM_EXPR
:
4138 /* ADJUSMENT_DEF is NULL when called from
4139 vect_create_epilog_for_reduction to vectorize double reduction. */
4141 *adjustment_def
= init_val
;
4143 if (code
== MULT_EXPR
)
4145 real_init_val
= dconst1
;
4149 if (code
== BIT_AND_EXPR
)
4152 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4153 def_for_init
= build_real (scalar_type
, real_init_val
);
4155 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4157 /* Create a vector of '0' or '1' except the first element. */
4158 elts
= XALLOCAVEC (tree
, nunits
);
4159 for (i
= nunits
- 2; i
>= 0; --i
)
4160 elts
[i
+ 1] = def_for_init
;
4162 /* Option1: the first element is '0' or '1' as well. */
4165 elts
[0] = def_for_init
;
4166 init_def
= build_vector (vectype
, elts
);
4170 /* Option2: the first element is INIT_VAL. */
4172 if (TREE_CONSTANT (init_val
))
4173 init_def
= build_vector (vectype
, elts
);
4176 vec
<constructor_elt
, va_gc
> *v
;
4177 vec_alloc (v
, nunits
);
4178 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
4179 for (i
= 1; i
< nunits
; ++i
)
4180 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
4181 init_def
= build_constructor (vectype
, v
);
4191 *adjustment_def
= NULL_TREE
;
4192 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo
) != COND_REDUCTION
)
4194 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
);
4198 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4199 if (! gimple_seq_empty_p (stmts
))
4200 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4201 init_def
= build_vector_from_val (vectype
, init_val
);
4211 /* Function vect_create_epilog_for_reduction
4213 Create code at the loop-epilog to finalize the result of a reduction
4216 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4217 reduction statements.
4218 STMT is the scalar reduction stmt that is being vectorized.
4219 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4220 number of elements that we can fit in a vectype (nunits). In this case
4221 we have to generate more than one vector stmt - i.e - we need to "unroll"
4222 the vector stmt by a factor VF/nunits. For more details see documentation
4223 in vectorizable_operation.
4224 REDUC_CODE is the tree-code for the epilog reduction.
4225 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4227 REDUC_INDEX is the index of the operand in the right hand side of the
4228 statement that is defined by REDUCTION_PHI.
4229 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4230 SLP_NODE is an SLP node containing a group of reduction statements. The
4231 first one in this group is STMT.
4232 INDUCTION_INDEX is the index of the loop for condition reductions.
4233 Otherwise it is undefined.
4236 1. Creates the reduction def-use cycles: sets the arguments for
4238 The loop-entry argument is the vectorized initial-value of the reduction.
4239 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4241 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4242 by applying the operation specified by REDUC_CODE if available, or by
4243 other means (whole-vector shifts or a scalar loop).
4244 The function also creates a new phi node at the loop exit to preserve
4245 loop-closed form, as illustrated below.
4247 The flow at the entry to this function:
4250 vec_def = phi <null, null> # REDUCTION_PHI
4251 VECT_DEF = vector_stmt # vectorized form of STMT
4252 s_loop = scalar_stmt # (scalar) STMT
4254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4258 The above is transformed by this function into:
4261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4262 VECT_DEF = vector_stmt # vectorized form of STMT
4263 s_loop = scalar_stmt # (scalar) STMT
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4275 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple
*stmt
,
4276 int ncopies
, enum tree_code reduc_code
,
4277 vec
<gimple
*> reduction_phis
,
4278 int reduc_index
, bool double_reduc
,
4279 slp_tree slp_node
, tree induction_index
)
4281 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4282 stmt_vec_info prev_phi_info
;
4285 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4286 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4287 basic_block exit_bb
;
4290 gimple
*new_phi
= NULL
, *phi
;
4291 gimple_stmt_iterator exit_gsi
;
4293 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4294 gimple
*epilog_stmt
= NULL
;
4295 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4298 tree adjustment_def
= NULL
;
4299 tree vec_initial_def
= NULL
;
4300 tree reduction_op
, expr
, def
, initial_def
= NULL
;
4301 tree orig_name
, scalar_result
;
4302 imm_use_iterator imm_iter
, phi_imm_iter
;
4303 use_operand_p use_p
, phi_use_p
;
4304 gimple
*use_stmt
, *orig_stmt
, *reduction_phi
= NULL
;
4305 bool nested_in_vect_loop
= false;
4306 auto_vec
<gimple
*> new_phis
;
4307 auto_vec
<gimple
*> inner_phis
;
4308 enum vect_def_type dt
= vect_unknown_def_type
;
4310 auto_vec
<tree
> scalar_results
;
4311 unsigned int group_size
= 1, k
, ratio
;
4312 auto_vec
<tree
> vec_initial_defs
;
4313 auto_vec
<gimple
*> phis
;
4314 bool slp_reduc
= false;
4315 tree new_phi_result
;
4316 gimple
*inner_phi
= NULL
;
4319 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4321 if (nested_in_vect_loop_p (loop
, stmt
))
4325 nested_in_vect_loop
= true;
4326 gcc_assert (!slp_node
);
4329 reduction_op
= get_reduction_op (stmt
, reduc_index
);
4331 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
4332 gcc_assert (vectype
);
4333 mode
= TYPE_MODE (vectype
);
4335 /* 1. Create the reduction def-use cycle:
4336 Set the arguments of REDUCTION_PHIS, i.e., transform
4339 vec_def = phi <null, null> # REDUCTION_PHI
4340 VECT_DEF = vector_stmt # vectorized form of STMT
4346 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4347 VECT_DEF = vector_stmt # vectorized form of STMT
4350 (in case of SLP, do it for all the phis). */
4352 /* Get the loop-entry arguments. */
4353 enum vect_def_type initial_def_dt
= vect_unknown_def_type
;
4355 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
4356 NULL
, slp_node
, reduc_index
);
4359 /* Get at the scalar def before the loop, that defines the initial value
4360 of the reduction variable. */
4361 gimple
*def_stmt
= SSA_NAME_DEF_STMT (reduction_op
);
4362 initial_def
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
4363 loop_preheader_edge (loop
));
4364 vect_is_simple_use (initial_def
, loop_vinfo
, &def_stmt
, &initial_def_dt
);
4365 vec_initial_def
= get_initial_def_for_reduction (stmt
, initial_def
,
4367 vec_initial_defs
.create (1);
4368 vec_initial_defs
.quick_push (vec_initial_def
);
4371 /* Set phi nodes arguments. */
4372 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4374 tree vec_init_def
, def
;
4376 vec_init_def
= force_gimple_operand (vec_initial_defs
[i
], &stmts
,
4378 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4380 for (j
= 0; j
< ncopies
; j
++)
4384 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4385 if (nested_in_vect_loop
)
4387 = vect_get_vec_def_for_stmt_copy (initial_def_dt
,
4391 /* Set the loop-entry arg of the reduction-phi. */
4393 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4394 == INTEGER_INDUC_COND_REDUCTION
)
4396 /* Initialise the reduction phi to zero. This prevents initial
4397 values of non-zero interferring with the reduction op. */
4398 gcc_assert (ncopies
== 1);
4399 gcc_assert (i
== 0);
4401 tree vec_init_def_type
= TREE_TYPE (vec_init_def
);
4402 tree zero_vec
= build_zero_cst (vec_init_def_type
);
4404 add_phi_arg (as_a
<gphi
*> (phi
), zero_vec
,
4405 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4408 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4409 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4411 /* Set the loop-latch arg for the reduction-phi. */
4413 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4415 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4418 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE
, vect_location
,
4421 "transform reduction: created def-use cycle: ");
4422 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4423 dump_printf (MSG_NOTE
, "\n");
4424 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4425 dump_printf (MSG_NOTE
, "\n");
4430 /* 2. Create epilog code.
4431 The reduction epilog code operates across the elements of the vector
4432 of partial results computed by the vectorized loop.
4433 The reduction epilog code consists of:
4435 step 1: compute the scalar result in a vector (v_out2)
4436 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4437 step 3: adjust the scalar result (s_out3) if needed.
4439 Step 1 can be accomplished using one the following three schemes:
4440 (scheme 1) using reduc_code, if available.
4441 (scheme 2) using whole-vector shifts, if available.
4442 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4445 The overall epilog code looks like this:
4447 s_out0 = phi <s_loop> # original EXIT_PHI
4448 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4449 v_out2 = reduce <v_out1> # step 1
4450 s_out3 = extract_field <v_out2, 0> # step 2
4451 s_out4 = adjust_result <s_out3> # step 3
4453 (step 3 is optional, and steps 1 and 2 may be combined).
4454 Lastly, the uses of s_out0 are replaced by s_out4. */
4457 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4458 v_out1 = phi <VECT_DEF>
4459 Store them in NEW_PHIS. */
4461 exit_bb
= single_exit (loop
)->dest
;
4462 prev_phi_info
= NULL
;
4463 new_phis
.create (vect_defs
.length ());
4464 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4466 for (j
= 0; j
< ncopies
; j
++)
4468 tree new_def
= copy_ssa_name (def
);
4469 phi
= create_phi_node (new_def
, exit_bb
);
4470 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
));
4472 new_phis
.quick_push (phi
);
4475 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
4476 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
4479 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4480 prev_phi_info
= vinfo_for_stmt (phi
);
4484 /* The epilogue is created for the outer-loop, i.e., for the loop being
4485 vectorized. Create exit phis for the outer loop. */
4489 exit_bb
= single_exit (loop
)->dest
;
4490 inner_phis
.create (vect_defs
.length ());
4491 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4493 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4494 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4495 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4497 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4499 inner_phis
.quick_push (phi
);
4500 new_phis
[i
] = outer_phi
;
4501 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4502 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
4504 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4505 new_result
= copy_ssa_name (PHI_RESULT (phi
));
4506 outer_phi
= create_phi_node (new_result
, exit_bb
);
4507 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4509 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4511 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
4512 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4517 exit_gsi
= gsi_after_labels (exit_bb
);
4519 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4520 (i.e. when reduc_code is not available) and in the final adjustment
4521 code (if needed). Also get the original scalar reduction variable as
4522 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4523 represents a reduction pattern), the tree-code and scalar-def are
4524 taken from the original stmt that the pattern-stmt (STMT) replaces.
4525 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4526 are taken from STMT. */
4528 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4531 /* Regular reduction */
4536 /* Reduction pattern */
4537 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
4538 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
4539 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
4542 code
= gimple_assign_rhs_code (orig_stmt
);
4543 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4544 partial results are added and not subtracted. */
4545 if (code
== MINUS_EXPR
)
4548 scalar_dest
= gimple_assign_lhs (orig_stmt
);
4549 scalar_type
= TREE_TYPE (scalar_dest
);
4550 scalar_results
.create (group_size
);
4551 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4552 bitsize
= TYPE_SIZE (scalar_type
);
4554 /* In case this is a reduction in an inner-loop while vectorizing an outer
4555 loop - we don't need to extract a single scalar result at the end of the
4556 inner-loop (unless it is double reduction, i.e., the use of reduction is
4557 outside the outer-loop). The final vector of partial results will be used
4558 in the vectorized outer-loop, or reduced to a scalar result at the end of
4560 if (nested_in_vect_loop
&& !double_reduc
)
4561 goto vect_finalize_reduction
;
4563 /* SLP reduction without reduction chain, e.g.,
4567 b2 = operation (b1) */
4568 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
4570 /* In case of reduction chain, e.g.,
4573 a3 = operation (a2),
4575 we may end up with more than one vector result. Here we reduce them to
4577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4579 tree first_vect
= PHI_RESULT (new_phis
[0]);
4581 gassign
*new_vec_stmt
= NULL
;
4583 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4584 for (k
= 1; k
< new_phis
.length (); k
++)
4586 gimple
*next_phi
= new_phis
[k
];
4587 tree second_vect
= PHI_RESULT (next_phi
);
4589 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
4590 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
4591 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
4592 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
4593 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4596 new_phi_result
= first_vect
;
4599 new_phis
.truncate (0);
4600 new_phis
.safe_push (new_vec_stmt
);
4604 new_phi_result
= PHI_RESULT (new_phis
[0]);
4606 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
4608 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4609 various data values where the condition matched and another vector
4610 (INDUCTION_INDEX) containing all the indexes of those matches. We
4611 need to extract the last matching index (which will be the index with
4612 highest value) and use this to index into the data vector.
4613 For the case where there were no matches, the data vector will contain
4614 all default values and the index vector will be all zeros. */
4616 /* Get various versions of the type of the vector of indexes. */
4617 tree index_vec_type
= TREE_TYPE (induction_index
);
4618 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
4619 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
4620 tree index_vec_cmp_type
= build_same_sized_truth_vector_type
4623 /* Get an unsigned integer version of the type of the data vector. */
4624 int scalar_precision
= GET_MODE_PRECISION (TYPE_MODE (scalar_type
));
4625 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
4626 tree vectype_unsigned
= build_vector_type
4627 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
4629 /* First we need to create a vector (ZERO_VEC) of zeros and another
4630 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4631 can create using a MAX reduction and then expanding.
4632 In the case where the loop never made any matches, the max index will
4635 /* Vector of {0, 0, 0,...}. */
4636 tree zero_vec
= make_ssa_name (vectype
);
4637 tree zero_vec_rhs
= build_zero_cst (vectype
);
4638 gimple
*zero_vec_stmt
= gimple_build_assign (zero_vec
, zero_vec_rhs
);
4639 gsi_insert_before (&exit_gsi
, zero_vec_stmt
, GSI_SAME_STMT
);
4641 /* Find maximum value from the vector of found indexes. */
4642 tree max_index
= make_ssa_name (index_scalar_type
);
4643 gimple
*max_index_stmt
= gimple_build_assign (max_index
, REDUC_MAX_EXPR
,
4645 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
4647 /* Vector of {max_index, max_index, max_index,...}. */
4648 tree max_index_vec
= make_ssa_name (index_vec_type
);
4649 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
4651 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
4653 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
4655 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4656 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4657 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4658 otherwise. Only one value should match, resulting in a vector
4659 (VEC_COND) with one data value and the rest zeros.
4660 In the case where the loop never made any matches, every index will
4661 match, resulting in a vector with all data values (which will all be
4662 the default value). */
4664 /* Compare the max index vector to the vector of found indexes to find
4665 the position of the max value. */
4666 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
4667 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
4670 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
4672 /* Use the compare to choose either values from the data vector or
4674 tree vec_cond
= make_ssa_name (vectype
);
4675 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
4676 vec_compare
, new_phi_result
,
4678 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
4680 /* Finally we need to extract the data value from the vector (VEC_COND)
4681 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4682 reduction, but because this doesn't exist, we can use a MAX reduction
4683 instead. The data value might be signed or a float so we need to cast
4685 In the case where the loop never made any matches, the data values are
4686 all identical, and so will reduce down correctly. */
4688 /* Make the matched data values unsigned. */
4689 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
4690 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
4692 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
4695 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
4697 /* Reduce down to a scalar value. */
4698 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
4699 optab ot
= optab_for_tree_code (REDUC_MAX_EXPR
, vectype_unsigned
,
4701 gcc_assert (optab_handler (ot
, TYPE_MODE (vectype_unsigned
))
4702 != CODE_FOR_nothing
);
4703 gimple
*data_reduc_stmt
= gimple_build_assign (data_reduc
,
4706 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
4708 /* Convert the reduced value back to the result type and set as the
4710 tree data_reduc_cast
= build1 (VIEW_CONVERT_EXPR
, scalar_type
,
4712 epilog_stmt
= gimple_build_assign (new_scalar_dest
, data_reduc_cast
);
4713 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4714 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4715 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4716 scalar_results
.safe_push (new_temp
);
4719 /* 2.3 Create the reduction code, using one of the three schemes described
4720 above. In SLP we simply need to extract all the elements from the
4721 vector (without reducing them), so we use scalar shifts. */
4722 else if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4727 /*** Case 1: Create:
4728 v_out2 = reduc_expr <v_out1> */
4730 if (dump_enabled_p ())
4731 dump_printf_loc (MSG_NOTE
, vect_location
,
4732 "Reduce using direct vector reduction.\n");
4734 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4735 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
4738 vect_create_destination_var (scalar_dest
, vec_elem_type
);
4739 tmp
= build1 (reduc_code
, vec_elem_type
, new_phi_result
);
4740 epilog_stmt
= gimple_build_assign (tmp_dest
, tmp
);
4741 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
4742 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4743 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4745 tmp
= build1 (NOP_EXPR
, scalar_type
, new_temp
);
4748 tmp
= build1 (reduc_code
, scalar_type
, new_phi_result
);
4750 epilog_stmt
= gimple_build_assign (new_scalar_dest
, tmp
);
4751 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4752 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4753 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4755 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4756 == INTEGER_INDUC_COND_REDUCTION
)
4758 /* Earlier we set the initial value to be zero. Check the result
4759 and if it is zero then replace with the original initial
4761 tree zero
= build_zero_cst (scalar_type
);
4762 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
, zero
);
4764 tmp
= make_ssa_name (new_scalar_dest
);
4765 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
4766 initial_def
, new_temp
);
4767 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4771 scalar_results
.safe_push (new_temp
);
4775 bool reduce_with_shift
= have_whole_vector_shift (mode
);
4776 int element_bitsize
= tree_to_uhwi (bitsize
);
4777 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4780 /* Regardless of whether we have a whole vector shift, if we're
4781 emulating the operation via tree-vect-generic, we don't want
4782 to use it. Only the first round of the reduction is likely
4783 to still be profitable via emulation. */
4784 /* ??? It might be better to emit a reduction tree code here, so that
4785 tree-vect-generic can expand the first round via bit tricks. */
4786 if (!VECTOR_MODE_P (mode
))
4787 reduce_with_shift
= false;
4790 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4791 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4792 reduce_with_shift
= false;
4795 if (reduce_with_shift
&& !slp_reduc
)
4797 int nelements
= vec_size_in_bits
/ element_bitsize
;
4798 unsigned char *sel
= XALLOCAVEC (unsigned char, nelements
);
4802 tree zero_vec
= build_zero_cst (vectype
);
4803 /*** Case 2: Create:
4804 for (offset = nelements/2; offset >= 1; offset/=2)
4806 Create: va' = vec_shift <va, offset>
4807 Create: va = vop <va, va'>
4812 if (dump_enabled_p ())
4813 dump_printf_loc (MSG_NOTE
, vect_location
,
4814 "Reduce using vector shifts\n");
4816 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4817 new_temp
= new_phi_result
;
4818 for (elt_offset
= nelements
/ 2;
4822 calc_vec_perm_mask_for_shift (mode
, elt_offset
, sel
);
4823 tree mask
= vect_gen_perm_mask_any (vectype
, sel
);
4824 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
4825 new_temp
, zero_vec
, mask
);
4826 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4827 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4828 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4830 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
4832 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4833 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4834 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4837 /* 2.4 Extract the final scalar result. Create:
4838 s_out3 = extract_field <v_out2, bitpos> */
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_NOTE
, vect_location
,
4842 "extract scalar result\n");
4844 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
4845 bitsize
, bitsize_zero_node
);
4846 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4847 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4848 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4849 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4850 scalar_results
.safe_push (new_temp
);
4854 /*** Case 3: Create:
4855 s = extract_field <v_out2, 0>
4856 for (offset = element_size;
4857 offset < vector_size;
4858 offset += element_size;)
4860 Create: s' = extract_field <v_out2, offset>
4861 Create: s = op <s, s'> // For non SLP cases
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE
, vect_location
,
4866 "Reduce using scalar code.\n");
4868 vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4869 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4872 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4873 vec_temp
= PHI_RESULT (new_phi
);
4875 vec_temp
= gimple_assign_lhs (new_phi
);
4876 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4878 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4879 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4880 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4881 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4883 /* In SLP we don't need to apply reduction operation, so we just
4884 collect s' values in SCALAR_RESULTS. */
4886 scalar_results
.safe_push (new_temp
);
4888 for (bit_offset
= element_bitsize
;
4889 bit_offset
< vec_size_in_bits
;
4890 bit_offset
+= element_bitsize
)
4892 tree bitpos
= bitsize_int (bit_offset
);
4893 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4896 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4897 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4898 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4899 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4903 /* In SLP we don't need to apply reduction operation, so
4904 we just collect s' values in SCALAR_RESULTS. */
4905 new_temp
= new_name
;
4906 scalar_results
.safe_push (new_name
);
4910 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4911 new_name
, new_temp
);
4912 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4913 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4914 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4919 /* The only case where we need to reduce scalar results in SLP, is
4920 unrolling. If the size of SCALAR_RESULTS is greater than
4921 GROUP_SIZE, we reduce them combining elements modulo
4925 tree res
, first_res
, new_res
;
4928 /* Reduce multiple scalar results in case of SLP unrolling. */
4929 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4932 first_res
= scalar_results
[j
% group_size
];
4933 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4935 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4936 gimple_assign_set_lhs (new_stmt
, new_res
);
4937 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4938 scalar_results
[j
% group_size
] = new_res
;
4942 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4943 scalar_results
.safe_push (new_temp
);
4947 vect_finalize_reduction
:
4952 /* 2.5 Adjust the final result by the initial value of the reduction
4953 variable. (When such adjustment is not needed, then
4954 'adjustment_def' is zero). For example, if code is PLUS we create:
4955 new_temp = loop_exit_def + adjustment_def */
4959 gcc_assert (!slp_reduc
);
4960 if (nested_in_vect_loop
)
4962 new_phi
= new_phis
[0];
4963 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4964 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4965 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4969 new_temp
= scalar_results
[0];
4970 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4971 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4972 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4975 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4976 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4977 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4978 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4979 if (nested_in_vect_loop
)
4981 set_vinfo_for_stmt (epilog_stmt
,
4982 new_stmt_vec_info (epilog_stmt
, loop_vinfo
));
4983 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4987 scalar_results
.quick_push (new_temp
);
4989 scalar_results
[0] = new_temp
;
4992 scalar_results
[0] = new_temp
;
4994 new_phis
[0] = epilog_stmt
;
4997 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4998 phis with new adjusted scalar results, i.e., replace use <s_out0>
5003 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5004 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5005 v_out2 = reduce <v_out1>
5006 s_out3 = extract_field <v_out2, 0>
5007 s_out4 = adjust_result <s_out3>
5014 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5015 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5016 v_out2 = reduce <v_out1>
5017 s_out3 = extract_field <v_out2, 0>
5018 s_out4 = adjust_result <s_out3>
5023 /* In SLP reduction chain we reduce vector results into one vector if
5024 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5025 the last stmt in the reduction chain, since we are looking for the loop
5027 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
5029 gimple
*dest_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5030 /* Handle reduction patterns. */
5031 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
)))
5032 dest_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
));
5034 scalar_dest
= gimple_assign_lhs (dest_stmt
);
5038 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5039 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5040 need to match SCALAR_RESULTS with corresponding statements. The first
5041 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5042 the first vector stmt, etc.
5043 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5044 if (group_size
> new_phis
.length ())
5046 ratio
= group_size
/ new_phis
.length ();
5047 gcc_assert (!(group_size
% new_phis
.length ()));
5052 for (k
= 0; k
< group_size
; k
++)
5056 epilog_stmt
= new_phis
[k
/ ratio
];
5057 reduction_phi
= reduction_phis
[k
/ ratio
];
5059 inner_phi
= inner_phis
[k
/ ratio
];
5064 gimple
*current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5066 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
5067 /* SLP statements can't participate in patterns. */
5068 gcc_assert (!orig_stmt
);
5069 scalar_dest
= gimple_assign_lhs (current_stmt
);
5073 /* Find the loop-closed-use at the loop exit of the original scalar
5074 result. (The reduction result is expected to have two immediate uses -
5075 one at the latch block, and one at the loop exit). */
5076 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5077 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
5078 && !is_gimple_debug (USE_STMT (use_p
)))
5079 phis
.safe_push (USE_STMT (use_p
));
5081 /* While we expect to have found an exit_phi because of loop-closed-ssa
5082 form we can end up without one if the scalar cycle is dead. */
5084 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5088 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5091 /* FORNOW. Currently not supporting the case that an inner-loop
5092 reduction is not used in the outer-loop (but only outside the
5093 outer-loop), unless it is double reduction. */
5094 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5095 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
5099 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
5101 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
5103 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
5104 != vect_double_reduction_def
)
5107 /* Handle double reduction:
5109 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5110 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5111 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5112 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5114 At that point the regular reduction (stmt2 and stmt3) is
5115 already vectorized, as well as the exit phi node, stmt4.
5116 Here we vectorize the phi node of double reduction, stmt1, and
5117 update all relevant statements. */
5119 /* Go through all the uses of s2 to find double reduction phi
5120 node, i.e., stmt1 above. */
5121 orig_name
= PHI_RESULT (exit_phi
);
5122 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5124 stmt_vec_info use_stmt_vinfo
;
5125 stmt_vec_info new_phi_vinfo
;
5126 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
5127 basic_block bb
= gimple_bb (use_stmt
);
5130 /* Check that USE_STMT is really double reduction phi
5132 if (gimple_code (use_stmt
) != GIMPLE_PHI
5133 || gimple_phi_num_args (use_stmt
) != 2
5134 || bb
->loop_father
!= outer_loop
)
5136 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
5138 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
5139 != vect_double_reduction_def
)
5142 /* Create vector phi node for double reduction:
5143 vs1 = phi <vs0, vs2>
5144 vs1 was created previously in this function by a call to
5145 vect_get_vec_def_for_operand and is stored in
5147 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5148 vs0 is created here. */
5150 /* Create vector phi node. */
5151 vect_phi
= create_phi_node (vec_initial_def
, bb
);
5152 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
5153 loop_vec_info_for_loop (outer_loop
));
5154 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
5156 /* Create vs0 - initial def of the double reduction phi. */
5157 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
5158 loop_preheader_edge (outer_loop
));
5159 init_def
= get_initial_def_for_reduction (stmt
,
5160 preheader_arg
, NULL
);
5161 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
5164 /* Update phi node arguments with vs0 and vs2. */
5165 add_phi_arg (vect_phi
, vect_phi_init
,
5166 loop_preheader_edge (outer_loop
),
5168 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
5169 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
5170 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_NOTE
, vect_location
,
5173 "created double reduction phi node: ");
5174 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
5175 dump_printf (MSG_NOTE
, "\n");
5178 vect_phi_res
= PHI_RESULT (vect_phi
);
5180 /* Replace the use, i.e., set the correct vs1 in the regular
5181 reduction phi node. FORNOW, NCOPIES is always 1, so the
5182 loop is redundant. */
5183 use
= reduction_phi
;
5184 for (j
= 0; j
< ncopies
; j
++)
5186 edge pr_edge
= loop_preheader_edge (loop
);
5187 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
5188 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
5195 if (nested_in_vect_loop
)
5204 /* Find the loop-closed-use at the loop exit of the original scalar
5205 result. (The reduction result is expected to have two immediate uses,
5206 one at the latch block, and one at the loop exit). For double
5207 reductions we are looking for exit phis of the outer loop. */
5208 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5210 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
5212 if (!is_gimple_debug (USE_STMT (use_p
)))
5213 phis
.safe_push (USE_STMT (use_p
));
5217 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
5219 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
5221 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
5223 if (!flow_bb_inside_loop_p (loop
,
5224 gimple_bb (USE_STMT (phi_use_p
)))
5225 && !is_gimple_debug (USE_STMT (phi_use_p
)))
5226 phis
.safe_push (USE_STMT (phi_use_p
));
5232 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5234 /* Replace the uses: */
5235 orig_name
= PHI_RESULT (exit_phi
);
5236 scalar_result
= scalar_results
[k
];
5237 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5238 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
5239 SET_USE (use_p
, scalar_result
);
5247 /* Function is_nonwrapping_integer_induction.
5249 Check if STMT (which is part of loop LOOP) both increments and
5250 does not cause overflow. */
5253 is_nonwrapping_integer_induction (gimple
*stmt
, struct loop
*loop
)
5255 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
5256 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
5257 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
5258 tree lhs_type
= TREE_TYPE (gimple_phi_result (stmt
));
5259 widest_int ni
, max_loop_value
, lhs_max
;
5260 bool overflow
= false;
5262 /* Make sure the loop is integer based. */
5263 if (TREE_CODE (base
) != INTEGER_CST
5264 || TREE_CODE (step
) != INTEGER_CST
)
5267 /* Check that the induction increments. */
5268 if (tree_int_cst_sgn (step
) == -1)
5271 /* Check that the max size of the loop will not wrap. */
5273 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
5276 if (! max_stmt_executions (loop
, &ni
))
5279 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
5284 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
5285 TYPE_SIGN (lhs_type
), &overflow
);
5289 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
5290 <= TYPE_PRECISION (lhs_type
));
5293 /* Function vectorizable_reduction.
5295 Check if STMT performs a reduction operation that can be vectorized.
5296 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5297 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5298 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5300 This function also handles reduction idioms (patterns) that have been
5301 recognized in advance during vect_pattern_recog. In this case, STMT may be
5303 X = pattern_expr (arg0, arg1, ..., X)
5304 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5305 sequence that had been detected and replaced by the pattern-stmt (STMT).
5307 This function also handles reduction of condition expressions, for example:
5308 for (int i = 0; i < N; i++)
5311 This is handled by vectorising the loop and creating an additional vector
5312 containing the loop indexes for which "a[i] < value" was true. In the
5313 function epilogue this is reduced to a single max value and then used to
5314 index into the vector of results.
5316 In some cases of reduction patterns, the type of the reduction variable X is
5317 different than the type of the other arguments of STMT.
5318 In such cases, the vectype that is used when transforming STMT into a vector
5319 stmt is different than the vectype that is used to determine the
5320 vectorization factor, because it consists of a different number of elements
5321 than the actual number of elements that are being operated upon in parallel.
5323 For example, consider an accumulation of shorts into an int accumulator.
5324 On some targets it's possible to vectorize this pattern operating on 8
5325 shorts at a time (hence, the vectype for purposes of determining the
5326 vectorization factor should be V8HI); on the other hand, the vectype that
5327 is used to create the vector form is actually V4SI (the type of the result).
5329 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5330 indicates what is the actual level of parallelism (V8HI in the example), so
5331 that the right vectorization factor would be derived. This vectype
5332 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5333 be used to create the vectorized stmt. The right vectype for the vectorized
5334 stmt is obtained from the type of the result X:
5335 get_vectype_for_scalar_type (TREE_TYPE (X))
5337 This means that, contrary to "regular" reductions (or "regular" stmts in
5338 general), the following equation:
5339 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5340 does *NOT* necessarily hold for reduction patterns. */
5343 vectorizable_reduction (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5344 gimple
**vec_stmt
, slp_tree slp_node
)
5348 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
5349 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5350 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5351 tree vectype_in
= NULL_TREE
;
5352 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5353 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5354 enum tree_code code
, orig_code
, epilog_reduc_code
;
5355 machine_mode vec_mode
;
5357 optab optab
, reduc_optab
;
5358 tree new_temp
= NULL_TREE
;
5360 enum vect_def_type dt
;
5361 gphi
*new_phi
= NULL
;
5365 stmt_vec_info orig_stmt_info
;
5366 tree expr
= NULL_TREE
;
5370 stmt_vec_info prev_stmt_info
, prev_phi_info
;
5371 bool single_defuse_cycle
= false;
5372 tree reduc_def
= NULL_TREE
;
5373 gimple
*new_stmt
= NULL
;
5376 bool nested_cycle
= false, found_nested_cycle_def
= false;
5377 gimple
*reduc_def_stmt
= NULL
;
5378 bool double_reduc
= false, dummy
;
5380 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
5382 gimple
*def_arg_stmt
;
5383 auto_vec
<tree
> vec_oprnds0
;
5384 auto_vec
<tree
> vec_oprnds1
;
5385 auto_vec
<tree
> vect_defs
;
5386 auto_vec
<gimple
*> phis
;
5388 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
5389 bool first_p
= true;
5390 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
5391 gimple
*cond_expr_induction_def_stmt
= NULL
;
5393 /* In case of reduction chain we switch to the first stmt in the chain, but
5394 we don't update STMT_INFO, since only the last stmt is marked as reduction
5395 and has reduction properties. */
5396 if (GROUP_FIRST_ELEMENT (stmt_info
)
5397 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
5399 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
5403 if (nested_in_vect_loop_p (loop
, stmt
))
5407 nested_cycle
= true;
5410 /* 1. Is vectorizable reduction? */
5411 /* Not supportable if the reduction variable is used in the loop, unless
5412 it's a reduction chain. */
5413 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
5414 && !GROUP_FIRST_ELEMENT (stmt_info
))
5417 /* Reductions that are not used even in an enclosing outer-loop,
5418 are expected to be "live" (used out of the loop). */
5419 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
5420 && !STMT_VINFO_LIVE_P (stmt_info
))
5423 /* Make sure it was already recognized as a reduction computation. */
5424 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_reduction_def
5425 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_nested_cycle
)
5428 /* 2. Has this been recognized as a reduction pattern?
5430 Check if STMT represents a pattern that has been recognized
5431 in earlier analysis stages. For stmts that represent a pattern,
5432 the STMT_VINFO_RELATED_STMT field records the last stmt in
5433 the original sequence that constitutes the pattern. */
5435 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
5438 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
5439 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5440 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
5443 /* 3. Check the operands of the operation. The first operands are defined
5444 inside the loop body. The last operand is the reduction variable,
5445 which is defined by the loop-header-phi. */
5447 gcc_assert (is_gimple_assign (stmt
));
5450 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
5452 case GIMPLE_SINGLE_RHS
:
5453 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
5454 if (op_type
== ternary_op
)
5456 tree rhs
= gimple_assign_rhs1 (stmt
);
5457 ops
[0] = TREE_OPERAND (rhs
, 0);
5458 ops
[1] = TREE_OPERAND (rhs
, 1);
5459 ops
[2] = TREE_OPERAND (rhs
, 2);
5460 code
= TREE_CODE (rhs
);
5466 case GIMPLE_BINARY_RHS
:
5467 code
= gimple_assign_rhs_code (stmt
);
5468 op_type
= TREE_CODE_LENGTH (code
);
5469 gcc_assert (op_type
== binary_op
);
5470 ops
[0] = gimple_assign_rhs1 (stmt
);
5471 ops
[1] = gimple_assign_rhs2 (stmt
);
5474 case GIMPLE_TERNARY_RHS
:
5475 code
= gimple_assign_rhs_code (stmt
);
5476 op_type
= TREE_CODE_LENGTH (code
);
5477 gcc_assert (op_type
== ternary_op
);
5478 ops
[0] = gimple_assign_rhs1 (stmt
);
5479 ops
[1] = gimple_assign_rhs2 (stmt
);
5480 ops
[2] = gimple_assign_rhs3 (stmt
);
5483 case GIMPLE_UNARY_RHS
:
5489 /* The default is that the reduction variable is the last in statement. */
5490 int reduc_index
= op_type
- 1;
5491 if (code
== MINUS_EXPR
)
5494 if (code
== COND_EXPR
&& slp_node
)
5497 scalar_dest
= gimple_assign_lhs (stmt
);
5498 scalar_type
= TREE_TYPE (scalar_dest
);
5499 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
5500 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
5503 /* Do not try to vectorize bit-precision reductions. */
5504 if ((TYPE_PRECISION (scalar_type
)
5505 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
5508 /* All uses but the last are expected to be defined in the loop.
5509 The last use is the reduction variable. In case of nested cycle this
5510 assumption is not true: we use reduc_index to record the index of the
5511 reduction variable. */
5512 for (i
= 0; i
< op_type
; i
++)
5514 if (i
== reduc_index
)
5517 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5518 if (i
== 0 && code
== COND_EXPR
)
5521 is_simple_use
= vect_is_simple_use (ops
[i
], loop_vinfo
,
5522 &def_stmt
, &dt
, &tem
);
5525 gcc_assert (is_simple_use
);
5527 if (dt
!= vect_internal_def
5528 && dt
!= vect_external_def
5529 && dt
!= vect_constant_def
5530 && dt
!= vect_induction_def
5531 && !(dt
== vect_nested_cycle
&& nested_cycle
))
5534 if (dt
== vect_nested_cycle
)
5536 found_nested_cycle_def
= true;
5537 reduc_def_stmt
= def_stmt
;
5541 if (i
== 1 && code
== COND_EXPR
&& dt
== vect_induction_def
)
5542 cond_expr_induction_def_stmt
= def_stmt
;
5545 is_simple_use
= vect_is_simple_use (ops
[reduc_index
], loop_vinfo
,
5546 &def_stmt
, &dt
, &tem
);
5549 gcc_assert (is_simple_use
);
5550 if (!found_nested_cycle_def
)
5551 reduc_def_stmt
= def_stmt
;
5553 if (reduc_def_stmt
&& gimple_code (reduc_def_stmt
) != GIMPLE_PHI
)
5556 if (!(dt
== vect_reduction_def
5557 || dt
== vect_nested_cycle
5558 || ((dt
== vect_internal_def
|| dt
== vect_external_def
5559 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
5560 && nested_cycle
&& found_nested_cycle_def
)))
5562 /* For pattern recognized stmts, orig_stmt might be a reduction,
5563 but some helper statements for the pattern might not, or
5564 might be COND_EXPRs with reduction uses in the condition. */
5565 gcc_assert (orig_stmt
);
5569 enum vect_reduction_type v_reduc_type
;
5570 gimple
*tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
5571 !nested_cycle
, &dummy
, false,
5574 /* If we have a condition reduction, see if we can simplify it further. */
5575 if (v_reduc_type
== COND_REDUCTION
5576 && cond_expr_induction_def_stmt
!= NULL
5577 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt
, loop
))
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE
, vect_location
,
5581 "condition expression based on integer induction.\n");
5582 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = INTEGER_INDUC_COND_REDUCTION
;
5585 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = v_reduc_type
;
5588 gcc_assert (tmp
== orig_stmt
5589 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == orig_stmt
);
5591 /* We changed STMT to be the first stmt in reduction chain, hence we
5592 check that in this case the first element in the chain is STMT. */
5593 gcc_assert (stmt
== tmp
5594 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
5596 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
5599 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
5602 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5603 / TYPE_VECTOR_SUBPARTS (vectype_in
));
5605 gcc_assert (ncopies
>= 1);
5607 vec_mode
= TYPE_MODE (vectype_in
);
5609 if (code
== COND_EXPR
)
5611 /* Only call during the analysis stage, otherwise we'll lose
5613 if (!vec_stmt
&& !vectorizable_condition (stmt
, gsi
, NULL
,
5614 ops
[reduc_index
], 0, NULL
))
5616 if (dump_enabled_p ())
5617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5618 "unsupported condition in reduction\n");
5624 /* 4. Supportable by target? */
5626 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
5627 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
5629 /* Shifts and rotates are only supported by vectorizable_shifts,
5630 not vectorizable_reduction. */
5631 if (dump_enabled_p ())
5632 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5633 "unsupported shift or rotation.\n");
5637 /* 4.1. check support for the operation in the loop */
5638 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5648 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5650 if (dump_enabled_p ())
5651 dump_printf (MSG_NOTE
, "op not supported by target.\n");
5653 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
5654 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5655 < vect_min_worthwhile_factor (code
))
5658 if (dump_enabled_p ())
5659 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
5662 /* Worthwhile without SIMD support? */
5663 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
5664 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5665 < vect_min_worthwhile_factor (code
))
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5669 "not worthwhile without SIMD support.\n");
5675 /* 4.2. Check support for the epilog operation.
5677 If STMT represents a reduction pattern, then the type of the
5678 reduction variable may be different than the type of the rest
5679 of the arguments. For example, consider the case of accumulation
5680 of shorts into an int accumulator; The original code:
5681 S1: int_a = (int) short_a;
5682 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5685 STMT: int_acc = widen_sum <short_a, int_acc>
5688 1. The tree-code that is used to create the vector operation in the
5689 epilog code (that reduces the partial results) is not the
5690 tree-code of STMT, but is rather the tree-code of the original
5691 stmt from the pattern that STMT is replacing. I.e, in the example
5692 above we want to use 'widen_sum' in the loop, but 'plus' in the
5694 2. The type (mode) we use to check available target support
5695 for the vector operation to be created in the *epilog*, is
5696 determined by the type of the reduction variable (in the example
5697 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5698 However the type (mode) we use to check available target support
5699 for the vector operation to be created *inside the loop*, is
5700 determined by the type of the other arguments to STMT (in the
5701 example we'd check this: optab_handler (widen_sum_optab,
5704 This is contrary to "regular" reductions, in which the types of all
5705 the arguments are the same as the type of the reduction variable.
5706 For "regular" reductions we can therefore use the same vector type
5707 (and also the same tree-code) when generating the epilog code and
5708 when generating the code inside the loop. */
5712 /* This is a reduction pattern: get the vectype from the type of the
5713 reduction variable, and get the tree-code from orig_stmt. */
5714 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5715 == TREE_CODE_REDUCTION
);
5716 orig_code
= gimple_assign_rhs_code (orig_stmt
);
5717 gcc_assert (vectype_out
);
5718 vec_mode
= TYPE_MODE (vectype_out
);
5722 /* Regular reduction: use the same vectype and tree-code as used for
5723 the vector code inside the loop can be used for the epilog code. */
5726 if (code
== MINUS_EXPR
)
5727 orig_code
= PLUS_EXPR
;
5729 /* For simple condition reductions, replace with the actual expression
5730 we want to base our reduction around. */
5731 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5732 == INTEGER_INDUC_COND_REDUCTION
)
5733 orig_code
= MAX_EXPR
;
5738 def_bb
= gimple_bb (reduc_def_stmt
);
5739 def_stmt_loop
= def_bb
->loop_father
;
5740 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5741 loop_preheader_edge (def_stmt_loop
));
5742 if (TREE_CODE (def_arg
) == SSA_NAME
5743 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
5744 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
5745 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
5746 && vinfo_for_stmt (def_arg_stmt
)
5747 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
5748 == vect_double_reduction_def
)
5749 double_reduc
= true;
5752 epilog_reduc_code
= ERROR_MARK
;
5754 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == TREE_CODE_REDUCTION
5755 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5756 == INTEGER_INDUC_COND_REDUCTION
)
5758 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
5760 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5766 "no optab for reduction.\n");
5768 epilog_reduc_code
= ERROR_MARK
;
5770 else if (optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5774 "reduc op not supported by target.\n");
5776 epilog_reduc_code
= ERROR_MARK
;
5779 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5780 generated in the epilog using multiple expressions. This does not
5781 work for condition reductions. */
5782 if (epilog_reduc_code
== ERROR_MARK
5783 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5784 == INTEGER_INDUC_COND_REDUCTION
)
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5788 "no reduc code for scalar code.\n");
5794 if (!nested_cycle
|| double_reduc
)
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5798 "no reduc code for scalar code.\n");
5806 int scalar_precision
= GET_MODE_PRECISION (TYPE_MODE (scalar_type
));
5807 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5808 cr_index_vector_type
= build_vector_type
5809 (cr_index_scalar_type
, TYPE_VECTOR_SUBPARTS (vectype_out
));
5811 epilog_reduc_code
= REDUC_MAX_EXPR
;
5812 optab
= optab_for_tree_code (REDUC_MAX_EXPR
, cr_index_vector_type
,
5814 if (optab_handler (optab
, TYPE_MODE (cr_index_vector_type
))
5815 == CODE_FOR_nothing
)
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5819 "reduc max op not supported by target.\n");
5825 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
5826 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5827 == INTEGER_INDUC_COND_REDUCTION
)
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5832 "multiple types in double reduction or condition "
5837 /* In case of widenning multiplication by a constant, we update the type
5838 of the constant to be the type of the other operand. We check that the
5839 constant fits the type in the pattern recognition pass. */
5840 if (code
== DOT_PROD_EXPR
5841 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
5843 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
5844 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
5845 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
5846 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5851 "invalid types in dot-prod\n");
5857 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
5861 if (! max_loop_iterations (loop
, &ni
))
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE
, vect_location
,
5865 "loop count not known, cannot create cond "
5869 /* Convert backedges to iterations. */
5872 /* The additional index will be the same type as the condition. Check
5873 that the loop can fit into this less one (because we'll use up the
5874 zero slot for when there are no matches). */
5875 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
5876 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
5878 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_NOTE
, vect_location
,
5880 "loop size is greater than data size.\n");
5885 if (!vec_stmt
) /* transformation not required. */
5888 && !vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
,
5891 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5897 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
5900 /* FORNOW: Multiple types are not supported for condition. */
5901 if (code
== COND_EXPR
)
5902 gcc_assert (ncopies
== 1);
5904 /* Create the destination vector */
5905 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
5907 /* In case the vectorization factor (VF) is bigger than the number
5908 of elements that we can fit in a vectype (nunits), we have to generate
5909 more than one vector stmt - i.e - we need to "unroll" the
5910 vector stmt by a factor VF/nunits. For more details see documentation
5911 in vectorizable_operation. */
5913 /* If the reduction is used in an outer loop we need to generate
5914 VF intermediate results, like so (e.g. for ncopies=2):
5919 (i.e. we generate VF results in 2 registers).
5920 In this case we have a separate def-use cycle for each copy, and therefore
5921 for each copy we get the vector def for the reduction variable from the
5922 respective phi node created for this copy.
5924 Otherwise (the reduction is unused in the loop nest), we can combine
5925 together intermediate results, like so (e.g. for ncopies=2):
5929 (i.e. we generate VF/2 results in a single register).
5930 In this case for each copy we get the vector def for the reduction variable
5931 from the vectorized reduction operation generated in the previous iteration.
5934 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5936 single_defuse_cycle
= true;
5940 epilog_copies
= ncopies
;
5942 prev_stmt_info
= NULL
;
5943 prev_phi_info
= NULL
;
5945 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5949 vec_oprnds0
.create (1);
5950 if (op_type
== ternary_op
)
5951 vec_oprnds1
.create (1);
5954 phis
.create (vec_num
);
5955 vect_defs
.create (vec_num
);
5957 vect_defs
.quick_push (NULL_TREE
);
5959 for (j
= 0; j
< ncopies
; j
++)
5961 if (j
== 0 || !single_defuse_cycle
)
5963 for (i
= 0; i
< vec_num
; i
++)
5965 /* Create the reduction-phi that defines the reduction
5967 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5968 set_vinfo_for_stmt (new_phi
,
5969 new_stmt_vec_info (new_phi
, loop_vinfo
));
5970 if (j
== 0 || slp_node
)
5971 phis
.quick_push (new_phi
);
5975 if (code
== COND_EXPR
)
5977 gcc_assert (!slp_node
);
5978 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5979 PHI_RESULT (phis
[0]),
5981 /* Multiple types are not supported for condition. */
5988 op0
= ops
[!reduc_index
];
5989 if (op_type
== ternary_op
)
5991 if (reduc_index
== 0)
5998 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
6002 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
6004 vec_oprnds0
.quick_push (loop_vec_def0
);
6005 if (op_type
== ternary_op
)
6007 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
);
6008 vec_oprnds1
.quick_push (loop_vec_def1
);
6016 enum vect_def_type dt
;
6019 vect_is_simple_use (ops
[!reduc_index
], loop_vinfo
,
6021 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
6023 vec_oprnds0
[0] = loop_vec_def0
;
6024 if (op_type
== ternary_op
)
6026 vect_is_simple_use (op1
, loop_vinfo
, &dummy_stmt
, &dt
);
6027 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
6029 vec_oprnds1
[0] = loop_vec_def1
;
6033 if (single_defuse_cycle
)
6034 reduc_def
= gimple_assign_lhs (new_stmt
);
6036 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
6039 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6042 reduc_def
= PHI_RESULT (phis
[i
]);
6045 if (!single_defuse_cycle
|| j
== 0)
6046 reduc_def
= PHI_RESULT (new_phi
);
6049 def1
= ((op_type
== ternary_op
)
6050 ? vec_oprnds1
[i
] : NULL
);
6051 if (op_type
== binary_op
)
6053 if (reduc_index
== 0)
6054 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
6056 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
6060 if (reduc_index
== 0)
6061 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
6064 if (reduc_index
== 1)
6065 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
6067 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
6071 new_stmt
= gimple_build_assign (vec_dest
, expr
);
6072 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6073 gimple_assign_set_lhs (new_stmt
, new_temp
);
6074 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
6078 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6079 vect_defs
.quick_push (new_temp
);
6082 vect_defs
[0] = new_temp
;
6089 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
6091 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
6093 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
6094 prev_phi_info
= vinfo_for_stmt (new_phi
);
6097 tree indx_before_incr
, indx_after_incr
, cond_name
= NULL
;
6099 /* Finalize the reduction-phi (set its arguments) and create the
6100 epilog reduction code. */
6101 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
6103 new_temp
= gimple_assign_lhs (*vec_stmt
);
6104 vect_defs
[0] = new_temp
;
6106 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6107 which is updated with the current index of the loop for every match of
6108 the original loop's cond_expr (VEC_STMT). This results in a vector
6109 containing the last time the condition passed for that vector lane.
6110 The first match will be a 1 to allow 0 to be used for non-matching
6111 indexes. If there are no matches at all then the vector will be all
6113 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
6115 int nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6118 gcc_assert (gimple_assign_rhs_code (*vec_stmt
) == VEC_COND_EXPR
);
6120 /* First we create a simple vector induction variable which starts
6121 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6122 vector size (STEP). */
6124 /* Create a {1,2,3,...} vector. */
6125 tree
*vtemp
= XALLOCAVEC (tree
, nunits_out
);
6126 for (k
= 0; k
< nunits_out
; ++k
)
6127 vtemp
[k
] = build_int_cst (cr_index_scalar_type
, k
+ 1);
6128 tree series_vect
= build_vector (cr_index_vector_type
, vtemp
);
6130 /* Create a vector of the step value. */
6131 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
6132 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
6134 /* Create an induction variable. */
6135 gimple_stmt_iterator incr_gsi
;
6137 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
6138 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
6139 insert_after
, &indx_before_incr
, &indx_after_incr
);
6141 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6142 filled with zeros (VEC_ZERO). */
6144 /* Create a vector of 0s. */
6145 tree zero
= build_zero_cst (cr_index_scalar_type
);
6146 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
6148 /* Create a vector phi node. */
6149 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
6150 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
6151 set_vinfo_for_stmt (new_phi
,
6152 new_stmt_vec_info (new_phi
, loop_vinfo
));
6153 add_phi_arg (new_phi
, vec_zero
, loop_preheader_edge (loop
),
6156 /* Now take the condition from the loops original cond_expr
6157 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6158 every match uses values from the induction variable
6159 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6161 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6162 the new cond_expr (INDEX_COND_EXPR). */
6164 /* Turn the condition from vec_stmt into an ssa name. */
6165 gimple_stmt_iterator vec_stmt_gsi
= gsi_for_stmt (*vec_stmt
);
6166 tree ccompare
= gimple_assign_rhs1 (*vec_stmt
);
6167 tree ccompare_name
= make_ssa_name (TREE_TYPE (ccompare
));
6168 gimple
*ccompare_stmt
= gimple_build_assign (ccompare_name
,
6170 gsi_insert_before (&vec_stmt_gsi
, ccompare_stmt
, GSI_SAME_STMT
);
6171 gimple_assign_set_rhs1 (*vec_stmt
, ccompare_name
);
6172 update_stmt (*vec_stmt
);
6174 /* Create a conditional, where the condition is taken from vec_stmt
6175 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6176 and else is the phi (NEW_PHI_TREE). */
6177 tree index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
6178 ccompare_name
, indx_before_incr
,
6180 cond_name
= make_ssa_name (cr_index_vector_type
);
6181 gimple
*index_condition
= gimple_build_assign (cond_name
,
6183 gsi_insert_before (&incr_gsi
, index_condition
, GSI_SAME_STMT
);
6184 stmt_vec_info index_vec_info
= new_stmt_vec_info (index_condition
,
6186 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
6187 set_vinfo_for_stmt (index_condition
, index_vec_info
);
6189 /* Update the phi with the vec cond. */
6190 add_phi_arg (new_phi
, cond_name
, loop_latch_edge (loop
),
6195 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
6196 epilog_reduc_code
, phis
, reduc_index
,
6197 double_reduc
, slp_node
, cond_name
);
6202 /* Function vect_min_worthwhile_factor.
6204 For a loop where we could vectorize the operation indicated by CODE,
6205 return the minimum vectorization factor that makes it worthwhile
6206 to use generic vectors. */
6208 vect_min_worthwhile_factor (enum tree_code code
)
6229 /* Function vectorizable_induction
6231 Check if PHI performs an induction computation that can be vectorized.
6232 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6233 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6234 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6237 vectorizable_induction (gimple
*phi
,
6238 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
6241 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
6242 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6243 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6244 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6245 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6246 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
6249 gcc_assert (ncopies
>= 1);
6250 /* FORNOW. These restrictions should be relaxed. */
6251 if (nested_in_vect_loop_p (loop
, phi
))
6253 imm_use_iterator imm_iter
;
6254 use_operand_p use_p
;
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6263 "multiple types in nested loop.\n");
6268 latch_e
= loop_latch_edge (loop
->inner
);
6269 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
6270 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
6272 gimple
*use_stmt
= USE_STMT (use_p
);
6273 if (is_gimple_debug (use_stmt
))
6276 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
6278 exit_phi
= use_stmt
;
6284 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
6285 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
6286 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
6288 if (dump_enabled_p ())
6289 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6290 "inner-loop induction only used outside "
6291 "of the outer vectorized loop.\n");
6297 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
6300 /* FORNOW: SLP not supported. */
6301 if (STMT_SLP_TYPE (stmt_info
))
6304 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
6306 if (gimple_code (phi
) != GIMPLE_PHI
)
6309 if (!vec_stmt
) /* transformation not required. */
6311 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
6312 if (dump_enabled_p ())
6313 dump_printf_loc (MSG_NOTE
, vect_location
,
6314 "=== vectorizable_induction ===\n");
6315 vect_model_induction_cost (stmt_info
, ncopies
);
6321 if (dump_enabled_p ())
6322 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
6324 vec_def
= get_initial_def_for_induction (phi
);
6325 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
6329 /* Function vectorizable_live_operation.
6331 STMT computes a value that is used outside the loop. Check if
6332 it can be supported. */
6335 vectorizable_live_operation (gimple
*stmt
,
6336 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
6339 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6340 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6341 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6346 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
6348 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
6351 if (!is_gimple_assign (stmt
))
6353 if (gimple_call_internal_p (stmt
)
6354 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
6355 && gimple_call_lhs (stmt
)
6357 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
6359 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0)))
6361 edge e
= single_exit (loop
);
6362 basic_block merge_bb
= e
->dest
;
6363 imm_use_iterator imm_iter
;
6364 use_operand_p use_p
;
6365 tree lhs
= gimple_call_lhs (stmt
);
6367 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
6369 gimple
*use_stmt
= USE_STMT (use_p
);
6370 if (gimple_code (use_stmt
) == GIMPLE_PHI
6371 && gimple_bb (use_stmt
) == merge_bb
)
6376 = build_int_cst (unsigned_type_node
,
6377 loop_vinfo
->vectorization_factor
- 1);
6378 SET_PHI_ARG_DEF (use_stmt
, e
->dest_idx
, vfm1
);
6388 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
6391 /* FORNOW. CHECKME. */
6392 if (nested_in_vect_loop_p (loop
, stmt
))
6395 /* FORNOW: support only if all uses are invariant. This means
6396 that the scalar operations can remain in place, unvectorized.
6397 The original last scalar value that they compute will be used. */
6398 FOR_EACH_SSA_TREE_OPERAND (op
, stmt
, iter
, SSA_OP_USE
)
6400 enum vect_def_type dt
= vect_uninitialized_def
;
6402 if (!vect_is_simple_use (op
, loop_vinfo
, &def_stmt
, &dt
))
6404 if (dump_enabled_p ())
6405 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6406 "use not simple.\n");
6410 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
6414 /* No transformation is required for the cases we currently support. */
6418 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6421 vect_loop_kill_debug_uses (struct loop
*loop
, gimple
*stmt
)
6423 ssa_op_iter op_iter
;
6424 imm_use_iterator imm_iter
;
6425 def_operand_p def_p
;
6428 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
6430 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
6434 if (!is_gimple_debug (ustmt
))
6437 bb
= gimple_bb (ustmt
);
6439 if (!flow_bb_inside_loop_p (loop
, bb
))
6441 if (gimple_debug_bind_p (ustmt
))
6443 if (dump_enabled_p ())
6444 dump_printf_loc (MSG_NOTE
, vect_location
,
6445 "killing debug use\n");
6447 gimple_debug_bind_reset_value (ustmt
);
6448 update_stmt (ustmt
);
6458 /* This function builds ni_name = number of iterations. Statements
6459 are emitted on the loop preheader edge. */
6462 vect_build_loop_niters (loop_vec_info loop_vinfo
)
6464 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
6465 if (TREE_CODE (ni
) == INTEGER_CST
)
6470 gimple_seq stmts
= NULL
;
6471 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
6473 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
6474 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
6476 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6483 /* This function generates the following statements:
6485 ni_name = number of iterations loop executes
6486 ratio = ni_name / vf
6487 ratio_mult_vf_name = ratio * vf
6489 and places them on the loop preheader edge. */
6492 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo
,
6494 tree
*ratio_mult_vf_name_ptr
,
6495 tree
*ratio_name_ptr
)
6497 tree ni_minus_gap_name
;
6500 tree ratio_mult_vf_name
;
6501 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
6502 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
6505 log_vf
= build_int_cst (TREE_TYPE (ni_name
), exact_log2 (vf
));
6507 /* If epilogue loop is required because of data accesses with gaps, we
6508 subtract one iteration from the total number of iterations here for
6509 correct calculation of RATIO. */
6510 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
6512 ni_minus_gap_name
= fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
6514 build_one_cst (TREE_TYPE (ni_name
)));
6515 if (!is_gimple_val (ni_minus_gap_name
))
6517 var
= create_tmp_var (TREE_TYPE (ni_name
), "ni_gap");
6518 gimple
*stmts
= NULL
;
6519 ni_minus_gap_name
= force_gimple_operand (ni_minus_gap_name
, &stmts
,
6521 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6525 ni_minus_gap_name
= ni_name
;
6527 /* Create: ratio = ni >> log2(vf) */
6528 /* ??? As we have ni == number of latch executions + 1, ni could
6529 have overflown to zero. So avoid computing ratio based on ni
6530 but compute it using the fact that we know ratio will be at least
6531 one, thus via (ni - vf) >> log2(vf) + 1. */
6533 = fold_build2 (PLUS_EXPR
, TREE_TYPE (ni_name
),
6534 fold_build2 (RSHIFT_EXPR
, TREE_TYPE (ni_name
),
6535 fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
6538 (TREE_TYPE (ni_name
), vf
)),
6540 build_int_cst (TREE_TYPE (ni_name
), 1));
6541 if (!is_gimple_val (ratio_name
))
6543 var
= create_tmp_var (TREE_TYPE (ni_name
), "bnd");
6544 gimple
*stmts
= NULL
;
6545 ratio_name
= force_gimple_operand (ratio_name
, &stmts
, true, var
);
6546 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6548 *ratio_name_ptr
= ratio_name
;
6550 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6552 if (ratio_mult_vf_name_ptr
)
6554 ratio_mult_vf_name
= fold_build2 (LSHIFT_EXPR
, TREE_TYPE (ratio_name
),
6555 ratio_name
, log_vf
);
6556 if (!is_gimple_val (ratio_mult_vf_name
))
6558 var
= create_tmp_var (TREE_TYPE (ni_name
), "ratio_mult_vf");
6559 gimple
*stmts
= NULL
;
6560 ratio_mult_vf_name
= force_gimple_operand (ratio_mult_vf_name
, &stmts
,
6562 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6564 *ratio_mult_vf_name_ptr
= ratio_mult_vf_name
;
6571 /* Function vect_transform_loop.
6573 The analysis phase has determined that the loop is vectorizable.
6574 Vectorize the loop - created vectorized stmts to replace the scalar
6575 stmts in the loop, and update the loop exit condition. */
6578 vect_transform_loop (loop_vec_info loop_vinfo
)
6580 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6581 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
6582 int nbbs
= loop
->num_nodes
;
6585 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
6587 bool slp_scheduled
= false;
6588 gimple
*stmt
, *pattern_stmt
;
6589 gimple_seq pattern_def_seq
= NULL
;
6590 gimple_stmt_iterator pattern_def_si
= gsi_none ();
6591 bool transform_pattern_stmt
= false;
6592 bool check_profitability
= false;
6594 /* Record number of iterations before we started tampering with the profile. */
6595 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
6597 if (dump_enabled_p ())
6598 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
6600 /* If profile is inprecise, we have chance to fix it up. */
6601 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
6602 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
6604 /* Use the more conservative vectorization threshold. If the number
6605 of iterations is constant assume the cost check has been performed
6606 by our caller. If the threshold makes all loops profitable that
6607 run at least the vectorization factor number of times checking
6608 is pointless, too. */
6609 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
6610 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
6611 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
6613 if (dump_enabled_p ())
6614 dump_printf_loc (MSG_NOTE
, vect_location
,
6615 "Profitability threshold is %d loop iterations.\n",
6617 check_profitability
= true;
6620 /* Version the loop first, if required, so the profitability check
6623 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
6624 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
6626 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
6627 check_profitability
= false;
6630 tree ni_name
= vect_build_loop_niters (loop_vinfo
);
6631 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = ni_name
;
6633 /* Peel the loop if there are data refs with unknown alignment.
6634 Only one data ref with unknown store is allowed. */
6636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
6638 vect_do_peeling_for_alignment (loop_vinfo
, ni_name
,
6639 th
, check_profitability
);
6640 check_profitability
= false;
6641 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6643 ni_name
= NULL_TREE
;
6646 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6647 compile time constant), or it is a constant that doesn't divide by the
6648 vectorization factor, then an epilog loop needs to be created.
6649 We therefore duplicate the loop: the original loop will be vectorized,
6650 and will compute the first (n/VF) iterations. The second copy of the loop
6651 will remain scalar and will compute the remaining (n%VF) iterations.
6652 (VF is the vectorization factor). */
6654 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
6655 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
6659 ni_name
= vect_build_loop_niters (loop_vinfo
);
6660 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, &ratio_mult_vf
,
6662 vect_do_peeling_for_loop_bound (loop_vinfo
, ni_name
, ratio_mult_vf
,
6663 th
, check_profitability
);
6665 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
6666 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
6667 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
6671 ni_name
= vect_build_loop_niters (loop_vinfo
);
6672 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, NULL
, &ratio
);
6675 /* 1) Make sure the loop header has exactly two entries
6676 2) Make sure we have a preheader basic block. */
6678 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
6680 split_edge (loop_preheader_edge (loop
));
6682 /* FORNOW: the vectorizer supports only loops which body consist
6683 of one basic block (header + empty latch). When the vectorizer will
6684 support more involved loop forms, the order by which the BBs are
6685 traversed need to be reconsidered. */
6687 for (i
= 0; i
< nbbs
; i
++)
6689 basic_block bb
= bbs
[i
];
6690 stmt_vec_info stmt_info
;
6692 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
6695 gphi
*phi
= si
.phi ();
6696 if (dump_enabled_p ())
6698 dump_printf_loc (MSG_NOTE
, vect_location
,
6699 "------>vectorizing phi: ");
6700 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
6701 dump_printf (MSG_NOTE
, "\n");
6703 stmt_info
= vinfo_for_stmt (phi
);
6707 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6708 vect_loop_kill_debug_uses (loop
, phi
);
6710 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6711 && !STMT_VINFO_LIVE_P (stmt_info
))
6714 if (STMT_VINFO_VECTYPE (stmt_info
)
6715 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
6716 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
6717 && dump_enabled_p ())
6718 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6720 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
6724 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
6728 pattern_stmt
= NULL
;
6729 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
6730 !gsi_end_p (si
) || transform_pattern_stmt
;)
6734 if (transform_pattern_stmt
)
6735 stmt
= pattern_stmt
;
6738 stmt
= gsi_stmt (si
);
6739 /* During vectorization remove existing clobber stmts. */
6740 if (gimple_clobber_p (stmt
))
6742 unlink_stmt_vdef (stmt
);
6743 gsi_remove (&si
, true);
6744 release_defs (stmt
);
6749 if (dump_enabled_p ())
6751 dump_printf_loc (MSG_NOTE
, vect_location
,
6752 "------>vectorizing statement: ");
6753 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
6754 dump_printf (MSG_NOTE
, "\n");
6757 stmt_info
= vinfo_for_stmt (stmt
);
6759 /* vector stmts created in the outer-loop during vectorization of
6760 stmts in an inner-loop may not have a stmt_info, and do not
6761 need to be vectorized. */
6768 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6769 vect_loop_kill_debug_uses (loop
, stmt
);
6771 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6772 && !STMT_VINFO_LIVE_P (stmt_info
))
6774 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6775 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6776 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6777 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6779 stmt
= pattern_stmt
;
6780 stmt_info
= vinfo_for_stmt (stmt
);
6788 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6789 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6790 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6791 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6792 transform_pattern_stmt
= true;
6794 /* If pattern statement has def stmts, vectorize them too. */
6795 if (is_pattern_stmt_p (stmt_info
))
6797 if (pattern_def_seq
== NULL
)
6799 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
6800 pattern_def_si
= gsi_start (pattern_def_seq
);
6802 else if (!gsi_end_p (pattern_def_si
))
6803 gsi_next (&pattern_def_si
);
6804 if (pattern_def_seq
!= NULL
)
6806 gimple
*pattern_def_stmt
= NULL
;
6807 stmt_vec_info pattern_def_stmt_info
= NULL
;
6809 while (!gsi_end_p (pattern_def_si
))
6811 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
6812 pattern_def_stmt_info
6813 = vinfo_for_stmt (pattern_def_stmt
);
6814 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
6815 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
6817 gsi_next (&pattern_def_si
);
6820 if (!gsi_end_p (pattern_def_si
))
6822 if (dump_enabled_p ())
6824 dump_printf_loc (MSG_NOTE
, vect_location
,
6825 "==> vectorizing pattern def "
6827 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
6828 pattern_def_stmt
, 0);
6829 dump_printf (MSG_NOTE
, "\n");
6832 stmt
= pattern_def_stmt
;
6833 stmt_info
= pattern_def_stmt_info
;
6837 pattern_def_si
= gsi_none ();
6838 transform_pattern_stmt
= false;
6842 transform_pattern_stmt
= false;
6845 if (STMT_VINFO_VECTYPE (stmt_info
))
6849 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
6850 if (!STMT_SLP_TYPE (stmt_info
)
6851 && nunits
!= (unsigned int) vectorization_factor
6852 && dump_enabled_p ())
6853 /* For SLP VF is set according to unrolling factor, and not
6854 to vector size, hence for SLP this print is not valid. */
6855 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6858 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6860 if (STMT_SLP_TYPE (stmt_info
))
6864 slp_scheduled
= true;
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_NOTE
, vect_location
,
6868 "=== scheduling SLP instances ===\n");
6870 vect_schedule_slp (loop_vinfo
);
6873 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6874 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
6876 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6878 pattern_def_seq
= NULL
;
6885 /* -------- vectorize statement ------------ */
6886 if (dump_enabled_p ())
6887 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
6889 grouped_store
= false;
6890 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
6893 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6895 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6896 interleaving chain was completed - free all the stores in
6899 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
6903 /* Free the attached stmt_vec_info and remove the stmt. */
6904 gimple
*store
= gsi_stmt (si
);
6905 free_stmt_vec_info (store
);
6906 unlink_stmt_vdef (store
);
6907 gsi_remove (&si
, true);
6908 release_defs (store
);
6911 /* Stores can only appear at the end of pattern statements. */
6912 gcc_assert (!transform_pattern_stmt
);
6913 pattern_def_seq
= NULL
;
6915 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6917 pattern_def_seq
= NULL
;
6923 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
6925 /* Reduce loop iterations by the vectorization factor. */
6926 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
6927 expected_iterations
/ vectorization_factor
);
6928 loop
->nb_iterations_upper_bound
6929 = wi::udiv_floor (loop
->nb_iterations_upper_bound
, vectorization_factor
);
6930 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6931 && loop
->nb_iterations_upper_bound
!= 0)
6932 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- 1;
6933 if (loop
->any_estimate
)
6935 loop
->nb_iterations_estimate
6936 = wi::udiv_floor (loop
->nb_iterations_estimate
, vectorization_factor
);
6937 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6938 && loop
->nb_iterations_estimate
!= 0)
6939 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- 1;
6942 if (dump_enabled_p ())
6944 dump_printf_loc (MSG_NOTE
, vect_location
,
6945 "LOOP VECTORIZED\n");
6947 dump_printf_loc (MSG_NOTE
, vect_location
,
6948 "OUTER LOOP VECTORIZED\n");
6949 dump_printf (MSG_NOTE
, "\n");
6952 /* Free SLP instances here because otherwise stmt reference counting
6954 slp_instance instance
;
6955 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
6956 vect_free_slp_instance (instance
);
6957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
6960 /* The code below is trying to perform simple optimization - revert
6961 if-conversion for masked stores, i.e. if the mask of a store is zero
6962 do not perform it and all stored value producers also if possible.
6970 this transformation will produce the following semi-hammock:
6972 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6974 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6975 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6976 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6977 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6978 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6979 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6984 optimize_mask_stores (struct loop
*loop
)
6986 basic_block
*bbs
= get_loop_body (loop
);
6987 unsigned nbbs
= loop
->num_nodes
;
6990 gimple_stmt_iterator gsi
;
6992 auto_vec
<gimple
*> worklist
;
6994 vect_location
= find_loop_location (loop
);
6995 /* Pick up all masked stores in loop if any. */
6996 for (i
= 0; i
< nbbs
; i
++)
6999 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
7002 stmt
= gsi_stmt (gsi
);
7003 if (is_gimple_call (stmt
)
7004 && gimple_call_internal_p (stmt
)
7005 && gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
7006 worklist
.safe_push (stmt
);
7011 if (worklist
.is_empty ())
7014 /* Loop has masked stores. */
7015 while (!worklist
.is_empty ())
7017 gimple
*last
, *last_store
;
7020 basic_block store_bb
, join_bb
;
7021 gimple_stmt_iterator gsi_to
;
7022 tree vdef
, new_vdef
;
7027 last
= worklist
.pop ();
7028 mask
= gimple_call_arg (last
, 2);
7029 bb
= gimple_bb (last
);
7030 /* Create new bb. */
7031 e
= split_block (bb
, last
);
7033 store_bb
= create_empty_bb (bb
);
7034 add_bb_to_loop (store_bb
, loop
);
7035 e
->flags
= EDGE_TRUE_VALUE
;
7036 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
7037 /* Put STORE_BB to likely part. */
7038 efalse
->probability
= PROB_UNLIKELY
;
7039 store_bb
->frequency
= PROB_ALWAYS
- EDGE_FREQUENCY (efalse
);
7040 make_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
7041 if (dom_info_available_p (CDI_DOMINATORS
))
7042 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
7043 if (dump_enabled_p ())
7044 dump_printf_loc (MSG_NOTE
, vect_location
,
7045 "Create new block %d to sink mask stores.",
7047 /* Create vector comparison with boolean result. */
7048 vectype
= TREE_TYPE (mask
);
7049 zero
= build_zero_cst (vectype
);
7050 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
7051 gsi
= gsi_last_bb (bb
);
7052 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
7053 /* Create new PHI node for vdef of the last masked store:
7054 .MEM_2 = VDEF <.MEM_1>
7055 will be converted to
7056 .MEM.3 = VDEF <.MEM_1>
7057 and new PHI node will be created in join bb
7058 .MEM_2 = PHI <.MEM_1, .MEM_3>
7060 vdef
= gimple_vdef (last
);
7061 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
7062 gimple_set_vdef (last
, new_vdef
);
7063 phi
= create_phi_node (vdef
, join_bb
);
7064 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
7066 /* Put all masked stores with the same mask to STORE_BB if possible. */
7069 gimple_stmt_iterator gsi_from
;
7070 gimple
*stmt1
= NULL
;
7072 /* Move masked store to STORE_BB. */
7074 gsi
= gsi_for_stmt (last
);
7076 /* Shift GSI to the previous stmt for further traversal. */
7078 gsi_to
= gsi_start_bb (store_bb
);
7079 gsi_move_before (&gsi_from
, &gsi_to
);
7080 /* Setup GSI_TO to the non-empty block start. */
7081 gsi_to
= gsi_start_bb (store_bb
);
7082 if (dump_enabled_p ())
7084 dump_printf_loc (MSG_NOTE
, vect_location
,
7085 "Move stmt to created bb\n");
7086 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, last
, 0);
7088 /* Move all stored value producers if possible. */
7089 while (!gsi_end_p (gsi
))
7092 imm_use_iterator imm_iter
;
7093 use_operand_p use_p
;
7096 /* Skip debug statements. */
7097 if (is_gimple_debug (gsi_stmt (gsi
)))
7102 stmt1
= gsi_stmt (gsi
);
7103 /* Do not consider statements writing to memory or having
7104 volatile operand. */
7105 if (gimple_vdef (stmt1
)
7106 || gimple_has_volatile_ops (stmt1
))
7110 lhs
= gimple_get_lhs (stmt1
);
7114 /* LHS of vectorized stmt must be SSA_NAME. */
7115 if (TREE_CODE (lhs
) != SSA_NAME
)
7118 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
7120 /* Remove dead scalar statement. */
7121 if (has_zero_uses (lhs
))
7123 gsi_remove (&gsi_from
, true);
7128 /* Check that LHS does not have uses outside of STORE_BB. */
7130 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
7133 use_stmt
= USE_STMT (use_p
);
7134 if (is_gimple_debug (use_stmt
))
7136 if (gimple_bb (use_stmt
) != store_bb
)
7145 if (gimple_vuse (stmt1
)
7146 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
7149 /* Can move STMT1 to STORE_BB. */
7150 if (dump_enabled_p ())
7152 dump_printf_loc (MSG_NOTE
, vect_location
,
7153 "Move stmt to created bb\n");
7154 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt1
, 0);
7156 gsi_move_before (&gsi_from
, &gsi_to
);
7157 /* Shift GSI_TO for further insertion. */
7160 /* Put other masked stores with the same mask to STORE_BB. */
7161 if (worklist
.is_empty ()
7162 || gimple_call_arg (worklist
.last (), 2) != mask
7163 || worklist
.last () != stmt1
)
7165 last
= worklist
.pop ();
7167 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);