2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "double-int.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
40 #include "hard-reg-set.h"
42 #include "dominance.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
68 #include "statistics.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
81 #include "insn-codes.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
90 /* Loop Vectorization Pass.
92 This pass tries to vectorize loops.
94 For example, the vectorizer transforms the following simple loop:
96 short a[N]; short b[N]; short c[N]; int i;
102 as if it was manually vectorized by rewriting the source code into:
104 typedef int __attribute__((mode(V8HI))) v8hi;
105 short a[N]; short b[N]; short c[N]; int i;
106 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
109 for (i=0; i<N/8; i++){
116 The main entry to this pass is vectorize_loops(), in which
117 the vectorizer applies a set of analyses on a given set of loops,
118 followed by the actual vectorization transformation for the loops that
119 had successfully passed the analysis phase.
120 Throughout this pass we make a distinction between two types of
121 data: scalars (which are represented by SSA_NAMES), and memory references
122 ("data-refs"). These two types of data require different handling both
123 during analysis and transformation. The types of data-refs that the
124 vectorizer currently supports are ARRAY_REFS which base is an array DECL
125 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126 accesses are required to have a simple (consecutive) access pattern.
130 The driver for the analysis phase is vect_analyze_loop().
131 It applies a set of analyses, some of which rely on the scalar evolution
132 analyzer (scev) developed by Sebastian Pop.
134 During the analysis phase the vectorizer records some information
135 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136 loop, as well as general information about the loop as a whole, which is
137 recorded in a "loop_vec_info" struct attached to each loop.
139 Transformation phase:
140 =====================
141 The loop transformation phase scans all the stmts in the loop, and
142 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143 the loop that needs to be vectorized. It inserts the vector code sequence
144 just before the scalar stmt S, and records a pointer to the vector code
145 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146 attached to S). This pointer will be used for the vectorization of following
147 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148 otherwise, we rely on dead code elimination for removing it.
150 For example, say stmt S1 was vectorized into stmt VS1:
153 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
156 To vectorize stmt S2, the vectorizer first finds the stmt that defines
157 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
159 resulting sequence would be:
162 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
164 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
166 Operands that are not SSA_NAMEs, are data-refs that appear in
167 load/store operations (like 'x[i]' in S1), and are handled differently.
171 Currently the only target specific information that is used is the
172 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173 Targets that can support different sizes of vectors, for now will need
174 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
175 flexibility will be added in the future.
177 Since we only vectorize operations which vector form can be
178 expressed using existing tree codes, to verify that an operation is
179 supported, the vectorizer checks the relevant optab at the relevant
180 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
181 the value found is CODE_FOR_nothing, then there's no target support, and
182 we can't vectorize the stmt.
184 For additional information on this project see:
185 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
188 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
190 /* Function vect_determine_vectorization_factor
192 Determine the vectorization factor (VF). VF is the number of data elements
193 that are operated upon in parallel in a single iteration of the vectorized
194 loop. For example, when vectorizing a loop that operates on 4byte elements,
195 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196 elements can fit in a single vector register.
198 We currently support vectorization of loops in which all types operated upon
199 are of the same size. Therefore this function currently sets VF according to
200 the size of the types operated upon, and fails if there are multiple sizes
203 VF is also the factor by which the loop iterations are strip-mined, e.g.:
210 for (i=0; i<N; i+=VF){
211 a[i:VF] = b[i:VF] + c[i:VF];
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
218 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
219 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
220 int nbbs
= loop
->num_nodes
;
221 unsigned int vectorization_factor
= 0;
226 stmt_vec_info stmt_info
;
229 gimple stmt
, pattern_stmt
= NULL
;
230 gimple_seq pattern_def_seq
= NULL
;
231 gimple_stmt_iterator pattern_def_si
= gsi_none ();
232 bool analyze_pattern_stmt
= false;
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE
, vect_location
,
236 "=== vect_determine_vectorization_factor ===\n");
238 for (i
= 0; i
< nbbs
; i
++)
240 basic_block bb
= bbs
[i
];
242 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
246 stmt_info
= vinfo_for_stmt (phi
);
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
250 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
251 dump_printf (MSG_NOTE
, "\n");
254 gcc_assert (stmt_info
);
256 if (STMT_VINFO_RELEVANT_P (stmt_info
))
258 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
259 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE
, vect_location
,
264 "get vectype for scalar type: ");
265 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
266 dump_printf (MSG_NOTE
, "\n");
269 vectype
= get_vectype_for_scalar_type (scalar_type
);
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
275 "not vectorized: unsupported "
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
279 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
283 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
285 if (dump_enabled_p ())
287 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
288 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
289 dump_printf (MSG_NOTE
, "\n");
292 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n",
297 if (!vectorization_factor
298 || (nunits
> vectorization_factor
))
299 vectorization_factor
= nunits
;
303 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
304 !gsi_end_p (si
) || analyze_pattern_stmt
;)
308 if (analyze_pattern_stmt
)
311 stmt
= gsi_stmt (si
);
313 stmt_info
= vinfo_for_stmt (stmt
);
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE
, vect_location
,
318 "==> examining statement: ");
319 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
320 dump_printf (MSG_NOTE
, "\n");
323 gcc_assert (stmt_info
);
325 /* Skip stmts which do not need to be vectorized. */
326 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
327 && !STMT_VINFO_LIVE_P (stmt_info
))
328 || gimple_clobber_p (stmt
))
330 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
331 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
332 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
333 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
336 stmt_info
= vinfo_for_stmt (pattern_stmt
);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE
, vect_location
,
340 "==> examining pattern statement: ");
341 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
342 dump_printf (MSG_NOTE
, "\n");
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
353 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
354 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
355 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
356 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
357 analyze_pattern_stmt
= true;
359 /* If a pattern statement has def stmts, analyze them too. */
360 if (is_pattern_stmt_p (stmt_info
))
362 if (pattern_def_seq
== NULL
)
364 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
365 pattern_def_si
= gsi_start (pattern_def_seq
);
367 else if (!gsi_end_p (pattern_def_si
))
368 gsi_next (&pattern_def_si
);
369 if (pattern_def_seq
!= NULL
)
371 gimple pattern_def_stmt
= NULL
;
372 stmt_vec_info pattern_def_stmt_info
= NULL
;
374 while (!gsi_end_p (pattern_def_si
))
376 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
377 pattern_def_stmt_info
378 = vinfo_for_stmt (pattern_def_stmt
);
379 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
380 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
382 gsi_next (&pattern_def_si
);
385 if (!gsi_end_p (pattern_def_si
))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE
, vect_location
,
390 "==> examining pattern def stmt: ");
391 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
392 pattern_def_stmt
, 0);
393 dump_printf (MSG_NOTE
, "\n");
396 stmt
= pattern_def_stmt
;
397 stmt_info
= pattern_def_stmt_info
;
401 pattern_def_si
= gsi_none ();
402 analyze_pattern_stmt
= false;
406 analyze_pattern_stmt
= false;
409 if (gimple_get_lhs (stmt
) == NULL_TREE
410 /* MASK_STORE has no lhs, but is ok. */
411 && (!is_gimple_call (stmt
)
412 || !gimple_call_internal_p (stmt
)
413 || gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
))
415 if (is_gimple_call (stmt
))
417 /* Ignore calls with no lhs. These must be calls to
418 #pragma omp simd functions, and what vectorization factor
419 it really needs can't be determined until
420 vectorizable_simd_clone_call. */
421 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
423 pattern_def_seq
= NULL
;
428 if (dump_enabled_p ())
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
431 "not vectorized: irregular stmt.");
432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
434 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
439 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
444 "not vectorized: vector stmt in loop:");
445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
446 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
451 if (STMT_VINFO_VECTYPE (stmt_info
))
453 /* The only case when a vectype had been already set is for stmts
454 that contain a dataref, or for "pattern-stmts" (stmts
455 generated by the vectorizer to represent/replace a certain
457 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
458 || is_pattern_stmt_p (stmt_info
)
459 || !gsi_end_p (pattern_def_si
));
460 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
464 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
465 if (is_gimple_call (stmt
)
466 && gimple_call_internal_p (stmt
)
467 && gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
468 scalar_type
= TREE_TYPE (gimple_call_arg (stmt
, 3));
470 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE
, vect_location
,
474 "get vectype for scalar type: ");
475 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
476 dump_printf (MSG_NOTE
, "\n");
478 vectype
= get_vectype_for_scalar_type (scalar_type
);
481 if (dump_enabled_p ())
483 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
484 "not vectorized: unsupported "
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
488 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
493 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
495 if (dump_enabled_p ())
497 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
498 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
499 dump_printf (MSG_NOTE
, "\n");
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE
, vect_location
,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
513 dump_printf (MSG_NOTE
, "\n");
515 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
524 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
539 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
542 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
550 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
551 dump_printf (MSG_NOTE
, "\n");
554 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n", nunits
);
557 if (!vectorization_factor
558 || (nunits
> vectorization_factor
))
559 vectorization_factor
= nunits
;
561 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
563 pattern_def_seq
= NULL
;
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d\n",
572 vectorization_factor
);
573 if (vectorization_factor
<= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
577 "not vectorized: unsupported data-type\n");
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
586 /* Function vect_is_simple_iv_evolution.
588 FORNOW: A simple evolution of an induction variables in the loop is
589 considered a polynomial evolution. */
592 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
597 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
600 /* When there is no evolution in this loop, the evolution function
602 if (evolution_part
== NULL_TREE
)
605 /* When the evolution is a polynomial of degree >= 2
606 the evolution function is not "simple". */
607 if (tree_is_chrec (evolution_part
))
610 step_expr
= evolution_part
;
611 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
616 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
617 dump_printf (MSG_NOTE
, ", init: ");
618 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
619 dump_printf (MSG_NOTE
, "\n");
625 if (TREE_CODE (step_expr
) != INTEGER_CST
626 && (TREE_CODE (step_expr
) != SSA_NAME
627 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
628 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
629 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
630 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
631 || !flag_associative_math
)))
632 && (TREE_CODE (step_expr
) != REAL_CST
633 || !flag_associative_math
))
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
644 /* Function vect_analyze_scalar_cycles_1.
646 Examine the cross iteration def-use cycles of scalar variables
647 in LOOP. LOOP_VINFO represents the loop that is now being
648 considered for vectorization (can be LOOP, or an outer-loop
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
654 basic_block bb
= loop
->header
;
656 auto_vec
<gimple
, 64> worklist
;
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE
, vect_location
,
662 "=== vect_analyze_scalar_cycles ===\n");
664 /* First - identify all inductions. Reduction detection assumes that all the
665 inductions have been identified, therefore, this order must not be
667 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
669 gphi
*phi
= gsi
.phi ();
670 tree access_fn
= NULL
;
671 tree def
= PHI_RESULT (phi
);
672 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
677 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
678 dump_printf (MSG_NOTE
, "\n");
681 /* Skip virtual phi's. The data dependences that are associated with
682 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
683 if (virtual_operand_p (def
))
686 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
688 /* Analyze the evolution function. */
689 access_fn
= analyze_scalar_evolution (loop
, def
);
692 STRIP_NOPS (access_fn
);
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE
, vect_location
,
696 "Access function of PHI: ");
697 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
698 dump_printf (MSG_NOTE
, "\n");
700 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
701 = evolution_part_in_loop_num (access_fn
, loop
->num
);
705 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
706 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
707 && TREE_CODE (step
) != INTEGER_CST
))
709 worklist
.safe_push (phi
);
713 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
721 /* Second - identify all reductions and nested cycles. */
722 while (worklist
.length () > 0)
724 gimple phi
= worklist
.pop ();
725 tree def
= PHI_RESULT (phi
);
726 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
733 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
734 dump_printf (MSG_NOTE
, "\n");
737 gcc_assert (!virtual_operand_p (def
)
738 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
740 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
741 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE
, vect_location
,
749 "Detected double reduction.\n");
751 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
753 vect_double_reduction_def
;
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE
, vect_location
,
761 "Detected vectorizable nested cycle.\n");
763 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE
, vect_location
,
771 "Detected reduction.\n");
773 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
774 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
776 /* Store the reduction cycles for possible vectorization in
778 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
785 "Unknown def-use cycle pattern.\n");
790 /* Function vect_analyze_scalar_cycles.
792 Examine the cross iteration def-use cycles of scalar variables, by
793 analyzing the loop-header PHIs of scalar variables. Classify each
794 cycle as one of the following: invariant, induction, reduction, unknown.
795 We do that for the loop represented by LOOP_VINFO, and also to its
796 inner-loop, if exists.
797 Examples for scalar cycles:
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
814 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
816 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
818 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819 Reductions in such inner-loop therefore have different properties than
820 the reductions in the nest that gets vectorized:
821 1. When vectorized, they are executed in the same order as in the original
822 scalar loop, so we can't change the order of computation when
824 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825 current checks are too strict. */
828 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
832 /* Function vect_get_loop_niters.
834 Determine how many iterations the loop is executed and place it
835 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
836 in NUMBER_OF_ITERATIONSM1.
838 Return the loop exit condition. */
842 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
,
843 tree
*number_of_iterationsm1
)
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_NOTE
, vect_location
,
849 "=== get_loop_niters ===\n");
851 niters
= number_of_latch_executions (loop
);
852 *number_of_iterationsm1
= niters
;
854 /* We want the number of loop header executions which is the number
855 of latch executions plus one.
856 ??? For UINT_MAX latch executions this number overflows to zero
857 for loops like do { n++; } while (n != 0); */
858 if (niters
&& !chrec_contains_undetermined (niters
))
859 niters
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niters
), unshare_expr (niters
),
860 build_int_cst (TREE_TYPE (niters
), 1));
861 *number_of_iterations
= niters
;
863 return get_loop_exit_condition (loop
);
867 /* Function bb_in_loop_p
869 Used as predicate for dfs order traversal of the loop bbs. */
872 bb_in_loop_p (const_basic_block bb
, const void *data
)
874 const struct loop
*const loop
= (const struct loop
*)data
;
875 if (flow_bb_inside_loop_p (loop
, bb
))
881 /* Function new_loop_vec_info.
883 Create and initialize a new loop_vec_info struct for LOOP, as well as
884 stmt_vec_info structs for all the stmts in LOOP. */
887 new_loop_vec_info (struct loop
*loop
)
891 gimple_stmt_iterator si
;
892 unsigned int i
, nbbs
;
894 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
895 LOOP_VINFO_LOOP (res
) = loop
;
897 bbs
= get_loop_body (loop
);
899 /* Create/Update stmt_info for all stmts in the loop. */
900 for (i
= 0; i
< loop
->num_nodes
; i
++)
902 basic_block bb
= bbs
[i
];
904 /* BBs in a nested inner-loop will have been already processed (because
905 we will have called vect_analyze_loop_form for any nested inner-loop).
906 Therefore, for stmts in an inner-loop we just want to update the
907 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908 loop_info of the outer-loop we are currently considering to vectorize
909 (instead of the loop_info of the inner-loop).
910 For stmts in other BBs we need to create a stmt_info from scratch. */
911 if (bb
->loop_father
!= loop
)
914 gcc_assert (loop
->inner
&& bb
->loop_father
== loop
->inner
);
915 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
917 gimple phi
= gsi_stmt (si
);
918 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
919 loop_vec_info inner_loop_vinfo
=
920 STMT_VINFO_LOOP_VINFO (stmt_info
);
921 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
922 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
924 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
926 gimple stmt
= gsi_stmt (si
);
927 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
928 loop_vec_info inner_loop_vinfo
=
929 STMT_VINFO_LOOP_VINFO (stmt_info
);
930 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
931 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
936 /* bb in current nest. */
937 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
939 gimple phi
= gsi_stmt (si
);
940 gimple_set_uid (phi
, 0);
941 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
, NULL
));
944 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
946 gimple stmt
= gsi_stmt (si
);
947 gimple_set_uid (stmt
, 0);
948 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
, NULL
));
953 /* CHECKME: We want to visit all BBs before their successors (except for
954 latch blocks, for which this assertion wouldn't hold). In the simple
955 case of the loop forms we allow, a dfs order of the BBs would the same
956 as reversed postorder traversal, so we are safe. */
959 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
960 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
961 bbs
, loop
->num_nodes
, loop
);
962 gcc_assert (nbbs
== loop
->num_nodes
);
964 LOOP_VINFO_BBS (res
) = bbs
;
965 LOOP_VINFO_NITERSM1 (res
) = NULL
;
966 LOOP_VINFO_NITERS (res
) = NULL
;
967 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
968 LOOP_VINFO_COST_MODEL_MIN_ITERS (res
) = 0;
969 LOOP_VINFO_COST_MODEL_THRESHOLD (res
) = 0;
970 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
971 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res
) = 0;
972 LOOP_VINFO_VECT_FACTOR (res
) = 0;
973 LOOP_VINFO_LOOP_NEST (res
).create (3);
974 LOOP_VINFO_DATAREFS (res
).create (10);
975 LOOP_VINFO_DDRS (res
).create (10 * 10);
976 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
977 LOOP_VINFO_MAY_MISALIGN_STMTS (res
).create (
978 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
));
979 LOOP_VINFO_MAY_ALIAS_DDRS (res
).create (
980 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
981 LOOP_VINFO_GROUPED_STORES (res
).create (10);
982 LOOP_VINFO_REDUCTIONS (res
).create (10);
983 LOOP_VINFO_REDUCTION_CHAINS (res
).create (10);
984 LOOP_VINFO_SLP_INSTANCES (res
).create (10);
985 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
986 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
987 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
988 LOOP_VINFO_PEELING_FOR_NITER (res
) = false;
989 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
995 /* Function destroy_loop_vec_info.
997 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998 stmts in the loop. */
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
1006 gimple_stmt_iterator si
;
1008 vec
<slp_instance
> slp_instances
;
1009 slp_instance instance
;
1015 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1017 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1018 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
1019 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
1021 for (j
= 0; j
< nbbs
; j
++)
1023 basic_block bb
= bbs
[j
];
1024 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1025 free_stmt_vec_info (gsi_stmt (si
));
1027 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
1029 gimple stmt
= gsi_stmt (si
);
1031 /* We may have broken canonical form by moving a constant
1032 into RHS1 of a commutative op. Fix such occurrences. */
1033 if (swapped
&& is_gimple_assign (stmt
))
1035 enum tree_code code
= gimple_assign_rhs_code (stmt
);
1037 if ((code
== PLUS_EXPR
1038 || code
== POINTER_PLUS_EXPR
1039 || code
== MULT_EXPR
)
1040 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
1041 swap_ssa_operands (stmt
,
1042 gimple_assign_rhs1_ptr (stmt
),
1043 gimple_assign_rhs2_ptr (stmt
));
1046 /* Free stmt_vec_info. */
1047 free_stmt_vec_info (stmt
);
1052 free (LOOP_VINFO_BBS (loop_vinfo
));
1053 vect_destroy_datarefs (loop_vinfo
, NULL
);
1054 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
1055 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
1056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
1057 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
1058 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1059 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
1060 vect_free_slp_instance (instance
);
1062 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
1063 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
1064 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
1065 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
1067 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo
);
1068 LOOP_VINFO_PEELING_HTAB (loop_vinfo
) = NULL
;
1070 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
1077 /* Function vect_analyze_loop_1.
1079 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080 for it. The different analyses will record information in the
1081 loop_vec_info struct. This is a subset of the analyses applied in
1082 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083 that is now considered for (outer-loop) vectorization. */
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop
*loop
)
1088 loop_vec_info loop_vinfo
;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE
, vect_location
,
1092 "===== analyze_loop_nest_1 =====\n");
1094 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1096 loop_vinfo
= vect_analyze_loop_form (loop
);
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1101 "bad inner-loop form.\n");
1109 /* Function vect_analyze_loop_form.
1111 Verify that certain CFG restrictions hold, including:
1112 - the loop has a pre-header
1113 - the loop has a single entry and exit
1114 - the loop exit condition is simple enough, and the number of iterations
1115 can be analyzed (a countable loop). */
1118 vect_analyze_loop_form (struct loop
*loop
)
1120 loop_vec_info loop_vinfo
;
1122 tree number_of_iterations
= NULL
, number_of_iterationsm1
= NULL
;
1123 loop_vec_info inner_loop_vinfo
= NULL
;
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE
, vect_location
,
1127 "=== vect_analyze_loop_form ===\n");
1129 /* Different restrictions apply when we are considering an inner-most loop,
1130 vs. an outer (nested) loop.
1131 (FORNOW. May want to relax some of these restrictions in the future). */
1135 /* Inner-most loop. We currently require that the number of BBs is
1136 exactly 2 (the header and latch). Vectorizable inner-most loops
1147 if (loop
->num_nodes
!= 2)
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1151 "not vectorized: control flow in loop.\n");
1155 if (empty_block_p (loop
->header
))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1159 "not vectorized: empty loop.\n");
1165 struct loop
*innerloop
= loop
->inner
;
1168 /* Nested loop. We currently require that the loop is doubly-nested,
1169 contains a single inner loop, and the number of BBs is exactly 5.
1170 Vectorizable outer-loops look like this:
1182 The inner-loop has the properties expected of inner-most loops
1183 as described above. */
1185 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1189 "not vectorized: multiple nested loops.\n");
1193 /* Analyze the inner-loop. */
1194 inner_loop_vinfo
= vect_analyze_loop_1 (loop
->inner
);
1195 if (!inner_loop_vinfo
)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1199 "not vectorized: Bad inner loop.\n");
1203 if (!expr_invariant_in_loop_p (loop
,
1204 LOOP_VINFO_NITERS (inner_loop_vinfo
)))
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1208 "not vectorized: inner-loop count not"
1210 destroy_loop_vec_info (inner_loop_vinfo
, true);
1214 if (loop
->num_nodes
!= 5)
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1218 "not vectorized: control flow in loop.\n");
1219 destroy_loop_vec_info (inner_loop_vinfo
, true);
1223 gcc_assert (EDGE_COUNT (innerloop
->header
->preds
) == 2);
1224 entryedge
= EDGE_PRED (innerloop
->header
, 0);
1225 if (EDGE_PRED (innerloop
->header
, 0)->src
== innerloop
->latch
)
1226 entryedge
= EDGE_PRED (innerloop
->header
, 1);
1228 if (entryedge
->src
!= loop
->header
1229 || !single_exit (innerloop
)
1230 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1234 "not vectorized: unsupported outerloop form.\n");
1235 destroy_loop_vec_info (inner_loop_vinfo
, true);
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE
, vect_location
,
1241 "Considering outer-loop vectorization.\n");
1244 if (!single_exit (loop
)
1245 || EDGE_COUNT (loop
->header
->preds
) != 2)
1247 if (dump_enabled_p ())
1249 if (!single_exit (loop
))
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1251 "not vectorized: multiple exits.\n");
1252 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1254 "not vectorized: too many incoming edges.\n");
1256 if (inner_loop_vinfo
)
1257 destroy_loop_vec_info (inner_loop_vinfo
, true);
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop
->latch
)
1266 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1270 "not vectorized: latch block not empty.\n");
1271 if (inner_loop_vinfo
)
1272 destroy_loop_vec_info (inner_loop_vinfo
, true);
1276 /* Make sure there exists a single-predecessor exit bb: */
1277 if (!single_pred_p (single_exit (loop
)->dest
))
1279 edge e
= single_exit (loop
);
1280 if (!(e
->flags
& EDGE_ABNORMAL
))
1282 split_loop_exit_edge (e
);
1283 if (dump_enabled_p ())
1284 dump_printf (MSG_NOTE
, "split exit edge.\n");
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1290 "not vectorized: abnormal loop exit edge.\n");
1291 if (inner_loop_vinfo
)
1292 destroy_loop_vec_info (inner_loop_vinfo
, true);
1297 loop_cond
= vect_get_loop_niters (loop
, &number_of_iterations
,
1298 &number_of_iterationsm1
);
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1303 "not vectorized: complicated exit condition.\n");
1304 if (inner_loop_vinfo
)
1305 destroy_loop_vec_info (inner_loop_vinfo
, true);
1309 if (!number_of_iterations
1310 || chrec_contains_undetermined (number_of_iterations
))
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1314 "not vectorized: number of iterations cannot be "
1316 if (inner_loop_vinfo
)
1317 destroy_loop_vec_info (inner_loop_vinfo
, true);
1321 if (integer_zerop (number_of_iterations
))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1325 "not vectorized: number of iterations = 0.\n");
1326 if (inner_loop_vinfo
)
1327 destroy_loop_vec_info (inner_loop_vinfo
, true);
1331 loop_vinfo
= new_loop_vec_info (loop
);
1332 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1333 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1334 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1336 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1338 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_NOTE
, vect_location
,
1341 "Symbolic number of iterations is ");
1342 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1343 dump_printf (MSG_NOTE
, "\n");
1347 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1349 /* CHECKME: May want to keep it around it in the future. */
1350 if (inner_loop_vinfo
)
1351 destroy_loop_vec_info (inner_loop_vinfo
, false);
1353 gcc_assert (!loop
->aux
);
1354 loop
->aux
= loop_vinfo
;
1359 /* Function vect_analyze_loop_operations.
1361 Scan the loop stmts and make sure they are all vectorizable. */
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo
, bool slp
)
1366 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1367 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1368 int nbbs
= loop
->num_nodes
;
1369 unsigned int vectorization_factor
= 0;
1371 stmt_vec_info stmt_info
;
1372 bool need_to_vectorize
= false;
1373 int min_profitable_iters
;
1374 int min_scalar_loop_bound
;
1376 bool only_slp_in_loop
= true, ok
;
1377 HOST_WIDE_INT max_niter
;
1378 HOST_WIDE_INT estimated_niter
;
1379 int min_profitable_estimate
;
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE
, vect_location
,
1383 "=== vect_analyze_loop_operations ===\n");
1385 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
1386 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1389 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390 vectorization factor of the loop is the unrolling factor required by
1391 the SLP instances. If that unrolling factor is 1, we say, that we
1392 perform pure SLP on loop - cross iteration parallelism is not
1394 for (i
= 0; i
< nbbs
; i
++)
1396 basic_block bb
= bbs
[i
];
1397 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1400 gimple stmt
= gsi_stmt (si
);
1401 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1402 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
1403 && STMT_VINFO_RELATED_STMT (stmt_info
))
1405 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
1406 stmt_info
= vinfo_for_stmt (stmt
);
1408 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1409 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1410 && !PURE_SLP_STMT (stmt_info
))
1411 /* STMT needs both SLP and loop-based vectorization. */
1412 only_slp_in_loop
= false;
1416 if (only_slp_in_loop
)
1417 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1419 vectorization_factor
= least_common_multiple (vectorization_factor
,
1420 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1422 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1423 if (dump_enabled_p ())
1424 dump_printf_loc (MSG_NOTE
, vect_location
,
1425 "Updating vectorization factor to %d\n",
1426 vectorization_factor
);
1429 for (i
= 0; i
< nbbs
; i
++)
1431 basic_block bb
= bbs
[i
];
1433 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1436 gphi
*phi
= si
.phi ();
1439 stmt_info
= vinfo_for_stmt (phi
);
1440 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1443 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1444 dump_printf (MSG_NOTE
, "\n");
1447 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1448 (i.e., a phi in the tail of the outer-loop). */
1449 if (! is_loop_header_bb_p (bb
))
1451 /* FORNOW: we currently don't support the case that these phis
1452 are not used in the outerloop (unless it is double reduction,
1453 i.e., this phi is vect_reduction_def), cause this case
1454 requires to actually do something here. */
1455 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1456 || STMT_VINFO_LIVE_P (stmt_info
))
1457 && STMT_VINFO_DEF_TYPE (stmt_info
)
1458 != vect_double_reduction_def
)
1460 if (dump_enabled_p ())
1461 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1462 "Unsupported loop-closed phi in "
1467 /* If PHI is used in the outer loop, we check that its operand
1468 is defined in the inner loop. */
1469 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1474 if (gimple_phi_num_args (phi
) != 1)
1477 phi_op
= PHI_ARG_DEF (phi
, 0);
1478 if (TREE_CODE (phi_op
) != SSA_NAME
)
1481 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1482 if (gimple_nop_p (op_def_stmt
)
1483 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1484 || !vinfo_for_stmt (op_def_stmt
))
1487 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1488 != vect_used_in_outer
1489 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1490 != vect_used_in_outer_by_reduction
)
1497 gcc_assert (stmt_info
);
1499 if (STMT_VINFO_LIVE_P (stmt_info
))
1501 /* FORNOW: not yet supported. */
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1504 "not vectorized: value used after loop.\n");
1508 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1509 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1511 /* A scalar-dependence cycle that we don't support. */
1512 if (dump_enabled_p ())
1513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1514 "not vectorized: scalar dependence cycle.\n");
1518 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1520 need_to_vectorize
= true;
1521 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1522 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1527 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1530 "not vectorized: relevant phi not "
1532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1533 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1539 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1542 gimple stmt
= gsi_stmt (si
);
1543 if (!gimple_clobber_p (stmt
)
1544 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1549 /* All operations in the loop are either irrelevant (deal with loop
1550 control, or dead), or only used outside the loop and can be moved
1551 out of the loop (e.g. invariants, inductions). The loop can be
1552 optimized away by scalar optimizations. We're better off not
1553 touching this loop. */
1554 if (!need_to_vectorize
)
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE
, vect_location
,
1558 "All the computation can be taken out of the loop.\n");
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1561 "not vectorized: redundant loop. no profit to "
1566 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE
, vect_location
,
1568 "vectorization_factor = %d, niters = "
1569 HOST_WIDE_INT_PRINT_DEC
"\n", vectorization_factor
,
1570 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1572 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1573 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1574 || ((max_niter
= max_stmt_executions_int (loop
)) != -1
1575 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1579 "not vectorized: iteration count too small.\n");
1580 if (dump_enabled_p ())
1581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1582 "not vectorized: iteration count smaller than "
1583 "vectorization factor.\n");
1587 /* Analyze cost. Decide if worth while to vectorize. */
1589 /* Once VF is set, SLP costs should be updated since the number of created
1590 vector stmts depends on VF. */
1591 vect_update_slp_costs_according_to_vf (loop_vinfo
);
1593 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1594 &min_profitable_estimate
);
1595 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
) = min_profitable_iters
;
1597 if (min_profitable_iters
< 0)
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1601 "not vectorized: vectorization not profitable.\n");
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1604 "not vectorized: vector version will never be "
1609 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1610 * vectorization_factor
) - 1);
1613 /* Use the cost model only if it is more conservative than user specified
1616 th
= (unsigned) min_scalar_loop_bound
;
1617 if (min_profitable_iters
1618 && (!min_scalar_loop_bound
1619 || min_profitable_iters
> min_scalar_loop_bound
))
1620 th
= (unsigned) min_profitable_iters
;
1622 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1624 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1625 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1629 "not vectorized: vectorization not profitable.\n");
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_NOTE
, vect_location
,
1632 "not vectorized: iteration count smaller than user "
1633 "specified loop bound parameter or minimum profitable "
1634 "iterations (whichever is more conservative).\n");
1638 if ((estimated_niter
= estimated_stmt_executions_int (loop
)) != -1
1639 && ((unsigned HOST_WIDE_INT
) estimated_niter
1640 <= MAX (th
, (unsigned)min_profitable_estimate
)))
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1644 "not vectorized: estimated iteration count too "
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_NOTE
, vect_location
,
1648 "not vectorized: estimated iteration count smaller "
1649 "than specified loop bound parameter or minimum "
1650 "profitable iterations (whichever is more "
1651 "conservative).\n");
1659 /* Function vect_analyze_loop_2.
1661 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1662 for it. The different analyses will record information in the
1663 loop_vec_info struct. */
1665 vect_analyze_loop_2 (loop_vec_info loop_vinfo
)
1667 bool ok
, slp
= false;
1668 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1671 unsigned int n_stmts
= 0;
1673 /* Find all data references in the loop (which correspond to vdefs/vuses)
1674 and analyze their evolution in the loop. Also adjust the minimal
1675 vectorization factor according to the loads and stores.
1677 FORNOW: Handle only simple, array references, which
1678 alignment can be forced, and aligned pointer-references. */
1680 ok
= vect_analyze_data_refs (loop_vinfo
, NULL
, &min_vf
, &n_stmts
);
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1685 "bad data references.\n");
1689 /* Classify all cross-iteration scalar data-flow cycles.
1690 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1692 vect_analyze_scalar_cycles (loop_vinfo
);
1694 vect_pattern_recog (loop_vinfo
, NULL
);
1696 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1697 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1699 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1704 "bad data access.\n");
1708 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1710 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1715 "unexpected pattern.\n");
1719 /* Analyze data dependences between the data-refs in the loop
1720 and adjust the maximum vectorization factor according to
1722 FORNOW: fail at the first data dependence that we encounter. */
1724 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1730 "bad data dependence.\n");
1734 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1739 "can't determine vectorization factor.\n");
1742 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1746 "bad data dependence.\n");
1750 /* Analyze the alignment of the data-refs in the loop.
1751 Fail if a data reference is found that cannot be vectorized. */
1753 ok
= vect_analyze_data_refs_alignment (loop_vinfo
, NULL
);
1756 if (dump_enabled_p ())
1757 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1758 "bad data alignment.\n");
1762 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1763 It is important to call pruning after vect_analyze_data_ref_accesses,
1764 since we use grouping information gathered by interleaving analysis. */
1765 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1770 "number of versioning for alias "
1771 "run-time tests exceeds %d "
1772 "(--param vect-max-version-for-alias-checks)\n",
1773 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
1777 /* This pass will decide on using loop versioning and/or loop peeling in
1778 order to enhance the alignment of data references in the loop. */
1780 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1785 "bad data alignment.\n");
1789 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1790 ok
= vect_analyze_slp (loop_vinfo
, NULL
, n_stmts
);
1793 /* Decide which possible SLP instances to SLP. */
1794 slp
= vect_make_slp_decision (loop_vinfo
);
1796 /* Find stmts that need to be both vectorized and SLPed. */
1797 vect_detect_hybrid_slp (loop_vinfo
);
1802 /* Scan all the operations in the loop and make sure they are
1805 ok
= vect_analyze_loop_operations (loop_vinfo
, slp
);
1808 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1810 "bad operation or unsupported loop bound.\n");
1814 /* Decide whether we need to create an epilogue loop to handle
1815 remaining scalar iterations. */
1816 th
= ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) + 1)
1817 / LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1818 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1820 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1821 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1823 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1824 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
1825 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1826 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1828 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1829 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1830 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1831 /* In case of versioning, check if the maximum number of
1832 iterations is greater than th. If they are identical,
1833 the epilogue is unnecessary. */
1834 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
)
1835 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1836 || (unsigned HOST_WIDE_INT
)max_stmt_executions_int
1837 (LOOP_VINFO_LOOP (loop_vinfo
)) > th
)))
1838 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1840 /* If an epilogue loop is required make sure we can create one. */
1841 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1842 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
1844 if (dump_enabled_p ())
1845 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
1846 if (!vect_can_advance_ivs_p (loop_vinfo
)
1847 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
1848 single_exit (LOOP_VINFO_LOOP
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1853 "not vectorized: can't create required "
1862 /* Function vect_analyze_loop.
1864 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865 for it. The different analyses will record information in the
1866 loop_vec_info struct. */
1868 vect_analyze_loop (struct loop
*loop
)
1870 loop_vec_info loop_vinfo
;
1871 unsigned int vector_sizes
;
1873 /* Autodetect first vector size we try. */
1874 current_vector_size
= 0;
1875 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE
, vect_location
,
1879 "===== analyze_loop_nest =====\n");
1881 if (loop_outer (loop
)
1882 && loop_vec_info_for_loop (loop_outer (loop
))
1883 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
1885 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE
, vect_location
,
1887 "outer-loop already vectorized.\n");
1893 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1894 loop_vinfo
= vect_analyze_loop_form (loop
);
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1899 "bad loop form.\n");
1903 if (vect_analyze_loop_2 (loop_vinfo
))
1905 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
1910 destroy_loop_vec_info (loop_vinfo
, true);
1912 vector_sizes
&= ~current_vector_size
;
1913 if (vector_sizes
== 0
1914 || current_vector_size
== 0)
1917 /* Try the next biggest vector size. */
1918 current_vector_size
= 1 << floor_log2 (vector_sizes
);
1919 if (dump_enabled_p ())
1920 dump_printf_loc (MSG_NOTE
, vect_location
,
1921 "***** Re-trying analysis with "
1922 "vector size %d\n", current_vector_size
);
1927 /* Function reduction_code_for_scalar_code
1930 CODE - tree_code of a reduction operations.
1933 REDUC_CODE - the corresponding tree-code to be used to reduce the
1934 vector of partial results into a single scalar result, or ERROR_MARK
1935 if the operation is a supported reduction operation, but does not have
1938 Return FALSE if CODE currently cannot be vectorized as reduction. */
1941 reduction_code_for_scalar_code (enum tree_code code
,
1942 enum tree_code
*reduc_code
)
1947 *reduc_code
= REDUC_MAX_EXPR
;
1951 *reduc_code
= REDUC_MIN_EXPR
;
1955 *reduc_code
= REDUC_PLUS_EXPR
;
1963 *reduc_code
= ERROR_MARK
;
1972 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1973 STMT is printed with a message MSG. */
1976 report_vect_op (int msg_type
, gimple stmt
, const char *msg
)
1978 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
1979 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
1980 dump_printf (msg_type
, "\n");
1984 /* Detect SLP reduction of the form:
1994 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1995 FIRST_STMT is the first reduction stmt in the chain
1996 (a2 = operation (a1)).
1998 Return TRUE if a reduction chain was detected. */
2001 vect_is_slp_reduction (loop_vec_info loop_info
, gimple phi
, gimple first_stmt
)
2003 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2004 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2005 enum tree_code code
;
2006 gimple current_stmt
= NULL
, loop_use_stmt
= NULL
, first
, next_stmt
;
2007 stmt_vec_info use_stmt_info
, current_stmt_info
;
2009 imm_use_iterator imm_iter
;
2010 use_operand_p use_p
;
2011 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2014 if (loop
!= vect_loop
)
2017 lhs
= PHI_RESULT (phi
);
2018 code
= gimple_assign_rhs_code (first_stmt
);
2022 n_out_of_loop_uses
= 0;
2023 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2025 gimple use_stmt
= USE_STMT (use_p
);
2026 if (is_gimple_debug (use_stmt
))
2029 /* Check if we got back to the reduction phi. */
2030 if (use_stmt
== phi
)
2032 loop_use_stmt
= use_stmt
;
2037 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2039 loop_use_stmt
= use_stmt
;
2043 n_out_of_loop_uses
++;
2045 /* There are can be either a single use in the loop or two uses in
2047 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2054 /* We reached a statement with no loop uses. */
2055 if (nloop_uses
== 0)
2058 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2059 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2062 if (!is_gimple_assign (loop_use_stmt
)
2063 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2064 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2067 /* Insert USE_STMT into reduction chain. */
2068 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2071 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2072 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2073 GROUP_FIRST_ELEMENT (use_stmt_info
)
2074 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2077 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2079 lhs
= gimple_assign_lhs (loop_use_stmt
);
2080 current_stmt
= loop_use_stmt
;
2084 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2087 /* Swap the operands, if needed, to make the reduction operand be the second
2089 lhs
= PHI_RESULT (phi
);
2090 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2093 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2095 tree op
= gimple_assign_rhs1 (next_stmt
);
2096 gimple def_stmt
= NULL
;
2098 if (TREE_CODE (op
) == SSA_NAME
)
2099 def_stmt
= SSA_NAME_DEF_STMT (op
);
2101 /* Check that the other def is either defined in the loop
2102 ("vect_internal_def"), or it's an induction (defined by a
2103 loop-header phi-node). */
2105 && gimple_bb (def_stmt
)
2106 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2107 && (is_gimple_assign (def_stmt
)
2108 || is_gimple_call (def_stmt
)
2109 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2110 == vect_induction_def
2111 || (gimple_code (def_stmt
) == GIMPLE_PHI
2112 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2113 == vect_internal_def
2114 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2116 lhs
= gimple_assign_lhs (next_stmt
);
2117 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2125 tree op
= gimple_assign_rhs2 (next_stmt
);
2126 gimple def_stmt
= NULL
;
2128 if (TREE_CODE (op
) == SSA_NAME
)
2129 def_stmt
= SSA_NAME_DEF_STMT (op
);
2131 /* Check that the other def is either defined in the loop
2132 ("vect_internal_def"), or it's an induction (defined by a
2133 loop-header phi-node). */
2135 && gimple_bb (def_stmt
)
2136 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2137 && (is_gimple_assign (def_stmt
)
2138 || is_gimple_call (def_stmt
)
2139 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2140 == vect_induction_def
2141 || (gimple_code (def_stmt
) == GIMPLE_PHI
2142 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2143 == vect_internal_def
2144 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2146 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2149 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2150 dump_printf (MSG_NOTE
, "\n");
2153 swap_ssa_operands (next_stmt
,
2154 gimple_assign_rhs1_ptr (next_stmt
),
2155 gimple_assign_rhs2_ptr (next_stmt
));
2156 update_stmt (next_stmt
);
2158 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2159 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2165 lhs
= gimple_assign_lhs (next_stmt
);
2166 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2169 /* Save the chain for further analysis in SLP detection. */
2170 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2171 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2172 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2178 /* Function vect_is_simple_reduction_1
2180 (1) Detect a cross-iteration def-use cycle that represents a simple
2181 reduction computation. We look for the following pattern:
2186 a2 = operation (a3, a1)
2193 a2 = operation (a3, a1)
2196 1. operation is commutative and associative and it is safe to
2197 change the order of the computation (if CHECK_REDUCTION is true)
2198 2. no uses for a2 in the loop (a2 is used out of the loop)
2199 3. no uses of a1 in the loop besides the reduction operation
2200 4. no uses of a1 outside the loop.
2202 Conditions 1,4 are tested here.
2203 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2205 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2206 nested cycles, if CHECK_REDUCTION is false.
2208 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2212 inner loop (def of a3)
2215 If MODIFY is true it tries also to rework the code in-place to enable
2216 detection of more reduction patterns. For the time being we rewrite
2217 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2221 vect_is_simple_reduction_1 (loop_vec_info loop_info
, gimple phi
,
2222 bool check_reduction
, bool *double_reduc
,
2225 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2226 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2227 edge latch_e
= loop_latch_edge (loop
);
2228 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2229 gimple def_stmt
, def1
= NULL
, def2
= NULL
;
2230 enum tree_code orig_code
, code
;
2231 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2235 imm_use_iterator imm_iter
;
2236 use_operand_p use_p
;
2239 *double_reduc
= false;
2241 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2242 otherwise, we assume outer loop vectorization. */
2243 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2244 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2246 name
= PHI_RESULT (phi
);
2247 /* ??? If there are no uses of the PHI result the inner loop reduction
2248 won't be detected as possibly double-reduction by vectorizable_reduction
2249 because that tries to walk the PHI arg from the preheader edge which
2250 can be constant. See PR60382. */
2251 if (has_zero_uses (name
))
2254 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2256 gimple use_stmt
= USE_STMT (use_p
);
2257 if (is_gimple_debug (use_stmt
))
2260 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2264 "intermediate value used outside loop.\n");
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2274 "reduction used in loop.\n");
2279 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2284 "reduction: not ssa_name: ");
2285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2286 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2291 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2296 "reduction: no def_stmt.\n");
2300 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2302 if (dump_enabled_p ())
2304 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2305 dump_printf (MSG_NOTE
, "\n");
2310 if (is_gimple_assign (def_stmt
))
2312 name
= gimple_assign_lhs (def_stmt
);
2317 name
= PHI_RESULT (def_stmt
);
2322 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2324 gimple use_stmt
= USE_STMT (use_p
);
2325 if (is_gimple_debug (use_stmt
))
2327 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2333 "reduction used in loop.\n");
2338 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2339 defined in the inner loop. */
2342 op1
= PHI_ARG_DEF (def_stmt
, 0);
2344 if (gimple_phi_num_args (def_stmt
) != 1
2345 || TREE_CODE (op1
) != SSA_NAME
)
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2349 "unsupported phi node definition.\n");
2354 def1
= SSA_NAME_DEF_STMT (op1
);
2355 if (gimple_bb (def1
)
2356 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2358 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2359 && is_gimple_assign (def1
))
2361 if (dump_enabled_p ())
2362 report_vect_op (MSG_NOTE
, def_stmt
,
2363 "detected double reduction: ");
2365 *double_reduc
= true;
2372 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2374 /* We can handle "res -= x[i]", which is non-associative by
2375 simply rewriting this into "res += -x[i]". Avoid changing
2376 gimple instruction for the first simple tests and only do this
2377 if we're allowed to change code at all. */
2378 if (code
== MINUS_EXPR
2380 && (op1
= gimple_assign_rhs1 (def_stmt
))
2381 && TREE_CODE (op1
) == SSA_NAME
2382 && SSA_NAME_DEF_STMT (op1
) == phi
)
2386 && (!commutative_tree_code (code
) || !associative_tree_code (code
)))
2388 if (dump_enabled_p ())
2389 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2390 "reduction: not commutative/associative: ");
2394 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2396 if (code
!= COND_EXPR
)
2398 if (dump_enabled_p ())
2399 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2400 "reduction: not binary operation: ");
2405 op3
= gimple_assign_rhs1 (def_stmt
);
2406 if (COMPARISON_CLASS_P (op3
))
2408 op4
= TREE_OPERAND (op3
, 1);
2409 op3
= TREE_OPERAND (op3
, 0);
2412 op1
= gimple_assign_rhs2 (def_stmt
);
2413 op2
= gimple_assign_rhs3 (def_stmt
);
2415 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2417 if (dump_enabled_p ())
2418 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2419 "reduction: uses not ssa_names: ");
2426 op1
= gimple_assign_rhs1 (def_stmt
);
2427 op2
= gimple_assign_rhs2 (def_stmt
);
2429 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2431 if (dump_enabled_p ())
2432 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2433 "reduction: uses not ssa_names: ");
2439 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2440 if ((TREE_CODE (op1
) == SSA_NAME
2441 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2442 || (TREE_CODE (op2
) == SSA_NAME
2443 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2444 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2445 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2446 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2447 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2449 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_NOTE
, vect_location
,
2452 "reduction: multiple types: operation type: ");
2453 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2454 dump_printf (MSG_NOTE
, ", operands types: ");
2455 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2457 dump_printf (MSG_NOTE
, ",");
2458 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2462 dump_printf (MSG_NOTE
, ",");
2463 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2469 dump_printf (MSG_NOTE
, ",");
2470 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2473 dump_printf (MSG_NOTE
, "\n");
2479 /* Check that it's ok to change the order of the computation.
2480 Generally, when vectorizing a reduction we change the order of the
2481 computation. This may change the behavior of the program in some
2482 cases, so we need to check that this is ok. One exception is when
2483 vectorizing an outer-loop: the inner-loop is executed sequentially,
2484 and therefore vectorizing reductions in the inner-loop during
2485 outer-loop vectorization is safe. */
2487 /* CHECKME: check for !flag_finite_math_only too? */
2488 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
2491 /* Changing the order of operations changes the semantics. */
2492 if (dump_enabled_p ())
2493 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2494 "reduction: unsafe fp math optimization: ");
2497 else if (INTEGRAL_TYPE_P (type
) && TYPE_OVERFLOW_TRAPS (type
)
2500 /* Changing the order of operations changes the semantics. */
2501 if (dump_enabled_p ())
2502 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2503 "reduction: unsafe int math optimization: ");
2506 else if (SAT_FIXED_POINT_TYPE_P (type
) && check_reduction
)
2508 /* Changing the order of operations changes the semantics. */
2509 if (dump_enabled_p ())
2510 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2511 "reduction: unsafe fixed-point math optimization: ");
2515 /* If we detected "res -= x[i]" earlier, rewrite it into
2516 "res += -x[i]" now. If this turns out to be useless reassoc
2517 will clean it up again. */
2518 if (orig_code
== MINUS_EXPR
)
2520 tree rhs
= gimple_assign_rhs2 (def_stmt
);
2521 tree negrhs
= make_ssa_name (TREE_TYPE (rhs
));
2522 gimple negate_stmt
= gimple_build_assign (negrhs
, NEGATE_EXPR
, rhs
);
2523 gimple_stmt_iterator gsi
= gsi_for_stmt (def_stmt
);
2524 set_vinfo_for_stmt (negate_stmt
, new_stmt_vec_info (negate_stmt
,
2526 gsi_insert_before (&gsi
, negate_stmt
, GSI_NEW_STMT
);
2527 gimple_assign_set_rhs2 (def_stmt
, negrhs
);
2528 gimple_assign_set_rhs_code (def_stmt
, PLUS_EXPR
);
2529 update_stmt (def_stmt
);
2532 /* Reduction is safe. We're dealing with one of the following:
2533 1) integer arithmetic and no trapv
2534 2) floating point arithmetic, and special flags permit this optimization
2535 3) nested cycle (i.e., outer loop vectorization). */
2536 if (TREE_CODE (op1
) == SSA_NAME
)
2537 def1
= SSA_NAME_DEF_STMT (op1
);
2539 if (TREE_CODE (op2
) == SSA_NAME
)
2540 def2
= SSA_NAME_DEF_STMT (op2
);
2542 if (code
!= COND_EXPR
2543 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2545 if (dump_enabled_p ())
2546 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2550 /* Check that one def is the reduction def, defined by PHI,
2551 the other def is either defined in the loop ("vect_internal_def"),
2552 or it's an induction (defined by a loop-header phi-node). */
2554 if (def2
&& def2
== phi
2555 && (code
== COND_EXPR
2556 || !def1
|| gimple_nop_p (def1
)
2557 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2558 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2559 && (is_gimple_assign (def1
)
2560 || is_gimple_call (def1
)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2562 == vect_induction_def
2563 || (gimple_code (def1
) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2568 if (dump_enabled_p ())
2569 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2573 if (def1
&& def1
== phi
2574 && (code
== COND_EXPR
2575 || !def2
|| gimple_nop_p (def2
)
2576 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2577 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2578 && (is_gimple_assign (def2
)
2579 || is_gimple_call (def2
)
2580 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2581 == vect_induction_def
2582 || (gimple_code (def2
) == GIMPLE_PHI
2583 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2584 == vect_internal_def
2585 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2587 if (check_reduction
)
2589 /* Swap operands (just for simplicity - so that the rest of the code
2590 can assume that the reduction variable is always the last (second)
2592 if (dump_enabled_p ())
2593 report_vect_op (MSG_NOTE
, def_stmt
,
2594 "detected reduction: need to swap operands: ");
2596 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2597 gimple_assign_rhs2_ptr (def_stmt
));
2599 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2600 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2604 if (dump_enabled_p ())
2605 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2611 /* Try to find SLP reduction chain. */
2612 if (check_reduction
&& vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
2614 if (dump_enabled_p ())
2615 report_vect_op (MSG_NOTE
, def_stmt
,
2616 "reduction: detected reduction chain: ");
2621 if (dump_enabled_p ())
2622 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2623 "reduction: unknown pattern: ");
2628 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2629 in-place. Arguments as there. */
2632 vect_is_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2633 bool check_reduction
, bool *double_reduc
)
2635 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2636 double_reduc
, false);
2639 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2640 in-place if it enables detection of more reductions. Arguments
2644 vect_force_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2645 bool check_reduction
, bool *double_reduc
)
2647 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2648 double_reduc
, true);
2651 /* Calculate the cost of one scalar iteration of the loop. */
2653 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo
,
2654 stmt_vector_for_cost
*scalar_cost_vec
)
2656 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2657 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2658 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
2659 int innerloop_iters
, i
;
2661 /* Count statements in scalar loop. Using this as scalar cost for a single
2664 TODO: Add outer loop support.
2666 TODO: Consider assigning different costs to different scalar
2670 innerloop_iters
= 1;
2672 innerloop_iters
= 50; /* FIXME */
2674 for (i
= 0; i
< nbbs
; i
++)
2676 gimple_stmt_iterator si
;
2677 basic_block bb
= bbs
[i
];
2679 if (bb
->loop_father
== loop
->inner
)
2680 factor
= innerloop_iters
;
2684 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
2686 gimple stmt
= gsi_stmt (si
);
2687 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2689 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
2692 /* Skip stmts that are not vectorized inside the loop. */
2694 && !STMT_VINFO_RELEVANT_P (stmt_info
)
2695 && (!STMT_VINFO_LIVE_P (stmt_info
)
2696 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2697 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
2700 vect_cost_for_stmt kind
;
2701 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
2703 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
2706 kind
= scalar_store
;
2711 scalar_single_iter_cost
2712 += record_stmt_cost (scalar_cost_vec
, factor
, kind
,
2713 NULL
, 0, vect_prologue
);
2716 return scalar_single_iter_cost
;
2719 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2721 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
2722 int *peel_iters_epilogue
,
2723 stmt_vector_for_cost
*scalar_cost_vec
,
2724 stmt_vector_for_cost
*prologue_cost_vec
,
2725 stmt_vector_for_cost
*epilogue_cost_vec
)
2728 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2730 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2732 *peel_iters_epilogue
= vf
/2;
2733 if (dump_enabled_p ())
2734 dump_printf_loc (MSG_NOTE
, vect_location
,
2735 "cost model: epilogue peel iters set to vf/2 "
2736 "because loop iterations are unknown .\n");
2738 /* If peeled iterations are known but number of scalar loop
2739 iterations are unknown, count a taken branch per peeled loop. */
2740 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
2741 NULL
, 0, vect_prologue
);
2742 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
2743 NULL
, 0, vect_epilogue
);
2747 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2748 peel_iters_prologue
= niters
< peel_iters_prologue
?
2749 niters
: peel_iters_prologue
;
2750 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
2751 /* If we need to peel for gaps, but no peeling is required, we have to
2752 peel VF iterations. */
2753 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
2754 *peel_iters_epilogue
= vf
;
2757 stmt_info_for_cost
*si
;
2759 if (peel_iters_prologue
)
2760 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
2761 retval
+= record_stmt_cost (prologue_cost_vec
,
2762 si
->count
* peel_iters_prologue
,
2763 si
->kind
, NULL
, si
->misalign
,
2765 if (*peel_iters_epilogue
)
2766 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
2767 retval
+= record_stmt_cost (epilogue_cost_vec
,
2768 si
->count
* *peel_iters_epilogue
,
2769 si
->kind
, NULL
, si
->misalign
,
2775 /* Function vect_estimate_min_profitable_iters
2777 Return the number of iterations required for the vector version of the
2778 loop to be profitable relative to the cost of the scalar version of the
2782 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
2783 int *ret_min_profitable_niters
,
2784 int *ret_min_profitable_estimate
)
2786 int min_profitable_iters
;
2787 int min_profitable_estimate
;
2788 int peel_iters_prologue
;
2789 int peel_iters_epilogue
;
2790 unsigned vec_inside_cost
= 0;
2791 int vec_outside_cost
= 0;
2792 unsigned vec_prologue_cost
= 0;
2793 unsigned vec_epilogue_cost
= 0;
2794 int scalar_single_iter_cost
= 0;
2795 int scalar_outside_cost
= 0;
2796 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2797 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2798 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2800 /* Cost model disabled. */
2801 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2803 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
2804 *ret_min_profitable_niters
= 0;
2805 *ret_min_profitable_estimate
= 0;
2809 /* Requires loop versioning tests to handle misalignment. */
2810 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2812 /* FIXME: Make cost depend on complexity of individual check. */
2813 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
2814 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2816 dump_printf (MSG_NOTE
,
2817 "cost model: Adding cost of checks for loop "
2818 "versioning to treat misalignment.\n");
2821 /* Requires loop versioning with alias checks. */
2822 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2824 /* FIXME: Make cost depend on complexity of individual check. */
2825 unsigned len
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).length ();
2826 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2828 dump_printf (MSG_NOTE
,
2829 "cost model: Adding cost of checks for loop "
2830 "versioning aliasing.\n");
2833 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2834 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2835 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
2838 /* Count statements in scalar loop. Using this as scalar cost for a single
2841 TODO: Add outer loop support.
2843 TODO: Consider assigning different costs to different scalar
2846 auto_vec
<stmt_info_for_cost
> scalar_cost_vec
;
2847 scalar_single_iter_cost
2848 = vect_get_single_scalar_iteration_cost (loop_vinfo
, &scalar_cost_vec
);
2850 /* Add additional cost for the peeled instructions in prologue and epilogue
2853 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2854 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2856 TODO: Build an expression that represents peel_iters for prologue and
2857 epilogue to be used in a run-time test. */
2861 peel_iters_prologue
= vf
/2;
2862 dump_printf (MSG_NOTE
, "cost model: "
2863 "prologue peel iters set to vf/2.\n");
2865 /* If peeling for alignment is unknown, loop bound of main loop becomes
2867 peel_iters_epilogue
= vf
/2;
2868 dump_printf (MSG_NOTE
, "cost model: "
2869 "epilogue peel iters set to vf/2 because "
2870 "peeling for alignment is unknown.\n");
2872 /* If peeled iterations are unknown, count a taken branch and a not taken
2873 branch per peeled loop. Even if scalar loop iterations are known,
2874 vector iterations are not known since peeled prologue iterations are
2875 not known. Hence guards remain the same. */
2876 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
2877 NULL
, 0, vect_prologue
);
2878 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
2879 NULL
, 0, vect_prologue
);
2880 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
2881 NULL
, 0, vect_epilogue
);
2882 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
2883 NULL
, 0, vect_epilogue
);
2884 stmt_info_for_cost
*si
;
2886 FOR_EACH_VEC_ELT (scalar_cost_vec
, j
, si
)
2888 struct _stmt_vec_info
*stmt_info
2889 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2890 (void) add_stmt_cost (target_cost_data
,
2891 si
->count
* peel_iters_prologue
,
2892 si
->kind
, stmt_info
, si
->misalign
,
2894 (void) add_stmt_cost (target_cost_data
,
2895 si
->count
* peel_iters_epilogue
,
2896 si
->kind
, stmt_info
, si
->misalign
,
2902 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2903 stmt_info_for_cost
*si
;
2905 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2907 prologue_cost_vec
.create (2);
2908 epilogue_cost_vec
.create (2);
2909 peel_iters_prologue
= npeel
;
2911 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
2912 &peel_iters_epilogue
,
2915 &epilogue_cost_vec
);
2917 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
2919 struct _stmt_vec_info
*stmt_info
2920 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2921 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2922 si
->misalign
, vect_prologue
);
2925 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
2927 struct _stmt_vec_info
*stmt_info
2928 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2929 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2930 si
->misalign
, vect_epilogue
);
2933 prologue_cost_vec
.release ();
2934 epilogue_cost_vec
.release ();
2937 /* FORNOW: The scalar outside cost is incremented in one of the
2940 1. The vectorizer checks for alignment and aliasing and generates
2941 a condition that allows dynamic vectorization. A cost model
2942 check is ANDED with the versioning condition. Hence scalar code
2943 path now has the added cost of the versioning check.
2945 if (cost > th & versioning_check)
2948 Hence run-time scalar is incremented by not-taken branch cost.
2950 2. The vectorizer then checks if a prologue is required. If the
2951 cost model check was not done before during versioning, it has to
2952 be done before the prologue check.
2955 prologue = scalar_iters
2960 if (prologue == num_iters)
2963 Hence the run-time scalar cost is incremented by a taken branch,
2964 plus a not-taken branch, plus a taken branch cost.
2966 3. The vectorizer then checks if an epilogue is required. If the
2967 cost model check was not done before during prologue check, it
2968 has to be done with the epilogue check.
2974 if (prologue == num_iters)
2977 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2980 Hence the run-time scalar cost should be incremented by 2 taken
2983 TODO: The back end may reorder the BBS's differently and reverse
2984 conditions/branch directions. Change the estimates below to
2985 something more reasonable. */
2987 /* If the number of iterations is known and we do not do versioning, we can
2988 decide whether to vectorize at compile time. Hence the scalar version
2989 do not carry cost model guard costs. */
2990 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2991 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2992 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2994 /* Cost model check occurs at versioning. */
2995 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2996 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2997 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3000 /* Cost model check occurs at prologue generation. */
3001 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3002 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3003 + vect_get_stmt_cost (cond_branch_not_taken
);
3004 /* Cost model check occurs at epilogue generation. */
3006 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3010 /* Complete the target-specific cost calculations. */
3011 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3012 &vec_inside_cost
, &vec_epilogue_cost
);
3014 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3016 if (dump_enabled_p ())
3018 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3019 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3021 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3023 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3025 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3026 scalar_single_iter_cost
);
3027 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3028 scalar_outside_cost
);
3029 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3031 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3032 peel_iters_prologue
);
3033 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3034 peel_iters_epilogue
);
3037 /* Calculate number of iterations required to make the vector version
3038 profitable, relative to the loop bodies only. The following condition
3040 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3042 SIC = scalar iteration cost, VIC = vector iteration cost,
3043 VOC = vector outside cost, VF = vectorization factor,
3044 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3045 SOC = scalar outside cost for run time cost model check. */
3047 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
3049 if (vec_outside_cost
<= 0)
3050 min_profitable_iters
= 1;
3053 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
3054 - vec_inside_cost
* peel_iters_prologue
3055 - vec_inside_cost
* peel_iters_epilogue
)
3056 / ((scalar_single_iter_cost
* vf
)
3059 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
3060 <= (((int) vec_inside_cost
* min_profitable_iters
)
3061 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
3062 min_profitable_iters
++;
3065 /* vector version will never be profitable. */
3068 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3069 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
3070 "did not happen for a simd loop");
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3074 "cost model: the vector iteration cost = %d "
3075 "divided by the scalar iteration cost = %d "
3076 "is greater or equal to the vectorization factor = %d"
3078 vec_inside_cost
, scalar_single_iter_cost
, vf
);
3079 *ret_min_profitable_niters
= -1;
3080 *ret_min_profitable_estimate
= -1;
3084 dump_printf (MSG_NOTE
,
3085 " Calculated minimum iters for profitability: %d\n",
3086 min_profitable_iters
);
3088 min_profitable_iters
=
3089 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
3091 /* Because the condition we create is:
3092 if (niters <= min_profitable_iters)
3093 then skip the vectorized loop. */
3094 min_profitable_iters
--;
3096 if (dump_enabled_p ())
3097 dump_printf_loc (MSG_NOTE
, vect_location
,
3098 " Runtime profitability threshold = %d\n",
3099 min_profitable_iters
);
3101 *ret_min_profitable_niters
= min_profitable_iters
;
3103 /* Calculate number of iterations required to make the vector version
3104 profitable, relative to the loop bodies only.
3106 Non-vectorized variant is SIC * niters and it must win over vector
3107 variant on the expected loop trip count. The following condition must hold true:
3108 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3110 if (vec_outside_cost
<= 0)
3111 min_profitable_estimate
= 1;
3114 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
3115 - vec_inside_cost
* peel_iters_prologue
3116 - vec_inside_cost
* peel_iters_epilogue
)
3117 / ((scalar_single_iter_cost
* vf
)
3120 min_profitable_estimate
--;
3121 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_NOTE
, vect_location
,
3124 " Static estimate profitability threshold = %d\n",
3125 min_profitable_iters
);
3127 *ret_min_profitable_estimate
= min_profitable_estimate
;
3130 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3131 vector elements (not bits) for a vector of mode MODE. */
3133 calc_vec_perm_mask_for_shift (enum machine_mode mode
, unsigned int offset
,
3136 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3138 for (i
= 0; i
< nelt
; i
++)
3139 sel
[i
] = (i
+ offset
) & (2*nelt
- 1);
3142 /* Checks whether the target supports whole-vector shifts for vectors of mode
3143 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3144 it supports vec_perm_const with masks for all necessary shift amounts. */
3146 have_whole_vector_shift (enum machine_mode mode
)
3148 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3151 if (direct_optab_handler (vec_perm_const_optab
, mode
) == CODE_FOR_nothing
)
3154 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3155 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
3157 for (i
= nelt
/2; i
>= 1; i
/=2)
3159 calc_vec_perm_mask_for_shift (mode
, i
, sel
);
3160 if (!can_vec_perm_p (mode
, false, sel
))
3166 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3169 get_reduction_op (gimple stmt
, int reduc_index
)
3171 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3173 case GIMPLE_SINGLE_RHS
:
3174 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3176 return TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3177 case GIMPLE_UNARY_RHS
:
3178 return gimple_assign_rhs1 (stmt
);
3179 case GIMPLE_BINARY_RHS
:
3181 ? gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
));
3182 case GIMPLE_TERNARY_RHS
:
3183 return gimple_op (stmt
, reduc_index
+ 1);
3189 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3190 functions. Design better to avoid maintenance issues. */
3192 /* Function vect_model_reduction_cost.
3194 Models cost for a reduction operation, including the vector ops
3195 generated within the strip-mine loop, the initial definition before
3196 the loop, and the epilogue code that must be generated. */
3199 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
3200 int ncopies
, int reduc_index
)
3202 int prologue_cost
= 0, epilogue_cost
= 0;
3203 enum tree_code code
;
3206 gimple stmt
, orig_stmt
;
3209 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3210 struct loop
*loop
= NULL
;
3211 void *target_cost_data
;
3215 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3216 target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3219 target_cost_data
= BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info
));
3221 /* Cost of reduction op inside loop. */
3222 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3223 stmt_info
, 0, vect_body
);
3224 stmt
= STMT_VINFO_STMT (stmt_info
);
3226 reduction_op
= get_reduction_op (stmt
, reduc_index
);
3228 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3231 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3234 "unsupported data-type ");
3235 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3236 TREE_TYPE (reduction_op
));
3237 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3242 mode
= TYPE_MODE (vectype
);
3243 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3246 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3248 code
= gimple_assign_rhs_code (orig_stmt
);
3250 /* Add in cost for initial definition. */
3251 prologue_cost
+= add_stmt_cost (target_cost_data
, 1, scalar_to_vec
,
3252 stmt_info
, 0, vect_prologue
);
3254 /* Determine cost of epilogue code.
3256 We have a reduction operator that will reduce the vector in one statement.
3257 Also requires scalar extract. */
3259 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt
))
3261 if (reduc_code
!= ERROR_MARK
)
3263 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3264 stmt_info
, 0, vect_epilogue
);
3265 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vec_to_scalar
,
3266 stmt_info
, 0, vect_epilogue
);
3270 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3272 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3273 int element_bitsize
= tree_to_uhwi (bitsize
);
3274 int nelements
= vec_size_in_bits
/ element_bitsize
;
3276 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3278 /* We have a whole vector shift available. */
3279 if (VECTOR_MODE_P (mode
)
3280 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3281 && have_whole_vector_shift (mode
))
3283 /* Final reduction via vector shifts and the reduction operator.
3284 Also requires scalar extract. */
3285 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3286 exact_log2 (nelements
) * 2,
3287 vector_stmt
, stmt_info
, 0,
3289 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3290 vec_to_scalar
, stmt_info
, 0,
3294 /* Use extracts and reduction op for final reduction. For N
3295 elements, we have N extracts and N-1 reduction ops. */
3296 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3297 nelements
+ nelements
- 1,
3298 vector_stmt
, stmt_info
, 0,
3303 if (dump_enabled_p ())
3304 dump_printf (MSG_NOTE
,
3305 "vect_model_reduction_cost: inside_cost = %d, "
3306 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
3307 prologue_cost
, epilogue_cost
);
3313 /* Function vect_model_induction_cost.
3315 Models cost for induction operations. */
3318 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3320 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3321 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3322 unsigned inside_cost
, prologue_cost
;
3324 /* loop cost for vec_loop. */
3325 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3326 stmt_info
, 0, vect_body
);
3328 /* prologue cost for vec_init and vec_step. */
3329 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3330 stmt_info
, 0, vect_prologue
);
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_NOTE
, vect_location
,
3334 "vect_model_induction_cost: inside_cost = %d, "
3335 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
3339 /* Function get_initial_def_for_induction
3342 STMT - a stmt that performs an induction operation in the loop.
3343 IV_PHI - the initial value of the induction variable
3346 Return a vector variable, initialized with the first VF values of
3347 the induction variable. E.g., for an iv with IV_PHI='X' and
3348 evolution S, for a vector of 4 units, we want to return:
3349 [X, X + S, X + 2*S, X + 3*S]. */
3352 get_initial_def_for_induction (gimple iv_phi
)
3354 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3355 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3356 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3359 edge pe
= loop_preheader_edge (loop
);
3360 struct loop
*iv_loop
;
3362 tree new_vec
, vec_init
, vec_step
, t
;
3365 gimple init_stmt
, new_stmt
;
3366 gphi
*induction_phi
;
3367 tree induc_def
, vec_def
, vec_dest
;
3368 tree init_expr
, step_expr
;
3369 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3373 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3374 bool nested_in_vect_loop
= false;
3375 gimple_seq stmts
= NULL
;
3376 imm_use_iterator imm_iter
;
3377 use_operand_p use_p
;
3381 gimple_stmt_iterator si
;
3382 basic_block bb
= gimple_bb (iv_phi
);
3386 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3387 if (nested_in_vect_loop_p (loop
, iv_phi
))
3389 nested_in_vect_loop
= true;
3390 iv_loop
= loop
->inner
;
3394 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3396 latch_e
= loop_latch_edge (iv_loop
);
3397 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3399 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
3400 gcc_assert (step_expr
!= NULL_TREE
);
3402 pe
= loop_preheader_edge (iv_loop
);
3403 init_expr
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3404 loop_preheader_edge (iv_loop
));
3406 vectype
= get_vectype_for_scalar_type (TREE_TYPE (init_expr
));
3407 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3408 gcc_assert (vectype
);
3409 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3410 ncopies
= vf
/ nunits
;
3412 gcc_assert (phi_info
);
3413 gcc_assert (ncopies
>= 1);
3415 /* Convert the step to the desired type. */
3416 step_expr
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3418 &stmts
, true, NULL_TREE
);
3421 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3422 gcc_assert (!new_bb
);
3425 /* Find the first insertion point in the BB. */
3426 si
= gsi_after_labels (bb
);
3428 /* Create the vector that holds the initial_value of the induction. */
3429 if (nested_in_vect_loop
)
3431 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3432 been created during vectorization of previous stmts. We obtain it
3433 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3434 vec_init
= vect_get_vec_def_for_operand (init_expr
, iv_phi
, NULL
);
3435 /* If the initial value is not of proper type, convert it. */
3436 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3439 = gimple_build_assign (vect_get_new_vect_var (vectype
,
3443 build1 (VIEW_CONVERT_EXPR
, vectype
,
3445 vec_init
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3446 gimple_assign_set_lhs (new_stmt
, vec_init
);
3447 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3449 gcc_assert (!new_bb
);
3450 set_vinfo_for_stmt (new_stmt
,
3451 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3456 vec
<constructor_elt
, va_gc
> *v
;
3458 /* iv_loop is the loop to be vectorized. Create:
3459 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3460 new_var
= vect_get_new_vect_var (TREE_TYPE (vectype
),
3461 vect_scalar_var
, "var_");
3462 new_name
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3464 &stmts
, false, new_var
);
3467 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3468 gcc_assert (!new_bb
);
3471 vec_alloc (v
, nunits
);
3472 bool constant_p
= is_gimple_min_invariant (new_name
);
3473 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3474 for (i
= 1; i
< nunits
; i
++)
3476 /* Create: new_name_i = new_name + step_expr */
3477 new_name
= fold_build2 (PLUS_EXPR
, TREE_TYPE (new_name
),
3478 new_name
, step_expr
);
3479 if (!is_gimple_min_invariant (new_name
))
3481 init_stmt
= gimple_build_assign (new_var
, new_name
);
3482 new_name
= make_ssa_name (new_var
, init_stmt
);
3483 gimple_assign_set_lhs (init_stmt
, new_name
);
3484 new_bb
= gsi_insert_on_edge_immediate (pe
, init_stmt
);
3485 gcc_assert (!new_bb
);
3486 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE
, vect_location
,
3489 "created new init_stmt: ");
3490 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, init_stmt
, 0);
3491 dump_printf (MSG_NOTE
, "\n");
3495 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3497 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3499 new_vec
= build_vector_from_ctor (vectype
, v
);
3501 new_vec
= build_constructor (vectype
, v
);
3502 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3506 /* Create the vector that holds the step of the induction. */
3507 if (nested_in_vect_loop
)
3508 /* iv_loop is nested in the loop to be vectorized. Generate:
3509 vec_step = [S, S, S, S] */
3510 new_name
= step_expr
;
3513 /* iv_loop is the loop to be vectorized. Generate:
3514 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3515 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3517 expr
= build_int_cst (integer_type_node
, vf
);
3518 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3521 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3522 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3524 if (TREE_CODE (step_expr
) == SSA_NAME
)
3525 new_name
= vect_init_vector (iv_phi
, new_name
,
3526 TREE_TYPE (step_expr
), NULL
);
3529 t
= unshare_expr (new_name
);
3530 gcc_assert (CONSTANT_CLASS_P (new_name
)
3531 || TREE_CODE (new_name
) == SSA_NAME
);
3532 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3533 gcc_assert (stepvectype
);
3534 new_vec
= build_vector_from_val (stepvectype
, t
);
3535 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3538 /* Create the following def-use cycle:
3543 vec_iv = PHI <vec_init, vec_loop>
3547 vec_loop = vec_iv + vec_step; */
3549 /* Create the induction-phi that defines the induction-operand. */
3550 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3551 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3552 set_vinfo_for_stmt (induction_phi
,
3553 new_stmt_vec_info (induction_phi
, loop_vinfo
, NULL
));
3554 induc_def
= PHI_RESULT (induction_phi
);
3556 /* Create the iv update inside the loop */
3557 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
, induc_def
, vec_step
);
3558 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3559 gimple_assign_set_lhs (new_stmt
, vec_def
);
3560 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3561 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
,
3564 /* Set the arguments of the phi node: */
3565 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3566 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3570 /* In case that vectorization factor (VF) is bigger than the number
3571 of elements that we can fit in a vectype (nunits), we have to generate
3572 more than one vector stmt - i.e - we need to "unroll" the
3573 vector stmt by a factor VF/nunits. For more details see documentation
3574 in vectorizable_operation. */
3578 stmt_vec_info prev_stmt_vinfo
;
3579 /* FORNOW. This restriction should be relaxed. */
3580 gcc_assert (!nested_in_vect_loop
);
3582 /* Create the vector that holds the step of the induction. */
3583 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3585 expr
= build_int_cst (integer_type_node
, nunits
);
3586 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3589 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3590 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3592 if (TREE_CODE (step_expr
) == SSA_NAME
)
3593 new_name
= vect_init_vector (iv_phi
, new_name
,
3594 TREE_TYPE (step_expr
), NULL
);
3595 t
= unshare_expr (new_name
);
3596 gcc_assert (CONSTANT_CLASS_P (new_name
)
3597 || TREE_CODE (new_name
) == SSA_NAME
);
3598 new_vec
= build_vector_from_val (stepvectype
, t
);
3599 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3601 vec_def
= induc_def
;
3602 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3603 for (i
= 1; i
< ncopies
; i
++)
3605 /* vec_i = vec_prev + vec_step */
3606 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
3608 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3609 gimple_assign_set_lhs (new_stmt
, vec_def
);
3611 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3612 if (!useless_type_conversion_p (resvectype
, vectype
))
3615 = gimple_build_assign
3616 (vect_get_new_vect_var (resvectype
, vect_simple_var
,
3619 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3620 gimple_assign_lhs (new_stmt
)));
3621 gimple_assign_set_lhs (new_stmt
,
3623 (gimple_assign_lhs (new_stmt
), new_stmt
));
3624 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3626 set_vinfo_for_stmt (new_stmt
,
3627 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3628 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3629 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3633 if (nested_in_vect_loop
)
3635 /* Find the loop-closed exit-phi of the induction, and record
3636 the final vector of induction results: */
3638 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3640 gimple use_stmt
= USE_STMT (use_p
);
3641 if (is_gimple_debug (use_stmt
))
3644 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
3646 exit_phi
= use_stmt
;
3652 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3653 /* FORNOW. Currently not supporting the case that an inner-loop induction
3654 is not used in the outer-loop (i.e. only outside the outer-loop). */
3655 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3656 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3658 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3659 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE
, vect_location
,
3662 "vector of inductions after inner-loop:");
3663 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3664 dump_printf (MSG_NOTE
, "\n");
3670 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_NOTE
, vect_location
,
3673 "transform induction: created def-use cycle: ");
3674 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3675 dump_printf (MSG_NOTE
, "\n");
3676 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
3677 SSA_NAME_DEF_STMT (vec_def
), 0);
3678 dump_printf (MSG_NOTE
, "\n");
3681 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
3682 if (!useless_type_conversion_p (resvectype
, vectype
))
3684 new_stmt
= gimple_build_assign (vect_get_new_vect_var (resvectype
,
3688 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3690 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3691 gimple_assign_set_lhs (new_stmt
, induc_def
);
3692 si
= gsi_after_labels (bb
);
3693 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3694 set_vinfo_for_stmt (new_stmt
,
3695 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3696 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
3697 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
3704 /* Function get_initial_def_for_reduction
3707 STMT - a stmt that performs a reduction operation in the loop.
3708 INIT_VAL - the initial value of the reduction variable
3711 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3712 of the reduction (used for adjusting the epilog - see below).
3713 Return a vector variable, initialized according to the operation that STMT
3714 performs. This vector will be used as the initial value of the
3715 vector of partial results.
3717 Option1 (adjust in epilog): Initialize the vector as follows:
3718 add/bit or/xor: [0,0,...,0,0]
3719 mult/bit and: [1,1,...,1,1]
3720 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3721 and when necessary (e.g. add/mult case) let the caller know
3722 that it needs to adjust the result by init_val.
3724 Option2: Initialize the vector as follows:
3725 add/bit or/xor: [init_val,0,0,...,0]
3726 mult/bit and: [init_val,1,1,...,1]
3727 min/max/cond_expr: [init_val,init_val,...,init_val]
3728 and no adjustments are needed.
3730 For example, for the following code:
3736 STMT is 's = s + a[i]', and the reduction variable is 's'.
3737 For a vector of 4 units, we want to return either [0,0,0,init_val],
3738 or [0,0,0,0] and let the caller know that it needs to adjust
3739 the result at the end by 'init_val'.
3741 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3742 initialization vector is simpler (same element in all entries), if
3743 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3745 A cost model should help decide between these two schemes. */
3748 get_initial_def_for_reduction (gimple stmt
, tree init_val
,
3749 tree
*adjustment_def
)
3751 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
3752 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3753 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3754 tree scalar_type
= TREE_TYPE (init_val
);
3755 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3757 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3762 bool nested_in_vect_loop
= false;
3764 REAL_VALUE_TYPE real_init_val
= dconst0
;
3765 int int_init_val
= 0;
3766 gimple def_stmt
= NULL
;
3768 gcc_assert (vectype
);
3769 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3771 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3772 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3774 if (nested_in_vect_loop_p (loop
, stmt
))
3775 nested_in_vect_loop
= true;
3777 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
3779 /* In case of double reduction we only create a vector variable to be put
3780 in the reduction phi node. The actual statement creation is done in
3781 vect_create_epilog_for_reduction. */
3782 if (adjustment_def
&& nested_in_vect_loop
3783 && TREE_CODE (init_val
) == SSA_NAME
3784 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
3785 && gimple_code (def_stmt
) == GIMPLE_PHI
3786 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3787 && vinfo_for_stmt (def_stmt
)
3788 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
3789 == vect_double_reduction_def
)
3791 *adjustment_def
= NULL
;
3792 return vect_create_destination_var (init_val
, vectype
);
3795 if (TREE_CONSTANT (init_val
))
3797 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3798 init_value
= build_real (scalar_type
, TREE_REAL_CST (init_val
));
3800 init_value
= build_int_cst (scalar_type
, TREE_INT_CST_LOW (init_val
));
3803 init_value
= init_val
;
3807 case WIDEN_SUM_EXPR
:
3816 /* ADJUSMENT_DEF is NULL when called from
3817 vect_create_epilog_for_reduction to vectorize double reduction. */
3820 if (nested_in_vect_loop
)
3821 *adjustment_def
= vect_get_vec_def_for_operand (init_val
, stmt
,
3824 *adjustment_def
= init_val
;
3827 if (code
== MULT_EXPR
)
3829 real_init_val
= dconst1
;
3833 if (code
== BIT_AND_EXPR
)
3836 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3837 def_for_init
= build_real (scalar_type
, real_init_val
);
3839 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3841 /* Create a vector of '0' or '1' except the first element. */
3842 elts
= XALLOCAVEC (tree
, nunits
);
3843 for (i
= nunits
- 2; i
>= 0; --i
)
3844 elts
[i
+ 1] = def_for_init
;
3846 /* Option1: the first element is '0' or '1' as well. */
3849 elts
[0] = def_for_init
;
3850 init_def
= build_vector (vectype
, elts
);
3854 /* Option2: the first element is INIT_VAL. */
3856 if (TREE_CONSTANT (init_val
))
3857 init_def
= build_vector (vectype
, elts
);
3860 vec
<constructor_elt
, va_gc
> *v
;
3861 vec_alloc (v
, nunits
);
3862 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
3863 for (i
= 1; i
< nunits
; ++i
)
3864 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
3865 init_def
= build_constructor (vectype
, v
);
3875 *adjustment_def
= NULL_TREE
;
3876 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
, NULL
);
3880 init_def
= build_vector_from_val (vectype
, init_value
);
3890 /* Function vect_create_epilog_for_reduction
3892 Create code at the loop-epilog to finalize the result of a reduction
3895 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3896 reduction statements.
3897 STMT is the scalar reduction stmt that is being vectorized.
3898 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3899 number of elements that we can fit in a vectype (nunits). In this case
3900 we have to generate more than one vector stmt - i.e - we need to "unroll"
3901 the vector stmt by a factor VF/nunits. For more details see documentation
3902 in vectorizable_operation.
3903 REDUC_CODE is the tree-code for the epilog reduction.
3904 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3906 REDUC_INDEX is the index of the operand in the right hand side of the
3907 statement that is defined by REDUCTION_PHI.
3908 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3909 SLP_NODE is an SLP node containing a group of reduction statements. The
3910 first one in this group is STMT.
3913 1. Creates the reduction def-use cycles: sets the arguments for
3915 The loop-entry argument is the vectorized initial-value of the reduction.
3916 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3918 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3919 by applying the operation specified by REDUC_CODE if available, or by
3920 other means (whole-vector shifts or a scalar loop).
3921 The function also creates a new phi node at the loop exit to preserve
3922 loop-closed form, as illustrated below.
3924 The flow at the entry to this function:
3927 vec_def = phi <null, null> # REDUCTION_PHI
3928 VECT_DEF = vector_stmt # vectorized form of STMT
3929 s_loop = scalar_stmt # (scalar) STMT
3931 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3935 The above is transformed by this function into:
3938 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3939 VECT_DEF = vector_stmt # vectorized form of STMT
3940 s_loop = scalar_stmt # (scalar) STMT
3942 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3943 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3944 v_out2 = reduce <v_out1>
3945 s_out3 = extract_field <v_out2, 0>
3946 s_out4 = adjust_result <s_out3>
3952 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple stmt
,
3953 int ncopies
, enum tree_code reduc_code
,
3954 vec
<gimple
> reduction_phis
,
3955 int reduc_index
, bool double_reduc
,
3958 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3959 stmt_vec_info prev_phi_info
;
3962 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3963 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
3964 basic_block exit_bb
;
3967 gimple new_phi
= NULL
, phi
;
3968 gimple_stmt_iterator exit_gsi
;
3970 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
3971 gimple epilog_stmt
= NULL
;
3972 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3975 tree adjustment_def
= NULL
;
3976 tree vec_initial_def
= NULL
;
3977 tree reduction_op
, expr
, def
;
3978 tree orig_name
, scalar_result
;
3979 imm_use_iterator imm_iter
, phi_imm_iter
;
3980 use_operand_p use_p
, phi_use_p
;
3981 gimple use_stmt
, orig_stmt
, reduction_phi
= NULL
;
3982 bool nested_in_vect_loop
= false;
3983 auto_vec
<gimple
> new_phis
;
3984 auto_vec
<gimple
> inner_phis
;
3985 enum vect_def_type dt
= vect_unknown_def_type
;
3987 auto_vec
<tree
> scalar_results
;
3988 unsigned int group_size
= 1, k
, ratio
;
3989 auto_vec
<tree
> vec_initial_defs
;
3990 auto_vec
<gimple
> phis
;
3991 bool slp_reduc
= false;
3992 tree new_phi_result
;
3993 gimple inner_phi
= NULL
;
3996 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
3998 if (nested_in_vect_loop_p (loop
, stmt
))
4002 nested_in_vect_loop
= true;
4003 gcc_assert (!slp_node
);
4006 reduction_op
= get_reduction_op (stmt
, reduc_index
);
4008 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
4009 gcc_assert (vectype
);
4010 mode
= TYPE_MODE (vectype
);
4012 /* 1. Create the reduction def-use cycle:
4013 Set the arguments of REDUCTION_PHIS, i.e., transform
4016 vec_def = phi <null, null> # REDUCTION_PHI
4017 VECT_DEF = vector_stmt # vectorized form of STMT
4023 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4024 VECT_DEF = vector_stmt # vectorized form of STMT
4027 (in case of SLP, do it for all the phis). */
4029 /* Get the loop-entry arguments. */
4031 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
4032 NULL
, slp_node
, reduc_index
);
4035 vec_initial_defs
.create (1);
4036 /* For the case of reduction, vect_get_vec_def_for_operand returns
4037 the scalar def before the loop, that defines the initial value
4038 of the reduction variable. */
4039 vec_initial_def
= vect_get_vec_def_for_operand (reduction_op
, stmt
,
4041 vec_initial_defs
.quick_push (vec_initial_def
);
4044 /* Set phi nodes arguments. */
4045 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4047 tree vec_init_def
, def
;
4049 vec_init_def
= force_gimple_operand (vec_initial_defs
[i
], &stmts
,
4051 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4053 for (j
= 0; j
< ncopies
; j
++)
4055 /* Set the loop-entry arg of the reduction-phi. */
4056 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4057 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4059 /* Set the loop-latch arg for the reduction-phi. */
4061 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4063 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4066 if (dump_enabled_p ())
4068 dump_printf_loc (MSG_NOTE
, vect_location
,
4069 "transform reduction: created def-use cycle: ");
4070 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4071 dump_printf (MSG_NOTE
, "\n");
4072 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4073 dump_printf (MSG_NOTE
, "\n");
4076 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4080 /* 2. Create epilog code.
4081 The reduction epilog code operates across the elements of the vector
4082 of partial results computed by the vectorized loop.
4083 The reduction epilog code consists of:
4085 step 1: compute the scalar result in a vector (v_out2)
4086 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4087 step 3: adjust the scalar result (s_out3) if needed.
4089 Step 1 can be accomplished using one the following three schemes:
4090 (scheme 1) using reduc_code, if available.
4091 (scheme 2) using whole-vector shifts, if available.
4092 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4095 The overall epilog code looks like this:
4097 s_out0 = phi <s_loop> # original EXIT_PHI
4098 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4099 v_out2 = reduce <v_out1> # step 1
4100 s_out3 = extract_field <v_out2, 0> # step 2
4101 s_out4 = adjust_result <s_out3> # step 3
4103 (step 3 is optional, and steps 1 and 2 may be combined).
4104 Lastly, the uses of s_out0 are replaced by s_out4. */
4107 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4108 v_out1 = phi <VECT_DEF>
4109 Store them in NEW_PHIS. */
4111 exit_bb
= single_exit (loop
)->dest
;
4112 prev_phi_info
= NULL
;
4113 new_phis
.create (vect_defs
.length ());
4114 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4116 for (j
= 0; j
< ncopies
; j
++)
4118 tree new_def
= copy_ssa_name (def
);
4119 phi
= create_phi_node (new_def
, exit_bb
);
4120 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
, NULL
));
4122 new_phis
.quick_push (phi
);
4125 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
4126 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
4129 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4130 prev_phi_info
= vinfo_for_stmt (phi
);
4134 /* The epilogue is created for the outer-loop, i.e., for the loop being
4135 vectorized. Create exit phis for the outer loop. */
4139 exit_bb
= single_exit (loop
)->dest
;
4140 inner_phis
.create (vect_defs
.length ());
4141 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4143 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4144 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4145 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4147 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4149 inner_phis
.quick_push (phi
);
4150 new_phis
[i
] = outer_phi
;
4151 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4152 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
4154 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4155 new_result
= copy_ssa_name (PHI_RESULT (phi
));
4156 outer_phi
= create_phi_node (new_result
, exit_bb
);
4157 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4159 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4161 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
4162 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4167 exit_gsi
= gsi_after_labels (exit_bb
);
4169 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4170 (i.e. when reduc_code is not available) and in the final adjustment
4171 code (if needed). Also get the original scalar reduction variable as
4172 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4173 represents a reduction pattern), the tree-code and scalar-def are
4174 taken from the original stmt that the pattern-stmt (STMT) replaces.
4175 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4176 are taken from STMT. */
4178 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4181 /* Regular reduction */
4186 /* Reduction pattern */
4187 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
4188 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
4189 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
4192 code
= gimple_assign_rhs_code (orig_stmt
);
4193 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4194 partial results are added and not subtracted. */
4195 if (code
== MINUS_EXPR
)
4198 scalar_dest
= gimple_assign_lhs (orig_stmt
);
4199 scalar_type
= TREE_TYPE (scalar_dest
);
4200 scalar_results
.create (group_size
);
4201 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4202 bitsize
= TYPE_SIZE (scalar_type
);
4204 /* In case this is a reduction in an inner-loop while vectorizing an outer
4205 loop - we don't need to extract a single scalar result at the end of the
4206 inner-loop (unless it is double reduction, i.e., the use of reduction is
4207 outside the outer-loop). The final vector of partial results will be used
4208 in the vectorized outer-loop, or reduced to a scalar result at the end of
4210 if (nested_in_vect_loop
&& !double_reduc
)
4211 goto vect_finalize_reduction
;
4213 /* SLP reduction without reduction chain, e.g.,
4217 b2 = operation (b1) */
4218 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
4220 /* In case of reduction chain, e.g.,
4223 a3 = operation (a2),
4225 we may end up with more than one vector result. Here we reduce them to
4227 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4229 tree first_vect
= PHI_RESULT (new_phis
[0]);
4231 gassign
*new_vec_stmt
= NULL
;
4233 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4234 for (k
= 1; k
< new_phis
.length (); k
++)
4236 gimple next_phi
= new_phis
[k
];
4237 tree second_vect
= PHI_RESULT (next_phi
);
4239 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
4240 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
4241 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
4242 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
4243 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4246 new_phi_result
= first_vect
;
4249 new_phis
.truncate (0);
4250 new_phis
.safe_push (new_vec_stmt
);
4254 new_phi_result
= PHI_RESULT (new_phis
[0]);
4256 /* 2.3 Create the reduction code, using one of the three schemes described
4257 above. In SLP we simply need to extract all the elements from the
4258 vector (without reducing them), so we use scalar shifts. */
4259 if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4264 /*** Case 1: Create:
4265 v_out2 = reduc_expr <v_out1> */
4267 if (dump_enabled_p ())
4268 dump_printf_loc (MSG_NOTE
, vect_location
,
4269 "Reduce using direct vector reduction.\n");
4271 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4272 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
4275 vect_create_destination_var (scalar_dest
, vec_elem_type
);
4276 tmp
= build1 (reduc_code
, vec_elem_type
, new_phi_result
);
4277 epilog_stmt
= gimple_build_assign (tmp_dest
, tmp
);
4278 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
4279 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4280 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4282 tmp
= build1 (NOP_EXPR
, scalar_type
, new_temp
);
4285 tmp
= build1 (reduc_code
, scalar_type
, new_phi_result
);
4286 epilog_stmt
= gimple_build_assign (new_scalar_dest
, tmp
);
4287 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4288 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4289 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4290 scalar_results
.safe_push (new_temp
);
4294 bool reduce_with_shift
= have_whole_vector_shift (mode
);
4295 int element_bitsize
= tree_to_uhwi (bitsize
);
4296 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4299 /* Regardless of whether we have a whole vector shift, if we're
4300 emulating the operation via tree-vect-generic, we don't want
4301 to use it. Only the first round of the reduction is likely
4302 to still be profitable via emulation. */
4303 /* ??? It might be better to emit a reduction tree code here, so that
4304 tree-vect-generic can expand the first round via bit tricks. */
4305 if (!VECTOR_MODE_P (mode
))
4306 reduce_with_shift
= false;
4309 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4310 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4311 reduce_with_shift
= false;
4314 if (reduce_with_shift
&& !slp_reduc
)
4316 int nelements
= vec_size_in_bits
/ element_bitsize
;
4317 unsigned char *sel
= XALLOCAVEC (unsigned char, nelements
);
4321 tree zero_vec
= build_zero_cst (vectype
);
4322 /*** Case 2: Create:
4323 for (offset = nelements/2; offset >= 1; offset/=2)
4325 Create: va' = vec_shift <va, offset>
4326 Create: va = vop <va, va'>
4331 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_NOTE
, vect_location
,
4333 "Reduce using vector shifts\n");
4335 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4336 new_temp
= new_phi_result
;
4337 for (elt_offset
= nelements
/ 2;
4341 calc_vec_perm_mask_for_shift (mode
, elt_offset
, sel
);
4342 tree mask
= vect_gen_perm_mask_any (vectype
, sel
);
4343 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
4344 new_temp
, zero_vec
, mask
);
4345 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4346 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4347 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4349 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
4351 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4352 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4353 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4356 /* 2.4 Extract the final scalar result. Create:
4357 s_out3 = extract_field <v_out2, bitpos> */
4359 if (dump_enabled_p ())
4360 dump_printf_loc (MSG_NOTE
, vect_location
,
4361 "extract scalar result\n");
4363 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
4364 bitsize
, bitsize_zero_node
);
4365 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4366 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4367 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4368 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4369 scalar_results
.safe_push (new_temp
);
4373 /*** Case 3: Create:
4374 s = extract_field <v_out2, 0>
4375 for (offset = element_size;
4376 offset < vector_size;
4377 offset += element_size;)
4379 Create: s' = extract_field <v_out2, offset>
4380 Create: s = op <s, s'> // For non SLP cases
4383 if (dump_enabled_p ())
4384 dump_printf_loc (MSG_NOTE
, vect_location
,
4385 "Reduce using scalar code.\n");
4387 vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4388 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4391 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4392 vec_temp
= PHI_RESULT (new_phi
);
4394 vec_temp
= gimple_assign_lhs (new_phi
);
4395 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4397 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4398 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4399 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4400 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4402 /* In SLP we don't need to apply reduction operation, so we just
4403 collect s' values in SCALAR_RESULTS. */
4405 scalar_results
.safe_push (new_temp
);
4407 for (bit_offset
= element_bitsize
;
4408 bit_offset
< vec_size_in_bits
;
4409 bit_offset
+= element_bitsize
)
4411 tree bitpos
= bitsize_int (bit_offset
);
4412 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4415 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4416 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4417 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4418 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4422 /* In SLP we don't need to apply reduction operation, so
4423 we just collect s' values in SCALAR_RESULTS. */
4424 new_temp
= new_name
;
4425 scalar_results
.safe_push (new_name
);
4429 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4430 new_name
, new_temp
);
4431 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4432 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4433 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4438 /* The only case where we need to reduce scalar results in SLP, is
4439 unrolling. If the size of SCALAR_RESULTS is greater than
4440 GROUP_SIZE, we reduce them combining elements modulo
4444 tree res
, first_res
, new_res
;
4447 /* Reduce multiple scalar results in case of SLP unrolling. */
4448 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4451 first_res
= scalar_results
[j
% group_size
];
4452 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4454 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4455 gimple_assign_set_lhs (new_stmt
, new_res
);
4456 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4457 scalar_results
[j
% group_size
] = new_res
;
4461 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4462 scalar_results
.safe_push (new_temp
);
4466 vect_finalize_reduction
:
4471 /* 2.5 Adjust the final result by the initial value of the reduction
4472 variable. (When such adjustment is not needed, then
4473 'adjustment_def' is zero). For example, if code is PLUS we create:
4474 new_temp = loop_exit_def + adjustment_def */
4478 gcc_assert (!slp_reduc
);
4479 if (nested_in_vect_loop
)
4481 new_phi
= new_phis
[0];
4482 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4483 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4484 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4488 new_temp
= scalar_results
[0];
4489 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4490 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4491 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4494 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4495 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4496 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4497 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4498 if (nested_in_vect_loop
)
4500 set_vinfo_for_stmt (epilog_stmt
,
4501 new_stmt_vec_info (epilog_stmt
, loop_vinfo
,
4503 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4504 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4507 scalar_results
.quick_push (new_temp
);
4509 scalar_results
[0] = new_temp
;
4512 scalar_results
[0] = new_temp
;
4514 new_phis
[0] = epilog_stmt
;
4517 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4518 phis with new adjusted scalar results, i.e., replace use <s_out0>
4523 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4524 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4525 v_out2 = reduce <v_out1>
4526 s_out3 = extract_field <v_out2, 0>
4527 s_out4 = adjust_result <s_out3>
4534 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4535 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4536 v_out2 = reduce <v_out1>
4537 s_out3 = extract_field <v_out2, 0>
4538 s_out4 = adjust_result <s_out3>
4543 /* In SLP reduction chain we reduce vector results into one vector if
4544 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4545 the last stmt in the reduction chain, since we are looking for the loop
4547 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4549 scalar_dest
= gimple_assign_lhs (
4550 SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
4554 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4555 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4556 need to match SCALAR_RESULTS with corresponding statements. The first
4557 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4558 the first vector stmt, etc.
4559 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4560 if (group_size
> new_phis
.length ())
4562 ratio
= group_size
/ new_phis
.length ();
4563 gcc_assert (!(group_size
% new_phis
.length ()));
4568 for (k
= 0; k
< group_size
; k
++)
4572 epilog_stmt
= new_phis
[k
/ ratio
];
4573 reduction_phi
= reduction_phis
[k
/ ratio
];
4575 inner_phi
= inner_phis
[k
/ ratio
];
4580 gimple current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
4582 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
4583 /* SLP statements can't participate in patterns. */
4584 gcc_assert (!orig_stmt
);
4585 scalar_dest
= gimple_assign_lhs (current_stmt
);
4589 /* Find the loop-closed-use at the loop exit of the original scalar
4590 result. (The reduction result is expected to have two immediate uses -
4591 one at the latch block, and one at the loop exit). */
4592 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4593 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
4594 && !is_gimple_debug (USE_STMT (use_p
)))
4595 phis
.safe_push (USE_STMT (use_p
));
4597 /* While we expect to have found an exit_phi because of loop-closed-ssa
4598 form we can end up without one if the scalar cycle is dead. */
4600 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4604 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
4607 /* FORNOW. Currently not supporting the case that an inner-loop
4608 reduction is not used in the outer-loop (but only outside the
4609 outer-loop), unless it is double reduction. */
4610 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
4611 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
4615 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
4617 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
4619 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
4620 != vect_double_reduction_def
)
4623 /* Handle double reduction:
4625 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4626 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4627 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4628 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4630 At that point the regular reduction (stmt2 and stmt3) is
4631 already vectorized, as well as the exit phi node, stmt4.
4632 Here we vectorize the phi node of double reduction, stmt1, and
4633 update all relevant statements. */
4635 /* Go through all the uses of s2 to find double reduction phi
4636 node, i.e., stmt1 above. */
4637 orig_name
= PHI_RESULT (exit_phi
);
4638 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4640 stmt_vec_info use_stmt_vinfo
;
4641 stmt_vec_info new_phi_vinfo
;
4642 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
4643 basic_block bb
= gimple_bb (use_stmt
);
4646 /* Check that USE_STMT is really double reduction phi
4648 if (gimple_code (use_stmt
) != GIMPLE_PHI
4649 || gimple_phi_num_args (use_stmt
) != 2
4650 || bb
->loop_father
!= outer_loop
)
4652 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
4654 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
4655 != vect_double_reduction_def
)
4658 /* Create vector phi node for double reduction:
4659 vs1 = phi <vs0, vs2>
4660 vs1 was created previously in this function by a call to
4661 vect_get_vec_def_for_operand and is stored in
4663 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4664 vs0 is created here. */
4666 /* Create vector phi node. */
4667 vect_phi
= create_phi_node (vec_initial_def
, bb
);
4668 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
4669 loop_vec_info_for_loop (outer_loop
), NULL
);
4670 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
4672 /* Create vs0 - initial def of the double reduction phi. */
4673 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
4674 loop_preheader_edge (outer_loop
));
4675 init_def
= get_initial_def_for_reduction (stmt
,
4676 preheader_arg
, NULL
);
4677 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
4680 /* Update phi node arguments with vs0 and vs2. */
4681 add_phi_arg (vect_phi
, vect_phi_init
,
4682 loop_preheader_edge (outer_loop
),
4684 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
4685 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
4686 if (dump_enabled_p ())
4688 dump_printf_loc (MSG_NOTE
, vect_location
,
4689 "created double reduction phi node: ");
4690 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
4691 dump_printf (MSG_NOTE
, "\n");
4694 vect_phi_res
= PHI_RESULT (vect_phi
);
4696 /* Replace the use, i.e., set the correct vs1 in the regular
4697 reduction phi node. FORNOW, NCOPIES is always 1, so the
4698 loop is redundant. */
4699 use
= reduction_phi
;
4700 for (j
= 0; j
< ncopies
; j
++)
4702 edge pr_edge
= loop_preheader_edge (loop
);
4703 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
4704 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
4711 if (nested_in_vect_loop
)
4720 /* Find the loop-closed-use at the loop exit of the original scalar
4721 result. (The reduction result is expected to have two immediate uses,
4722 one at the latch block, and one at the loop exit). For double
4723 reductions we are looking for exit phis of the outer loop. */
4724 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4726 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4728 if (!is_gimple_debug (USE_STMT (use_p
)))
4729 phis
.safe_push (USE_STMT (use_p
));
4733 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
4735 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
4737 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
4739 if (!flow_bb_inside_loop_p (loop
,
4740 gimple_bb (USE_STMT (phi_use_p
)))
4741 && !is_gimple_debug (USE_STMT (phi_use_p
)))
4742 phis
.safe_push (USE_STMT (phi_use_p
));
4748 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4750 /* Replace the uses: */
4751 orig_name
= PHI_RESULT (exit_phi
);
4752 scalar_result
= scalar_results
[k
];
4753 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4754 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4755 SET_USE (use_p
, scalar_result
);
4763 /* Function vectorizable_reduction.
4765 Check if STMT performs a reduction operation that can be vectorized.
4766 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4767 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4768 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4770 This function also handles reduction idioms (patterns) that have been
4771 recognized in advance during vect_pattern_recog. In this case, STMT may be
4773 X = pattern_expr (arg0, arg1, ..., X)
4774 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4775 sequence that had been detected and replaced by the pattern-stmt (STMT).
4777 In some cases of reduction patterns, the type of the reduction variable X is
4778 different than the type of the other arguments of STMT.
4779 In such cases, the vectype that is used when transforming STMT into a vector
4780 stmt is different than the vectype that is used to determine the
4781 vectorization factor, because it consists of a different number of elements
4782 than the actual number of elements that are being operated upon in parallel.
4784 For example, consider an accumulation of shorts into an int accumulator.
4785 On some targets it's possible to vectorize this pattern operating on 8
4786 shorts at a time (hence, the vectype for purposes of determining the
4787 vectorization factor should be V8HI); on the other hand, the vectype that
4788 is used to create the vector form is actually V4SI (the type of the result).
4790 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4791 indicates what is the actual level of parallelism (V8HI in the example), so
4792 that the right vectorization factor would be derived. This vectype
4793 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4794 be used to create the vectorized stmt. The right vectype for the vectorized
4795 stmt is obtained from the type of the result X:
4796 get_vectype_for_scalar_type (TREE_TYPE (X))
4798 This means that, contrary to "regular" reductions (or "regular" stmts in
4799 general), the following equation:
4800 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4801 does *NOT* necessarily hold for reduction patterns. */
4804 vectorizable_reduction (gimple stmt
, gimple_stmt_iterator
*gsi
,
4805 gimple
*vec_stmt
, slp_tree slp_node
)
4809 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
4810 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4811 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
4812 tree vectype_in
= NULL_TREE
;
4813 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4814 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4815 enum tree_code code
, orig_code
, epilog_reduc_code
;
4816 machine_mode vec_mode
;
4818 optab optab
, reduc_optab
;
4819 tree new_temp
= NULL_TREE
;
4822 enum vect_def_type dt
;
4823 gphi
*new_phi
= NULL
;
4827 stmt_vec_info orig_stmt_info
;
4828 tree expr
= NULL_TREE
;
4832 stmt_vec_info prev_stmt_info
, prev_phi_info
;
4833 bool single_defuse_cycle
= false;
4834 tree reduc_def
= NULL_TREE
;
4835 gimple new_stmt
= NULL
;
4838 bool nested_cycle
= false, found_nested_cycle_def
= false;
4839 gimple reduc_def_stmt
= NULL
;
4840 bool double_reduc
= false, dummy
;
4842 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
4844 gimple def_arg_stmt
;
4845 auto_vec
<tree
> vec_oprnds0
;
4846 auto_vec
<tree
> vec_oprnds1
;
4847 auto_vec
<tree
> vect_defs
;
4848 auto_vec
<gimple
> phis
;
4850 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
4852 /* In case of reduction chain we switch to the first stmt in the chain, but
4853 we don't update STMT_INFO, since only the last stmt is marked as reduction
4854 and has reduction properties. */
4855 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4856 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
4858 if (nested_in_vect_loop_p (loop
, stmt
))
4862 nested_cycle
= true;
4865 /* 1. Is vectorizable reduction? */
4866 /* Not supportable if the reduction variable is used in the loop, unless
4867 it's a reduction chain. */
4868 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
4869 && !GROUP_FIRST_ELEMENT (stmt_info
))
4872 /* Reductions that are not used even in an enclosing outer-loop,
4873 are expected to be "live" (used out of the loop). */
4874 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
4875 && !STMT_VINFO_LIVE_P (stmt_info
))
4878 /* Make sure it was already recognized as a reduction computation. */
4879 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
4880 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
4883 /* 2. Has this been recognized as a reduction pattern?
4885 Check if STMT represents a pattern that has been recognized
4886 in earlier analysis stages. For stmts that represent a pattern,
4887 the STMT_VINFO_RELATED_STMT field records the last stmt in
4888 the original sequence that constitutes the pattern. */
4890 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4893 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
4894 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4895 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
4898 /* 3. Check the operands of the operation. The first operands are defined
4899 inside the loop body. The last operand is the reduction variable,
4900 which is defined by the loop-header-phi. */
4902 gcc_assert (is_gimple_assign (stmt
));
4905 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
4907 case GIMPLE_SINGLE_RHS
:
4908 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
4909 if (op_type
== ternary_op
)
4911 tree rhs
= gimple_assign_rhs1 (stmt
);
4912 ops
[0] = TREE_OPERAND (rhs
, 0);
4913 ops
[1] = TREE_OPERAND (rhs
, 1);
4914 ops
[2] = TREE_OPERAND (rhs
, 2);
4915 code
= TREE_CODE (rhs
);
4921 case GIMPLE_BINARY_RHS
:
4922 code
= gimple_assign_rhs_code (stmt
);
4923 op_type
= TREE_CODE_LENGTH (code
);
4924 gcc_assert (op_type
== binary_op
);
4925 ops
[0] = gimple_assign_rhs1 (stmt
);
4926 ops
[1] = gimple_assign_rhs2 (stmt
);
4929 case GIMPLE_TERNARY_RHS
:
4930 code
= gimple_assign_rhs_code (stmt
);
4931 op_type
= TREE_CODE_LENGTH (code
);
4932 gcc_assert (op_type
== ternary_op
);
4933 ops
[0] = gimple_assign_rhs1 (stmt
);
4934 ops
[1] = gimple_assign_rhs2 (stmt
);
4935 ops
[2] = gimple_assign_rhs3 (stmt
);
4938 case GIMPLE_UNARY_RHS
:
4944 /* The default is that the reduction variable is the last in statement. */
4945 int reduc_index
= op_type
- 1;
4947 if (code
== COND_EXPR
&& slp_node
)
4950 scalar_dest
= gimple_assign_lhs (stmt
);
4951 scalar_type
= TREE_TYPE (scalar_dest
);
4952 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
4953 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
4956 /* Do not try to vectorize bit-precision reductions. */
4957 if ((TYPE_PRECISION (scalar_type
)
4958 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
4961 /* All uses but the last are expected to be defined in the loop.
4962 The last use is the reduction variable. In case of nested cycle this
4963 assumption is not true: we use reduc_index to record the index of the
4964 reduction variable. */
4965 for (i
= 0; i
< op_type
- 1; i
++)
4967 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4968 if (i
== 0 && code
== COND_EXPR
)
4971 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4972 &def_stmt
, &def
, &dt
, &tem
);
4975 gcc_assert (is_simple_use
);
4977 if (dt
!= vect_internal_def
4978 && dt
!= vect_external_def
4979 && dt
!= vect_constant_def
4980 && dt
!= vect_induction_def
4981 && !(dt
== vect_nested_cycle
&& nested_cycle
))
4984 if (dt
== vect_nested_cycle
)
4986 found_nested_cycle_def
= true;
4987 reduc_def_stmt
= def_stmt
;
4992 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4993 &def_stmt
, &def
, &dt
, &tem
);
4996 gcc_assert (is_simple_use
);
4997 if (!found_nested_cycle_def
)
4998 reduc_def_stmt
= def_stmt
;
5000 if (reduc_def_stmt
&& gimple_code (reduc_def_stmt
) != GIMPLE_PHI
)
5003 if (!(dt
== vect_reduction_def
5004 || dt
== vect_nested_cycle
5005 || ((dt
== vect_internal_def
|| dt
== vect_external_def
5006 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
5007 && nested_cycle
&& found_nested_cycle_def
)))
5009 /* For pattern recognized stmts, orig_stmt might be a reduction,
5010 but some helper statements for the pattern might not, or
5011 might be COND_EXPRs with reduction uses in the condition. */
5012 gcc_assert (orig_stmt
);
5017 gcc_assert (orig_stmt
== vect_is_simple_reduction (loop_vinfo
,
5023 gimple tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
5024 !nested_cycle
, &dummy
);
5025 /* We changed STMT to be the first stmt in reduction chain, hence we
5026 check that in this case the first element in the chain is STMT. */
5027 gcc_assert (stmt
== tmp
5028 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
5031 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
5034 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
5037 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5038 / TYPE_VECTOR_SUBPARTS (vectype_in
));
5040 gcc_assert (ncopies
>= 1);
5042 vec_mode
= TYPE_MODE (vectype_in
);
5044 if (code
== COND_EXPR
)
5046 if (!vectorizable_condition (stmt
, gsi
, NULL
, ops
[reduc_index
], 0, NULL
))
5048 if (dump_enabled_p ())
5049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5050 "unsupported condition in reduction\n");
5057 /* 4. Supportable by target? */
5059 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
5060 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
5062 /* Shifts and rotates are only supported by vectorizable_shifts,
5063 not vectorizable_reduction. */
5064 if (dump_enabled_p ())
5065 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5066 "unsupported shift or rotation.\n");
5070 /* 4.1. check support for the operation in the loop */
5071 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
5074 if (dump_enabled_p ())
5075 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5081 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5083 if (dump_enabled_p ())
5084 dump_printf (MSG_NOTE
, "op not supported by target.\n");
5086 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
5087 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5088 < vect_min_worthwhile_factor (code
))
5091 if (dump_enabled_p ())
5092 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
5095 /* Worthwhile without SIMD support? */
5096 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
5097 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5098 < vect_min_worthwhile_factor (code
))
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5102 "not worthwhile without SIMD support.\n");
5108 /* 4.2. Check support for the epilog operation.
5110 If STMT represents a reduction pattern, then the type of the
5111 reduction variable may be different than the type of the rest
5112 of the arguments. For example, consider the case of accumulation
5113 of shorts into an int accumulator; The original code:
5114 S1: int_a = (int) short_a;
5115 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5118 STMT: int_acc = widen_sum <short_a, int_acc>
5121 1. The tree-code that is used to create the vector operation in the
5122 epilog code (that reduces the partial results) is not the
5123 tree-code of STMT, but is rather the tree-code of the original
5124 stmt from the pattern that STMT is replacing. I.e, in the example
5125 above we want to use 'widen_sum' in the loop, but 'plus' in the
5127 2. The type (mode) we use to check available target support
5128 for the vector operation to be created in the *epilog*, is
5129 determined by the type of the reduction variable (in the example
5130 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5131 However the type (mode) we use to check available target support
5132 for the vector operation to be created *inside the loop*, is
5133 determined by the type of the other arguments to STMT (in the
5134 example we'd check this: optab_handler (widen_sum_optab,
5137 This is contrary to "regular" reductions, in which the types of all
5138 the arguments are the same as the type of the reduction variable.
5139 For "regular" reductions we can therefore use the same vector type
5140 (and also the same tree-code) when generating the epilog code and
5141 when generating the code inside the loop. */
5145 /* This is a reduction pattern: get the vectype from the type of the
5146 reduction variable, and get the tree-code from orig_stmt. */
5147 orig_code
= gimple_assign_rhs_code (orig_stmt
);
5148 gcc_assert (vectype_out
);
5149 vec_mode
= TYPE_MODE (vectype_out
);
5153 /* Regular reduction: use the same vectype and tree-code as used for
5154 the vector code inside the loop can be used for the epilog code. */
5160 def_bb
= gimple_bb (reduc_def_stmt
);
5161 def_stmt_loop
= def_bb
->loop_father
;
5162 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5163 loop_preheader_edge (def_stmt_loop
));
5164 if (TREE_CODE (def_arg
) == SSA_NAME
5165 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
5166 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
5167 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
5168 && vinfo_for_stmt (def_arg_stmt
)
5169 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
5170 == vect_double_reduction_def
)
5171 double_reduc
= true;
5174 epilog_reduc_code
= ERROR_MARK
;
5175 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
5177 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
5181 if (dump_enabled_p ())
5182 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5183 "no optab for reduction.\n");
5185 epilog_reduc_code
= ERROR_MARK
;
5187 else if (optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
5189 optab
= scalar_reduc_to_vector (reduc_optab
, vectype_out
);
5190 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5192 if (dump_enabled_p ())
5193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5194 "reduc op not supported by target.\n");
5196 epilog_reduc_code
= ERROR_MARK
;
5202 if (!nested_cycle
|| double_reduc
)
5204 if (dump_enabled_p ())
5205 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5206 "no reduc code for scalar code.\n");
5212 if (double_reduc
&& ncopies
> 1)
5214 if (dump_enabled_p ())
5215 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5216 "multiple types in double reduction\n");
5221 /* In case of widenning multiplication by a constant, we update the type
5222 of the constant to be the type of the other operand. We check that the
5223 constant fits the type in the pattern recognition pass. */
5224 if (code
== DOT_PROD_EXPR
5225 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
5227 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
5228 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
5229 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
5230 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
5233 if (dump_enabled_p ())
5234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5235 "invalid types in dot-prod\n");
5241 if (!vec_stmt
) /* transformation not required. */
5243 if (!vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
,
5246 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5252 if (dump_enabled_p ())
5253 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
5255 /* FORNOW: Multiple types are not supported for condition. */
5256 if (code
== COND_EXPR
)
5257 gcc_assert (ncopies
== 1);
5259 /* Create the destination vector */
5260 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
5262 /* In case the vectorization factor (VF) is bigger than the number
5263 of elements that we can fit in a vectype (nunits), we have to generate
5264 more than one vector stmt - i.e - we need to "unroll" the
5265 vector stmt by a factor VF/nunits. For more details see documentation
5266 in vectorizable_operation. */
5268 /* If the reduction is used in an outer loop we need to generate
5269 VF intermediate results, like so (e.g. for ncopies=2):
5274 (i.e. we generate VF results in 2 registers).
5275 In this case we have a separate def-use cycle for each copy, and therefore
5276 for each copy we get the vector def for the reduction variable from the
5277 respective phi node created for this copy.
5279 Otherwise (the reduction is unused in the loop nest), we can combine
5280 together intermediate results, like so (e.g. for ncopies=2):
5284 (i.e. we generate VF/2 results in a single register).
5285 In this case for each copy we get the vector def for the reduction variable
5286 from the vectorized reduction operation generated in the previous iteration.
5289 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5291 single_defuse_cycle
= true;
5295 epilog_copies
= ncopies
;
5297 prev_stmt_info
= NULL
;
5298 prev_phi_info
= NULL
;
5301 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5302 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out
)
5303 == TYPE_VECTOR_SUBPARTS (vectype_in
));
5308 vec_oprnds0
.create (1);
5309 if (op_type
== ternary_op
)
5310 vec_oprnds1
.create (1);
5313 phis
.create (vec_num
);
5314 vect_defs
.create (vec_num
);
5316 vect_defs
.quick_push (NULL_TREE
);
5318 for (j
= 0; j
< ncopies
; j
++)
5320 if (j
== 0 || !single_defuse_cycle
)
5322 for (i
= 0; i
< vec_num
; i
++)
5324 /* Create the reduction-phi that defines the reduction
5326 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5327 set_vinfo_for_stmt (new_phi
,
5328 new_stmt_vec_info (new_phi
, loop_vinfo
,
5330 if (j
== 0 || slp_node
)
5331 phis
.quick_push (new_phi
);
5335 if (code
== COND_EXPR
)
5337 gcc_assert (!slp_node
);
5338 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5339 PHI_RESULT (phis
[0]),
5341 /* Multiple types are not supported for condition. */
5348 op0
= ops
[!reduc_index
];
5349 if (op_type
== ternary_op
)
5351 if (reduc_index
== 0)
5358 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
5362 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
5364 vec_oprnds0
.quick_push (loop_vec_def0
);
5365 if (op_type
== ternary_op
)
5367 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
,
5369 vec_oprnds1
.quick_push (loop_vec_def1
);
5377 enum vect_def_type dt
;
5381 vect_is_simple_use (ops
[!reduc_index
], stmt
, loop_vinfo
, NULL
,
5382 &dummy_stmt
, &dummy
, &dt
);
5383 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
5385 vec_oprnds0
[0] = loop_vec_def0
;
5386 if (op_type
== ternary_op
)
5388 vect_is_simple_use (op1
, stmt
, loop_vinfo
, NULL
, &dummy_stmt
,
5390 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
5392 vec_oprnds1
[0] = loop_vec_def1
;
5396 if (single_defuse_cycle
)
5397 reduc_def
= gimple_assign_lhs (new_stmt
);
5399 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
5402 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5405 reduc_def
= PHI_RESULT (phis
[i
]);
5408 if (!single_defuse_cycle
|| j
== 0)
5409 reduc_def
= PHI_RESULT (new_phi
);
5412 def1
= ((op_type
== ternary_op
)
5413 ? vec_oprnds1
[i
] : NULL
);
5414 if (op_type
== binary_op
)
5416 if (reduc_index
== 0)
5417 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
5419 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
5423 if (reduc_index
== 0)
5424 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
5427 if (reduc_index
== 1)
5428 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
5430 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
5434 new_stmt
= gimple_build_assign (vec_dest
, expr
);
5435 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5436 gimple_assign_set_lhs (new_stmt
, new_temp
);
5437 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
5441 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
5442 vect_defs
.quick_push (new_temp
);
5445 vect_defs
[0] = new_temp
;
5452 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
5454 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
5456 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
5457 prev_phi_info
= vinfo_for_stmt (new_phi
);
5460 /* Finalize the reduction-phi (set its arguments) and create the
5461 epilog reduction code. */
5462 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
5464 new_temp
= gimple_assign_lhs (*vec_stmt
);
5465 vect_defs
[0] = new_temp
;
5468 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
5469 epilog_reduc_code
, phis
, reduc_index
,
5470 double_reduc
, slp_node
);
5475 /* Function vect_min_worthwhile_factor.
5477 For a loop where we could vectorize the operation indicated by CODE,
5478 return the minimum vectorization factor that makes it worthwhile
5479 to use generic vectors. */
5481 vect_min_worthwhile_factor (enum tree_code code
)
5502 /* Function vectorizable_induction
5504 Check if PHI performs an induction computation that can be vectorized.
5505 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5506 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5507 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5510 vectorizable_induction (gimple phi
, gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5513 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
5514 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5515 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5516 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5517 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5518 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
5521 gcc_assert (ncopies
>= 1);
5522 /* FORNOW. These restrictions should be relaxed. */
5523 if (nested_in_vect_loop_p (loop
, phi
))
5525 imm_use_iterator imm_iter
;
5526 use_operand_p use_p
;
5533 if (dump_enabled_p ())
5534 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5535 "multiple types in nested loop.\n");
5540 latch_e
= loop_latch_edge (loop
->inner
);
5541 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
5542 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
5544 gimple use_stmt
= USE_STMT (use_p
);
5545 if (is_gimple_debug (use_stmt
))
5548 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
5550 exit_phi
= use_stmt
;
5556 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5557 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5558 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
5560 if (dump_enabled_p ())
5561 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5562 "inner-loop induction only used outside "
5563 "of the outer vectorized loop.\n");
5569 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
5572 /* FORNOW: SLP not supported. */
5573 if (STMT_SLP_TYPE (stmt_info
))
5576 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
5578 if (gimple_code (phi
) != GIMPLE_PHI
)
5581 if (!vec_stmt
) /* transformation not required. */
5583 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_NOTE
, vect_location
,
5586 "=== vectorizable_induction ===\n");
5587 vect_model_induction_cost (stmt_info
, ncopies
);
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
5596 vec_def
= get_initial_def_for_induction (phi
);
5597 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
5601 /* Function vectorizable_live_operation.
5603 STMT computes a value that is used outside the loop. Check if
5604 it can be supported. */
5607 vectorizable_live_operation (gimple stmt
,
5608 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5611 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5612 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5613 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5619 enum vect_def_type dt
;
5620 enum tree_code code
;
5621 enum gimple_rhs_class rhs_class
;
5623 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
5625 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5628 if (!is_gimple_assign (stmt
))
5630 if (gimple_call_internal_p (stmt
)
5631 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
5632 && gimple_call_lhs (stmt
)
5634 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
5636 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0)))
5638 edge e
= single_exit (loop
);
5639 basic_block merge_bb
= e
->dest
;
5640 imm_use_iterator imm_iter
;
5641 use_operand_p use_p
;
5642 tree lhs
= gimple_call_lhs (stmt
);
5644 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
5646 gimple use_stmt
= USE_STMT (use_p
);
5647 if (gimple_code (use_stmt
) == GIMPLE_PHI
5648 && gimple_bb (use_stmt
) == merge_bb
)
5653 = build_int_cst (unsigned_type_node
,
5654 loop_vinfo
->vectorization_factor
- 1);
5655 SET_PHI_ARG_DEF (use_stmt
, e
->dest_idx
, vfm1
);
5665 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
5668 /* FORNOW. CHECKME. */
5669 if (nested_in_vect_loop_p (loop
, stmt
))
5672 code
= gimple_assign_rhs_code (stmt
);
5673 op_type
= TREE_CODE_LENGTH (code
);
5674 rhs_class
= get_gimple_rhs_class (code
);
5675 gcc_assert (rhs_class
!= GIMPLE_UNARY_RHS
|| op_type
== unary_op
);
5676 gcc_assert (rhs_class
!= GIMPLE_BINARY_RHS
|| op_type
== binary_op
);
5678 /* FORNOW: support only if all uses are invariant. This means
5679 that the scalar operations can remain in place, unvectorized.
5680 The original last scalar value that they compute will be used. */
5682 for (i
= 0; i
< op_type
; i
++)
5684 if (rhs_class
== GIMPLE_SINGLE_RHS
)
5685 op
= TREE_OPERAND (gimple_op (stmt
, 1), i
);
5687 op
= gimple_op (stmt
, i
+ 1);
5689 && !vect_is_simple_use (op
, stmt
, loop_vinfo
, NULL
, &def_stmt
, &def
,
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5694 "use not simple.\n");
5698 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
5702 /* No transformation is required for the cases we currently support. */
5706 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5709 vect_loop_kill_debug_uses (struct loop
*loop
, gimple stmt
)
5711 ssa_op_iter op_iter
;
5712 imm_use_iterator imm_iter
;
5713 def_operand_p def_p
;
5716 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
5718 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
5722 if (!is_gimple_debug (ustmt
))
5725 bb
= gimple_bb (ustmt
);
5727 if (!flow_bb_inside_loop_p (loop
, bb
))
5729 if (gimple_debug_bind_p (ustmt
))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE
, vect_location
,
5733 "killing debug use\n");
5735 gimple_debug_bind_reset_value (ustmt
);
5736 update_stmt (ustmt
);
5746 /* This function builds ni_name = number of iterations. Statements
5747 are emitted on the loop preheader edge. */
5750 vect_build_loop_niters (loop_vec_info loop_vinfo
)
5752 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
5753 if (TREE_CODE (ni
) == INTEGER_CST
)
5758 gimple_seq stmts
= NULL
;
5759 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5761 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
5762 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
5764 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5771 /* This function generates the following statements:
5773 ni_name = number of iterations loop executes
5774 ratio = ni_name / vf
5775 ratio_mult_vf_name = ratio * vf
5777 and places them on the loop preheader edge. */
5780 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo
,
5782 tree
*ratio_mult_vf_name_ptr
,
5783 tree
*ratio_name_ptr
)
5785 tree ni_minus_gap_name
;
5788 tree ratio_mult_vf_name
;
5789 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5790 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5793 log_vf
= build_int_cst (TREE_TYPE (ni_name
), exact_log2 (vf
));
5795 /* If epilogue loop is required because of data accesses with gaps, we
5796 subtract one iteration from the total number of iterations here for
5797 correct calculation of RATIO. */
5798 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5800 ni_minus_gap_name
= fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5802 build_one_cst (TREE_TYPE (ni_name
)));
5803 if (!is_gimple_val (ni_minus_gap_name
))
5805 var
= create_tmp_var (TREE_TYPE (ni_name
), "ni_gap");
5806 gimple stmts
= NULL
;
5807 ni_minus_gap_name
= force_gimple_operand (ni_minus_gap_name
, &stmts
,
5809 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5813 ni_minus_gap_name
= ni_name
;
5815 /* Create: ratio = ni >> log2(vf) */
5816 /* ??? As we have ni == number of latch executions + 1, ni could
5817 have overflown to zero. So avoid computing ratio based on ni
5818 but compute it using the fact that we know ratio will be at least
5819 one, thus via (ni - vf) >> log2(vf) + 1. */
5821 = fold_build2 (PLUS_EXPR
, TREE_TYPE (ni_name
),
5822 fold_build2 (RSHIFT_EXPR
, TREE_TYPE (ni_name
),
5823 fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5826 (TREE_TYPE (ni_name
), vf
)),
5828 build_int_cst (TREE_TYPE (ni_name
), 1));
5829 if (!is_gimple_val (ratio_name
))
5831 var
= create_tmp_var (TREE_TYPE (ni_name
), "bnd");
5832 gimple stmts
= NULL
;
5833 ratio_name
= force_gimple_operand (ratio_name
, &stmts
, true, var
);
5834 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5836 *ratio_name_ptr
= ratio_name
;
5838 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5840 if (ratio_mult_vf_name_ptr
)
5842 ratio_mult_vf_name
= fold_build2 (LSHIFT_EXPR
, TREE_TYPE (ratio_name
),
5843 ratio_name
, log_vf
);
5844 if (!is_gimple_val (ratio_mult_vf_name
))
5846 var
= create_tmp_var (TREE_TYPE (ni_name
), "ratio_mult_vf");
5847 gimple stmts
= NULL
;
5848 ratio_mult_vf_name
= force_gimple_operand (ratio_mult_vf_name
, &stmts
,
5850 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5852 *ratio_mult_vf_name_ptr
= ratio_mult_vf_name
;
5859 /* Function vect_transform_loop.
5861 The analysis phase has determined that the loop is vectorizable.
5862 Vectorize the loop - created vectorized stmts to replace the scalar
5863 stmts in the loop, and update the loop exit condition. */
5866 vect_transform_loop (loop_vec_info loop_vinfo
)
5868 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5869 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
5870 int nbbs
= loop
->num_nodes
;
5873 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5875 bool slp_scheduled
= false;
5876 gimple stmt
, pattern_stmt
;
5877 gimple_seq pattern_def_seq
= NULL
;
5878 gimple_stmt_iterator pattern_def_si
= gsi_none ();
5879 bool transform_pattern_stmt
= false;
5880 bool check_profitability
= false;
5882 /* Record number of iterations before we started tampering with the profile. */
5883 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
5885 if (dump_enabled_p ())
5886 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
5888 /* If profile is inprecise, we have chance to fix it up. */
5889 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5890 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
5892 /* Use the more conservative vectorization threshold. If the number
5893 of iterations is constant assume the cost check has been performed
5894 by our caller. If the threshold makes all loops profitable that
5895 run at least the vectorization factor number of times checking
5896 is pointless, too. */
5897 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
5898 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
5899 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5901 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_NOTE
, vect_location
,
5903 "Profitability threshold is %d loop iterations.\n",
5905 check_profitability
= true;
5908 /* Version the loop first, if required, so the profitability check
5911 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
5912 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
5914 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
5915 check_profitability
= false;
5918 tree ni_name
= vect_build_loop_niters (loop_vinfo
);
5919 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = ni_name
;
5921 /* Peel the loop if there are data refs with unknown alignment.
5922 Only one data ref with unknown store is allowed. */
5924 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
5926 vect_do_peeling_for_alignment (loop_vinfo
, ni_name
,
5927 th
, check_profitability
);
5928 check_profitability
= false;
5929 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5931 ni_name
= NULL_TREE
;
5934 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5935 compile time constant), or it is a constant that doesn't divide by the
5936 vectorization factor, then an epilog loop needs to be created.
5937 We therefore duplicate the loop: the original loop will be vectorized,
5938 and will compute the first (n/VF) iterations. The second copy of the loop
5939 will remain scalar and will compute the remaining (n%VF) iterations.
5940 (VF is the vectorization factor). */
5942 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
5943 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5947 ni_name
= vect_build_loop_niters (loop_vinfo
);
5948 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, &ratio_mult_vf
,
5950 vect_do_peeling_for_loop_bound (loop_vinfo
, ni_name
, ratio_mult_vf
,
5951 th
, check_profitability
);
5953 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5954 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
5955 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
5959 ni_name
= vect_build_loop_niters (loop_vinfo
);
5960 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, NULL
, &ratio
);
5963 /* 1) Make sure the loop header has exactly two entries
5964 2) Make sure we have a preheader basic block. */
5966 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
5968 split_edge (loop_preheader_edge (loop
));
5970 /* FORNOW: the vectorizer supports only loops which body consist
5971 of one basic block (header + empty latch). When the vectorizer will
5972 support more involved loop forms, the order by which the BBs are
5973 traversed need to be reconsidered. */
5975 for (i
= 0; i
< nbbs
; i
++)
5977 basic_block bb
= bbs
[i
];
5978 stmt_vec_info stmt_info
;
5980 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
5983 gphi
*phi
= si
.phi ();
5984 if (dump_enabled_p ())
5986 dump_printf_loc (MSG_NOTE
, vect_location
,
5987 "------>vectorizing phi: ");
5988 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
5989 dump_printf (MSG_NOTE
, "\n");
5991 stmt_info
= vinfo_for_stmt (phi
);
5995 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5996 vect_loop_kill_debug_uses (loop
, phi
);
5998 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5999 && !STMT_VINFO_LIVE_P (stmt_info
))
6002 if (STMT_VINFO_VECTYPE (stmt_info
)
6003 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
6004 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
6005 && dump_enabled_p ())
6006 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6008 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
6010 if (dump_enabled_p ())
6011 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
6012 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
6016 pattern_stmt
= NULL
;
6017 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
6018 !gsi_end_p (si
) || transform_pattern_stmt
;)
6022 if (transform_pattern_stmt
)
6023 stmt
= pattern_stmt
;
6026 stmt
= gsi_stmt (si
);
6027 /* During vectorization remove existing clobber stmts. */
6028 if (gimple_clobber_p (stmt
))
6030 unlink_stmt_vdef (stmt
);
6031 gsi_remove (&si
, true);
6032 release_defs (stmt
);
6037 if (dump_enabled_p ())
6039 dump_printf_loc (MSG_NOTE
, vect_location
,
6040 "------>vectorizing statement: ");
6041 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
6042 dump_printf (MSG_NOTE
, "\n");
6045 stmt_info
= vinfo_for_stmt (stmt
);
6047 /* vector stmts created in the outer-loop during vectorization of
6048 stmts in an inner-loop may not have a stmt_info, and do not
6049 need to be vectorized. */
6056 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6057 vect_loop_kill_debug_uses (loop
, stmt
);
6059 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6060 && !STMT_VINFO_LIVE_P (stmt_info
))
6062 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6063 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6064 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6065 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6067 stmt
= pattern_stmt
;
6068 stmt_info
= vinfo_for_stmt (stmt
);
6076 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6077 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6078 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6079 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6080 transform_pattern_stmt
= true;
6082 /* If pattern statement has def stmts, vectorize them too. */
6083 if (is_pattern_stmt_p (stmt_info
))
6085 if (pattern_def_seq
== NULL
)
6087 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
6088 pattern_def_si
= gsi_start (pattern_def_seq
);
6090 else if (!gsi_end_p (pattern_def_si
))
6091 gsi_next (&pattern_def_si
);
6092 if (pattern_def_seq
!= NULL
)
6094 gimple pattern_def_stmt
= NULL
;
6095 stmt_vec_info pattern_def_stmt_info
= NULL
;
6097 while (!gsi_end_p (pattern_def_si
))
6099 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
6100 pattern_def_stmt_info
6101 = vinfo_for_stmt (pattern_def_stmt
);
6102 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
6103 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
6105 gsi_next (&pattern_def_si
);
6108 if (!gsi_end_p (pattern_def_si
))
6110 if (dump_enabled_p ())
6112 dump_printf_loc (MSG_NOTE
, vect_location
,
6113 "==> vectorizing pattern def "
6115 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
6116 pattern_def_stmt
, 0);
6117 dump_printf (MSG_NOTE
, "\n");
6120 stmt
= pattern_def_stmt
;
6121 stmt_info
= pattern_def_stmt_info
;
6125 pattern_def_si
= gsi_none ();
6126 transform_pattern_stmt
= false;
6130 transform_pattern_stmt
= false;
6133 if (STMT_VINFO_VECTYPE (stmt_info
))
6137 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
6138 if (!STMT_SLP_TYPE (stmt_info
)
6139 && nunits
!= (unsigned int) vectorization_factor
6140 && dump_enabled_p ())
6141 /* For SLP VF is set according to unrolling factor, and not
6142 to vector size, hence for SLP this print is not valid. */
6143 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6146 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6148 if (STMT_SLP_TYPE (stmt_info
))
6152 slp_scheduled
= true;
6154 if (dump_enabled_p ())
6155 dump_printf_loc (MSG_NOTE
, vect_location
,
6156 "=== scheduling SLP instances ===\n");
6158 vect_schedule_slp (loop_vinfo
, NULL
);
6161 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6162 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
6164 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6166 pattern_def_seq
= NULL
;
6173 /* -------- vectorize statement ------------ */
6174 if (dump_enabled_p ())
6175 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
6177 grouped_store
= false;
6178 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
6181 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6183 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6184 interleaving chain was completed - free all the stores in
6187 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
6191 /* Free the attached stmt_vec_info and remove the stmt. */
6192 gimple store
= gsi_stmt (si
);
6193 free_stmt_vec_info (store
);
6194 unlink_stmt_vdef (store
);
6195 gsi_remove (&si
, true);
6196 release_defs (store
);
6199 /* Stores can only appear at the end of pattern statements. */
6200 gcc_assert (!transform_pattern_stmt
);
6201 pattern_def_seq
= NULL
;
6203 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6205 pattern_def_seq
= NULL
;
6211 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
6213 /* Reduce loop iterations by the vectorization factor. */
6214 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
6215 expected_iterations
/ vectorization_factor
);
6216 loop
->nb_iterations_upper_bound
6217 = wi::udiv_floor (loop
->nb_iterations_upper_bound
, vectorization_factor
);
6218 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6219 && loop
->nb_iterations_upper_bound
!= 0)
6220 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- 1;
6221 if (loop
->any_estimate
)
6223 loop
->nb_iterations_estimate
6224 = wi::udiv_floor (loop
->nb_iterations_estimate
, vectorization_factor
);
6225 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6226 && loop
->nb_iterations_estimate
!= 0)
6227 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- 1;
6230 if (dump_enabled_p ())
6232 dump_printf_loc (MSG_NOTE
, vect_location
,
6233 "LOOP VECTORIZED\n");
6235 dump_printf_loc (MSG_NOTE
, vect_location
,
6236 "OUTER LOOP VECTORIZED\n");
6237 dump_printf (MSG_NOTE
, "\n");