2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
34 #include "hard-reg-set.h"
37 #include "dominance.h"
40 #include "basic-block.h"
41 #include "gimple-pretty-print.h"
42 #include "tree-ssa-alias.h"
43 #include "internal-fn.h"
44 #include "gimple-expr.h"
48 #include "gimple-iterator.h"
49 #include "gimplify-me.h"
50 #include "gimple-ssa.h"
51 #include "tree-phinodes.h"
52 #include "ssa-iterators.h"
53 #include "stringpool.h"
54 #include "tree-ssanames.h"
55 #include "tree-ssa-loop-ivopts.h"
56 #include "tree-ssa-loop-manip.h"
57 #include "tree-ssa-loop-niter.h"
58 #include "tree-pass.h"
62 #include "insn-codes.h"
65 #include "diagnostic-core.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
71 /* Loop Vectorization Pass.
73 This pass tries to vectorize loops.
75 For example, the vectorizer transforms the following simple loop:
77 short a[N]; short b[N]; short c[N]; int i;
83 as if it was manually vectorized by rewriting the source code into:
85 typedef int __attribute__((mode(V8HI))) v8hi;
86 short a[N]; short b[N]; short c[N]; int i;
87 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
90 for (i=0; i<N/8; i++){
97 The main entry to this pass is vectorize_loops(), in which
98 the vectorizer applies a set of analyses on a given set of loops,
99 followed by the actual vectorization transformation for the loops that
100 had successfully passed the analysis phase.
101 Throughout this pass we make a distinction between two types of
102 data: scalars (which are represented by SSA_NAMES), and memory references
103 ("data-refs"). These two types of data require different handling both
104 during analysis and transformation. The types of data-refs that the
105 vectorizer currently supports are ARRAY_REFS which base is an array DECL
106 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
107 accesses are required to have a simple (consecutive) access pattern.
111 The driver for the analysis phase is vect_analyze_loop().
112 It applies a set of analyses, some of which rely on the scalar evolution
113 analyzer (scev) developed by Sebastian Pop.
115 During the analysis phase the vectorizer records some information
116 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
117 loop, as well as general information about the loop as a whole, which is
118 recorded in a "loop_vec_info" struct attached to each loop.
120 Transformation phase:
121 =====================
122 The loop transformation phase scans all the stmts in the loop, and
123 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
124 the loop that needs to be vectorized. It inserts the vector code sequence
125 just before the scalar stmt S, and records a pointer to the vector code
126 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
127 attached to S). This pointer will be used for the vectorization of following
128 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
129 otherwise, we rely on dead code elimination for removing it.
131 For example, say stmt S1 was vectorized into stmt VS1:
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 To vectorize stmt S2, the vectorizer first finds the stmt that defines
138 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
139 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
140 resulting sequence would be:
143 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
145 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
147 Operands that are not SSA_NAMEs, are data-refs that appear in
148 load/store operations (like 'x[i]' in S1), and are handled differently.
152 Currently the only target specific information that is used is the
153 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
154 Targets that can support different sizes of vectors, for now will need
155 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
156 flexibility will be added in the future.
158 Since we only vectorize operations which vector form can be
159 expressed using existing tree codes, to verify that an operation is
160 supported, the vectorizer checks the relevant optab at the relevant
161 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
162 the value found is CODE_FOR_nothing, then there's no target support, and
163 we can't vectorize the stmt.
165 For additional information on this project see:
166 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
169 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
171 /* Function vect_determine_vectorization_factor
173 Determine the vectorization factor (VF). VF is the number of data elements
174 that are operated upon in parallel in a single iteration of the vectorized
175 loop. For example, when vectorizing a loop that operates on 4byte elements,
176 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
177 elements can fit in a single vector register.
179 We currently support vectorization of loops in which all types operated upon
180 are of the same size. Therefore this function currently sets VF according to
181 the size of the types operated upon, and fails if there are multiple sizes
184 VF is also the factor by which the loop iterations are strip-mined, e.g.:
191 for (i=0; i<N; i+=VF){
192 a[i:VF] = b[i:VF] + c[i:VF];
197 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
199 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
200 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
201 int nbbs
= loop
->num_nodes
;
202 unsigned int vectorization_factor
= 0;
207 stmt_vec_info stmt_info
;
210 gimple stmt
, pattern_stmt
= NULL
;
211 gimple_seq pattern_def_seq
= NULL
;
212 gimple_stmt_iterator pattern_def_si
= gsi_none ();
213 bool analyze_pattern_stmt
= false;
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE
, vect_location
,
217 "=== vect_determine_vectorization_factor ===\n");
219 for (i
= 0; i
< nbbs
; i
++)
221 basic_block bb
= bbs
[i
];
223 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
227 stmt_info
= vinfo_for_stmt (phi
);
228 if (dump_enabled_p ())
230 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
231 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
232 dump_printf (MSG_NOTE
, "\n");
235 gcc_assert (stmt_info
);
237 if (STMT_VINFO_RELEVANT_P (stmt_info
))
239 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
240 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
242 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE
, vect_location
,
245 "get vectype for scalar type: ");
246 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
247 dump_printf (MSG_NOTE
, "\n");
250 vectype
= get_vectype_for_scalar_type (scalar_type
);
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
256 "not vectorized: unsupported "
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
260 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
264 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
269 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
270 dump_printf (MSG_NOTE
, "\n");
273 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
274 if (dump_enabled_p ())
275 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n",
278 if (!vectorization_factor
279 || (nunits
> vectorization_factor
))
280 vectorization_factor
= nunits
;
284 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
285 !gsi_end_p (si
) || analyze_pattern_stmt
;)
289 if (analyze_pattern_stmt
)
292 stmt
= gsi_stmt (si
);
294 stmt_info
= vinfo_for_stmt (stmt
);
296 if (dump_enabled_p ())
298 dump_printf_loc (MSG_NOTE
, vect_location
,
299 "==> examining statement: ");
300 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
301 dump_printf (MSG_NOTE
, "\n");
304 gcc_assert (stmt_info
);
306 /* Skip stmts which do not need to be vectorized. */
307 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
308 && !STMT_VINFO_LIVE_P (stmt_info
))
309 || gimple_clobber_p (stmt
))
311 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
312 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
313 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
314 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
317 stmt_info
= vinfo_for_stmt (pattern_stmt
);
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE
, vect_location
,
321 "==> examining pattern statement: ");
322 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
323 dump_printf (MSG_NOTE
, "\n");
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
334 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
335 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
336 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
337 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
338 analyze_pattern_stmt
= true;
340 /* If a pattern statement has def stmts, analyze them too. */
341 if (is_pattern_stmt_p (stmt_info
))
343 if (pattern_def_seq
== NULL
)
345 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
346 pattern_def_si
= gsi_start (pattern_def_seq
);
348 else if (!gsi_end_p (pattern_def_si
))
349 gsi_next (&pattern_def_si
);
350 if (pattern_def_seq
!= NULL
)
352 gimple pattern_def_stmt
= NULL
;
353 stmt_vec_info pattern_def_stmt_info
= NULL
;
355 while (!gsi_end_p (pattern_def_si
))
357 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
358 pattern_def_stmt_info
359 = vinfo_for_stmt (pattern_def_stmt
);
360 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
361 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
363 gsi_next (&pattern_def_si
);
366 if (!gsi_end_p (pattern_def_si
))
368 if (dump_enabled_p ())
370 dump_printf_loc (MSG_NOTE
, vect_location
,
371 "==> examining pattern def stmt: ");
372 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
373 pattern_def_stmt
, 0);
374 dump_printf (MSG_NOTE
, "\n");
377 stmt
= pattern_def_stmt
;
378 stmt_info
= pattern_def_stmt_info
;
382 pattern_def_si
= gsi_none ();
383 analyze_pattern_stmt
= false;
387 analyze_pattern_stmt
= false;
390 if (gimple_get_lhs (stmt
) == NULL_TREE
391 /* MASK_STORE has no lhs, but is ok. */
392 && (!is_gimple_call (stmt
)
393 || !gimple_call_internal_p (stmt
)
394 || gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
))
396 if (is_gimple_call (stmt
))
398 /* Ignore calls with no lhs. These must be calls to
399 #pragma omp simd functions, and what vectorization factor
400 it really needs can't be determined until
401 vectorizable_simd_clone_call. */
402 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
404 pattern_def_seq
= NULL
;
409 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
412 "not vectorized: irregular stmt.");
413 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
415 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
420 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
422 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
425 "not vectorized: vector stmt in loop:");
426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
427 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
432 if (STMT_VINFO_VECTYPE (stmt_info
))
434 /* The only case when a vectype had been already set is for stmts
435 that contain a dataref, or for "pattern-stmts" (stmts
436 generated by the vectorizer to represent/replace a certain
438 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
439 || is_pattern_stmt_p (stmt_info
)
440 || !gsi_end_p (pattern_def_si
));
441 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
445 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
446 if (is_gimple_call (stmt
)
447 && gimple_call_internal_p (stmt
)
448 && gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
449 scalar_type
= TREE_TYPE (gimple_call_arg (stmt
, 3));
451 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
452 if (dump_enabled_p ())
454 dump_printf_loc (MSG_NOTE
, vect_location
,
455 "get vectype for scalar type: ");
456 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
457 dump_printf (MSG_NOTE
, "\n");
459 vectype
= get_vectype_for_scalar_type (scalar_type
);
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
465 "not vectorized: unsupported "
467 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
469 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
474 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
476 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
479 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
480 dump_printf (MSG_NOTE
, "\n");
484 /* The vectorization factor is according to the smallest
485 scalar type (or the largest vector size, but we only
486 support one vector size per loop). */
487 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE
, vect_location
,
492 "get vectype for scalar type: ");
493 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
494 dump_printf (MSG_NOTE
, "\n");
496 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
502 "not vectorized: unsupported data-type ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
505 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
510 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
511 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
513 if (dump_enabled_p ())
515 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
516 "not vectorized: different sized vector "
517 "types in statement, ");
518 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
520 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
523 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
528 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
531 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
532 dump_printf (MSG_NOTE
, "\n");
535 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n", nunits
);
538 if (!vectorization_factor
539 || (nunits
> vectorization_factor
))
540 vectorization_factor
= nunits
;
542 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
544 pattern_def_seq
= NULL
;
550 /* TODO: Analyze cost. Decide if worth while to vectorize. */
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d\n",
553 vectorization_factor
);
554 if (vectorization_factor
<= 1)
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
558 "not vectorized: unsupported data-type\n");
561 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
567 /* Function vect_is_simple_iv_evolution.
569 FORNOW: A simple evolution of an induction variables in the loop is
570 considered a polynomial evolution. */
573 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
578 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
581 /* When there is no evolution in this loop, the evolution function
583 if (evolution_part
== NULL_TREE
)
586 /* When the evolution is a polynomial of degree >= 2
587 the evolution function is not "simple". */
588 if (tree_is_chrec (evolution_part
))
591 step_expr
= evolution_part
;
592 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
594 if (dump_enabled_p ())
596 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
597 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
598 dump_printf (MSG_NOTE
, ", init: ");
599 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
600 dump_printf (MSG_NOTE
, "\n");
606 if (TREE_CODE (step_expr
) != INTEGER_CST
607 && (TREE_CODE (step_expr
) != SSA_NAME
608 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
609 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
610 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
611 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
612 || !flag_associative_math
)))
613 && (TREE_CODE (step_expr
) != REAL_CST
614 || !flag_associative_math
))
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
625 /* Function vect_analyze_scalar_cycles_1.
627 Examine the cross iteration def-use cycles of scalar variables
628 in LOOP. LOOP_VINFO represents the loop that is now being
629 considered for vectorization (can be LOOP, or an outer-loop
633 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
635 basic_block bb
= loop
->header
;
637 auto_vec
<gimple
, 64> worklist
;
641 if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE
, vect_location
,
643 "=== vect_analyze_scalar_cycles ===\n");
645 /* First - identify all inductions. Reduction detection assumes that all the
646 inductions have been identified, therefore, this order must not be
648 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
650 gphi
*phi
= gsi
.phi ();
651 tree access_fn
= NULL
;
652 tree def
= PHI_RESULT (phi
);
653 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
655 if (dump_enabled_p ())
657 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
658 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
659 dump_printf (MSG_NOTE
, "\n");
662 /* Skip virtual phi's. The data dependences that are associated with
663 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
664 if (virtual_operand_p (def
))
667 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
669 /* Analyze the evolution function. */
670 access_fn
= analyze_scalar_evolution (loop
, def
);
673 STRIP_NOPS (access_fn
);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE
, vect_location
,
677 "Access function of PHI: ");
678 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
679 dump_printf (MSG_NOTE
, "\n");
681 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
682 = evolution_part_in_loop_num (access_fn
, loop
->num
);
686 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
687 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
688 && TREE_CODE (step
) != INTEGER_CST
))
690 worklist
.safe_push (phi
);
694 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
698 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
702 /* Second - identify all reductions and nested cycles. */
703 while (worklist
.length () > 0)
705 gimple phi
= worklist
.pop ();
706 tree def
= PHI_RESULT (phi
);
707 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
711 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
714 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
715 dump_printf (MSG_NOTE
, "\n");
718 gcc_assert (!virtual_operand_p (def
)
719 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
721 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
722 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
728 if (dump_enabled_p ())
729 dump_printf_loc (MSG_NOTE
, vect_location
,
730 "Detected double reduction.\n");
732 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
733 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
734 vect_double_reduction_def
;
740 if (dump_enabled_p ())
741 dump_printf_loc (MSG_NOTE
, vect_location
,
742 "Detected vectorizable nested cycle.\n");
744 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
745 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_NOTE
, vect_location
,
752 "Detected reduction.\n");
754 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
755 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
757 /* Store the reduction cycles for possible vectorization in
759 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
764 if (dump_enabled_p ())
765 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
766 "Unknown def-use cycle pattern.\n");
771 /* Function vect_analyze_scalar_cycles.
773 Examine the cross iteration def-use cycles of scalar variables, by
774 analyzing the loop-header PHIs of scalar variables. Classify each
775 cycle as one of the following: invariant, induction, reduction, unknown.
776 We do that for the loop represented by LOOP_VINFO, and also to its
777 inner-loop, if exists.
778 Examples for scalar cycles:
793 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
795 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
797 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
799 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
800 Reductions in such inner-loop therefore have different properties than
801 the reductions in the nest that gets vectorized:
802 1. When vectorized, they are executed in the same order as in the original
803 scalar loop, so we can't change the order of computation when
805 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
806 current checks are too strict. */
809 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
813 /* Function vect_get_loop_niters.
815 Determine how many iterations the loop is executed and place it
816 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
817 in NUMBER_OF_ITERATIONSM1.
819 Return the loop exit condition. */
823 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
,
824 tree
*number_of_iterationsm1
)
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE
, vect_location
,
830 "=== get_loop_niters ===\n");
832 niters
= number_of_latch_executions (loop
);
833 *number_of_iterationsm1
= niters
;
835 /* We want the number of loop header executions which is the number
836 of latch executions plus one.
837 ??? For UINT_MAX latch executions this number overflows to zero
838 for loops like do { n++; } while (n != 0); */
839 if (niters
&& !chrec_contains_undetermined (niters
))
840 niters
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niters
), unshare_expr (niters
),
841 build_int_cst (TREE_TYPE (niters
), 1));
842 *number_of_iterations
= niters
;
844 return get_loop_exit_condition (loop
);
848 /* Function bb_in_loop_p
850 Used as predicate for dfs order traversal of the loop bbs. */
853 bb_in_loop_p (const_basic_block bb
, const void *data
)
855 const struct loop
*const loop
= (const struct loop
*)data
;
856 if (flow_bb_inside_loop_p (loop
, bb
))
862 /* Function new_loop_vec_info.
864 Create and initialize a new loop_vec_info struct for LOOP, as well as
865 stmt_vec_info structs for all the stmts in LOOP. */
868 new_loop_vec_info (struct loop
*loop
)
872 gimple_stmt_iterator si
;
873 unsigned int i
, nbbs
;
875 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
876 LOOP_VINFO_LOOP (res
) = loop
;
878 bbs
= get_loop_body (loop
);
880 /* Create/Update stmt_info for all stmts in the loop. */
881 for (i
= 0; i
< loop
->num_nodes
; i
++)
883 basic_block bb
= bbs
[i
];
885 /* BBs in a nested inner-loop will have been already processed (because
886 we will have called vect_analyze_loop_form for any nested inner-loop).
887 Therefore, for stmts in an inner-loop we just want to update the
888 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
889 loop_info of the outer-loop we are currently considering to vectorize
890 (instead of the loop_info of the inner-loop).
891 For stmts in other BBs we need to create a stmt_info from scratch. */
892 if (bb
->loop_father
!= loop
)
895 gcc_assert (loop
->inner
&& bb
->loop_father
== loop
->inner
);
896 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
898 gimple phi
= gsi_stmt (si
);
899 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
900 loop_vec_info inner_loop_vinfo
=
901 STMT_VINFO_LOOP_VINFO (stmt_info
);
902 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
903 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
905 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
907 gimple stmt
= gsi_stmt (si
);
908 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
909 loop_vec_info inner_loop_vinfo
=
910 STMT_VINFO_LOOP_VINFO (stmt_info
);
911 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
912 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
917 /* bb in current nest. */
918 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
920 gimple phi
= gsi_stmt (si
);
921 gimple_set_uid (phi
, 0);
922 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
, NULL
));
925 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
927 gimple stmt
= gsi_stmt (si
);
928 gimple_set_uid (stmt
, 0);
929 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
, NULL
));
934 /* CHECKME: We want to visit all BBs before their successors (except for
935 latch blocks, for which this assertion wouldn't hold). In the simple
936 case of the loop forms we allow, a dfs order of the BBs would the same
937 as reversed postorder traversal, so we are safe. */
940 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
941 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
942 bbs
, loop
->num_nodes
, loop
);
943 gcc_assert (nbbs
== loop
->num_nodes
);
945 LOOP_VINFO_BBS (res
) = bbs
;
946 LOOP_VINFO_NITERSM1 (res
) = NULL
;
947 LOOP_VINFO_NITERS (res
) = NULL
;
948 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
949 LOOP_VINFO_COST_MODEL_MIN_ITERS (res
) = 0;
950 LOOP_VINFO_COST_MODEL_THRESHOLD (res
) = 0;
951 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
952 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res
) = 0;
953 LOOP_VINFO_VECT_FACTOR (res
) = 0;
954 LOOP_VINFO_LOOP_NEST (res
).create (3);
955 LOOP_VINFO_DATAREFS (res
).create (10);
956 LOOP_VINFO_DDRS (res
).create (10 * 10);
957 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
958 LOOP_VINFO_MAY_MISALIGN_STMTS (res
).create (
959 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
));
960 LOOP_VINFO_MAY_ALIAS_DDRS (res
).create (
961 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
962 LOOP_VINFO_GROUPED_STORES (res
).create (10);
963 LOOP_VINFO_REDUCTIONS (res
).create (10);
964 LOOP_VINFO_REDUCTION_CHAINS (res
).create (10);
965 LOOP_VINFO_SLP_INSTANCES (res
).create (10);
966 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
967 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
968 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
969 LOOP_VINFO_PEELING_FOR_NITER (res
) = false;
970 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
976 /* Function destroy_loop_vec_info.
978 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
979 stmts in the loop. */
982 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
987 gimple_stmt_iterator si
;
989 vec
<slp_instance
> slp_instances
;
990 slp_instance instance
;
996 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
998 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
999 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
1000 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
1002 for (j
= 0; j
< nbbs
; j
++)
1004 basic_block bb
= bbs
[j
];
1005 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1006 free_stmt_vec_info (gsi_stmt (si
));
1008 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
1010 gimple stmt
= gsi_stmt (si
);
1012 /* We may have broken canonical form by moving a constant
1013 into RHS1 of a commutative op. Fix such occurrences. */
1014 if (swapped
&& is_gimple_assign (stmt
))
1016 enum tree_code code
= gimple_assign_rhs_code (stmt
);
1018 if ((code
== PLUS_EXPR
1019 || code
== POINTER_PLUS_EXPR
1020 || code
== MULT_EXPR
)
1021 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
1022 swap_ssa_operands (stmt
,
1023 gimple_assign_rhs1_ptr (stmt
),
1024 gimple_assign_rhs2_ptr (stmt
));
1027 /* Free stmt_vec_info. */
1028 free_stmt_vec_info (stmt
);
1033 free (LOOP_VINFO_BBS (loop_vinfo
));
1034 vect_destroy_datarefs (loop_vinfo
, NULL
);
1035 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
1036 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
1037 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
1038 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
1039 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1040 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
1041 vect_free_slp_instance (instance
);
1043 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
1044 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
1045 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
1046 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
1048 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo
);
1049 LOOP_VINFO_PEELING_HTAB (loop_vinfo
) = NULL
;
1051 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
1058 /* Function vect_analyze_loop_1.
1060 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1061 for it. The different analyses will record information in the
1062 loop_vec_info struct. This is a subset of the analyses applied in
1063 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1064 that is now considered for (outer-loop) vectorization. */
1066 static loop_vec_info
1067 vect_analyze_loop_1 (struct loop
*loop
)
1069 loop_vec_info loop_vinfo
;
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_NOTE
, vect_location
,
1073 "===== analyze_loop_nest_1 =====\n");
1075 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1077 loop_vinfo
= vect_analyze_loop_form (loop
);
1080 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1082 "bad inner-loop form.\n");
1090 /* Function vect_analyze_loop_form.
1092 Verify that certain CFG restrictions hold, including:
1093 - the loop has a pre-header
1094 - the loop has a single entry and exit
1095 - the loop exit condition is simple enough, and the number of iterations
1096 can be analyzed (a countable loop). */
1099 vect_analyze_loop_form (struct loop
*loop
)
1101 loop_vec_info loop_vinfo
;
1103 tree number_of_iterations
= NULL
, number_of_iterationsm1
= NULL
;
1104 loop_vec_info inner_loop_vinfo
= NULL
;
1106 if (dump_enabled_p ())
1107 dump_printf_loc (MSG_NOTE
, vect_location
,
1108 "=== vect_analyze_loop_form ===\n");
1110 /* Different restrictions apply when we are considering an inner-most loop,
1111 vs. an outer (nested) loop.
1112 (FORNOW. May want to relax some of these restrictions in the future). */
1116 /* Inner-most loop. We currently require that the number of BBs is
1117 exactly 2 (the header and latch). Vectorizable inner-most loops
1128 if (loop
->num_nodes
!= 2)
1130 if (dump_enabled_p ())
1131 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1132 "not vectorized: control flow in loop.\n");
1136 if (empty_block_p (loop
->header
))
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1140 "not vectorized: empty loop.\n");
1146 struct loop
*innerloop
= loop
->inner
;
1149 /* Nested loop. We currently require that the loop is doubly-nested,
1150 contains a single inner loop, and the number of BBs is exactly 5.
1151 Vectorizable outer-loops look like this:
1163 The inner-loop has the properties expected of inner-most loops
1164 as described above. */
1166 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1170 "not vectorized: multiple nested loops.\n");
1174 /* Analyze the inner-loop. */
1175 inner_loop_vinfo
= vect_analyze_loop_1 (loop
->inner
);
1176 if (!inner_loop_vinfo
)
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1180 "not vectorized: Bad inner loop.\n");
1184 if (!expr_invariant_in_loop_p (loop
,
1185 LOOP_VINFO_NITERS (inner_loop_vinfo
)))
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1189 "not vectorized: inner-loop count not"
1191 destroy_loop_vec_info (inner_loop_vinfo
, true);
1195 if (loop
->num_nodes
!= 5)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1199 "not vectorized: control flow in loop.\n");
1200 destroy_loop_vec_info (inner_loop_vinfo
, true);
1204 gcc_assert (EDGE_COUNT (innerloop
->header
->preds
) == 2);
1205 entryedge
= EDGE_PRED (innerloop
->header
, 0);
1206 if (EDGE_PRED (innerloop
->header
, 0)->src
== innerloop
->latch
)
1207 entryedge
= EDGE_PRED (innerloop
->header
, 1);
1209 if (entryedge
->src
!= loop
->header
1210 || !single_exit (innerloop
)
1211 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1215 "not vectorized: unsupported outerloop form.\n");
1216 destroy_loop_vec_info (inner_loop_vinfo
, true);
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_NOTE
, vect_location
,
1222 "Considering outer-loop vectorization.\n");
1225 if (!single_exit (loop
)
1226 || EDGE_COUNT (loop
->header
->preds
) != 2)
1228 if (dump_enabled_p ())
1230 if (!single_exit (loop
))
1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1232 "not vectorized: multiple exits.\n");
1233 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1235 "not vectorized: too many incoming edges.\n");
1237 if (inner_loop_vinfo
)
1238 destroy_loop_vec_info (inner_loop_vinfo
, true);
1242 /* We assume that the loop exit condition is at the end of the loop. i.e,
1243 that the loop is represented as a do-while (with a proper if-guard
1244 before the loop if needed), where the loop header contains all the
1245 executable statements, and the latch is empty. */
1246 if (!empty_block_p (loop
->latch
)
1247 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1251 "not vectorized: latch block not empty.\n");
1252 if (inner_loop_vinfo
)
1253 destroy_loop_vec_info (inner_loop_vinfo
, true);
1257 /* Make sure there exists a single-predecessor exit bb: */
1258 if (!single_pred_p (single_exit (loop
)->dest
))
1260 edge e
= single_exit (loop
);
1261 if (!(e
->flags
& EDGE_ABNORMAL
))
1263 split_loop_exit_edge (e
);
1264 if (dump_enabled_p ())
1265 dump_printf (MSG_NOTE
, "split exit edge.\n");
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1271 "not vectorized: abnormal loop exit edge.\n");
1272 if (inner_loop_vinfo
)
1273 destroy_loop_vec_info (inner_loop_vinfo
, true);
1278 loop_cond
= vect_get_loop_niters (loop
, &number_of_iterations
,
1279 &number_of_iterationsm1
);
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1284 "not vectorized: complicated exit condition.\n");
1285 if (inner_loop_vinfo
)
1286 destroy_loop_vec_info (inner_loop_vinfo
, true);
1290 if (!number_of_iterations
1291 || chrec_contains_undetermined (number_of_iterations
))
1293 if (dump_enabled_p ())
1294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1295 "not vectorized: number of iterations cannot be "
1297 if (inner_loop_vinfo
)
1298 destroy_loop_vec_info (inner_loop_vinfo
, true);
1302 if (integer_zerop (number_of_iterations
))
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1306 "not vectorized: number of iterations = 0.\n");
1307 if (inner_loop_vinfo
)
1308 destroy_loop_vec_info (inner_loop_vinfo
, true);
1312 loop_vinfo
= new_loop_vec_info (loop
);
1313 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1314 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1315 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1317 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1319 if (dump_enabled_p ())
1321 dump_printf_loc (MSG_NOTE
, vect_location
,
1322 "Symbolic number of iterations is ");
1323 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1324 dump_printf (MSG_NOTE
, "\n");
1328 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1330 /* CHECKME: May want to keep it around it in the future. */
1331 if (inner_loop_vinfo
)
1332 destroy_loop_vec_info (inner_loop_vinfo
, false);
1334 gcc_assert (!loop
->aux
);
1335 loop
->aux
= loop_vinfo
;
1340 /* Function vect_analyze_loop_operations.
1342 Scan the loop stmts and make sure they are all vectorizable. */
1345 vect_analyze_loop_operations (loop_vec_info loop_vinfo
, bool slp
)
1347 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1348 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1349 int nbbs
= loop
->num_nodes
;
1350 unsigned int vectorization_factor
= 0;
1352 stmt_vec_info stmt_info
;
1353 bool need_to_vectorize
= false;
1354 int min_profitable_iters
;
1355 int min_scalar_loop_bound
;
1357 bool only_slp_in_loop
= true, ok
;
1358 HOST_WIDE_INT max_niter
;
1359 HOST_WIDE_INT estimated_niter
;
1360 int min_profitable_estimate
;
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_NOTE
, vect_location
,
1364 "=== vect_analyze_loop_operations ===\n");
1366 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
1367 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1370 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371 vectorization factor of the loop is the unrolling factor required by
1372 the SLP instances. If that unrolling factor is 1, we say, that we
1373 perform pure SLP on loop - cross iteration parallelism is not
1375 for (i
= 0; i
< nbbs
; i
++)
1377 basic_block bb
= bbs
[i
];
1378 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1381 gimple stmt
= gsi_stmt (si
);
1382 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1383 gcc_assert (stmt_info
);
1384 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1385 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1386 && !PURE_SLP_STMT (stmt_info
))
1387 /* STMT needs both SLP and loop-based vectorization. */
1388 only_slp_in_loop
= false;
1392 if (only_slp_in_loop
)
1393 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1395 vectorization_factor
= least_common_multiple (vectorization_factor
,
1396 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1398 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_NOTE
, vect_location
,
1401 "Updating vectorization factor to %d\n",
1402 vectorization_factor
);
1405 for (i
= 0; i
< nbbs
; i
++)
1407 basic_block bb
= bbs
[i
];
1409 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1412 gphi
*phi
= si
.phi ();
1415 stmt_info
= vinfo_for_stmt (phi
);
1416 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1419 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1420 dump_printf (MSG_NOTE
, "\n");
1423 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1424 (i.e., a phi in the tail of the outer-loop). */
1425 if (! is_loop_header_bb_p (bb
))
1427 /* FORNOW: we currently don't support the case that these phis
1428 are not used in the outerloop (unless it is double reduction,
1429 i.e., this phi is vect_reduction_def), cause this case
1430 requires to actually do something here. */
1431 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1432 || STMT_VINFO_LIVE_P (stmt_info
))
1433 && STMT_VINFO_DEF_TYPE (stmt_info
)
1434 != vect_double_reduction_def
)
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1438 "Unsupported loop-closed phi in "
1443 /* If PHI is used in the outer loop, we check that its operand
1444 is defined in the inner loop. */
1445 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1450 if (gimple_phi_num_args (phi
) != 1)
1453 phi_op
= PHI_ARG_DEF (phi
, 0);
1454 if (TREE_CODE (phi_op
) != SSA_NAME
)
1457 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1458 if (gimple_nop_p (op_def_stmt
)
1459 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1460 || !vinfo_for_stmt (op_def_stmt
))
1463 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1464 != vect_used_in_outer
1465 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1466 != vect_used_in_outer_by_reduction
)
1473 gcc_assert (stmt_info
);
1475 if (STMT_VINFO_LIVE_P (stmt_info
))
1477 /* FORNOW: not yet supported. */
1478 if (dump_enabled_p ())
1479 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1480 "not vectorized: value used after loop.\n");
1484 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1485 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1487 /* A scalar-dependence cycle that we don't support. */
1488 if (dump_enabled_p ())
1489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1490 "not vectorized: scalar dependence cycle.\n");
1494 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1496 need_to_vectorize
= true;
1497 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1498 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1503 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1506 "not vectorized: relevant phi not "
1508 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1509 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1515 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1518 gimple stmt
= gsi_stmt (si
);
1519 if (!gimple_clobber_p (stmt
)
1520 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1525 /* All operations in the loop are either irrelevant (deal with loop
1526 control, or dead), or only used outside the loop and can be moved
1527 out of the loop (e.g. invariants, inductions). The loop can be
1528 optimized away by scalar optimizations. We're better off not
1529 touching this loop. */
1530 if (!need_to_vectorize
)
1532 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_NOTE
, vect_location
,
1534 "All the computation can be taken out of the loop.\n");
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1537 "not vectorized: redundant loop. no profit to "
1542 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE
, vect_location
,
1544 "vectorization_factor = %d, niters = "
1545 HOST_WIDE_INT_PRINT_DEC
"\n", vectorization_factor
,
1546 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1548 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1549 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1550 || ((max_niter
= max_stmt_executions_int (loop
)) != -1
1551 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1555 "not vectorized: iteration count too small.\n");
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1558 "not vectorized: iteration count smaller than "
1559 "vectorization factor.\n");
1563 /* Analyze cost. Decide if worth while to vectorize. */
1565 /* Once VF is set, SLP costs should be updated since the number of created
1566 vector stmts depends on VF. */
1567 vect_update_slp_costs_according_to_vf (loop_vinfo
);
1569 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1570 &min_profitable_estimate
);
1571 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
) = min_profitable_iters
;
1573 if (min_profitable_iters
< 0)
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1577 "not vectorized: vectorization not profitable.\n");
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1580 "not vectorized: vector version will never be "
1585 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1586 * vectorization_factor
) - 1);
1589 /* Use the cost model only if it is more conservative than user specified
1592 th
= (unsigned) min_scalar_loop_bound
;
1593 if (min_profitable_iters
1594 && (!min_scalar_loop_bound
1595 || min_profitable_iters
> min_scalar_loop_bound
))
1596 th
= (unsigned) min_profitable_iters
;
1598 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1600 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1601 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1605 "not vectorized: vectorization not profitable.\n");
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE
, vect_location
,
1608 "not vectorized: iteration count smaller than user "
1609 "specified loop bound parameter or minimum profitable "
1610 "iterations (whichever is more conservative).\n");
1614 if ((estimated_niter
= estimated_stmt_executions_int (loop
)) != -1
1615 && ((unsigned HOST_WIDE_INT
) estimated_niter
1616 <= MAX (th
, (unsigned)min_profitable_estimate
)))
1618 if (dump_enabled_p ())
1619 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1620 "not vectorized: estimated iteration count too "
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE
, vect_location
,
1624 "not vectorized: estimated iteration count smaller "
1625 "than specified loop bound parameter or minimum "
1626 "profitable iterations (whichever is more "
1627 "conservative).\n");
1635 /* Function vect_analyze_loop_2.
1637 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1638 for it. The different analyses will record information in the
1639 loop_vec_info struct. */
1641 vect_analyze_loop_2 (loop_vec_info loop_vinfo
)
1643 bool ok
, slp
= false;
1644 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1647 unsigned int n_stmts
= 0;
1649 /* Find all data references in the loop (which correspond to vdefs/vuses)
1650 and analyze their evolution in the loop. Also adjust the minimal
1651 vectorization factor according to the loads and stores.
1653 FORNOW: Handle only simple, array references, which
1654 alignment can be forced, and aligned pointer-references. */
1656 ok
= vect_analyze_data_refs (loop_vinfo
, NULL
, &min_vf
, &n_stmts
);
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1661 "bad data references.\n");
1665 /* Classify all cross-iteration scalar data-flow cycles.
1666 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1668 vect_analyze_scalar_cycles (loop_vinfo
);
1670 vect_pattern_recog (loop_vinfo
, NULL
);
1672 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1673 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1675 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1680 "bad data access.\n");
1684 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1686 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1691 "unexpected pattern.\n");
1695 /* Analyze data dependences between the data-refs in the loop
1696 and adjust the maximum vectorization factor according to
1698 FORNOW: fail at the first data dependence that we encounter. */
1700 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1706 "bad data dependence.\n");
1710 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1715 "can't determine vectorization factor.\n");
1718 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1722 "bad data dependence.\n");
1726 /* Analyze the alignment of the data-refs in the loop.
1727 Fail if a data reference is found that cannot be vectorized. */
1729 ok
= vect_analyze_data_refs_alignment (loop_vinfo
, NULL
);
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1734 "bad data alignment.\n");
1738 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1739 It is important to call pruning after vect_analyze_data_ref_accesses,
1740 since we use grouping information gathered by interleaving analysis. */
1741 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1746 "number of versioning for alias "
1747 "run-time tests exceeds %d "
1748 "(--param vect-max-version-for-alias-checks)\n",
1749 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
1753 /* This pass will decide on using loop versioning and/or loop peeling in
1754 order to enhance the alignment of data references in the loop. */
1756 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1761 "bad data alignment.\n");
1765 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1766 ok
= vect_analyze_slp (loop_vinfo
, NULL
, n_stmts
);
1769 /* Decide which possible SLP instances to SLP. */
1770 slp
= vect_make_slp_decision (loop_vinfo
);
1772 /* Find stmts that need to be both vectorized and SLPed. */
1773 vect_detect_hybrid_slp (loop_vinfo
);
1778 /* Scan all the operations in the loop and make sure they are
1781 ok
= vect_analyze_loop_operations (loop_vinfo
, slp
);
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1786 "bad operation or unsupported loop bound.\n");
1790 /* Decide whether we need to create an epilogue loop to handle
1791 remaining scalar iterations. */
1792 th
= ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) + 1)
1793 / LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1794 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1796 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1797 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1799 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1800 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
1801 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1802 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1804 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1805 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1806 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1807 /* In case of versioning, check if the maximum number of
1808 iterations is greater than th. If they are identical,
1809 the epilogue is unnecessary. */
1810 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
)
1811 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1812 || (unsigned HOST_WIDE_INT
)max_stmt_executions_int
1813 (LOOP_VINFO_LOOP (loop_vinfo
)) > th
)))
1814 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1816 /* If an epilogue loop is required make sure we can create one. */
1817 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1818 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
1822 if (!vect_can_advance_ivs_p (loop_vinfo
)
1823 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
1824 single_exit (LOOP_VINFO_LOOP
1827 if (dump_enabled_p ())
1828 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1829 "not vectorized: can't create required "
1838 /* Function vect_analyze_loop.
1840 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1841 for it. The different analyses will record information in the
1842 loop_vec_info struct. */
1844 vect_analyze_loop (struct loop
*loop
)
1846 loop_vec_info loop_vinfo
;
1847 unsigned int vector_sizes
;
1849 /* Autodetect first vector size we try. */
1850 current_vector_size
= 0;
1851 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_NOTE
, vect_location
,
1855 "===== analyze_loop_nest =====\n");
1857 if (loop_outer (loop
)
1858 && loop_vec_info_for_loop (loop_outer (loop
))
1859 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE
, vect_location
,
1863 "outer-loop already vectorized.\n");
1869 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1870 loop_vinfo
= vect_analyze_loop_form (loop
);
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1875 "bad loop form.\n");
1879 if (vect_analyze_loop_2 (loop_vinfo
))
1881 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
1886 destroy_loop_vec_info (loop_vinfo
, true);
1888 vector_sizes
&= ~current_vector_size
;
1889 if (vector_sizes
== 0
1890 || current_vector_size
== 0)
1893 /* Try the next biggest vector size. */
1894 current_vector_size
= 1 << floor_log2 (vector_sizes
);
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE
, vect_location
,
1897 "***** Re-trying analysis with "
1898 "vector size %d\n", current_vector_size
);
1903 /* Function reduction_code_for_scalar_code
1906 CODE - tree_code of a reduction operations.
1909 REDUC_CODE - the corresponding tree-code to be used to reduce the
1910 vector of partial results into a single scalar result, or ERROR_MARK
1911 if the operation is a supported reduction operation, but does not have
1914 Return FALSE if CODE currently cannot be vectorized as reduction. */
1917 reduction_code_for_scalar_code (enum tree_code code
,
1918 enum tree_code
*reduc_code
)
1923 *reduc_code
= REDUC_MAX_EXPR
;
1927 *reduc_code
= REDUC_MIN_EXPR
;
1931 *reduc_code
= REDUC_PLUS_EXPR
;
1939 *reduc_code
= ERROR_MARK
;
1948 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1949 STMT is printed with a message MSG. */
1952 report_vect_op (int msg_type
, gimple stmt
, const char *msg
)
1954 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
1955 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
1956 dump_printf (msg_type
, "\n");
1960 /* Detect SLP reduction of the form:
1970 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1971 FIRST_STMT is the first reduction stmt in the chain
1972 (a2 = operation (a1)).
1974 Return TRUE if a reduction chain was detected. */
1977 vect_is_slp_reduction (loop_vec_info loop_info
, gimple phi
, gimple first_stmt
)
1979 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
1980 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
1981 enum tree_code code
;
1982 gimple current_stmt
= NULL
, loop_use_stmt
= NULL
, first
, next_stmt
;
1983 stmt_vec_info use_stmt_info
, current_stmt_info
;
1985 imm_use_iterator imm_iter
;
1986 use_operand_p use_p
;
1987 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
1990 if (loop
!= vect_loop
)
1993 lhs
= PHI_RESULT (phi
);
1994 code
= gimple_assign_rhs_code (first_stmt
);
1998 n_out_of_loop_uses
= 0;
1999 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2001 gimple use_stmt
= USE_STMT (use_p
);
2002 if (is_gimple_debug (use_stmt
))
2005 /* Check if we got back to the reduction phi. */
2006 if (use_stmt
== phi
)
2008 loop_use_stmt
= use_stmt
;
2013 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2015 if (vinfo_for_stmt (use_stmt
)
2016 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt
)))
2018 loop_use_stmt
= use_stmt
;
2023 n_out_of_loop_uses
++;
2025 /* There are can be either a single use in the loop or two uses in
2027 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2034 /* We reached a statement with no loop uses. */
2035 if (nloop_uses
== 0)
2038 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2039 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2042 if (!is_gimple_assign (loop_use_stmt
)
2043 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2044 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2047 /* Insert USE_STMT into reduction chain. */
2048 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2051 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2052 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2053 GROUP_FIRST_ELEMENT (use_stmt_info
)
2054 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2057 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2059 lhs
= gimple_assign_lhs (loop_use_stmt
);
2060 current_stmt
= loop_use_stmt
;
2064 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2067 /* Swap the operands, if needed, to make the reduction operand be the second
2069 lhs
= PHI_RESULT (phi
);
2070 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2073 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2075 tree op
= gimple_assign_rhs1 (next_stmt
);
2076 gimple def_stmt
= NULL
;
2078 if (TREE_CODE (op
) == SSA_NAME
)
2079 def_stmt
= SSA_NAME_DEF_STMT (op
);
2081 /* Check that the other def is either defined in the loop
2082 ("vect_internal_def"), or it's an induction (defined by a
2083 loop-header phi-node). */
2085 && gimple_bb (def_stmt
)
2086 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2087 && (is_gimple_assign (def_stmt
)
2088 || is_gimple_call (def_stmt
)
2089 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2090 == vect_induction_def
2091 || (gimple_code (def_stmt
) == GIMPLE_PHI
2092 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2093 == vect_internal_def
2094 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2096 lhs
= gimple_assign_lhs (next_stmt
);
2097 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2105 tree op
= gimple_assign_rhs2 (next_stmt
);
2106 gimple def_stmt
= NULL
;
2108 if (TREE_CODE (op
) == SSA_NAME
)
2109 def_stmt
= SSA_NAME_DEF_STMT (op
);
2111 /* Check that the other def is either defined in the loop
2112 ("vect_internal_def"), or it's an induction (defined by a
2113 loop-header phi-node). */
2115 && gimple_bb (def_stmt
)
2116 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2117 && (is_gimple_assign (def_stmt
)
2118 || is_gimple_call (def_stmt
)
2119 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2120 == vect_induction_def
2121 || (gimple_code (def_stmt
) == GIMPLE_PHI
2122 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2123 == vect_internal_def
2124 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2126 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2129 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2130 dump_printf (MSG_NOTE
, "\n");
2133 swap_ssa_operands (next_stmt
,
2134 gimple_assign_rhs1_ptr (next_stmt
),
2135 gimple_assign_rhs2_ptr (next_stmt
));
2136 update_stmt (next_stmt
);
2138 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2139 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2145 lhs
= gimple_assign_lhs (next_stmt
);
2146 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2149 /* Save the chain for further analysis in SLP detection. */
2150 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2151 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2152 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2158 /* Function vect_is_simple_reduction_1
2160 (1) Detect a cross-iteration def-use cycle that represents a simple
2161 reduction computation. We look for the following pattern:
2166 a2 = operation (a3, a1)
2173 a2 = operation (a3, a1)
2176 1. operation is commutative and associative and it is safe to
2177 change the order of the computation (if CHECK_REDUCTION is true)
2178 2. no uses for a2 in the loop (a2 is used out of the loop)
2179 3. no uses of a1 in the loop besides the reduction operation
2180 4. no uses of a1 outside the loop.
2182 Conditions 1,4 are tested here.
2183 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2185 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2186 nested cycles, if CHECK_REDUCTION is false.
2188 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2192 inner loop (def of a3)
2195 If MODIFY is true it tries also to rework the code in-place to enable
2196 detection of more reduction patterns. For the time being we rewrite
2197 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2201 vect_is_simple_reduction_1 (loop_vec_info loop_info
, gimple phi
,
2202 bool check_reduction
, bool *double_reduc
,
2205 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2206 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2207 edge latch_e
= loop_latch_edge (loop
);
2208 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2209 gimple def_stmt
, def1
= NULL
, def2
= NULL
;
2210 enum tree_code orig_code
, code
;
2211 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2215 imm_use_iterator imm_iter
;
2216 use_operand_p use_p
;
2219 *double_reduc
= false;
2221 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2222 otherwise, we assume outer loop vectorization. */
2223 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2224 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2226 name
= PHI_RESULT (phi
);
2227 /* ??? If there are no uses of the PHI result the inner loop reduction
2228 won't be detected as possibly double-reduction by vectorizable_reduction
2229 because that tries to walk the PHI arg from the preheader edge which
2230 can be constant. See PR60382. */
2231 if (has_zero_uses (name
))
2234 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2236 gimple use_stmt
= USE_STMT (use_p
);
2237 if (is_gimple_debug (use_stmt
))
2240 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2244 "intermediate value used outside loop.\n");
2249 if (vinfo_for_stmt (use_stmt
)
2250 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2256 "reduction used in loop.\n");
2261 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2263 if (dump_enabled_p ())
2265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2266 "reduction: not ssa_name: ");
2267 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2268 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2273 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2278 "reduction: no def_stmt.\n");
2282 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2284 if (dump_enabled_p ())
2286 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2287 dump_printf (MSG_NOTE
, "\n");
2292 if (is_gimple_assign (def_stmt
))
2294 name
= gimple_assign_lhs (def_stmt
);
2299 name
= PHI_RESULT (def_stmt
);
2304 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2306 gimple use_stmt
= USE_STMT (use_p
);
2307 if (is_gimple_debug (use_stmt
))
2309 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
2310 && vinfo_for_stmt (use_stmt
)
2311 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt
)))
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2317 "reduction used in loop.\n");
2322 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2323 defined in the inner loop. */
2326 op1
= PHI_ARG_DEF (def_stmt
, 0);
2328 if (gimple_phi_num_args (def_stmt
) != 1
2329 || TREE_CODE (op1
) != SSA_NAME
)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2333 "unsupported phi node definition.\n");
2338 def1
= SSA_NAME_DEF_STMT (op1
);
2339 if (gimple_bb (def1
)
2340 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2342 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2343 && is_gimple_assign (def1
))
2345 if (dump_enabled_p ())
2346 report_vect_op (MSG_NOTE
, def_stmt
,
2347 "detected double reduction: ");
2349 *double_reduc
= true;
2356 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2358 /* We can handle "res -= x[i]", which is non-associative by
2359 simply rewriting this into "res += -x[i]". Avoid changing
2360 gimple instruction for the first simple tests and only do this
2361 if we're allowed to change code at all. */
2362 if (code
== MINUS_EXPR
2364 && (op1
= gimple_assign_rhs1 (def_stmt
))
2365 && TREE_CODE (op1
) == SSA_NAME
2366 && SSA_NAME_DEF_STMT (op1
) == phi
)
2370 && (!commutative_tree_code (code
) || !associative_tree_code (code
)))
2372 if (dump_enabled_p ())
2373 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2374 "reduction: not commutative/associative: ");
2378 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2380 if (code
!= COND_EXPR
)
2382 if (dump_enabled_p ())
2383 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2384 "reduction: not binary operation: ");
2389 op3
= gimple_assign_rhs1 (def_stmt
);
2390 if (COMPARISON_CLASS_P (op3
))
2392 op4
= TREE_OPERAND (op3
, 1);
2393 op3
= TREE_OPERAND (op3
, 0);
2396 op1
= gimple_assign_rhs2 (def_stmt
);
2397 op2
= gimple_assign_rhs3 (def_stmt
);
2399 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2401 if (dump_enabled_p ())
2402 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2403 "reduction: uses not ssa_names: ");
2410 op1
= gimple_assign_rhs1 (def_stmt
);
2411 op2
= gimple_assign_rhs2 (def_stmt
);
2413 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2415 if (dump_enabled_p ())
2416 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2417 "reduction: uses not ssa_names: ");
2423 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2424 if ((TREE_CODE (op1
) == SSA_NAME
2425 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2426 || (TREE_CODE (op2
) == SSA_NAME
2427 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2428 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2429 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2430 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2431 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2433 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_NOTE
, vect_location
,
2436 "reduction: multiple types: operation type: ");
2437 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2438 dump_printf (MSG_NOTE
, ", operands types: ");
2439 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2441 dump_printf (MSG_NOTE
, ",");
2442 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2446 dump_printf (MSG_NOTE
, ",");
2447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2453 dump_printf (MSG_NOTE
, ",");
2454 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2457 dump_printf (MSG_NOTE
, "\n");
2463 /* Check that it's ok to change the order of the computation.
2464 Generally, when vectorizing a reduction we change the order of the
2465 computation. This may change the behavior of the program in some
2466 cases, so we need to check that this is ok. One exception is when
2467 vectorizing an outer-loop: the inner-loop is executed sequentially,
2468 and therefore vectorizing reductions in the inner-loop during
2469 outer-loop vectorization is safe. */
2471 /* CHECKME: check for !flag_finite_math_only too? */
2472 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
2475 /* Changing the order of operations changes the semantics. */
2476 if (dump_enabled_p ())
2477 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2478 "reduction: unsafe fp math optimization: ");
2481 else if (INTEGRAL_TYPE_P (type
) && TYPE_OVERFLOW_TRAPS (type
)
2484 /* Changing the order of operations changes the semantics. */
2485 if (dump_enabled_p ())
2486 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2487 "reduction: unsafe int math optimization: ");
2490 else if (SAT_FIXED_POINT_TYPE_P (type
) && check_reduction
)
2492 /* Changing the order of operations changes the semantics. */
2493 if (dump_enabled_p ())
2494 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2495 "reduction: unsafe fixed-point math optimization: ");
2499 /* If we detected "res -= x[i]" earlier, rewrite it into
2500 "res += -x[i]" now. If this turns out to be useless reassoc
2501 will clean it up again. */
2502 if (orig_code
== MINUS_EXPR
)
2504 tree rhs
= gimple_assign_rhs2 (def_stmt
);
2505 tree negrhs
= make_ssa_name (TREE_TYPE (rhs
));
2506 gimple negate_stmt
= gimple_build_assign (negrhs
, NEGATE_EXPR
, rhs
);
2507 gimple_stmt_iterator gsi
= gsi_for_stmt (def_stmt
);
2508 set_vinfo_for_stmt (negate_stmt
, new_stmt_vec_info (negate_stmt
,
2510 gsi_insert_before (&gsi
, negate_stmt
, GSI_NEW_STMT
);
2511 gimple_assign_set_rhs2 (def_stmt
, negrhs
);
2512 gimple_assign_set_rhs_code (def_stmt
, PLUS_EXPR
);
2513 update_stmt (def_stmt
);
2516 /* Reduction is safe. We're dealing with one of the following:
2517 1) integer arithmetic and no trapv
2518 2) floating point arithmetic, and special flags permit this optimization
2519 3) nested cycle (i.e., outer loop vectorization). */
2520 if (TREE_CODE (op1
) == SSA_NAME
)
2521 def1
= SSA_NAME_DEF_STMT (op1
);
2523 if (TREE_CODE (op2
) == SSA_NAME
)
2524 def2
= SSA_NAME_DEF_STMT (op2
);
2526 if (code
!= COND_EXPR
2527 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2529 if (dump_enabled_p ())
2530 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2534 /* Check that one def is the reduction def, defined by PHI,
2535 the other def is either defined in the loop ("vect_internal_def"),
2536 or it's an induction (defined by a loop-header phi-node). */
2538 if (def2
&& def2
== phi
2539 && (code
== COND_EXPR
2540 || !def1
|| gimple_nop_p (def1
)
2541 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2542 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2543 && (is_gimple_assign (def1
)
2544 || is_gimple_call (def1
)
2545 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2546 == vect_induction_def
2547 || (gimple_code (def1
) == GIMPLE_PHI
2548 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2549 == vect_internal_def
2550 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2552 if (dump_enabled_p ())
2553 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2557 if (def1
&& def1
== phi
2558 && (code
== COND_EXPR
2559 || !def2
|| gimple_nop_p (def2
)
2560 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2561 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2562 && (is_gimple_assign (def2
)
2563 || is_gimple_call (def2
)
2564 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2565 == vect_induction_def
2566 || (gimple_code (def2
) == GIMPLE_PHI
2567 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2568 == vect_internal_def
2569 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2571 if (check_reduction
)
2573 /* Swap operands (just for simplicity - so that the rest of the code
2574 can assume that the reduction variable is always the last (second)
2576 if (dump_enabled_p ())
2577 report_vect_op (MSG_NOTE
, def_stmt
,
2578 "detected reduction: need to swap operands: ");
2580 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2581 gimple_assign_rhs2_ptr (def_stmt
));
2583 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2584 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2588 if (dump_enabled_p ())
2589 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2595 /* Try to find SLP reduction chain. */
2596 if (check_reduction
&& vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
2598 if (dump_enabled_p ())
2599 report_vect_op (MSG_NOTE
, def_stmt
,
2600 "reduction: detected reduction chain: ");
2605 if (dump_enabled_p ())
2606 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2607 "reduction: unknown pattern: ");
2612 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2613 in-place. Arguments as there. */
2616 vect_is_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2617 bool check_reduction
, bool *double_reduc
)
2619 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2620 double_reduc
, false);
2623 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2624 in-place if it enables detection of more reductions. Arguments
2628 vect_force_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2629 bool check_reduction
, bool *double_reduc
)
2631 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2632 double_reduc
, true);
2635 /* Calculate the cost of one scalar iteration of the loop. */
2637 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
2639 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2640 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2641 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
2642 int innerloop_iters
, i
, stmt_cost
;
2644 /* Count statements in scalar loop. Using this as scalar cost for a single
2647 TODO: Add outer loop support.
2649 TODO: Consider assigning different costs to different scalar
2653 innerloop_iters
= 1;
2655 innerloop_iters
= 50; /* FIXME */
2657 for (i
= 0; i
< nbbs
; i
++)
2659 gimple_stmt_iterator si
;
2660 basic_block bb
= bbs
[i
];
2662 if (bb
->loop_father
== loop
->inner
)
2663 factor
= innerloop_iters
;
2667 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
2669 gimple stmt
= gsi_stmt (si
);
2670 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2672 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
2675 /* Skip stmts that are not vectorized inside the loop. */
2677 && !STMT_VINFO_RELEVANT_P (stmt_info
)
2678 && (!STMT_VINFO_LIVE_P (stmt_info
)
2679 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2680 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
2683 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
2685 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
2686 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2688 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2691 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2693 scalar_single_iter_cost
+= stmt_cost
* factor
;
2696 return scalar_single_iter_cost
;
2699 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2701 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
2702 int *peel_iters_epilogue
,
2703 int scalar_single_iter_cost
,
2704 stmt_vector_for_cost
*prologue_cost_vec
,
2705 stmt_vector_for_cost
*epilogue_cost_vec
)
2708 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2710 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2712 *peel_iters_epilogue
= vf
/2;
2713 if (dump_enabled_p ())
2714 dump_printf_loc (MSG_NOTE
, vect_location
,
2715 "cost model: epilogue peel iters set to vf/2 "
2716 "because loop iterations are unknown .\n");
2718 /* If peeled iterations are known but number of scalar loop
2719 iterations are unknown, count a taken branch per peeled loop. */
2720 retval
= record_stmt_cost (prologue_cost_vec
, 2, cond_branch_taken
,
2721 NULL
, 0, vect_prologue
);
2725 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2726 peel_iters_prologue
= niters
< peel_iters_prologue
?
2727 niters
: peel_iters_prologue
;
2728 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
2729 /* If we need to peel for gaps, but no peeling is required, we have to
2730 peel VF iterations. */
2731 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
2732 *peel_iters_epilogue
= vf
;
2735 if (peel_iters_prologue
)
2736 retval
+= record_stmt_cost (prologue_cost_vec
,
2737 peel_iters_prologue
* scalar_single_iter_cost
,
2738 scalar_stmt
, NULL
, 0, vect_prologue
);
2739 if (*peel_iters_epilogue
)
2740 retval
+= record_stmt_cost (epilogue_cost_vec
,
2741 *peel_iters_epilogue
* scalar_single_iter_cost
,
2742 scalar_stmt
, NULL
, 0, vect_epilogue
);
2746 /* Function vect_estimate_min_profitable_iters
2748 Return the number of iterations required for the vector version of the
2749 loop to be profitable relative to the cost of the scalar version of the
2753 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
2754 int *ret_min_profitable_niters
,
2755 int *ret_min_profitable_estimate
)
2757 int min_profitable_iters
;
2758 int min_profitable_estimate
;
2759 int peel_iters_prologue
;
2760 int peel_iters_epilogue
;
2761 unsigned vec_inside_cost
= 0;
2762 int vec_outside_cost
= 0;
2763 unsigned vec_prologue_cost
= 0;
2764 unsigned vec_epilogue_cost
= 0;
2765 int scalar_single_iter_cost
= 0;
2766 int scalar_outside_cost
= 0;
2767 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2768 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2769 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2771 /* Cost model disabled. */
2772 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2774 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
2775 *ret_min_profitable_niters
= 0;
2776 *ret_min_profitable_estimate
= 0;
2780 /* Requires loop versioning tests to handle misalignment. */
2781 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2783 /* FIXME: Make cost depend on complexity of individual check. */
2784 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
2785 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2787 dump_printf (MSG_NOTE
,
2788 "cost model: Adding cost of checks for loop "
2789 "versioning to treat misalignment.\n");
2792 /* Requires loop versioning with alias checks. */
2793 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2795 /* FIXME: Make cost depend on complexity of individual check. */
2796 unsigned len
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).length ();
2797 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2799 dump_printf (MSG_NOTE
,
2800 "cost model: Adding cost of checks for loop "
2801 "versioning aliasing.\n");
2804 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2805 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2806 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
2809 /* Count statements in scalar loop. Using this as scalar cost for a single
2812 TODO: Add outer loop support.
2814 TODO: Consider assigning different costs to different scalar
2817 scalar_single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
2819 /* Add additional cost for the peeled instructions in prologue and epilogue
2822 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2823 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2825 TODO: Build an expression that represents peel_iters for prologue and
2826 epilogue to be used in a run-time test. */
2830 peel_iters_prologue
= vf
/2;
2831 dump_printf (MSG_NOTE
, "cost model: "
2832 "prologue peel iters set to vf/2.\n");
2834 /* If peeling for alignment is unknown, loop bound of main loop becomes
2836 peel_iters_epilogue
= vf
/2;
2837 dump_printf (MSG_NOTE
, "cost model: "
2838 "epilogue peel iters set to vf/2 because "
2839 "peeling for alignment is unknown.\n");
2841 /* If peeled iterations are unknown, count a taken branch and a not taken
2842 branch per peeled loop. Even if scalar loop iterations are known,
2843 vector iterations are not known since peeled prologue iterations are
2844 not known. Hence guards remain the same. */
2845 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_taken
,
2846 NULL
, 0, vect_prologue
);
2847 (void) add_stmt_cost (target_cost_data
, 2, cond_branch_not_taken
,
2848 NULL
, 0, vect_prologue
);
2849 /* FORNOW: Don't attempt to pass individual scalar instructions to
2850 the model; just assume linear cost for scalar iterations. */
2851 (void) add_stmt_cost (target_cost_data
,
2852 peel_iters_prologue
* scalar_single_iter_cost
,
2853 scalar_stmt
, NULL
, 0, vect_prologue
);
2854 (void) add_stmt_cost (target_cost_data
,
2855 peel_iters_epilogue
* scalar_single_iter_cost
,
2856 scalar_stmt
, NULL
, 0, vect_epilogue
);
2860 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2861 stmt_info_for_cost
*si
;
2863 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2865 prologue_cost_vec
.create (2);
2866 epilogue_cost_vec
.create (2);
2867 peel_iters_prologue
= npeel
;
2869 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
2870 &peel_iters_epilogue
,
2871 scalar_single_iter_cost
,
2873 &epilogue_cost_vec
);
2875 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
2877 struct _stmt_vec_info
*stmt_info
2878 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2879 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2880 si
->misalign
, vect_prologue
);
2883 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
2885 struct _stmt_vec_info
*stmt_info
2886 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2887 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2888 si
->misalign
, vect_epilogue
);
2891 prologue_cost_vec
.release ();
2892 epilogue_cost_vec
.release ();
2895 /* FORNOW: The scalar outside cost is incremented in one of the
2898 1. The vectorizer checks for alignment and aliasing and generates
2899 a condition that allows dynamic vectorization. A cost model
2900 check is ANDED with the versioning condition. Hence scalar code
2901 path now has the added cost of the versioning check.
2903 if (cost > th & versioning_check)
2906 Hence run-time scalar is incremented by not-taken branch cost.
2908 2. The vectorizer then checks if a prologue is required. If the
2909 cost model check was not done before during versioning, it has to
2910 be done before the prologue check.
2913 prologue = scalar_iters
2918 if (prologue == num_iters)
2921 Hence the run-time scalar cost is incremented by a taken branch,
2922 plus a not-taken branch, plus a taken branch cost.
2924 3. The vectorizer then checks if an epilogue is required. If the
2925 cost model check was not done before during prologue check, it
2926 has to be done with the epilogue check.
2932 if (prologue == num_iters)
2935 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2938 Hence the run-time scalar cost should be incremented by 2 taken
2941 TODO: The back end may reorder the BBS's differently and reverse
2942 conditions/branch directions. Change the estimates below to
2943 something more reasonable. */
2945 /* If the number of iterations is known and we do not do versioning, we can
2946 decide whether to vectorize at compile time. Hence the scalar version
2947 do not carry cost model guard costs. */
2948 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2949 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2950 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2952 /* Cost model check occurs at versioning. */
2953 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2954 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2955 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
2958 /* Cost model check occurs at prologue generation. */
2959 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2960 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
2961 + vect_get_stmt_cost (cond_branch_not_taken
);
2962 /* Cost model check occurs at epilogue generation. */
2964 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
2968 /* Complete the target-specific cost calculations. */
2969 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
2970 &vec_inside_cost
, &vec_epilogue_cost
);
2972 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
2974 /* Calculate number of iterations required to make the vector version
2975 profitable, relative to the loop bodies only. The following condition
2977 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2979 SIC = scalar iteration cost, VIC = vector iteration cost,
2980 VOC = vector outside cost, VF = vectorization factor,
2981 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2982 SOC = scalar outside cost for run time cost model check. */
2984 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
2986 if (vec_outside_cost
<= 0)
2987 min_profitable_iters
= 1;
2990 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
2991 - vec_inside_cost
* peel_iters_prologue
2992 - vec_inside_cost
* peel_iters_epilogue
)
2993 / ((scalar_single_iter_cost
* vf
)
2996 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
2997 <= (((int) vec_inside_cost
* min_profitable_iters
)
2998 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
2999 min_profitable_iters
++;
3002 /* vector version will never be profitable. */
3005 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3006 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
3007 "did not happen for a simd loop");
3009 if (dump_enabled_p ())
3010 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3011 "cost model: the vector iteration cost = %d "
3012 "divided by the scalar iteration cost = %d "
3013 "is greater or equal to the vectorization factor = %d"
3015 vec_inside_cost
, scalar_single_iter_cost
, vf
);
3016 *ret_min_profitable_niters
= -1;
3017 *ret_min_profitable_estimate
= -1;
3021 if (dump_enabled_p ())
3023 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3024 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3026 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3028 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3030 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3031 scalar_single_iter_cost
);
3032 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3033 scalar_outside_cost
);
3034 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3036 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3037 peel_iters_prologue
);
3038 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3039 peel_iters_epilogue
);
3040 dump_printf (MSG_NOTE
,
3041 " Calculated minimum iters for profitability: %d\n",
3042 min_profitable_iters
);
3043 dump_printf (MSG_NOTE
, "\n");
3046 min_profitable_iters
=
3047 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
3049 /* Because the condition we create is:
3050 if (niters <= min_profitable_iters)
3051 then skip the vectorized loop. */
3052 min_profitable_iters
--;
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_NOTE
, vect_location
,
3056 " Runtime profitability threshold = %d\n",
3057 min_profitable_iters
);
3059 *ret_min_profitable_niters
= min_profitable_iters
;
3061 /* Calculate number of iterations required to make the vector version
3062 profitable, relative to the loop bodies only.
3064 Non-vectorized variant is SIC * niters and it must win over vector
3065 variant on the expected loop trip count. The following condition must hold true:
3066 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3068 if (vec_outside_cost
<= 0)
3069 min_profitable_estimate
= 1;
3072 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
3073 - vec_inside_cost
* peel_iters_prologue
3074 - vec_inside_cost
* peel_iters_epilogue
)
3075 / ((scalar_single_iter_cost
* vf
)
3078 min_profitable_estimate
--;
3079 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE
, vect_location
,
3082 " Static estimate profitability threshold = %d\n",
3083 min_profitable_iters
);
3085 *ret_min_profitable_estimate
= min_profitable_estimate
;
3088 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3089 vector elements (not bits) for a vector of mode MODE. */
3091 calc_vec_perm_mask_for_shift (enum machine_mode mode
, unsigned int offset
,
3094 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3096 for (i
= 0; i
< nelt
; i
++)
3097 sel
[i
] = (i
+ offset
) & (2*nelt
- 1);
3100 /* Checks whether the target supports whole-vector shifts for vectors of mode
3101 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3102 it supports vec_perm_const with masks for all necessary shift amounts. */
3104 have_whole_vector_shift (enum machine_mode mode
)
3106 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3109 if (direct_optab_handler (vec_perm_const_optab
, mode
) == CODE_FOR_nothing
)
3112 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3113 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
3115 for (i
= nelt
/2; i
>= 1; i
/=2)
3117 calc_vec_perm_mask_for_shift (mode
, i
, sel
);
3118 if (!can_vec_perm_p (mode
, false, sel
))
3124 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3125 functions. Design better to avoid maintenance issues. */
3127 /* Function vect_model_reduction_cost.
3129 Models cost for a reduction operation, including the vector ops
3130 generated within the strip-mine loop, the initial definition before
3131 the loop, and the epilogue code that must be generated. */
3134 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
3137 int prologue_cost
= 0, epilogue_cost
= 0;
3138 enum tree_code code
;
3141 gimple stmt
, orig_stmt
;
3144 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3145 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3146 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3148 /* Cost of reduction op inside loop. */
3149 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3150 stmt_info
, 0, vect_body
);
3151 stmt
= STMT_VINFO_STMT (stmt_info
);
3153 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3155 case GIMPLE_SINGLE_RHS
:
3156 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
)) == ternary_op
);
3157 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 2);
3159 case GIMPLE_UNARY_RHS
:
3160 reduction_op
= gimple_assign_rhs1 (stmt
);
3162 case GIMPLE_BINARY_RHS
:
3163 reduction_op
= gimple_assign_rhs2 (stmt
);
3165 case GIMPLE_TERNARY_RHS
:
3166 reduction_op
= gimple_assign_rhs3 (stmt
);
3172 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3175 if (dump_enabled_p ())
3177 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3178 "unsupported data-type ");
3179 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3180 TREE_TYPE (reduction_op
));
3181 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3186 mode
= TYPE_MODE (vectype
);
3187 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3190 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3192 code
= gimple_assign_rhs_code (orig_stmt
);
3194 /* Add in cost for initial definition. */
3195 prologue_cost
+= add_stmt_cost (target_cost_data
, 1, scalar_to_vec
,
3196 stmt_info
, 0, vect_prologue
);
3198 /* Determine cost of epilogue code.
3200 We have a reduction operator that will reduce the vector in one statement.
3201 Also requires scalar extract. */
3203 if (!nested_in_vect_loop_p (loop
, orig_stmt
))
3205 if (reduc_code
!= ERROR_MARK
)
3207 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3208 stmt_info
, 0, vect_epilogue
);
3209 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vec_to_scalar
,
3210 stmt_info
, 0, vect_epilogue
);
3214 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3216 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3217 int element_bitsize
= tree_to_uhwi (bitsize
);
3218 int nelements
= vec_size_in_bits
/ element_bitsize
;
3220 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3222 /* We have a whole vector shift available. */
3223 if (VECTOR_MODE_P (mode
)
3224 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3225 && have_whole_vector_shift (mode
))
3227 /* Final reduction via vector shifts and the reduction operator.
3228 Also requires scalar extract. */
3229 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3230 exact_log2 (nelements
) * 2,
3231 vector_stmt
, stmt_info
, 0,
3233 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3234 vec_to_scalar
, stmt_info
, 0,
3238 /* Use extracts and reduction op for final reduction. For N
3239 elements, we have N extracts and N-1 reduction ops. */
3240 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3241 nelements
+ nelements
- 1,
3242 vector_stmt
, stmt_info
, 0,
3247 if (dump_enabled_p ())
3248 dump_printf (MSG_NOTE
,
3249 "vect_model_reduction_cost: inside_cost = %d, "
3250 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
3251 prologue_cost
, epilogue_cost
);
3257 /* Function vect_model_induction_cost.
3259 Models cost for induction operations. */
3262 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3264 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3265 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3266 unsigned inside_cost
, prologue_cost
;
3268 /* loop cost for vec_loop. */
3269 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3270 stmt_info
, 0, vect_body
);
3272 /* prologue cost for vec_init and vec_step. */
3273 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3274 stmt_info
, 0, vect_prologue
);
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE
, vect_location
,
3278 "vect_model_induction_cost: inside_cost = %d, "
3279 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
3283 /* Function get_initial_def_for_induction
3286 STMT - a stmt that performs an induction operation in the loop.
3287 IV_PHI - the initial value of the induction variable
3290 Return a vector variable, initialized with the first VF values of
3291 the induction variable. E.g., for an iv with IV_PHI='X' and
3292 evolution S, for a vector of 4 units, we want to return:
3293 [X, X + S, X + 2*S, X + 3*S]. */
3296 get_initial_def_for_induction (gimple iv_phi
)
3298 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3299 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3300 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3303 edge pe
= loop_preheader_edge (loop
);
3304 struct loop
*iv_loop
;
3306 tree new_vec
, vec_init
, vec_step
, t
;
3309 gimple init_stmt
, new_stmt
;
3310 gphi
*induction_phi
;
3311 tree induc_def
, vec_def
, vec_dest
;
3312 tree init_expr
, step_expr
;
3313 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3317 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3318 bool nested_in_vect_loop
= false;
3319 gimple_seq stmts
= NULL
;
3320 imm_use_iterator imm_iter
;
3321 use_operand_p use_p
;
3325 gimple_stmt_iterator si
;
3326 basic_block bb
= gimple_bb (iv_phi
);
3330 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3331 if (nested_in_vect_loop_p (loop
, iv_phi
))
3333 nested_in_vect_loop
= true;
3334 iv_loop
= loop
->inner
;
3338 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3340 latch_e
= loop_latch_edge (iv_loop
);
3341 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3343 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
3344 gcc_assert (step_expr
!= NULL_TREE
);
3346 pe
= loop_preheader_edge (iv_loop
);
3347 init_expr
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3348 loop_preheader_edge (iv_loop
));
3350 vectype
= get_vectype_for_scalar_type (TREE_TYPE (init_expr
));
3351 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3352 gcc_assert (vectype
);
3353 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3354 ncopies
= vf
/ nunits
;
3356 gcc_assert (phi_info
);
3357 gcc_assert (ncopies
>= 1);
3359 /* Convert the step to the desired type. */
3360 step_expr
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3362 &stmts
, true, NULL_TREE
);
3365 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3366 gcc_assert (!new_bb
);
3369 /* Find the first insertion point in the BB. */
3370 si
= gsi_after_labels (bb
);
3372 /* Create the vector that holds the initial_value of the induction. */
3373 if (nested_in_vect_loop
)
3375 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3376 been created during vectorization of previous stmts. We obtain it
3377 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3378 vec_init
= vect_get_vec_def_for_operand (init_expr
, iv_phi
, NULL
);
3379 /* If the initial value is not of proper type, convert it. */
3380 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3383 = gimple_build_assign (vect_get_new_vect_var (vectype
,
3387 build1 (VIEW_CONVERT_EXPR
, vectype
,
3389 vec_init
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3390 gimple_assign_set_lhs (new_stmt
, vec_init
);
3391 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3393 gcc_assert (!new_bb
);
3394 set_vinfo_for_stmt (new_stmt
,
3395 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3400 vec
<constructor_elt
, va_gc
> *v
;
3402 /* iv_loop is the loop to be vectorized. Create:
3403 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3404 new_var
= vect_get_new_vect_var (TREE_TYPE (vectype
),
3405 vect_scalar_var
, "var_");
3406 new_name
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3408 &stmts
, false, new_var
);
3411 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3412 gcc_assert (!new_bb
);
3415 vec_alloc (v
, nunits
);
3416 bool constant_p
= is_gimple_min_invariant (new_name
);
3417 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3418 for (i
= 1; i
< nunits
; i
++)
3420 /* Create: new_name_i = new_name + step_expr */
3421 new_name
= fold_build2 (PLUS_EXPR
, TREE_TYPE (new_name
),
3422 new_name
, step_expr
);
3423 if (!is_gimple_min_invariant (new_name
))
3425 init_stmt
= gimple_build_assign (new_var
, new_name
);
3426 new_name
= make_ssa_name (new_var
, init_stmt
);
3427 gimple_assign_set_lhs (init_stmt
, new_name
);
3428 new_bb
= gsi_insert_on_edge_immediate (pe
, init_stmt
);
3429 gcc_assert (!new_bb
);
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_NOTE
, vect_location
,
3433 "created new init_stmt: ");
3434 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, init_stmt
, 0);
3435 dump_printf (MSG_NOTE
, "\n");
3439 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3441 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3443 new_vec
= build_vector_from_ctor (vectype
, v
);
3445 new_vec
= build_constructor (vectype
, v
);
3446 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3450 /* Create the vector that holds the step of the induction. */
3451 if (nested_in_vect_loop
)
3452 /* iv_loop is nested in the loop to be vectorized. Generate:
3453 vec_step = [S, S, S, S] */
3454 new_name
= step_expr
;
3457 /* iv_loop is the loop to be vectorized. Generate:
3458 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3459 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3461 expr
= build_int_cst (integer_type_node
, vf
);
3462 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3465 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3466 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3468 if (TREE_CODE (step_expr
) == SSA_NAME
)
3469 new_name
= vect_init_vector (iv_phi
, new_name
,
3470 TREE_TYPE (step_expr
), NULL
);
3473 t
= unshare_expr (new_name
);
3474 gcc_assert (CONSTANT_CLASS_P (new_name
)
3475 || TREE_CODE (new_name
) == SSA_NAME
);
3476 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3477 gcc_assert (stepvectype
);
3478 new_vec
= build_vector_from_val (stepvectype
, t
);
3479 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3482 /* Create the following def-use cycle:
3487 vec_iv = PHI <vec_init, vec_loop>
3491 vec_loop = vec_iv + vec_step; */
3493 /* Create the induction-phi that defines the induction-operand. */
3494 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3495 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3496 set_vinfo_for_stmt (induction_phi
,
3497 new_stmt_vec_info (induction_phi
, loop_vinfo
, NULL
));
3498 induc_def
= PHI_RESULT (induction_phi
);
3500 /* Create the iv update inside the loop */
3501 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
, induc_def
, vec_step
);
3502 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3503 gimple_assign_set_lhs (new_stmt
, vec_def
);
3504 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3505 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
,
3508 /* Set the arguments of the phi node: */
3509 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3510 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3514 /* In case that vectorization factor (VF) is bigger than the number
3515 of elements that we can fit in a vectype (nunits), we have to generate
3516 more than one vector stmt - i.e - we need to "unroll" the
3517 vector stmt by a factor VF/nunits. For more details see documentation
3518 in vectorizable_operation. */
3522 stmt_vec_info prev_stmt_vinfo
;
3523 /* FORNOW. This restriction should be relaxed. */
3524 gcc_assert (!nested_in_vect_loop
);
3526 /* Create the vector that holds the step of the induction. */
3527 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3529 expr
= build_int_cst (integer_type_node
, nunits
);
3530 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3533 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3534 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3536 if (TREE_CODE (step_expr
) == SSA_NAME
)
3537 new_name
= vect_init_vector (iv_phi
, new_name
,
3538 TREE_TYPE (step_expr
), NULL
);
3539 t
= unshare_expr (new_name
);
3540 gcc_assert (CONSTANT_CLASS_P (new_name
)
3541 || TREE_CODE (new_name
) == SSA_NAME
);
3542 new_vec
= build_vector_from_val (stepvectype
, t
);
3543 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3545 vec_def
= induc_def
;
3546 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3547 for (i
= 1; i
< ncopies
; i
++)
3549 /* vec_i = vec_prev + vec_step */
3550 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
3552 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3553 gimple_assign_set_lhs (new_stmt
, vec_def
);
3555 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3556 if (!useless_type_conversion_p (resvectype
, vectype
))
3559 = gimple_build_assign
3560 (vect_get_new_vect_var (resvectype
, vect_simple_var
,
3563 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3564 gimple_assign_lhs (new_stmt
)));
3565 gimple_assign_set_lhs (new_stmt
,
3567 (gimple_assign_lhs (new_stmt
), new_stmt
));
3568 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3570 set_vinfo_for_stmt (new_stmt
,
3571 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3572 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3573 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3577 if (nested_in_vect_loop
)
3579 /* Find the loop-closed exit-phi of the induction, and record
3580 the final vector of induction results: */
3582 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3584 gimple use_stmt
= USE_STMT (use_p
);
3585 if (is_gimple_debug (use_stmt
))
3588 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
3590 exit_phi
= use_stmt
;
3596 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3597 /* FORNOW. Currently not supporting the case that an inner-loop induction
3598 is not used in the outer-loop (i.e. only outside the outer-loop). */
3599 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3600 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3602 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3603 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_NOTE
, vect_location
,
3606 "vector of inductions after inner-loop:");
3607 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3608 dump_printf (MSG_NOTE
, "\n");
3614 if (dump_enabled_p ())
3616 dump_printf_loc (MSG_NOTE
, vect_location
,
3617 "transform induction: created def-use cycle: ");
3618 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3619 dump_printf (MSG_NOTE
, "\n");
3620 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
3621 SSA_NAME_DEF_STMT (vec_def
), 0);
3622 dump_printf (MSG_NOTE
, "\n");
3625 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
3626 if (!useless_type_conversion_p (resvectype
, vectype
))
3628 new_stmt
= gimple_build_assign (vect_get_new_vect_var (resvectype
,
3632 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3634 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3635 gimple_assign_set_lhs (new_stmt
, induc_def
);
3636 si
= gsi_after_labels (bb
);
3637 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3638 set_vinfo_for_stmt (new_stmt
,
3639 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3640 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
3641 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
3648 /* Function get_initial_def_for_reduction
3651 STMT - a stmt that performs a reduction operation in the loop.
3652 INIT_VAL - the initial value of the reduction variable
3655 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3656 of the reduction (used for adjusting the epilog - see below).
3657 Return a vector variable, initialized according to the operation that STMT
3658 performs. This vector will be used as the initial value of the
3659 vector of partial results.
3661 Option1 (adjust in epilog): Initialize the vector as follows:
3662 add/bit or/xor: [0,0,...,0,0]
3663 mult/bit and: [1,1,...,1,1]
3664 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3665 and when necessary (e.g. add/mult case) let the caller know
3666 that it needs to adjust the result by init_val.
3668 Option2: Initialize the vector as follows:
3669 add/bit or/xor: [init_val,0,0,...,0]
3670 mult/bit and: [init_val,1,1,...,1]
3671 min/max/cond_expr: [init_val,init_val,...,init_val]
3672 and no adjustments are needed.
3674 For example, for the following code:
3680 STMT is 's = s + a[i]', and the reduction variable is 's'.
3681 For a vector of 4 units, we want to return either [0,0,0,init_val],
3682 or [0,0,0,0] and let the caller know that it needs to adjust
3683 the result at the end by 'init_val'.
3685 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3686 initialization vector is simpler (same element in all entries), if
3687 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3689 A cost model should help decide between these two schemes. */
3692 get_initial_def_for_reduction (gimple stmt
, tree init_val
,
3693 tree
*adjustment_def
)
3695 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
3696 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3697 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3698 tree scalar_type
= TREE_TYPE (init_val
);
3699 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3701 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3706 bool nested_in_vect_loop
= false;
3708 REAL_VALUE_TYPE real_init_val
= dconst0
;
3709 int int_init_val
= 0;
3710 gimple def_stmt
= NULL
;
3712 gcc_assert (vectype
);
3713 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3715 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3716 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3718 if (nested_in_vect_loop_p (loop
, stmt
))
3719 nested_in_vect_loop
= true;
3721 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
3723 /* In case of double reduction we only create a vector variable to be put
3724 in the reduction phi node. The actual statement creation is done in
3725 vect_create_epilog_for_reduction. */
3726 if (adjustment_def
&& nested_in_vect_loop
3727 && TREE_CODE (init_val
) == SSA_NAME
3728 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
3729 && gimple_code (def_stmt
) == GIMPLE_PHI
3730 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3731 && vinfo_for_stmt (def_stmt
)
3732 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
3733 == vect_double_reduction_def
)
3735 *adjustment_def
= NULL
;
3736 return vect_create_destination_var (init_val
, vectype
);
3739 if (TREE_CONSTANT (init_val
))
3741 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3742 init_value
= build_real (scalar_type
, TREE_REAL_CST (init_val
));
3744 init_value
= build_int_cst (scalar_type
, TREE_INT_CST_LOW (init_val
));
3747 init_value
= init_val
;
3751 case WIDEN_SUM_EXPR
:
3760 /* ADJUSMENT_DEF is NULL when called from
3761 vect_create_epilog_for_reduction to vectorize double reduction. */
3764 if (nested_in_vect_loop
)
3765 *adjustment_def
= vect_get_vec_def_for_operand (init_val
, stmt
,
3768 *adjustment_def
= init_val
;
3771 if (code
== MULT_EXPR
)
3773 real_init_val
= dconst1
;
3777 if (code
== BIT_AND_EXPR
)
3780 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3781 def_for_init
= build_real (scalar_type
, real_init_val
);
3783 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3785 /* Create a vector of '0' or '1' except the first element. */
3786 elts
= XALLOCAVEC (tree
, nunits
);
3787 for (i
= nunits
- 2; i
>= 0; --i
)
3788 elts
[i
+ 1] = def_for_init
;
3790 /* Option1: the first element is '0' or '1' as well. */
3793 elts
[0] = def_for_init
;
3794 init_def
= build_vector (vectype
, elts
);
3798 /* Option2: the first element is INIT_VAL. */
3800 if (TREE_CONSTANT (init_val
))
3801 init_def
= build_vector (vectype
, elts
);
3804 vec
<constructor_elt
, va_gc
> *v
;
3805 vec_alloc (v
, nunits
);
3806 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
3807 for (i
= 1; i
< nunits
; ++i
)
3808 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
3809 init_def
= build_constructor (vectype
, v
);
3819 *adjustment_def
= NULL_TREE
;
3820 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
, NULL
);
3824 init_def
= build_vector_from_val (vectype
, init_value
);
3834 /* Function vect_create_epilog_for_reduction
3836 Create code at the loop-epilog to finalize the result of a reduction
3839 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3840 reduction statements.
3841 STMT is the scalar reduction stmt that is being vectorized.
3842 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3843 number of elements that we can fit in a vectype (nunits). In this case
3844 we have to generate more than one vector stmt - i.e - we need to "unroll"
3845 the vector stmt by a factor VF/nunits. For more details see documentation
3846 in vectorizable_operation.
3847 REDUC_CODE is the tree-code for the epilog reduction.
3848 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3850 REDUC_INDEX is the index of the operand in the right hand side of the
3851 statement that is defined by REDUCTION_PHI.
3852 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3853 SLP_NODE is an SLP node containing a group of reduction statements. The
3854 first one in this group is STMT.
3857 1. Creates the reduction def-use cycles: sets the arguments for
3859 The loop-entry argument is the vectorized initial-value of the reduction.
3860 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3862 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3863 by applying the operation specified by REDUC_CODE if available, or by
3864 other means (whole-vector shifts or a scalar loop).
3865 The function also creates a new phi node at the loop exit to preserve
3866 loop-closed form, as illustrated below.
3868 The flow at the entry to this function:
3871 vec_def = phi <null, null> # REDUCTION_PHI
3872 VECT_DEF = vector_stmt # vectorized form of STMT
3873 s_loop = scalar_stmt # (scalar) STMT
3875 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3879 The above is transformed by this function into:
3882 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3883 VECT_DEF = vector_stmt # vectorized form of STMT
3884 s_loop = scalar_stmt # (scalar) STMT
3886 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3887 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3888 v_out2 = reduce <v_out1>
3889 s_out3 = extract_field <v_out2, 0>
3890 s_out4 = adjust_result <s_out3>
3896 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple stmt
,
3897 int ncopies
, enum tree_code reduc_code
,
3898 vec
<gimple
> reduction_phis
,
3899 int reduc_index
, bool double_reduc
,
3902 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3903 stmt_vec_info prev_phi_info
;
3906 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3907 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
3908 basic_block exit_bb
;
3911 gimple new_phi
= NULL
, phi
;
3912 gimple_stmt_iterator exit_gsi
;
3914 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
3915 gimple epilog_stmt
= NULL
;
3916 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3919 tree adjustment_def
= NULL
;
3920 tree vec_initial_def
= NULL
;
3921 tree reduction_op
, expr
, def
;
3922 tree orig_name
, scalar_result
;
3923 imm_use_iterator imm_iter
, phi_imm_iter
;
3924 use_operand_p use_p
, phi_use_p
;
3925 gimple use_stmt
, orig_stmt
, reduction_phi
= NULL
;
3926 bool nested_in_vect_loop
= false;
3927 auto_vec
<gimple
> new_phis
;
3928 auto_vec
<gimple
> inner_phis
;
3929 enum vect_def_type dt
= vect_unknown_def_type
;
3931 auto_vec
<tree
> scalar_results
;
3932 unsigned int group_size
= 1, k
, ratio
;
3933 auto_vec
<tree
> vec_initial_defs
;
3934 auto_vec
<gimple
> phis
;
3935 bool slp_reduc
= false;
3936 tree new_phi_result
;
3937 gimple inner_phi
= NULL
;
3940 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
3942 if (nested_in_vect_loop_p (loop
, stmt
))
3946 nested_in_vect_loop
= true;
3947 gcc_assert (!slp_node
);
3950 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3952 case GIMPLE_SINGLE_RHS
:
3953 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3955 reduction_op
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3957 case GIMPLE_UNARY_RHS
:
3958 reduction_op
= gimple_assign_rhs1 (stmt
);
3960 case GIMPLE_BINARY_RHS
:
3961 reduction_op
= reduc_index
?
3962 gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
);
3964 case GIMPLE_TERNARY_RHS
:
3965 reduction_op
= gimple_op (stmt
, reduc_index
+ 1);
3971 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3972 gcc_assert (vectype
);
3973 mode
= TYPE_MODE (vectype
);
3975 /* 1. Create the reduction def-use cycle:
3976 Set the arguments of REDUCTION_PHIS, i.e., transform
3979 vec_def = phi <null, null> # REDUCTION_PHI
3980 VECT_DEF = vector_stmt # vectorized form of STMT
3986 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3987 VECT_DEF = vector_stmt # vectorized form of STMT
3990 (in case of SLP, do it for all the phis). */
3992 /* Get the loop-entry arguments. */
3994 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
3995 NULL
, slp_node
, reduc_index
);
3998 vec_initial_defs
.create (1);
3999 /* For the case of reduction, vect_get_vec_def_for_operand returns
4000 the scalar def before the loop, that defines the initial value
4001 of the reduction variable. */
4002 vec_initial_def
= vect_get_vec_def_for_operand (reduction_op
, stmt
,
4004 vec_initial_defs
.quick_push (vec_initial_def
);
4007 /* Set phi nodes arguments. */
4008 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4010 tree vec_init_def
, def
;
4012 vec_init_def
= force_gimple_operand (vec_initial_defs
[i
], &stmts
,
4014 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4016 for (j
= 0; j
< ncopies
; j
++)
4018 /* Set the loop-entry arg of the reduction-phi. */
4019 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4020 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4022 /* Set the loop-latch arg for the reduction-phi. */
4024 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4026 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4029 if (dump_enabled_p ())
4031 dump_printf_loc (MSG_NOTE
, vect_location
,
4032 "transform reduction: created def-use cycle: ");
4033 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4034 dump_printf (MSG_NOTE
, "\n");
4035 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4036 dump_printf (MSG_NOTE
, "\n");
4039 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4043 /* 2. Create epilog code.
4044 The reduction epilog code operates across the elements of the vector
4045 of partial results computed by the vectorized loop.
4046 The reduction epilog code consists of:
4048 step 1: compute the scalar result in a vector (v_out2)
4049 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4050 step 3: adjust the scalar result (s_out3) if needed.
4052 Step 1 can be accomplished using one the following three schemes:
4053 (scheme 1) using reduc_code, if available.
4054 (scheme 2) using whole-vector shifts, if available.
4055 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4058 The overall epilog code looks like this:
4060 s_out0 = phi <s_loop> # original EXIT_PHI
4061 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4062 v_out2 = reduce <v_out1> # step 1
4063 s_out3 = extract_field <v_out2, 0> # step 2
4064 s_out4 = adjust_result <s_out3> # step 3
4066 (step 3 is optional, and steps 1 and 2 may be combined).
4067 Lastly, the uses of s_out0 are replaced by s_out4. */
4070 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4071 v_out1 = phi <VECT_DEF>
4072 Store them in NEW_PHIS. */
4074 exit_bb
= single_exit (loop
)->dest
;
4075 prev_phi_info
= NULL
;
4076 new_phis
.create (vect_defs
.length ());
4077 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4079 for (j
= 0; j
< ncopies
; j
++)
4081 tree new_def
= copy_ssa_name (def
);
4082 phi
= create_phi_node (new_def
, exit_bb
);
4083 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
, NULL
));
4085 new_phis
.quick_push (phi
);
4088 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
4089 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
4092 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4093 prev_phi_info
= vinfo_for_stmt (phi
);
4097 /* The epilogue is created for the outer-loop, i.e., for the loop being
4098 vectorized. Create exit phis for the outer loop. */
4102 exit_bb
= single_exit (loop
)->dest
;
4103 inner_phis
.create (vect_defs
.length ());
4104 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4106 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4107 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4108 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4110 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4112 inner_phis
.quick_push (phi
);
4113 new_phis
[i
] = outer_phi
;
4114 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4115 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
4117 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4118 new_result
= copy_ssa_name (PHI_RESULT (phi
));
4119 outer_phi
= create_phi_node (new_result
, exit_bb
);
4120 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4122 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4124 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
4125 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4130 exit_gsi
= gsi_after_labels (exit_bb
);
4132 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4133 (i.e. when reduc_code is not available) and in the final adjustment
4134 code (if needed). Also get the original scalar reduction variable as
4135 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4136 represents a reduction pattern), the tree-code and scalar-def are
4137 taken from the original stmt that the pattern-stmt (STMT) replaces.
4138 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4139 are taken from STMT. */
4141 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4144 /* Regular reduction */
4149 /* Reduction pattern */
4150 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
4151 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
4152 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
4155 code
= gimple_assign_rhs_code (orig_stmt
);
4156 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4157 partial results are added and not subtracted. */
4158 if (code
== MINUS_EXPR
)
4161 scalar_dest
= gimple_assign_lhs (orig_stmt
);
4162 scalar_type
= TREE_TYPE (scalar_dest
);
4163 scalar_results
.create (group_size
);
4164 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4165 bitsize
= TYPE_SIZE (scalar_type
);
4167 /* In case this is a reduction in an inner-loop while vectorizing an outer
4168 loop - we don't need to extract a single scalar result at the end of the
4169 inner-loop (unless it is double reduction, i.e., the use of reduction is
4170 outside the outer-loop). The final vector of partial results will be used
4171 in the vectorized outer-loop, or reduced to a scalar result at the end of
4173 if (nested_in_vect_loop
&& !double_reduc
)
4174 goto vect_finalize_reduction
;
4176 /* SLP reduction without reduction chain, e.g.,
4180 b2 = operation (b1) */
4181 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
4183 /* In case of reduction chain, e.g.,
4186 a3 = operation (a2),
4188 we may end up with more than one vector result. Here we reduce them to
4190 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4192 tree first_vect
= PHI_RESULT (new_phis
[0]);
4194 gassign
*new_vec_stmt
= NULL
;
4196 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4197 for (k
= 1; k
< new_phis
.length (); k
++)
4199 gimple next_phi
= new_phis
[k
];
4200 tree second_vect
= PHI_RESULT (next_phi
);
4202 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
4203 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
4204 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
4205 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
4206 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4209 new_phi_result
= first_vect
;
4212 new_phis
.truncate (0);
4213 new_phis
.safe_push (new_vec_stmt
);
4217 new_phi_result
= PHI_RESULT (new_phis
[0]);
4219 /* 2.3 Create the reduction code, using one of the three schemes described
4220 above. In SLP we simply need to extract all the elements from the
4221 vector (without reducing them), so we use scalar shifts. */
4222 if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4227 /*** Case 1: Create:
4228 v_out2 = reduc_expr <v_out1> */
4230 if (dump_enabled_p ())
4231 dump_printf_loc (MSG_NOTE
, vect_location
,
4232 "Reduce using direct vector reduction.\n");
4234 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4235 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
4238 vect_create_destination_var (scalar_dest
, vec_elem_type
);
4239 tmp
= build1 (reduc_code
, vec_elem_type
, new_phi_result
);
4240 epilog_stmt
= gimple_build_assign (tmp_dest
, tmp
);
4241 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
4242 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4243 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4245 tmp
= build1 (NOP_EXPR
, scalar_type
, new_temp
);
4248 tmp
= build1 (reduc_code
, scalar_type
, new_phi_result
);
4249 epilog_stmt
= gimple_build_assign (new_scalar_dest
, tmp
);
4250 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4251 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4252 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4253 scalar_results
.safe_push (new_temp
);
4257 bool reduce_with_shift
= have_whole_vector_shift (mode
);
4258 int element_bitsize
= tree_to_uhwi (bitsize
);
4259 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4262 /* Regardless of whether we have a whole vector shift, if we're
4263 emulating the operation via tree-vect-generic, we don't want
4264 to use it. Only the first round of the reduction is likely
4265 to still be profitable via emulation. */
4266 /* ??? It might be better to emit a reduction tree code here, so that
4267 tree-vect-generic can expand the first round via bit tricks. */
4268 if (!VECTOR_MODE_P (mode
))
4269 reduce_with_shift
= false;
4272 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4273 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4274 reduce_with_shift
= false;
4277 if (reduce_with_shift
&& !slp_reduc
)
4279 int nelements
= vec_size_in_bits
/ element_bitsize
;
4280 unsigned char *sel
= XALLOCAVEC (unsigned char, nelements
);
4284 tree zero_vec
= build_zero_cst (vectype
);
4285 /*** Case 2: Create:
4286 for (offset = nelements/2; offset >= 1; offset/=2)
4288 Create: va' = vec_shift <va, offset>
4289 Create: va = vop <va, va'>
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE
, vect_location
,
4296 "Reduce using vector shifts\n");
4298 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4299 new_temp
= new_phi_result
;
4300 for (elt_offset
= nelements
/ 2;
4304 calc_vec_perm_mask_for_shift (mode
, elt_offset
, sel
);
4305 tree mask
= vect_gen_perm_mask_any (vectype
, sel
);
4306 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
4307 new_temp
, zero_vec
, mask
);
4308 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4309 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4310 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4312 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
4314 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4315 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4316 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4319 /* 2.4 Extract the final scalar result. Create:
4320 s_out3 = extract_field <v_out2, bitpos> */
4322 if (dump_enabled_p ())
4323 dump_printf_loc (MSG_NOTE
, vect_location
,
4324 "extract scalar result\n");
4326 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
4327 bitsize
, bitsize_zero_node
);
4328 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4329 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4330 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4331 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4332 scalar_results
.safe_push (new_temp
);
4336 /*** Case 3: Create:
4337 s = extract_field <v_out2, 0>
4338 for (offset = element_size;
4339 offset < vector_size;
4340 offset += element_size;)
4342 Create: s' = extract_field <v_out2, offset>
4343 Create: s = op <s, s'> // For non SLP cases
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_NOTE
, vect_location
,
4348 "Reduce using scalar code.\n");
4350 vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4351 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4354 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4355 vec_temp
= PHI_RESULT (new_phi
);
4357 vec_temp
= gimple_assign_lhs (new_phi
);
4358 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4360 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4361 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4362 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4363 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4365 /* In SLP we don't need to apply reduction operation, so we just
4366 collect s' values in SCALAR_RESULTS. */
4368 scalar_results
.safe_push (new_temp
);
4370 for (bit_offset
= element_bitsize
;
4371 bit_offset
< vec_size_in_bits
;
4372 bit_offset
+= element_bitsize
)
4374 tree bitpos
= bitsize_int (bit_offset
);
4375 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4378 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4379 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4380 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4381 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4385 /* In SLP we don't need to apply reduction operation, so
4386 we just collect s' values in SCALAR_RESULTS. */
4387 new_temp
= new_name
;
4388 scalar_results
.safe_push (new_name
);
4392 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4393 new_name
, new_temp
);
4394 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4395 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4396 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4401 /* The only case where we need to reduce scalar results in SLP, is
4402 unrolling. If the size of SCALAR_RESULTS is greater than
4403 GROUP_SIZE, we reduce them combining elements modulo
4407 tree res
, first_res
, new_res
;
4410 /* Reduce multiple scalar results in case of SLP unrolling. */
4411 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4414 first_res
= scalar_results
[j
% group_size
];
4415 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4417 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4418 gimple_assign_set_lhs (new_stmt
, new_res
);
4419 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4420 scalar_results
[j
% group_size
] = new_res
;
4424 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4425 scalar_results
.safe_push (new_temp
);
4429 vect_finalize_reduction
:
4434 /* 2.5 Adjust the final result by the initial value of the reduction
4435 variable. (When such adjustment is not needed, then
4436 'adjustment_def' is zero). For example, if code is PLUS we create:
4437 new_temp = loop_exit_def + adjustment_def */
4441 gcc_assert (!slp_reduc
);
4442 if (nested_in_vect_loop
)
4444 new_phi
= new_phis
[0];
4445 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4446 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4447 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4451 new_temp
= scalar_results
[0];
4452 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4453 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4454 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4457 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4458 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4459 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4460 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4461 if (nested_in_vect_loop
)
4463 set_vinfo_for_stmt (epilog_stmt
,
4464 new_stmt_vec_info (epilog_stmt
, loop_vinfo
,
4466 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4467 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4470 scalar_results
.quick_push (new_temp
);
4472 scalar_results
[0] = new_temp
;
4475 scalar_results
[0] = new_temp
;
4477 new_phis
[0] = epilog_stmt
;
4480 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4481 phis with new adjusted scalar results, i.e., replace use <s_out0>
4486 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4487 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4488 v_out2 = reduce <v_out1>
4489 s_out3 = extract_field <v_out2, 0>
4490 s_out4 = adjust_result <s_out3>
4497 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4498 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4499 v_out2 = reduce <v_out1>
4500 s_out3 = extract_field <v_out2, 0>
4501 s_out4 = adjust_result <s_out3>
4506 /* In SLP reduction chain we reduce vector results into one vector if
4507 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4508 the last stmt in the reduction chain, since we are looking for the loop
4510 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4512 scalar_dest
= gimple_assign_lhs (
4513 SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
4517 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4518 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4519 need to match SCALAR_RESULTS with corresponding statements. The first
4520 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4521 the first vector stmt, etc.
4522 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4523 if (group_size
> new_phis
.length ())
4525 ratio
= group_size
/ new_phis
.length ();
4526 gcc_assert (!(group_size
% new_phis
.length ()));
4531 for (k
= 0; k
< group_size
; k
++)
4535 epilog_stmt
= new_phis
[k
/ ratio
];
4536 reduction_phi
= reduction_phis
[k
/ ratio
];
4538 inner_phi
= inner_phis
[k
/ ratio
];
4543 gimple current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
4545 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
4546 /* SLP statements can't participate in patterns. */
4547 gcc_assert (!orig_stmt
);
4548 scalar_dest
= gimple_assign_lhs (current_stmt
);
4552 /* Find the loop-closed-use at the loop exit of the original scalar
4553 result. (The reduction result is expected to have two immediate uses -
4554 one at the latch block, and one at the loop exit). */
4555 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4556 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
4557 && !is_gimple_debug (USE_STMT (use_p
)))
4558 phis
.safe_push (USE_STMT (use_p
));
4560 /* While we expect to have found an exit_phi because of loop-closed-ssa
4561 form we can end up without one if the scalar cycle is dead. */
4563 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4567 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
4570 /* FORNOW. Currently not supporting the case that an inner-loop
4571 reduction is not used in the outer-loop (but only outside the
4572 outer-loop), unless it is double reduction. */
4573 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
4574 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
4577 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
4579 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
4580 != vect_double_reduction_def
)
4583 /* Handle double reduction:
4585 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4586 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4587 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4588 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4590 At that point the regular reduction (stmt2 and stmt3) is
4591 already vectorized, as well as the exit phi node, stmt4.
4592 Here we vectorize the phi node of double reduction, stmt1, and
4593 update all relevant statements. */
4595 /* Go through all the uses of s2 to find double reduction phi
4596 node, i.e., stmt1 above. */
4597 orig_name
= PHI_RESULT (exit_phi
);
4598 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4600 stmt_vec_info use_stmt_vinfo
;
4601 stmt_vec_info new_phi_vinfo
;
4602 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
4603 basic_block bb
= gimple_bb (use_stmt
);
4606 /* Check that USE_STMT is really double reduction phi
4608 if (gimple_code (use_stmt
) != GIMPLE_PHI
4609 || gimple_phi_num_args (use_stmt
) != 2
4610 || bb
->loop_father
!= outer_loop
)
4612 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
4614 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
4615 != vect_double_reduction_def
)
4618 /* Create vector phi node for double reduction:
4619 vs1 = phi <vs0, vs2>
4620 vs1 was created previously in this function by a call to
4621 vect_get_vec_def_for_operand and is stored in
4623 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4624 vs0 is created here. */
4626 /* Create vector phi node. */
4627 vect_phi
= create_phi_node (vec_initial_def
, bb
);
4628 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
4629 loop_vec_info_for_loop (outer_loop
), NULL
);
4630 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
4632 /* Create vs0 - initial def of the double reduction phi. */
4633 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
4634 loop_preheader_edge (outer_loop
));
4635 init_def
= get_initial_def_for_reduction (stmt
,
4636 preheader_arg
, NULL
);
4637 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
4640 /* Update phi node arguments with vs0 and vs2. */
4641 add_phi_arg (vect_phi
, vect_phi_init
,
4642 loop_preheader_edge (outer_loop
),
4644 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
4645 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
4646 if (dump_enabled_p ())
4648 dump_printf_loc (MSG_NOTE
, vect_location
,
4649 "created double reduction phi node: ");
4650 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
4651 dump_printf (MSG_NOTE
, "\n");
4654 vect_phi_res
= PHI_RESULT (vect_phi
);
4656 /* Replace the use, i.e., set the correct vs1 in the regular
4657 reduction phi node. FORNOW, NCOPIES is always 1, so the
4658 loop is redundant. */
4659 use
= reduction_phi
;
4660 for (j
= 0; j
< ncopies
; j
++)
4662 edge pr_edge
= loop_preheader_edge (loop
);
4663 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
4664 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
4671 if (nested_in_vect_loop
)
4680 /* Find the loop-closed-use at the loop exit of the original scalar
4681 result. (The reduction result is expected to have two immediate uses,
4682 one at the latch block, and one at the loop exit). For double
4683 reductions we are looking for exit phis of the outer loop. */
4684 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4686 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4688 if (!is_gimple_debug (USE_STMT (use_p
)))
4689 phis
.safe_push (USE_STMT (use_p
));
4693 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
4695 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
4697 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
4699 if (!flow_bb_inside_loop_p (loop
,
4700 gimple_bb (USE_STMT (phi_use_p
)))
4701 && !is_gimple_debug (USE_STMT (phi_use_p
)))
4702 phis
.safe_push (USE_STMT (phi_use_p
));
4708 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4710 /* Replace the uses: */
4711 orig_name
= PHI_RESULT (exit_phi
);
4712 scalar_result
= scalar_results
[k
];
4713 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4714 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4715 SET_USE (use_p
, scalar_result
);
4723 /* Function vectorizable_reduction.
4725 Check if STMT performs a reduction operation that can be vectorized.
4726 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4727 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4728 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4730 This function also handles reduction idioms (patterns) that have been
4731 recognized in advance during vect_pattern_recog. In this case, STMT may be
4733 X = pattern_expr (arg0, arg1, ..., X)
4734 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4735 sequence that had been detected and replaced by the pattern-stmt (STMT).
4737 In some cases of reduction patterns, the type of the reduction variable X is
4738 different than the type of the other arguments of STMT.
4739 In such cases, the vectype that is used when transforming STMT into a vector
4740 stmt is different than the vectype that is used to determine the
4741 vectorization factor, because it consists of a different number of elements
4742 than the actual number of elements that are being operated upon in parallel.
4744 For example, consider an accumulation of shorts into an int accumulator.
4745 On some targets it's possible to vectorize this pattern operating on 8
4746 shorts at a time (hence, the vectype for purposes of determining the
4747 vectorization factor should be V8HI); on the other hand, the vectype that
4748 is used to create the vector form is actually V4SI (the type of the result).
4750 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4751 indicates what is the actual level of parallelism (V8HI in the example), so
4752 that the right vectorization factor would be derived. This vectype
4753 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4754 be used to create the vectorized stmt. The right vectype for the vectorized
4755 stmt is obtained from the type of the result X:
4756 get_vectype_for_scalar_type (TREE_TYPE (X))
4758 This means that, contrary to "regular" reductions (or "regular" stmts in
4759 general), the following equation:
4760 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4761 does *NOT* necessarily hold for reduction patterns. */
4764 vectorizable_reduction (gimple stmt
, gimple_stmt_iterator
*gsi
,
4765 gimple
*vec_stmt
, slp_tree slp_node
)
4769 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
4770 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4771 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
4772 tree vectype_in
= NULL_TREE
;
4773 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4774 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4775 enum tree_code code
, orig_code
, epilog_reduc_code
;
4776 machine_mode vec_mode
;
4778 optab optab
, reduc_optab
;
4779 tree new_temp
= NULL_TREE
;
4782 enum vect_def_type dt
;
4783 gphi
*new_phi
= NULL
;
4787 stmt_vec_info orig_stmt_info
;
4788 tree expr
= NULL_TREE
;
4792 stmt_vec_info prev_stmt_info
, prev_phi_info
;
4793 bool single_defuse_cycle
= false;
4794 tree reduc_def
= NULL_TREE
;
4795 gimple new_stmt
= NULL
;
4798 bool nested_cycle
= false, found_nested_cycle_def
= false;
4799 gimple reduc_def_stmt
= NULL
;
4800 /* The default is that the reduction variable is the last in statement. */
4801 int reduc_index
= 2;
4802 bool double_reduc
= false, dummy
;
4804 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
4806 gimple def_arg_stmt
;
4807 auto_vec
<tree
> vec_oprnds0
;
4808 auto_vec
<tree
> vec_oprnds1
;
4809 auto_vec
<tree
> vect_defs
;
4810 auto_vec
<gimple
> phis
;
4812 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
4814 /* In case of reduction chain we switch to the first stmt in the chain, but
4815 we don't update STMT_INFO, since only the last stmt is marked as reduction
4816 and has reduction properties. */
4817 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4818 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
4820 if (nested_in_vect_loop_p (loop
, stmt
))
4824 nested_cycle
= true;
4827 /* 1. Is vectorizable reduction? */
4828 /* Not supportable if the reduction variable is used in the loop, unless
4829 it's a reduction chain. */
4830 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
4831 && !GROUP_FIRST_ELEMENT (stmt_info
))
4834 /* Reductions that are not used even in an enclosing outer-loop,
4835 are expected to be "live" (used out of the loop). */
4836 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
4837 && !STMT_VINFO_LIVE_P (stmt_info
))
4840 /* Make sure it was already recognized as a reduction computation. */
4841 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
4842 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
4845 /* 2. Has this been recognized as a reduction pattern?
4847 Check if STMT represents a pattern that has been recognized
4848 in earlier analysis stages. For stmts that represent a pattern,
4849 the STMT_VINFO_RELATED_STMT field records the last stmt in
4850 the original sequence that constitutes the pattern. */
4852 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4855 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
4856 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4857 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
4860 /* 3. Check the operands of the operation. The first operands are defined
4861 inside the loop body. The last operand is the reduction variable,
4862 which is defined by the loop-header-phi. */
4864 gcc_assert (is_gimple_assign (stmt
));
4867 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
4869 case GIMPLE_SINGLE_RHS
:
4870 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
4871 if (op_type
== ternary_op
)
4873 tree rhs
= gimple_assign_rhs1 (stmt
);
4874 ops
[0] = TREE_OPERAND (rhs
, 0);
4875 ops
[1] = TREE_OPERAND (rhs
, 1);
4876 ops
[2] = TREE_OPERAND (rhs
, 2);
4877 code
= TREE_CODE (rhs
);
4883 case GIMPLE_BINARY_RHS
:
4884 code
= gimple_assign_rhs_code (stmt
);
4885 op_type
= TREE_CODE_LENGTH (code
);
4886 gcc_assert (op_type
== binary_op
);
4887 ops
[0] = gimple_assign_rhs1 (stmt
);
4888 ops
[1] = gimple_assign_rhs2 (stmt
);
4891 case GIMPLE_TERNARY_RHS
:
4892 code
= gimple_assign_rhs_code (stmt
);
4893 op_type
= TREE_CODE_LENGTH (code
);
4894 gcc_assert (op_type
== ternary_op
);
4895 ops
[0] = gimple_assign_rhs1 (stmt
);
4896 ops
[1] = gimple_assign_rhs2 (stmt
);
4897 ops
[2] = gimple_assign_rhs3 (stmt
);
4900 case GIMPLE_UNARY_RHS
:
4907 if (code
== COND_EXPR
&& slp_node
)
4910 scalar_dest
= gimple_assign_lhs (stmt
);
4911 scalar_type
= TREE_TYPE (scalar_dest
);
4912 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
4913 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
4916 /* Do not try to vectorize bit-precision reductions. */
4917 if ((TYPE_PRECISION (scalar_type
)
4918 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
4921 /* All uses but the last are expected to be defined in the loop.
4922 The last use is the reduction variable. In case of nested cycle this
4923 assumption is not true: we use reduc_index to record the index of the
4924 reduction variable. */
4925 for (i
= 0; i
< op_type
- 1; i
++)
4927 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4928 if (i
== 0 && code
== COND_EXPR
)
4931 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4932 &def_stmt
, &def
, &dt
, &tem
);
4935 gcc_assert (is_simple_use
);
4937 if (dt
!= vect_internal_def
4938 && dt
!= vect_external_def
4939 && dt
!= vect_constant_def
4940 && dt
!= vect_induction_def
4941 && !(dt
== vect_nested_cycle
&& nested_cycle
))
4944 if (dt
== vect_nested_cycle
)
4946 found_nested_cycle_def
= true;
4947 reduc_def_stmt
= def_stmt
;
4952 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
4953 &def_stmt
, &def
, &dt
, &tem
);
4956 gcc_assert (is_simple_use
);
4957 if (!(dt
== vect_reduction_def
4958 || dt
== vect_nested_cycle
4959 || ((dt
== vect_internal_def
|| dt
== vect_external_def
4960 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
4961 && nested_cycle
&& found_nested_cycle_def
)))
4963 /* For pattern recognized stmts, orig_stmt might be a reduction,
4964 but some helper statements for the pattern might not, or
4965 might be COND_EXPRs with reduction uses in the condition. */
4966 gcc_assert (orig_stmt
);
4969 if (!found_nested_cycle_def
)
4970 reduc_def_stmt
= def_stmt
;
4972 gcc_assert (gimple_code (reduc_def_stmt
) == GIMPLE_PHI
);
4974 gcc_assert (orig_stmt
== vect_is_simple_reduction (loop_vinfo
,
4980 gimple tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
4981 !nested_cycle
, &dummy
);
4982 /* We changed STMT to be the first stmt in reduction chain, hence we
4983 check that in this case the first element in the chain is STMT. */
4984 gcc_assert (stmt
== tmp
4985 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
4988 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
4991 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
4994 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
4995 / TYPE_VECTOR_SUBPARTS (vectype_in
));
4997 gcc_assert (ncopies
>= 1);
4999 vec_mode
= TYPE_MODE (vectype_in
);
5001 if (code
== COND_EXPR
)
5003 if (!vectorizable_condition (stmt
, gsi
, NULL
, ops
[reduc_index
], 0, NULL
))
5005 if (dump_enabled_p ())
5006 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5007 "unsupported condition in reduction\n");
5014 /* 4. Supportable by target? */
5016 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
5017 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
5019 /* Shifts and rotates are only supported by vectorizable_shifts,
5020 not vectorizable_reduction. */
5021 if (dump_enabled_p ())
5022 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5023 "unsupported shift or rotation.\n");
5027 /* 4.1. check support for the operation in the loop */
5028 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
5031 if (dump_enabled_p ())
5032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5038 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5040 if (dump_enabled_p ())
5041 dump_printf (MSG_NOTE
, "op not supported by target.\n");
5043 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
5044 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5045 < vect_min_worthwhile_factor (code
))
5048 if (dump_enabled_p ())
5049 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
5052 /* Worthwhile without SIMD support? */
5053 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
5054 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5055 < vect_min_worthwhile_factor (code
))
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5059 "not worthwhile without SIMD support.\n");
5065 /* 4.2. Check support for the epilog operation.
5067 If STMT represents a reduction pattern, then the type of the
5068 reduction variable may be different than the type of the rest
5069 of the arguments. For example, consider the case of accumulation
5070 of shorts into an int accumulator; The original code:
5071 S1: int_a = (int) short_a;
5072 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5075 STMT: int_acc = widen_sum <short_a, int_acc>
5078 1. The tree-code that is used to create the vector operation in the
5079 epilog code (that reduces the partial results) is not the
5080 tree-code of STMT, but is rather the tree-code of the original
5081 stmt from the pattern that STMT is replacing. I.e, in the example
5082 above we want to use 'widen_sum' in the loop, but 'plus' in the
5084 2. The type (mode) we use to check available target support
5085 for the vector operation to be created in the *epilog*, is
5086 determined by the type of the reduction variable (in the example
5087 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5088 However the type (mode) we use to check available target support
5089 for the vector operation to be created *inside the loop*, is
5090 determined by the type of the other arguments to STMT (in the
5091 example we'd check this: optab_handler (widen_sum_optab,
5094 This is contrary to "regular" reductions, in which the types of all
5095 the arguments are the same as the type of the reduction variable.
5096 For "regular" reductions we can therefore use the same vector type
5097 (and also the same tree-code) when generating the epilog code and
5098 when generating the code inside the loop. */
5102 /* This is a reduction pattern: get the vectype from the type of the
5103 reduction variable, and get the tree-code from orig_stmt. */
5104 orig_code
= gimple_assign_rhs_code (orig_stmt
);
5105 gcc_assert (vectype_out
);
5106 vec_mode
= TYPE_MODE (vectype_out
);
5110 /* Regular reduction: use the same vectype and tree-code as used for
5111 the vector code inside the loop can be used for the epilog code. */
5117 def_bb
= gimple_bb (reduc_def_stmt
);
5118 def_stmt_loop
= def_bb
->loop_father
;
5119 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5120 loop_preheader_edge (def_stmt_loop
));
5121 if (TREE_CODE (def_arg
) == SSA_NAME
5122 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
5123 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
5124 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
5125 && vinfo_for_stmt (def_arg_stmt
)
5126 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
5127 == vect_double_reduction_def
)
5128 double_reduc
= true;
5131 epilog_reduc_code
= ERROR_MARK
;
5132 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
5134 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
5138 if (dump_enabled_p ())
5139 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5140 "no optab for reduction.\n");
5142 epilog_reduc_code
= ERROR_MARK
;
5144 else if (optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
5146 optab
= scalar_reduc_to_vector (reduc_optab
, vectype_out
);
5147 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5151 "reduc op not supported by target.\n");
5153 epilog_reduc_code
= ERROR_MARK
;
5159 if (!nested_cycle
|| double_reduc
)
5161 if (dump_enabled_p ())
5162 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5163 "no reduc code for scalar code.\n");
5169 if (double_reduc
&& ncopies
> 1)
5171 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5173 "multiple types in double reduction\n");
5178 /* In case of widenning multiplication by a constant, we update the type
5179 of the constant to be the type of the other operand. We check that the
5180 constant fits the type in the pattern recognition pass. */
5181 if (code
== DOT_PROD_EXPR
5182 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
5184 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
5185 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
5186 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
5187 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
5190 if (dump_enabled_p ())
5191 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5192 "invalid types in dot-prod\n");
5198 if (!vec_stmt
) /* transformation not required. */
5200 if (!vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
))
5202 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5208 if (dump_enabled_p ())
5209 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
5211 /* FORNOW: Multiple types are not supported for condition. */
5212 if (code
== COND_EXPR
)
5213 gcc_assert (ncopies
== 1);
5215 /* Create the destination vector */
5216 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
5218 /* In case the vectorization factor (VF) is bigger than the number
5219 of elements that we can fit in a vectype (nunits), we have to generate
5220 more than one vector stmt - i.e - we need to "unroll" the
5221 vector stmt by a factor VF/nunits. For more details see documentation
5222 in vectorizable_operation. */
5224 /* If the reduction is used in an outer loop we need to generate
5225 VF intermediate results, like so (e.g. for ncopies=2):
5230 (i.e. we generate VF results in 2 registers).
5231 In this case we have a separate def-use cycle for each copy, and therefore
5232 for each copy we get the vector def for the reduction variable from the
5233 respective phi node created for this copy.
5235 Otherwise (the reduction is unused in the loop nest), we can combine
5236 together intermediate results, like so (e.g. for ncopies=2):
5240 (i.e. we generate VF/2 results in a single register).
5241 In this case for each copy we get the vector def for the reduction variable
5242 from the vectorized reduction operation generated in the previous iteration.
5245 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5247 single_defuse_cycle
= true;
5251 epilog_copies
= ncopies
;
5253 prev_stmt_info
= NULL
;
5254 prev_phi_info
= NULL
;
5257 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5258 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out
)
5259 == TYPE_VECTOR_SUBPARTS (vectype_in
));
5264 vec_oprnds0
.create (1);
5265 if (op_type
== ternary_op
)
5266 vec_oprnds1
.create (1);
5269 phis
.create (vec_num
);
5270 vect_defs
.create (vec_num
);
5272 vect_defs
.quick_push (NULL_TREE
);
5274 for (j
= 0; j
< ncopies
; j
++)
5276 if (j
== 0 || !single_defuse_cycle
)
5278 for (i
= 0; i
< vec_num
; i
++)
5280 /* Create the reduction-phi that defines the reduction
5282 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5283 set_vinfo_for_stmt (new_phi
,
5284 new_stmt_vec_info (new_phi
, loop_vinfo
,
5286 if (j
== 0 || slp_node
)
5287 phis
.quick_push (new_phi
);
5291 if (code
== COND_EXPR
)
5293 gcc_assert (!slp_node
);
5294 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5295 PHI_RESULT (phis
[0]),
5297 /* Multiple types are not supported for condition. */
5304 op0
= ops
[!reduc_index
];
5305 if (op_type
== ternary_op
)
5307 if (reduc_index
== 0)
5314 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
5318 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
5320 vec_oprnds0
.quick_push (loop_vec_def0
);
5321 if (op_type
== ternary_op
)
5323 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
,
5325 vec_oprnds1
.quick_push (loop_vec_def1
);
5333 enum vect_def_type dt
;
5337 vect_is_simple_use (ops
[!reduc_index
], stmt
, loop_vinfo
, NULL
,
5338 &dummy_stmt
, &dummy
, &dt
);
5339 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
5341 vec_oprnds0
[0] = loop_vec_def0
;
5342 if (op_type
== ternary_op
)
5344 vect_is_simple_use (op1
, stmt
, loop_vinfo
, NULL
, &dummy_stmt
,
5346 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
5348 vec_oprnds1
[0] = loop_vec_def1
;
5352 if (single_defuse_cycle
)
5353 reduc_def
= gimple_assign_lhs (new_stmt
);
5355 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
5358 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5361 reduc_def
= PHI_RESULT (phis
[i
]);
5364 if (!single_defuse_cycle
|| j
== 0)
5365 reduc_def
= PHI_RESULT (new_phi
);
5368 def1
= ((op_type
== ternary_op
)
5369 ? vec_oprnds1
[i
] : NULL
);
5370 if (op_type
== binary_op
)
5372 if (reduc_index
== 0)
5373 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
5375 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
5379 if (reduc_index
== 0)
5380 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
5383 if (reduc_index
== 1)
5384 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
5386 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
5390 new_stmt
= gimple_build_assign (vec_dest
, expr
);
5391 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5392 gimple_assign_set_lhs (new_stmt
, new_temp
);
5393 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
5397 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
5398 vect_defs
.quick_push (new_temp
);
5401 vect_defs
[0] = new_temp
;
5408 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
5410 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
5412 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
5413 prev_phi_info
= vinfo_for_stmt (new_phi
);
5416 /* Finalize the reduction-phi (set its arguments) and create the
5417 epilog reduction code. */
5418 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
5420 new_temp
= gimple_assign_lhs (*vec_stmt
);
5421 vect_defs
[0] = new_temp
;
5424 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
5425 epilog_reduc_code
, phis
, reduc_index
,
5426 double_reduc
, slp_node
);
5431 /* Function vect_min_worthwhile_factor.
5433 For a loop where we could vectorize the operation indicated by CODE,
5434 return the minimum vectorization factor that makes it worthwhile
5435 to use generic vectors. */
5437 vect_min_worthwhile_factor (enum tree_code code
)
5458 /* Function vectorizable_induction
5460 Check if PHI performs an induction computation that can be vectorized.
5461 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5462 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5463 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5466 vectorizable_induction (gimple phi
, gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5469 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
5470 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5471 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5472 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5473 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5474 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
5477 gcc_assert (ncopies
>= 1);
5478 /* FORNOW. These restrictions should be relaxed. */
5479 if (nested_in_vect_loop_p (loop
, phi
))
5481 imm_use_iterator imm_iter
;
5482 use_operand_p use_p
;
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5491 "multiple types in nested loop.\n");
5496 latch_e
= loop_latch_edge (loop
->inner
);
5497 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
5498 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
5500 gimple use_stmt
= USE_STMT (use_p
);
5501 if (is_gimple_debug (use_stmt
))
5504 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
5506 exit_phi
= use_stmt
;
5512 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5513 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5514 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
5516 if (dump_enabled_p ())
5517 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5518 "inner-loop induction only used outside "
5519 "of the outer vectorized loop.\n");
5525 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
5528 /* FORNOW: SLP not supported. */
5529 if (STMT_SLP_TYPE (stmt_info
))
5532 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
5534 if (gimple_code (phi
) != GIMPLE_PHI
)
5537 if (!vec_stmt
) /* transformation not required. */
5539 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
5540 if (dump_enabled_p ())
5541 dump_printf_loc (MSG_NOTE
, vect_location
,
5542 "=== vectorizable_induction ===\n");
5543 vect_model_induction_cost (stmt_info
, ncopies
);
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
5552 vec_def
= get_initial_def_for_induction (phi
);
5553 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
5557 /* Function vectorizable_live_operation.
5559 STMT computes a value that is used outside the loop. Check if
5560 it can be supported. */
5563 vectorizable_live_operation (gimple stmt
,
5564 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5567 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5568 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5569 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5575 enum vect_def_type dt
;
5576 enum tree_code code
;
5577 enum gimple_rhs_class rhs_class
;
5579 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
5581 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5584 if (!is_gimple_assign (stmt
))
5586 if (gimple_call_internal_p (stmt
)
5587 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
5588 && gimple_call_lhs (stmt
)
5590 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
5592 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0)))
5594 edge e
= single_exit (loop
);
5595 basic_block merge_bb
= e
->dest
;
5596 imm_use_iterator imm_iter
;
5597 use_operand_p use_p
;
5598 tree lhs
= gimple_call_lhs (stmt
);
5600 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
5602 gimple use_stmt
= USE_STMT (use_p
);
5603 if (gimple_code (use_stmt
) == GIMPLE_PHI
5604 && gimple_bb (use_stmt
) == merge_bb
)
5609 = build_int_cst (unsigned_type_node
,
5610 loop_vinfo
->vectorization_factor
- 1);
5611 SET_PHI_ARG_DEF (use_stmt
, e
->dest_idx
, vfm1
);
5621 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
5624 /* FORNOW. CHECKME. */
5625 if (nested_in_vect_loop_p (loop
, stmt
))
5628 code
= gimple_assign_rhs_code (stmt
);
5629 op_type
= TREE_CODE_LENGTH (code
);
5630 rhs_class
= get_gimple_rhs_class (code
);
5631 gcc_assert (rhs_class
!= GIMPLE_UNARY_RHS
|| op_type
== unary_op
);
5632 gcc_assert (rhs_class
!= GIMPLE_BINARY_RHS
|| op_type
== binary_op
);
5634 /* FORNOW: support only if all uses are invariant. This means
5635 that the scalar operations can remain in place, unvectorized.
5636 The original last scalar value that they compute will be used. */
5638 for (i
= 0; i
< op_type
; i
++)
5640 if (rhs_class
== GIMPLE_SINGLE_RHS
)
5641 op
= TREE_OPERAND (gimple_op (stmt
, 1), i
);
5643 op
= gimple_op (stmt
, i
+ 1);
5645 && !vect_is_simple_use (op
, stmt
, loop_vinfo
, NULL
, &def_stmt
, &def
,
5648 if (dump_enabled_p ())
5649 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5650 "use not simple.\n");
5654 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
5658 /* No transformation is required for the cases we currently support. */
5662 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5665 vect_loop_kill_debug_uses (struct loop
*loop
, gimple stmt
)
5667 ssa_op_iter op_iter
;
5668 imm_use_iterator imm_iter
;
5669 def_operand_p def_p
;
5672 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
5674 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
5678 if (!is_gimple_debug (ustmt
))
5681 bb
= gimple_bb (ustmt
);
5683 if (!flow_bb_inside_loop_p (loop
, bb
))
5685 if (gimple_debug_bind_p (ustmt
))
5687 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_NOTE
, vect_location
,
5689 "killing debug use\n");
5691 gimple_debug_bind_reset_value (ustmt
);
5692 update_stmt (ustmt
);
5702 /* This function builds ni_name = number of iterations. Statements
5703 are emitted on the loop preheader edge. */
5706 vect_build_loop_niters (loop_vec_info loop_vinfo
)
5708 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
5709 if (TREE_CODE (ni
) == INTEGER_CST
)
5714 gimple_seq stmts
= NULL
;
5715 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5717 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
5718 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
5720 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5727 /* This function generates the following statements:
5729 ni_name = number of iterations loop executes
5730 ratio = ni_name / vf
5731 ratio_mult_vf_name = ratio * vf
5733 and places them on the loop preheader edge. */
5736 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo
,
5738 tree
*ratio_mult_vf_name_ptr
,
5739 tree
*ratio_name_ptr
)
5741 tree ni_minus_gap_name
;
5744 tree ratio_mult_vf_name
;
5745 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5746 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5749 log_vf
= build_int_cst (TREE_TYPE (ni_name
), exact_log2 (vf
));
5751 /* If epilogue loop is required because of data accesses with gaps, we
5752 subtract one iteration from the total number of iterations here for
5753 correct calculation of RATIO. */
5754 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5756 ni_minus_gap_name
= fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5758 build_one_cst (TREE_TYPE (ni_name
)));
5759 if (!is_gimple_val (ni_minus_gap_name
))
5761 var
= create_tmp_var (TREE_TYPE (ni_name
), "ni_gap");
5762 gimple stmts
= NULL
;
5763 ni_minus_gap_name
= force_gimple_operand (ni_minus_gap_name
, &stmts
,
5765 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5769 ni_minus_gap_name
= ni_name
;
5771 /* Create: ratio = ni >> log2(vf) */
5772 /* ??? As we have ni == number of latch executions + 1, ni could
5773 have overflown to zero. So avoid computing ratio based on ni
5774 but compute it using the fact that we know ratio will be at least
5775 one, thus via (ni - vf) >> log2(vf) + 1. */
5777 = fold_build2 (PLUS_EXPR
, TREE_TYPE (ni_name
),
5778 fold_build2 (RSHIFT_EXPR
, TREE_TYPE (ni_name
),
5779 fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5782 (TREE_TYPE (ni_name
), vf
)),
5784 build_int_cst (TREE_TYPE (ni_name
), 1));
5785 if (!is_gimple_val (ratio_name
))
5787 var
= create_tmp_var (TREE_TYPE (ni_name
), "bnd");
5788 gimple stmts
= NULL
;
5789 ratio_name
= force_gimple_operand (ratio_name
, &stmts
, true, var
);
5790 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5792 *ratio_name_ptr
= ratio_name
;
5794 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5796 if (ratio_mult_vf_name_ptr
)
5798 ratio_mult_vf_name
= fold_build2 (LSHIFT_EXPR
, TREE_TYPE (ratio_name
),
5799 ratio_name
, log_vf
);
5800 if (!is_gimple_val (ratio_mult_vf_name
))
5802 var
= create_tmp_var (TREE_TYPE (ni_name
), "ratio_mult_vf");
5803 gimple stmts
= NULL
;
5804 ratio_mult_vf_name
= force_gimple_operand (ratio_mult_vf_name
, &stmts
,
5806 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5808 *ratio_mult_vf_name_ptr
= ratio_mult_vf_name
;
5815 /* Function vect_transform_loop.
5817 The analysis phase has determined that the loop is vectorizable.
5818 Vectorize the loop - created vectorized stmts to replace the scalar
5819 stmts in the loop, and update the loop exit condition. */
5822 vect_transform_loop (loop_vec_info loop_vinfo
)
5824 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5825 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
5826 int nbbs
= loop
->num_nodes
;
5829 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5831 bool slp_scheduled
= false;
5832 gimple stmt
, pattern_stmt
;
5833 gimple_seq pattern_def_seq
= NULL
;
5834 gimple_stmt_iterator pattern_def_si
= gsi_none ();
5835 bool transform_pattern_stmt
= false;
5836 bool check_profitability
= false;
5838 /* Record number of iterations before we started tampering with the profile. */
5839 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
5844 /* If profile is inprecise, we have chance to fix it up. */
5845 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5846 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
5848 /* Use the more conservative vectorization threshold. If the number
5849 of iterations is constant assume the cost check has been performed
5850 by our caller. If the threshold makes all loops profitable that
5851 run at least the vectorization factor number of times checking
5852 is pointless, too. */
5853 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
5854 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
5855 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5857 if (dump_enabled_p ())
5858 dump_printf_loc (MSG_NOTE
, vect_location
,
5859 "Profitability threshold is %d loop iterations.\n",
5861 check_profitability
= true;
5864 /* Version the loop first, if required, so the profitability check
5867 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
5868 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
5870 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
5871 check_profitability
= false;
5874 tree ni_name
= vect_build_loop_niters (loop_vinfo
);
5875 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = ni_name
;
5877 /* Peel the loop if there are data refs with unknown alignment.
5878 Only one data ref with unknown store is allowed. */
5880 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
5882 vect_do_peeling_for_alignment (loop_vinfo
, ni_name
,
5883 th
, check_profitability
);
5884 check_profitability
= false;
5885 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5887 ni_name
= NULL_TREE
;
5890 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5891 compile time constant), or it is a constant that doesn't divide by the
5892 vectorization factor, then an epilog loop needs to be created.
5893 We therefore duplicate the loop: the original loop will be vectorized,
5894 and will compute the first (n/VF) iterations. The second copy of the loop
5895 will remain scalar and will compute the remaining (n%VF) iterations.
5896 (VF is the vectorization factor). */
5898 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
5899 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5903 ni_name
= vect_build_loop_niters (loop_vinfo
);
5904 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, &ratio_mult_vf
,
5906 vect_do_peeling_for_loop_bound (loop_vinfo
, ni_name
, ratio_mult_vf
,
5907 th
, check_profitability
);
5909 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5910 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
5911 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
5915 ni_name
= vect_build_loop_niters (loop_vinfo
);
5916 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, NULL
, &ratio
);
5919 /* 1) Make sure the loop header has exactly two entries
5920 2) Make sure we have a preheader basic block. */
5922 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
5924 split_edge (loop_preheader_edge (loop
));
5926 /* FORNOW: the vectorizer supports only loops which body consist
5927 of one basic block (header + empty latch). When the vectorizer will
5928 support more involved loop forms, the order by which the BBs are
5929 traversed need to be reconsidered. */
5931 for (i
= 0; i
< nbbs
; i
++)
5933 basic_block bb
= bbs
[i
];
5934 stmt_vec_info stmt_info
;
5936 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
5939 gphi
*phi
= si
.phi ();
5940 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_NOTE
, vect_location
,
5943 "------>vectorizing phi: ");
5944 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
5945 dump_printf (MSG_NOTE
, "\n");
5947 stmt_info
= vinfo_for_stmt (phi
);
5951 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
5952 vect_loop_kill_debug_uses (loop
, phi
);
5954 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
5955 && !STMT_VINFO_LIVE_P (stmt_info
))
5958 if (STMT_VINFO_VECTYPE (stmt_info
)
5959 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
5960 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
5961 && dump_enabled_p ())
5962 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
5964 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
5966 if (dump_enabled_p ())
5967 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
5968 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
5972 pattern_stmt
= NULL
;
5973 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
5974 !gsi_end_p (si
) || transform_pattern_stmt
;)
5978 if (transform_pattern_stmt
)
5979 stmt
= pattern_stmt
;
5982 stmt
= gsi_stmt (si
);
5983 /* During vectorization remove existing clobber stmts. */
5984 if (gimple_clobber_p (stmt
))
5986 unlink_stmt_vdef (stmt
);
5987 gsi_remove (&si
, true);
5988 release_defs (stmt
);
5993 if (dump_enabled_p ())
5995 dump_printf_loc (MSG_NOTE
, vect_location
,
5996 "------>vectorizing statement: ");
5997 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
5998 dump_printf (MSG_NOTE
, "\n");
6001 stmt_info
= vinfo_for_stmt (stmt
);
6003 /* vector stmts created in the outer-loop during vectorization of
6004 stmts in an inner-loop may not have a stmt_info, and do not
6005 need to be vectorized. */
6012 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6013 vect_loop_kill_debug_uses (loop
, stmt
);
6015 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6016 && !STMT_VINFO_LIVE_P (stmt_info
))
6018 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6019 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6020 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6021 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6023 stmt
= pattern_stmt
;
6024 stmt_info
= vinfo_for_stmt (stmt
);
6032 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6033 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6034 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6035 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6036 transform_pattern_stmt
= true;
6038 /* If pattern statement has def stmts, vectorize them too. */
6039 if (is_pattern_stmt_p (stmt_info
))
6041 if (pattern_def_seq
== NULL
)
6043 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
6044 pattern_def_si
= gsi_start (pattern_def_seq
);
6046 else if (!gsi_end_p (pattern_def_si
))
6047 gsi_next (&pattern_def_si
);
6048 if (pattern_def_seq
!= NULL
)
6050 gimple pattern_def_stmt
= NULL
;
6051 stmt_vec_info pattern_def_stmt_info
= NULL
;
6053 while (!gsi_end_p (pattern_def_si
))
6055 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
6056 pattern_def_stmt_info
6057 = vinfo_for_stmt (pattern_def_stmt
);
6058 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
6059 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
6061 gsi_next (&pattern_def_si
);
6064 if (!gsi_end_p (pattern_def_si
))
6066 if (dump_enabled_p ())
6068 dump_printf_loc (MSG_NOTE
, vect_location
,
6069 "==> vectorizing pattern def "
6071 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
6072 pattern_def_stmt
, 0);
6073 dump_printf (MSG_NOTE
, "\n");
6076 stmt
= pattern_def_stmt
;
6077 stmt_info
= pattern_def_stmt_info
;
6081 pattern_def_si
= gsi_none ();
6082 transform_pattern_stmt
= false;
6086 transform_pattern_stmt
= false;
6089 if (STMT_VINFO_VECTYPE (stmt_info
))
6093 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
6094 if (!STMT_SLP_TYPE (stmt_info
)
6095 && nunits
!= (unsigned int) vectorization_factor
6096 && dump_enabled_p ())
6097 /* For SLP VF is set according to unrolling factor, and not
6098 to vector size, hence for SLP this print is not valid. */
6099 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6102 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6104 if (STMT_SLP_TYPE (stmt_info
))
6108 slp_scheduled
= true;
6110 if (dump_enabled_p ())
6111 dump_printf_loc (MSG_NOTE
, vect_location
,
6112 "=== scheduling SLP instances ===\n");
6114 vect_schedule_slp (loop_vinfo
, NULL
);
6117 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6118 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
6120 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6122 pattern_def_seq
= NULL
;
6129 /* -------- vectorize statement ------------ */
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
6133 grouped_store
= false;
6134 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
6137 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6139 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6140 interleaving chain was completed - free all the stores in
6143 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
6147 /* Free the attached stmt_vec_info and remove the stmt. */
6148 gimple store
= gsi_stmt (si
);
6149 free_stmt_vec_info (store
);
6150 unlink_stmt_vdef (store
);
6151 gsi_remove (&si
, true);
6152 release_defs (store
);
6155 /* Stores can only appear at the end of pattern statements. */
6156 gcc_assert (!transform_pattern_stmt
);
6157 pattern_def_seq
= NULL
;
6159 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6161 pattern_def_seq
= NULL
;
6167 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
6169 /* Reduce loop iterations by the vectorization factor. */
6170 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
6171 expected_iterations
/ vectorization_factor
);
6172 loop
->nb_iterations_upper_bound
6173 = wi::udiv_floor (loop
->nb_iterations_upper_bound
, vectorization_factor
);
6174 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6175 && loop
->nb_iterations_upper_bound
!= 0)
6176 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- 1;
6177 if (loop
->any_estimate
)
6179 loop
->nb_iterations_estimate
6180 = wi::udiv_floor (loop
->nb_iterations_estimate
, vectorization_factor
);
6181 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6182 && loop
->nb_iterations_estimate
!= 0)
6183 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- 1;
6186 if (dump_enabled_p ())
6188 dump_printf_loc (MSG_NOTE
, vect_location
,
6189 "LOOP VECTORIZED\n");
6191 dump_printf_loc (MSG_NOTE
, vect_location
,
6192 "OUTER LOOP VECTORIZED\n");
6193 dump_printf (MSG_NOTE
, "\n");