2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-pass.h"
47 #include "insn-config.h"
57 #include "insn-codes.h"
60 #include "diagnostic-core.h"
61 #include "tree-chrec.h"
62 #include "tree-scalar-evolution.h"
63 #include "tree-vectorizer.h"
66 /* Loop Vectorization Pass.
68 This pass tries to vectorize loops.
70 For example, the vectorizer transforms the following simple loop:
72 short a[N]; short b[N]; short c[N]; int i;
78 as if it was manually vectorized by rewriting the source code into:
80 typedef int __attribute__((mode(V8HI))) v8hi;
81 short a[N]; short b[N]; short c[N]; int i;
82 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
85 for (i=0; i<N/8; i++){
92 The main entry to this pass is vectorize_loops(), in which
93 the vectorizer applies a set of analyses on a given set of loops,
94 followed by the actual vectorization transformation for the loops that
95 had successfully passed the analysis phase.
96 Throughout this pass we make a distinction between two types of
97 data: scalars (which are represented by SSA_NAMES), and memory references
98 ("data-refs"). These two types of data require different handling both
99 during analysis and transformation. The types of data-refs that the
100 vectorizer currently supports are ARRAY_REFS which base is an array DECL
101 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
102 accesses are required to have a simple (consecutive) access pattern.
106 The driver for the analysis phase is vect_analyze_loop().
107 It applies a set of analyses, some of which rely on the scalar evolution
108 analyzer (scev) developed by Sebastian Pop.
110 During the analysis phase the vectorizer records some information
111 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
112 loop, as well as general information about the loop as a whole, which is
113 recorded in a "loop_vec_info" struct attached to each loop.
115 Transformation phase:
116 =====================
117 The loop transformation phase scans all the stmts in the loop, and
118 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
119 the loop that needs to be vectorized. It inserts the vector code sequence
120 just before the scalar stmt S, and records a pointer to the vector code
121 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
122 attached to S). This pointer will be used for the vectorization of following
123 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
124 otherwise, we rely on dead code elimination for removing it.
126 For example, say stmt S1 was vectorized into stmt VS1:
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 To vectorize stmt S2, the vectorizer first finds the stmt that defines
133 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
134 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
135 resulting sequence would be:
138 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
140 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
142 Operands that are not SSA_NAMEs, are data-refs that appear in
143 load/store operations (like 'x[i]' in S1), and are handled differently.
147 Currently the only target specific information that is used is the
148 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
149 Targets that can support different sizes of vectors, for now will need
150 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
151 flexibility will be added in the future.
153 Since we only vectorize operations which vector form can be
154 expressed using existing tree codes, to verify that an operation is
155 supported, the vectorizer checks the relevant optab at the relevant
156 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
157 the value found is CODE_FOR_nothing, then there's no target support, and
158 we can't vectorize the stmt.
160 For additional information on this project see:
161 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
164 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
166 /* Function vect_determine_vectorization_factor
168 Determine the vectorization factor (VF). VF is the number of data elements
169 that are operated upon in parallel in a single iteration of the vectorized
170 loop. For example, when vectorizing a loop that operates on 4byte elements,
171 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
172 elements can fit in a single vector register.
174 We currently support vectorization of loops in which all types operated upon
175 are of the same size. Therefore this function currently sets VF according to
176 the size of the types operated upon, and fails if there are multiple sizes
179 VF is also the factor by which the loop iterations are strip-mined, e.g.:
186 for (i=0; i<N; i+=VF){
187 a[i:VF] = b[i:VF] + c[i:VF];
192 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
194 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
195 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
196 int nbbs
= loop
->num_nodes
;
197 unsigned int vectorization_factor
= 0;
202 stmt_vec_info stmt_info
;
205 gimple stmt
, pattern_stmt
= NULL
;
206 gimple_seq pattern_def_seq
= NULL
;
207 gimple_stmt_iterator pattern_def_si
= gsi_none ();
208 bool analyze_pattern_stmt
= false;
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_NOTE
, vect_location
,
212 "=== vect_determine_vectorization_factor ===\n");
214 for (i
= 0; i
< nbbs
; i
++)
216 basic_block bb
= bbs
[i
];
218 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
222 stmt_info
= vinfo_for_stmt (phi
);
223 if (dump_enabled_p ())
225 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: ");
226 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
227 dump_printf (MSG_NOTE
, "\n");
230 gcc_assert (stmt_info
);
232 if (STMT_VINFO_RELEVANT_P (stmt_info
))
234 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
235 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE
, vect_location
,
240 "get vectype for scalar type: ");
241 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
242 dump_printf (MSG_NOTE
, "\n");
245 vectype
= get_vectype_for_scalar_type (scalar_type
);
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
251 "not vectorized: unsupported "
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
255 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
259 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
264 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
265 dump_printf (MSG_NOTE
, "\n");
268 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
269 if (dump_enabled_p ())
270 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n",
273 if (!vectorization_factor
274 || (nunits
> vectorization_factor
))
275 vectorization_factor
= nunits
;
279 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
280 !gsi_end_p (si
) || analyze_pattern_stmt
;)
284 if (analyze_pattern_stmt
)
287 stmt
= gsi_stmt (si
);
289 stmt_info
= vinfo_for_stmt (stmt
);
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE
, vect_location
,
294 "==> examining statement: ");
295 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
296 dump_printf (MSG_NOTE
, "\n");
299 gcc_assert (stmt_info
);
301 /* Skip stmts which do not need to be vectorized. */
302 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
303 && !STMT_VINFO_LIVE_P (stmt_info
))
304 || gimple_clobber_p (stmt
))
306 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
307 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
308 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
309 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
312 stmt_info
= vinfo_for_stmt (pattern_stmt
);
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE
, vect_location
,
316 "==> examining pattern statement: ");
317 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
318 dump_printf (MSG_NOTE
, "\n");
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
329 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
330 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
331 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
332 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
333 analyze_pattern_stmt
= true;
335 /* If a pattern statement has def stmts, analyze them too. */
336 if (is_pattern_stmt_p (stmt_info
))
338 if (pattern_def_seq
== NULL
)
340 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
341 pattern_def_si
= gsi_start (pattern_def_seq
);
343 else if (!gsi_end_p (pattern_def_si
))
344 gsi_next (&pattern_def_si
);
345 if (pattern_def_seq
!= NULL
)
347 gimple pattern_def_stmt
= NULL
;
348 stmt_vec_info pattern_def_stmt_info
= NULL
;
350 while (!gsi_end_p (pattern_def_si
))
352 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
353 pattern_def_stmt_info
354 = vinfo_for_stmt (pattern_def_stmt
);
355 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
356 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
358 gsi_next (&pattern_def_si
);
361 if (!gsi_end_p (pattern_def_si
))
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE
, vect_location
,
366 "==> examining pattern def stmt: ");
367 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
368 pattern_def_stmt
, 0);
369 dump_printf (MSG_NOTE
, "\n");
372 stmt
= pattern_def_stmt
;
373 stmt_info
= pattern_def_stmt_info
;
377 pattern_def_si
= gsi_none ();
378 analyze_pattern_stmt
= false;
382 analyze_pattern_stmt
= false;
385 if (gimple_get_lhs (stmt
) == NULL_TREE
386 /* MASK_STORE has no lhs, but is ok. */
387 && (!is_gimple_call (stmt
)
388 || !gimple_call_internal_p (stmt
)
389 || gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
))
391 if (is_gimple_call (stmt
))
393 /* Ignore calls with no lhs. These must be calls to
394 #pragma omp simd functions, and what vectorization factor
395 it really needs can't be determined until
396 vectorizable_simd_clone_call. */
397 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
399 pattern_def_seq
= NULL
;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
407 "not vectorized: irregular stmt.");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
,
410 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
415 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt
))))
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
420 "not vectorized: vector stmt in loop:");
421 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
422 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
427 if (STMT_VINFO_VECTYPE (stmt_info
))
429 /* The only case when a vectype had been already set is for stmts
430 that contain a dataref, or for "pattern-stmts" (stmts
431 generated by the vectorizer to represent/replace a certain
433 gcc_assert (STMT_VINFO_DATA_REF (stmt_info
)
434 || is_pattern_stmt_p (stmt_info
)
435 || !gsi_end_p (pattern_def_si
));
436 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
440 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
441 if (is_gimple_call (stmt
)
442 && gimple_call_internal_p (stmt
)
443 && gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
444 scalar_type
= TREE_TYPE (gimple_call_arg (stmt
, 3));
446 scalar_type
= TREE_TYPE (gimple_get_lhs (stmt
));
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE
, vect_location
,
450 "get vectype for scalar type: ");
451 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
452 dump_printf (MSG_NOTE
, "\n");
454 vectype
= get_vectype_for_scalar_type (scalar_type
);
457 if (dump_enabled_p ())
459 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
460 "not vectorized: unsupported "
462 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
464 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
469 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
474 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vectype
);
475 dump_printf (MSG_NOTE
, "\n");
479 /* The vectorization factor is according to the smallest
480 scalar type (or the largest vector size, but we only
481 support one vector size per loop). */
482 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
,
484 if (dump_enabled_p ())
486 dump_printf_loc (MSG_NOTE
, vect_location
,
487 "get vectype for scalar type: ");
488 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, scalar_type
);
489 dump_printf (MSG_NOTE
, "\n");
491 vf_vectype
= get_vectype_for_scalar_type (scalar_type
);
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
497 "not vectorized: unsupported data-type ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
500 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
505 if ((GET_MODE_SIZE (TYPE_MODE (vectype
))
506 != GET_MODE_SIZE (TYPE_MODE (vf_vectype
))))
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
511 "not vectorized: different sized vector "
512 "types in statement, ");
513 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
515 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
516 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
518 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: ");
526 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, vf_vectype
);
527 dump_printf (MSG_NOTE
, "\n");
530 nunits
= TYPE_VECTOR_SUBPARTS (vf_vectype
);
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = %d\n", nunits
);
533 if (!vectorization_factor
534 || (nunits
> vectorization_factor
))
535 vectorization_factor
= nunits
;
537 if (!analyze_pattern_stmt
&& gsi_end_p (pattern_def_si
))
539 pattern_def_seq
= NULL
;
545 /* TODO: Analyze cost. Decide if worth while to vectorize. */
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = %d\n",
548 vectorization_factor
);
549 if (vectorization_factor
<= 1)
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
553 "not vectorized: unsupported data-type\n");
556 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
562 /* Function vect_is_simple_iv_evolution.
564 FORNOW: A simple evolution of an induction variables in the loop is
565 considered a polynomial evolution. */
568 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
573 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
576 /* When there is no evolution in this loop, the evolution function
578 if (evolution_part
== NULL_TREE
)
581 /* When the evolution is a polynomial of degree >= 2
582 the evolution function is not "simple". */
583 if (tree_is_chrec (evolution_part
))
586 step_expr
= evolution_part
;
587 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
589 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE
, vect_location
, "step: ");
592 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step_expr
);
593 dump_printf (MSG_NOTE
, ", init: ");
594 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_expr
);
595 dump_printf (MSG_NOTE
, "\n");
601 if (TREE_CODE (step_expr
) != INTEGER_CST
602 && (TREE_CODE (step_expr
) != SSA_NAME
603 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
604 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
605 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
606 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
607 || !flag_associative_math
)))
608 && (TREE_CODE (step_expr
) != REAL_CST
609 || !flag_associative_math
))
611 if (dump_enabled_p ())
612 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
620 /* Function vect_analyze_scalar_cycles_1.
622 Examine the cross iteration def-use cycles of scalar variables
623 in LOOP. LOOP_VINFO represents the loop that is now being
624 considered for vectorization (can be LOOP, or an outer-loop
628 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
630 basic_block bb
= loop
->header
;
632 auto_vec
<gimple
, 64> worklist
;
636 if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE
, vect_location
,
638 "=== vect_analyze_scalar_cycles ===\n");
640 /* First - identify all inductions. Reduction detection assumes that all the
641 inductions have been identified, therefore, this order must not be
643 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
645 gphi
*phi
= gsi
.phi ();
646 tree access_fn
= NULL
;
647 tree def
= PHI_RESULT (phi
);
648 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
650 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
653 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
654 dump_printf (MSG_NOTE
, "\n");
657 /* Skip virtual phi's. The data dependences that are associated with
658 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
659 if (virtual_operand_p (def
))
662 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
664 /* Analyze the evolution function. */
665 access_fn
= analyze_scalar_evolution (loop
, def
);
668 STRIP_NOPS (access_fn
);
669 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE
, vect_location
,
672 "Access function of PHI: ");
673 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, access_fn
);
674 dump_printf (MSG_NOTE
, "\n");
676 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
677 = evolution_part_in_loop_num (access_fn
, loop
->num
);
681 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
682 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
683 && TREE_CODE (step
) != INTEGER_CST
))
685 worklist
.safe_push (phi
);
689 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
693 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
697 /* Second - identify all reductions and nested cycles. */
698 while (worklist
.length () > 0)
700 gimple phi
= worklist
.pop ();
701 tree def
= PHI_RESULT (phi
);
702 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (phi
);
706 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: ");
709 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
710 dump_printf (MSG_NOTE
, "\n");
713 gcc_assert (!virtual_operand_p (def
)
714 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
716 nested_cycle
= (loop
!= LOOP_VINFO_LOOP (loop_vinfo
));
717 reduc_stmt
= vect_force_simple_reduction (loop_vinfo
, phi
, !nested_cycle
,
718 &double_reduc
, false);
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE
, vect_location
,
725 "Detected double reduction.\n");
727 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
728 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
729 vect_double_reduction_def
;
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE
, vect_location
,
737 "Detected vectorizable nested cycle.\n");
739 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
740 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
745 if (dump_enabled_p ())
746 dump_printf_loc (MSG_NOTE
, vect_location
,
747 "Detected reduction.\n");
749 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
750 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt
)) =
752 /* Store the reduction cycles for possible vectorization in
754 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt
);
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
761 "Unknown def-use cycle pattern.\n");
766 /* Function vect_analyze_scalar_cycles.
768 Examine the cross iteration def-use cycles of scalar variables, by
769 analyzing the loop-header PHIs of scalar variables. Classify each
770 cycle as one of the following: invariant, induction, reduction, unknown.
771 We do that for the loop represented by LOOP_VINFO, and also to its
772 inner-loop, if exists.
773 Examples for scalar cycles:
788 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
790 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
792 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
794 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
795 Reductions in such inner-loop therefore have different properties than
796 the reductions in the nest that gets vectorized:
797 1. When vectorized, they are executed in the same order as in the original
798 scalar loop, so we can't change the order of computation when
800 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
801 current checks are too strict. */
804 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
807 /* Transfer group and reduction information from STMT to its pattern stmt. */
810 vect_fixup_reduc_chain (gimple stmt
)
812 gimple firstp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
814 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp
))
815 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
816 GROUP_SIZE (vinfo_for_stmt (firstp
)) = GROUP_SIZE (vinfo_for_stmt (stmt
));
819 stmtp
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
820 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp
)) = firstp
;
821 stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
823 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp
))
824 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
827 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp
)) = vect_reduction_def
;
830 /* Fixup scalar cycles that now have their stmts detected as patterns. */
833 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
838 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
839 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first
)))
841 vect_fixup_reduc_chain (first
);
842 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
843 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first
));
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1.
853 Return the loop exit condition. */
857 vect_get_loop_niters (struct loop
*loop
, tree
*number_of_iterations
,
858 tree
*number_of_iterationsm1
)
862 if (dump_enabled_p ())
863 dump_printf_loc (MSG_NOTE
, vect_location
,
864 "=== get_loop_niters ===\n");
866 niters
= number_of_latch_executions (loop
);
867 *number_of_iterationsm1
= niters
;
869 /* We want the number of loop header executions which is the number
870 of latch executions plus one.
871 ??? For UINT_MAX latch executions this number overflows to zero
872 for loops like do { n++; } while (n != 0); */
873 if (niters
&& !chrec_contains_undetermined (niters
))
874 niters
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niters
), unshare_expr (niters
),
875 build_int_cst (TREE_TYPE (niters
), 1));
876 *number_of_iterations
= niters
;
878 return get_loop_exit_condition (loop
);
882 /* Function bb_in_loop_p
884 Used as predicate for dfs order traversal of the loop bbs. */
887 bb_in_loop_p (const_basic_block bb
, const void *data
)
889 const struct loop
*const loop
= (const struct loop
*)data
;
890 if (flow_bb_inside_loop_p (loop
, bb
))
896 /* Function new_loop_vec_info.
898 Create and initialize a new loop_vec_info struct for LOOP, as well as
899 stmt_vec_info structs for all the stmts in LOOP. */
902 new_loop_vec_info (struct loop
*loop
)
906 gimple_stmt_iterator si
;
907 unsigned int i
, nbbs
;
909 res
= (loop_vec_info
) xcalloc (1, sizeof (struct _loop_vec_info
));
910 LOOP_VINFO_LOOP (res
) = loop
;
912 bbs
= get_loop_body (loop
);
914 /* Create/Update stmt_info for all stmts in the loop. */
915 for (i
= 0; i
< loop
->num_nodes
; i
++)
917 basic_block bb
= bbs
[i
];
919 /* BBs in a nested inner-loop will have been already processed (because
920 we will have called vect_analyze_loop_form for any nested inner-loop).
921 Therefore, for stmts in an inner-loop we just want to update the
922 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
923 loop_info of the outer-loop we are currently considering to vectorize
924 (instead of the loop_info of the inner-loop).
925 For stmts in other BBs we need to create a stmt_info from scratch. */
926 if (bb
->loop_father
!= loop
)
929 gcc_assert (loop
->inner
&& bb
->loop_father
== loop
->inner
);
930 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
932 gimple phi
= gsi_stmt (si
);
933 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
934 loop_vec_info inner_loop_vinfo
=
935 STMT_VINFO_LOOP_VINFO (stmt_info
);
936 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
937 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
939 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
941 gimple stmt
= gsi_stmt (si
);
942 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
943 loop_vec_info inner_loop_vinfo
=
944 STMT_VINFO_LOOP_VINFO (stmt_info
);
945 gcc_assert (loop
->inner
== LOOP_VINFO_LOOP (inner_loop_vinfo
));
946 STMT_VINFO_LOOP_VINFO (stmt_info
) = res
;
951 /* bb in current nest. */
952 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
954 gimple phi
= gsi_stmt (si
);
955 gimple_set_uid (phi
, 0);
956 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, res
, NULL
));
959 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
961 gimple stmt
= gsi_stmt (si
);
962 gimple_set_uid (stmt
, 0);
963 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, res
, NULL
));
968 /* CHECKME: We want to visit all BBs before their successors (except for
969 latch blocks, for which this assertion wouldn't hold). In the simple
970 case of the loop forms we allow, a dfs order of the BBs would the same
971 as reversed postorder traversal, so we are safe. */
974 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
975 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
976 bbs
, loop
->num_nodes
, loop
);
977 gcc_assert (nbbs
== loop
->num_nodes
);
979 LOOP_VINFO_BBS (res
) = bbs
;
980 LOOP_VINFO_NITERSM1 (res
) = NULL
;
981 LOOP_VINFO_NITERS (res
) = NULL
;
982 LOOP_VINFO_NITERS_UNCHANGED (res
) = NULL
;
983 LOOP_VINFO_COST_MODEL_MIN_ITERS (res
) = 0;
984 LOOP_VINFO_COST_MODEL_THRESHOLD (res
) = 0;
985 LOOP_VINFO_VECTORIZABLE_P (res
) = 0;
986 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res
) = 0;
987 LOOP_VINFO_VECT_FACTOR (res
) = 0;
988 LOOP_VINFO_LOOP_NEST (res
).create (3);
989 LOOP_VINFO_DATAREFS (res
).create (10);
990 LOOP_VINFO_DDRS (res
).create (10 * 10);
991 LOOP_VINFO_UNALIGNED_DR (res
) = NULL
;
992 LOOP_VINFO_MAY_MISALIGN_STMTS (res
).create (
993 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
));
994 LOOP_VINFO_MAY_ALIAS_DDRS (res
).create (
995 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
996 LOOP_VINFO_GROUPED_STORES (res
).create (10);
997 LOOP_VINFO_REDUCTIONS (res
).create (10);
998 LOOP_VINFO_REDUCTION_CHAINS (res
).create (10);
999 LOOP_VINFO_SLP_INSTANCES (res
).create (10);
1000 LOOP_VINFO_SLP_UNROLLING_FACTOR (res
) = 1;
1001 LOOP_VINFO_TARGET_COST_DATA (res
) = init_cost (loop
);
1002 LOOP_VINFO_PEELING_FOR_GAPS (res
) = false;
1003 LOOP_VINFO_PEELING_FOR_NITER (res
) = false;
1004 LOOP_VINFO_OPERANDS_SWAPPED (res
) = false;
1010 /* Function destroy_loop_vec_info.
1012 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1013 stmts in the loop. */
1016 destroy_loop_vec_info (loop_vec_info loop_vinfo
, bool clean_stmts
)
1021 gimple_stmt_iterator si
;
1023 vec
<slp_instance
> slp_instances
;
1024 slp_instance instance
;
1030 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1032 bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1033 nbbs
= clean_stmts
? loop
->num_nodes
: 0;
1034 swapped
= LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo
);
1036 for (j
= 0; j
< nbbs
; j
++)
1038 basic_block bb
= bbs
[j
];
1039 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1040 free_stmt_vec_info (gsi_stmt (si
));
1042 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
1044 gimple stmt
= gsi_stmt (si
);
1046 /* We may have broken canonical form by moving a constant
1047 into RHS1 of a commutative op. Fix such occurrences. */
1048 if (swapped
&& is_gimple_assign (stmt
))
1050 enum tree_code code
= gimple_assign_rhs_code (stmt
);
1052 if ((code
== PLUS_EXPR
1053 || code
== POINTER_PLUS_EXPR
1054 || code
== MULT_EXPR
)
1055 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
1056 swap_ssa_operands (stmt
,
1057 gimple_assign_rhs1_ptr (stmt
),
1058 gimple_assign_rhs2_ptr (stmt
));
1061 /* Free stmt_vec_info. */
1062 free_stmt_vec_info (stmt
);
1067 free (LOOP_VINFO_BBS (loop_vinfo
));
1068 vect_destroy_datarefs (loop_vinfo
, NULL
);
1069 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo
));
1070 LOOP_VINFO_LOOP_NEST (loop_vinfo
).release ();
1071 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).release ();
1072 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).release ();
1073 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1074 FOR_EACH_VEC_ELT (slp_instances
, j
, instance
)
1075 vect_free_slp_instance (instance
);
1077 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
1078 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).release ();
1079 LOOP_VINFO_REDUCTIONS (loop_vinfo
).release ();
1080 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).release ();
1082 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo
);
1083 LOOP_VINFO_PEELING_HTAB (loop_vinfo
) = NULL
;
1085 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
1086 loop_vinfo
->scalar_cost_vec
.release ();
1093 /* Calculate the cost of one scalar iteration of the loop. */
1095 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1097 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1098 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1099 int nbbs
= loop
->num_nodes
, factor
, scalar_single_iter_cost
= 0;
1100 int innerloop_iters
, i
;
1102 /* Count statements in scalar loop. Using this as scalar cost for a single
1105 TODO: Add outer loop support.
1107 TODO: Consider assigning different costs to different scalar
1111 innerloop_iters
= 1;
1113 innerloop_iters
= 50; /* FIXME */
1115 for (i
= 0; i
< nbbs
; i
++)
1117 gimple_stmt_iterator si
;
1118 basic_block bb
= bbs
[i
];
1120 if (bb
->loop_father
== loop
->inner
)
1121 factor
= innerloop_iters
;
1125 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1127 gimple stmt
= gsi_stmt (si
);
1128 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1130 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1133 /* Skip stmts that are not vectorized inside the loop. */
1135 && !STMT_VINFO_RELEVANT_P (stmt_info
)
1136 && (!STMT_VINFO_LIVE_P (stmt_info
)
1137 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1138 && !STMT_VINFO_IN_PATTERN_P (stmt_info
))
1141 vect_cost_for_stmt kind
;
1142 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
)))
1144 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
1147 kind
= scalar_store
;
1152 scalar_single_iter_cost
1153 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1154 factor
, kind
, NULL
, 0, vect_prologue
);
1157 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
)
1158 = scalar_single_iter_cost
;
1162 /* Function vect_analyze_loop_1.
1164 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1165 for it. The different analyses will record information in the
1166 loop_vec_info struct. This is a subset of the analyses applied in
1167 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1168 that is now considered for (outer-loop) vectorization. */
1170 static loop_vec_info
1171 vect_analyze_loop_1 (struct loop
*loop
)
1173 loop_vec_info loop_vinfo
;
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_NOTE
, vect_location
,
1177 "===== analyze_loop_nest_1 =====\n");
1179 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1181 loop_vinfo
= vect_analyze_loop_form (loop
);
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1186 "bad inner-loop form.\n");
1194 /* Function vect_analyze_loop_form.
1196 Verify that certain CFG restrictions hold, including:
1197 - the loop has a pre-header
1198 - the loop has a single entry and exit
1199 - the loop exit condition is simple enough, and the number of iterations
1200 can be analyzed (a countable loop). */
1203 vect_analyze_loop_form (struct loop
*loop
)
1205 loop_vec_info loop_vinfo
;
1207 tree number_of_iterations
= NULL
, number_of_iterationsm1
= NULL
;
1208 loop_vec_info inner_loop_vinfo
= NULL
;
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_NOTE
, vect_location
,
1212 "=== vect_analyze_loop_form ===\n");
1214 /* Different restrictions apply when we are considering an inner-most loop,
1215 vs. an outer (nested) loop.
1216 (FORNOW. May want to relax some of these restrictions in the future). */
1220 /* Inner-most loop. We currently require that the number of BBs is
1221 exactly 2 (the header and latch). Vectorizable inner-most loops
1232 if (loop
->num_nodes
!= 2)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1236 "not vectorized: control flow in loop.\n");
1240 if (empty_block_p (loop
->header
))
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1244 "not vectorized: empty loop.\n");
1250 struct loop
*innerloop
= loop
->inner
;
1253 /* Nested loop. We currently require that the loop is doubly-nested,
1254 contains a single inner loop, and the number of BBs is exactly 5.
1255 Vectorizable outer-loops look like this:
1267 The inner-loop has the properties expected of inner-most loops
1268 as described above. */
1270 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1274 "not vectorized: multiple nested loops.\n");
1278 /* Analyze the inner-loop. */
1279 inner_loop_vinfo
= vect_analyze_loop_1 (loop
->inner
);
1280 if (!inner_loop_vinfo
)
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1284 "not vectorized: Bad inner loop.\n");
1288 if (!expr_invariant_in_loop_p (loop
,
1289 LOOP_VINFO_NITERS (inner_loop_vinfo
)))
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1293 "not vectorized: inner-loop count not"
1295 destroy_loop_vec_info (inner_loop_vinfo
, true);
1299 if (loop
->num_nodes
!= 5)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1303 "not vectorized: control flow in loop.\n");
1304 destroy_loop_vec_info (inner_loop_vinfo
, true);
1308 gcc_assert (EDGE_COUNT (innerloop
->header
->preds
) == 2);
1309 entryedge
= EDGE_PRED (innerloop
->header
, 0);
1310 if (EDGE_PRED (innerloop
->header
, 0)->src
== innerloop
->latch
)
1311 entryedge
= EDGE_PRED (innerloop
->header
, 1);
1313 if (entryedge
->src
!= loop
->header
1314 || !single_exit (innerloop
)
1315 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1317 if (dump_enabled_p ())
1318 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1319 "not vectorized: unsupported outerloop form.\n");
1320 destroy_loop_vec_info (inner_loop_vinfo
, true);
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_NOTE
, vect_location
,
1326 "Considering outer-loop vectorization.\n");
1329 if (!single_exit (loop
)
1330 || EDGE_COUNT (loop
->header
->preds
) != 2)
1332 if (dump_enabled_p ())
1334 if (!single_exit (loop
))
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1336 "not vectorized: multiple exits.\n");
1337 else if (EDGE_COUNT (loop
->header
->preds
) != 2)
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1339 "not vectorized: too many incoming edges.\n");
1341 if (inner_loop_vinfo
)
1342 destroy_loop_vec_info (inner_loop_vinfo
, true);
1346 /* We assume that the loop exit condition is at the end of the loop. i.e,
1347 that the loop is represented as a do-while (with a proper if-guard
1348 before the loop if needed), where the loop header contains all the
1349 executable statements, and the latch is empty. */
1350 if (!empty_block_p (loop
->latch
)
1351 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1355 "not vectorized: latch block not empty.\n");
1356 if (inner_loop_vinfo
)
1357 destroy_loop_vec_info (inner_loop_vinfo
, true);
1361 /* Make sure there exists a single-predecessor exit bb: */
1362 if (!single_pred_p (single_exit (loop
)->dest
))
1364 edge e
= single_exit (loop
);
1365 if (!(e
->flags
& EDGE_ABNORMAL
))
1367 split_loop_exit_edge (e
);
1368 if (dump_enabled_p ())
1369 dump_printf (MSG_NOTE
, "split exit edge.\n");
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1375 "not vectorized: abnormal loop exit edge.\n");
1376 if (inner_loop_vinfo
)
1377 destroy_loop_vec_info (inner_loop_vinfo
, true);
1382 loop_cond
= vect_get_loop_niters (loop
, &number_of_iterations
,
1383 &number_of_iterationsm1
);
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1388 "not vectorized: complicated exit condition.\n");
1389 if (inner_loop_vinfo
)
1390 destroy_loop_vec_info (inner_loop_vinfo
, true);
1394 if (!number_of_iterations
1395 || chrec_contains_undetermined (number_of_iterations
))
1397 if (dump_enabled_p ())
1398 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1399 "not vectorized: number of iterations cannot be "
1401 if (inner_loop_vinfo
)
1402 destroy_loop_vec_info (inner_loop_vinfo
, true);
1406 if (integer_zerop (number_of_iterations
))
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1410 "not vectorized: number of iterations = 0.\n");
1411 if (inner_loop_vinfo
)
1412 destroy_loop_vec_info (inner_loop_vinfo
, true);
1416 loop_vinfo
= new_loop_vec_info (loop
);
1417 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1418 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1419 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1421 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1423 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_NOTE
, vect_location
,
1426 "Symbolic number of iterations is ");
1427 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1428 dump_printf (MSG_NOTE
, "\n");
1432 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond
)) = loop_exit_ctrl_vec_info_type
;
1434 /* CHECKME: May want to keep it around it in the future. */
1435 if (inner_loop_vinfo
)
1436 destroy_loop_vec_info (inner_loop_vinfo
, false);
1438 gcc_assert (!loop
->aux
);
1439 loop
->aux
= loop_vinfo
;
1443 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1444 statements update the vectorization factor. */
1447 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1449 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1450 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1451 int nbbs
= loop
->num_nodes
;
1452 unsigned int vectorization_factor
;
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_NOTE
, vect_location
,
1457 "=== vect_update_vf_for_slp ===\n");
1459 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1460 gcc_assert (vectorization_factor
!= 0);
1462 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1463 vectorization factor of the loop is the unrolling factor required by
1464 the SLP instances. If that unrolling factor is 1, we say, that we
1465 perform pure SLP on loop - cross iteration parallelism is not
1467 bool only_slp_in_loop
= true;
1468 for (i
= 0; i
< nbbs
; i
++)
1470 basic_block bb
= bbs
[i
];
1471 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1474 gimple stmt
= gsi_stmt (si
);
1475 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1476 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
1477 && STMT_VINFO_RELATED_STMT (stmt_info
))
1479 stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
1480 stmt_info
= vinfo_for_stmt (stmt
);
1482 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1483 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1484 && !PURE_SLP_STMT (stmt_info
))
1485 /* STMT needs both SLP and loop-based vectorization. */
1486 only_slp_in_loop
= false;
1490 if (only_slp_in_loop
)
1491 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1493 vectorization_factor
1494 = least_common_multiple (vectorization_factor
,
1495 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1497 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE
, vect_location
,
1500 "Updating vectorization factor to %d\n",
1501 vectorization_factor
);
1504 /* Function vect_analyze_loop_operations.
1506 Scan the loop stmts and make sure they are all vectorizable. */
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1511 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1512 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1513 int nbbs
= loop
->num_nodes
;
1514 unsigned int vectorization_factor
;
1516 stmt_vec_info stmt_info
;
1517 bool need_to_vectorize
= false;
1518 int min_profitable_iters
;
1519 int min_scalar_loop_bound
;
1522 HOST_WIDE_INT max_niter
;
1523 HOST_WIDE_INT estimated_niter
;
1524 int min_profitable_estimate
;
1526 if (dump_enabled_p ())
1527 dump_printf_loc (MSG_NOTE
, vect_location
,
1528 "=== vect_analyze_loop_operations ===\n");
1530 for (i
= 0; i
< nbbs
; i
++)
1532 basic_block bb
= bbs
[i
];
1534 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1537 gphi
*phi
= si
.phi ();
1540 stmt_info
= vinfo_for_stmt (phi
);
1541 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: ");
1544 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
1545 dump_printf (MSG_NOTE
, "\n");
1548 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1549 (i.e., a phi in the tail of the outer-loop). */
1550 if (! is_loop_header_bb_p (bb
))
1552 /* FORNOW: we currently don't support the case that these phis
1553 are not used in the outerloop (unless it is double reduction,
1554 i.e., this phi is vect_reduction_def), cause this case
1555 requires to actually do something here. */
1556 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
1557 || STMT_VINFO_LIVE_P (stmt_info
))
1558 && STMT_VINFO_DEF_TYPE (stmt_info
)
1559 != vect_double_reduction_def
)
1561 if (dump_enabled_p ())
1562 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1563 "Unsupported loop-closed phi in "
1568 /* If PHI is used in the outer loop, we check that its operand
1569 is defined in the inner loop. */
1570 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1575 if (gimple_phi_num_args (phi
) != 1)
1578 phi_op
= PHI_ARG_DEF (phi
, 0);
1579 if (TREE_CODE (phi_op
) != SSA_NAME
)
1582 op_def_stmt
= SSA_NAME_DEF_STMT (phi_op
);
1583 if (gimple_nop_p (op_def_stmt
)
1584 || !flow_bb_inside_loop_p (loop
, gimple_bb (op_def_stmt
))
1585 || !vinfo_for_stmt (op_def_stmt
))
1588 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1589 != vect_used_in_outer
1590 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt
))
1591 != vect_used_in_outer_by_reduction
)
1598 gcc_assert (stmt_info
);
1600 if (STMT_VINFO_LIVE_P (stmt_info
))
1602 /* FORNOW: not yet supported. */
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1605 "not vectorized: value used after loop.\n");
1609 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1610 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1612 /* A scalar-dependence cycle that we don't support. */
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1615 "not vectorized: scalar dependence cycle.\n");
1619 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1621 need_to_vectorize
= true;
1622 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
1623 ok
= vectorizable_induction (phi
, NULL
, NULL
);
1628 if (dump_enabled_p ())
1630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1631 "not vectorized: relevant phi not "
1633 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, phi
, 0);
1634 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1640 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1643 gimple stmt
= gsi_stmt (si
);
1644 if (!gimple_clobber_p (stmt
)
1645 && !vect_analyze_stmt (stmt
, &need_to_vectorize
, NULL
))
1650 /* All operations in the loop are either irrelevant (deal with loop
1651 control, or dead), or only used outside the loop and can be moved
1652 out of the loop (e.g. invariants, inductions). The loop can be
1653 optimized away by scalar optimizations. We're better off not
1654 touching this loop. */
1655 if (!need_to_vectorize
)
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_NOTE
, vect_location
,
1659 "All the computation can be taken out of the loop.\n");
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1662 "not vectorized: redundant loop. no profit to "
1667 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1668 gcc_assert (vectorization_factor
!= 0);
1670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE
, vect_location
,
1672 "vectorization_factor = %d, niters = "
1673 HOST_WIDE_INT_PRINT_DEC
"\n", vectorization_factor
,
1674 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1676 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1677 && (LOOP_VINFO_INT_NITERS (loop_vinfo
) < vectorization_factor
))
1678 || ((max_niter
= max_stmt_executions_int (loop
)) != -1
1679 && (unsigned HOST_WIDE_INT
) max_niter
< vectorization_factor
))
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1683 "not vectorized: iteration count too small.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1686 "not vectorized: iteration count smaller than "
1687 "vectorization factor.\n");
1691 /* Analyze cost. Decide if worth while to vectorize. */
1693 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1694 &min_profitable_estimate
);
1695 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo
) = min_profitable_iters
;
1697 if (min_profitable_iters
< 0)
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1701 "not vectorized: vectorization not profitable.\n");
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1704 "not vectorized: vector version will never be "
1709 min_scalar_loop_bound
= ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1710 * vectorization_factor
) - 1);
1713 /* Use the cost model only if it is more conservative than user specified
1716 th
= (unsigned) min_scalar_loop_bound
;
1717 if (min_profitable_iters
1718 && (!min_scalar_loop_bound
1719 || min_profitable_iters
> min_scalar_loop_bound
))
1720 th
= (unsigned) min_profitable_iters
;
1722 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1724 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1725 && LOOP_VINFO_INT_NITERS (loop_vinfo
) <= th
)
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1729 "not vectorized: vectorization not profitable.\n");
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE
, vect_location
,
1732 "not vectorized: iteration count smaller than user "
1733 "specified loop bound parameter or minimum profitable "
1734 "iterations (whichever is more conservative).\n");
1738 if ((estimated_niter
= estimated_stmt_executions_int (loop
)) != -1
1739 && ((unsigned HOST_WIDE_INT
) estimated_niter
1740 <= MAX (th
, (unsigned)min_profitable_estimate
)))
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1744 "not vectorized: estimated iteration count too "
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE
, vect_location
,
1748 "not vectorized: estimated iteration count smaller "
1749 "than specified loop bound parameter or minimum "
1750 "profitable iterations (whichever is more "
1751 "conservative).\n");
1759 /* Function vect_analyze_loop_2.
1761 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1762 for it. The different analyses will record information in the
1763 loop_vec_info struct. */
1765 vect_analyze_loop_2 (loop_vec_info loop_vinfo
)
1768 int max_vf
= MAX_VECTORIZATION_FACTOR
;
1771 unsigned int n_stmts
= 0;
1773 /* Find all data references in the loop (which correspond to vdefs/vuses)
1774 and analyze their evolution in the loop. Also adjust the minimal
1775 vectorization factor according to the loads and stores.
1777 FORNOW: Handle only simple, array references, which
1778 alignment can be forced, and aligned pointer-references. */
1780 ok
= vect_analyze_data_refs (loop_vinfo
, NULL
, &min_vf
, &n_stmts
);
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1785 "bad data references.\n");
1789 /* Classify all cross-iteration scalar data-flow cycles.
1790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1792 vect_analyze_scalar_cycles (loop_vinfo
);
1794 vect_pattern_recog (loop_vinfo
, NULL
);
1796 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1798 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1799 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1801 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1806 "bad data access.\n");
1810 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1812 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1817 "unexpected pattern.\n");
1821 /* Analyze data dependences between the data-refs in the loop
1822 and adjust the maximum vectorization factor according to
1824 FORNOW: fail at the first data dependence that we encounter. */
1826 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1832 "bad data dependence.\n");
1836 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1841 "can't determine vectorization factor.\n");
1844 if (max_vf
< LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1848 "bad data dependence.\n");
1852 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1853 ok
= vect_analyze_slp (loop_vinfo
, NULL
, n_stmts
);
1857 /* If there are any SLP instances mark them as pure_slp. */
1858 bool slp
= vect_make_slp_decision (loop_vinfo
);
1861 /* Find stmts that need to be both vectorized and SLPed. */
1862 vect_detect_hybrid_slp (loop_vinfo
);
1864 /* Update the vectorization factor based on the SLP decision. */
1865 vect_update_vf_for_slp (loop_vinfo
);
1868 /* Analyze the alignment of the data-refs in the loop.
1869 Fail if a data reference is found that cannot be vectorized. */
1871 ok
= vect_analyze_data_refs_alignment (loop_vinfo
, NULL
);
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1876 "bad data alignment.\n");
1880 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1881 It is important to call pruning after vect_analyze_data_ref_accesses,
1882 since we use grouping information gathered by interleaving analysis. */
1883 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1888 "number of versioning for alias "
1889 "run-time tests exceeds %d "
1890 "(--param vect-max-version-for-alias-checks)\n",
1891 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
1895 /* Compute the scalar iteration cost. */
1896 vect_get_single_scalar_iteration_cost (loop_vinfo
);
1898 /* This pass will decide on using loop versioning and/or loop peeling in
1899 order to enhance the alignment of data references in the loop. */
1901 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1904 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1906 "bad data alignment.\n");
1912 /* Analyze operations in the SLP instances. Note this may
1913 remove unsupported SLP instances which makes the above
1914 SLP kind detection invalid. */
1915 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
1916 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
),
1917 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
1918 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
1922 /* Scan all the remaining operations in the loop that are not subject
1923 to SLP and make sure they are vectorizable. */
1924 ok
= vect_analyze_loop_operations (loop_vinfo
);
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1929 "bad operation or unsupported loop bound.\n");
1933 /* Decide whether we need to create an epilogue loop to handle
1934 remaining scalar iterations. */
1935 th
= ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) + 1)
1936 / LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1937 * LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1940 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1942 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1943 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
1944 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1945 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1947 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1948 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1949 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
))
1950 /* In case of versioning, check if the maximum number of
1951 iterations is greater than th. If they are identical,
1952 the epilogue is unnecessary. */
1953 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
)
1954 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1955 || (unsigned HOST_WIDE_INT
)max_stmt_executions_int
1956 (LOOP_VINFO_LOOP (loop_vinfo
)) > th
)))
1957 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1959 /* If an epilogue loop is required make sure we can create one. */
1960 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1961 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
1965 if (!vect_can_advance_ivs_p (loop_vinfo
)
1966 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
1967 single_exit (LOOP_VINFO_LOOP
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1972 "not vectorized: can't create required "
1981 /* Function vect_analyze_loop.
1983 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1984 for it. The different analyses will record information in the
1985 loop_vec_info struct. */
1987 vect_analyze_loop (struct loop
*loop
)
1989 loop_vec_info loop_vinfo
;
1990 unsigned int vector_sizes
;
1992 /* Autodetect first vector size we try. */
1993 current_vector_size
= 0;
1994 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
1996 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_NOTE
, vect_location
,
1998 "===== analyze_loop_nest =====\n");
2000 if (loop_outer (loop
)
2001 && loop_vec_info_for_loop (loop_outer (loop
))
2002 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2004 if (dump_enabled_p ())
2005 dump_printf_loc (MSG_NOTE
, vect_location
,
2006 "outer-loop already vectorized.\n");
2012 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2013 loop_vinfo
= vect_analyze_loop_form (loop
);
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2018 "bad loop form.\n");
2022 if (vect_analyze_loop_2 (loop_vinfo
))
2024 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2029 destroy_loop_vec_info (loop_vinfo
, true);
2031 vector_sizes
&= ~current_vector_size
;
2032 if (vector_sizes
== 0
2033 || current_vector_size
== 0)
2036 /* Try the next biggest vector size. */
2037 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_NOTE
, vect_location
,
2040 "***** Re-trying analysis with "
2041 "vector size %d\n", current_vector_size
);
2046 /* Function reduction_code_for_scalar_code
2049 CODE - tree_code of a reduction operations.
2052 REDUC_CODE - the corresponding tree-code to be used to reduce the
2053 vector of partial results into a single scalar result, or ERROR_MARK
2054 if the operation is a supported reduction operation, but does not have
2057 Return FALSE if CODE currently cannot be vectorized as reduction. */
2060 reduction_code_for_scalar_code (enum tree_code code
,
2061 enum tree_code
*reduc_code
)
2066 *reduc_code
= REDUC_MAX_EXPR
;
2070 *reduc_code
= REDUC_MIN_EXPR
;
2074 *reduc_code
= REDUC_PLUS_EXPR
;
2082 *reduc_code
= ERROR_MARK
;
2091 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2092 STMT is printed with a message MSG. */
2095 report_vect_op (int msg_type
, gimple stmt
, const char *msg
)
2097 dump_printf_loc (msg_type
, vect_location
, "%s", msg
);
2098 dump_gimple_stmt (msg_type
, TDF_SLIM
, stmt
, 0);
2099 dump_printf (msg_type
, "\n");
2103 /* Detect SLP reduction of the form:
2113 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2114 FIRST_STMT is the first reduction stmt in the chain
2115 (a2 = operation (a1)).
2117 Return TRUE if a reduction chain was detected. */
2120 vect_is_slp_reduction (loop_vec_info loop_info
, gimple phi
, gimple first_stmt
)
2122 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2123 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2124 enum tree_code code
;
2125 gimple current_stmt
= NULL
, loop_use_stmt
= NULL
, first
, next_stmt
;
2126 stmt_vec_info use_stmt_info
, current_stmt_info
;
2128 imm_use_iterator imm_iter
;
2129 use_operand_p use_p
;
2130 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2133 if (loop
!= vect_loop
)
2136 lhs
= PHI_RESULT (phi
);
2137 code
= gimple_assign_rhs_code (first_stmt
);
2141 n_out_of_loop_uses
= 0;
2142 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2144 gimple use_stmt
= USE_STMT (use_p
);
2145 if (is_gimple_debug (use_stmt
))
2148 /* Check if we got back to the reduction phi. */
2149 if (use_stmt
== phi
)
2151 loop_use_stmt
= use_stmt
;
2156 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2158 loop_use_stmt
= use_stmt
;
2162 n_out_of_loop_uses
++;
2164 /* There are can be either a single use in the loop or two uses in
2166 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2173 /* We reached a statement with no loop uses. */
2174 if (nloop_uses
== 0)
2177 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2178 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2181 if (!is_gimple_assign (loop_use_stmt
)
2182 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2183 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2186 /* Insert USE_STMT into reduction chain. */
2187 use_stmt_info
= vinfo_for_stmt (loop_use_stmt
);
2190 current_stmt_info
= vinfo_for_stmt (current_stmt
);
2191 GROUP_NEXT_ELEMENT (current_stmt_info
) = loop_use_stmt
;
2192 GROUP_FIRST_ELEMENT (use_stmt_info
)
2193 = GROUP_FIRST_ELEMENT (current_stmt_info
);
2196 GROUP_FIRST_ELEMENT (use_stmt_info
) = loop_use_stmt
;
2198 lhs
= gimple_assign_lhs (loop_use_stmt
);
2199 current_stmt
= loop_use_stmt
;
2203 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2206 /* Swap the operands, if needed, to make the reduction operand be the second
2208 lhs
= PHI_RESULT (phi
);
2209 next_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2212 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2214 tree op
= gimple_assign_rhs1 (next_stmt
);
2215 gimple def_stmt
= NULL
;
2217 if (TREE_CODE (op
) == SSA_NAME
)
2218 def_stmt
= SSA_NAME_DEF_STMT (op
);
2220 /* Check that the other def is either defined in the loop
2221 ("vect_internal_def"), or it's an induction (defined by a
2222 loop-header phi-node). */
2224 && gimple_bb (def_stmt
)
2225 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2226 && (is_gimple_assign (def_stmt
)
2227 || is_gimple_call (def_stmt
)
2228 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2229 == vect_induction_def
2230 || (gimple_code (def_stmt
) == GIMPLE_PHI
2231 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2232 == vect_internal_def
2233 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2235 lhs
= gimple_assign_lhs (next_stmt
);
2236 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2244 tree op
= gimple_assign_rhs2 (next_stmt
);
2245 gimple def_stmt
= NULL
;
2247 if (TREE_CODE (op
) == SSA_NAME
)
2248 def_stmt
= SSA_NAME_DEF_STMT (op
);
2250 /* Check that the other def is either defined in the loop
2251 ("vect_internal_def"), or it's an induction (defined by a
2252 loop-header phi-node). */
2254 && gimple_bb (def_stmt
)
2255 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2256 && (is_gimple_assign (def_stmt
)
2257 || is_gimple_call (def_stmt
)
2258 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2259 == vect_induction_def
2260 || (gimple_code (def_stmt
) == GIMPLE_PHI
2261 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
2262 == vect_internal_def
2263 && !is_loop_header_bb_p (gimple_bb (def_stmt
)))))
2265 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: ");
2268 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, next_stmt
, 0);
2269 dump_printf (MSG_NOTE
, "\n");
2272 swap_ssa_operands (next_stmt
,
2273 gimple_assign_rhs1_ptr (next_stmt
),
2274 gimple_assign_rhs2_ptr (next_stmt
));
2275 update_stmt (next_stmt
);
2277 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2278 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2284 lhs
= gimple_assign_lhs (next_stmt
);
2285 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
2288 /* Save the chain for further analysis in SLP detection. */
2289 first
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt
));
2290 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (first
);
2291 GROUP_SIZE (vinfo_for_stmt (first
)) = size
;
2297 /* Function vect_is_simple_reduction_1
2299 (1) Detect a cross-iteration def-use cycle that represents a simple
2300 reduction computation. We look for the following pattern:
2305 a2 = operation (a3, a1)
2312 a2 = operation (a3, a1)
2315 1. operation is commutative and associative and it is safe to
2316 change the order of the computation (if CHECK_REDUCTION is true)
2317 2. no uses for a2 in the loop (a2 is used out of the loop)
2318 3. no uses of a1 in the loop besides the reduction operation
2319 4. no uses of a1 outside the loop.
2321 Conditions 1,4 are tested here.
2322 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2324 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2325 nested cycles, if CHECK_REDUCTION is false.
2327 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2331 inner loop (def of a3)
2334 If MODIFY is true it tries also to rework the code in-place to enable
2335 detection of more reduction patterns. For the time being we rewrite
2336 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2340 vect_is_simple_reduction_1 (loop_vec_info loop_info
, gimple phi
,
2341 bool check_reduction
, bool *double_reduc
,
2342 bool modify
, bool need_wrapping_integral_overflow
)
2344 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2345 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2346 edge latch_e
= loop_latch_edge (loop
);
2347 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2348 gimple def_stmt
, def1
= NULL
, def2
= NULL
;
2349 enum tree_code orig_code
, code
;
2350 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2354 imm_use_iterator imm_iter
;
2355 use_operand_p use_p
;
2358 *double_reduc
= false;
2360 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2361 otherwise, we assume outer loop vectorization. */
2362 gcc_assert ((check_reduction
&& loop
== vect_loop
)
2363 || (!check_reduction
&& flow_loop_nested_p (vect_loop
, loop
)));
2365 name
= PHI_RESULT (phi
);
2366 /* ??? If there are no uses of the PHI result the inner loop reduction
2367 won't be detected as possibly double-reduction by vectorizable_reduction
2368 because that tries to walk the PHI arg from the preheader edge which
2369 can be constant. See PR60382. */
2370 if (has_zero_uses (name
))
2373 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2375 gimple use_stmt
= USE_STMT (use_p
);
2376 if (is_gimple_debug (use_stmt
))
2379 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2383 "intermediate value used outside loop.\n");
2391 if (dump_enabled_p ())
2392 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2393 "reduction used in loop.\n");
2398 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2400 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2403 "reduction: not ssa_name: ");
2404 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, loop_arg
);
2405 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2410 def_stmt
= SSA_NAME_DEF_STMT (loop_arg
);
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2415 "reduction: no def_stmt.\n");
2419 if (!is_gimple_assign (def_stmt
) && gimple_code (def_stmt
) != GIMPLE_PHI
)
2421 if (dump_enabled_p ())
2423 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, def_stmt
, 0);
2424 dump_printf (MSG_NOTE
, "\n");
2429 if (is_gimple_assign (def_stmt
))
2431 name
= gimple_assign_lhs (def_stmt
);
2436 name
= PHI_RESULT (def_stmt
);
2441 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2443 gimple use_stmt
= USE_STMT (use_p
);
2444 if (is_gimple_debug (use_stmt
))
2446 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2452 "reduction used in loop.\n");
2457 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2458 defined in the inner loop. */
2461 op1
= PHI_ARG_DEF (def_stmt
, 0);
2463 if (gimple_phi_num_args (def_stmt
) != 1
2464 || TREE_CODE (op1
) != SSA_NAME
)
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2468 "unsupported phi node definition.\n");
2473 def1
= SSA_NAME_DEF_STMT (op1
);
2474 if (gimple_bb (def1
)
2475 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2477 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2478 && is_gimple_assign (def1
))
2480 if (dump_enabled_p ())
2481 report_vect_op (MSG_NOTE
, def_stmt
,
2482 "detected double reduction: ");
2484 *double_reduc
= true;
2491 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
2493 /* We can handle "res -= x[i]", which is non-associative by
2494 simply rewriting this into "res += -x[i]". Avoid changing
2495 gimple instruction for the first simple tests and only do this
2496 if we're allowed to change code at all. */
2497 if (code
== MINUS_EXPR
2499 && (op1
= gimple_assign_rhs1 (def_stmt
))
2500 && TREE_CODE (op1
) == SSA_NAME
2501 && SSA_NAME_DEF_STMT (op1
) == phi
)
2505 && (!commutative_tree_code (code
) || !associative_tree_code (code
)))
2507 if (dump_enabled_p ())
2508 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2509 "reduction: not commutative/associative: ");
2513 if (get_gimple_rhs_class (code
) != GIMPLE_BINARY_RHS
)
2515 if (code
!= COND_EXPR
)
2517 if (dump_enabled_p ())
2518 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2519 "reduction: not binary operation: ");
2524 op3
= gimple_assign_rhs1 (def_stmt
);
2525 if (COMPARISON_CLASS_P (op3
))
2527 op4
= TREE_OPERAND (op3
, 1);
2528 op3
= TREE_OPERAND (op3
, 0);
2531 op1
= gimple_assign_rhs2 (def_stmt
);
2532 op2
= gimple_assign_rhs3 (def_stmt
);
2534 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2536 if (dump_enabled_p ())
2537 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2538 "reduction: uses not ssa_names: ");
2545 op1
= gimple_assign_rhs1 (def_stmt
);
2546 op2
= gimple_assign_rhs2 (def_stmt
);
2548 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2550 if (dump_enabled_p ())
2551 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2552 "reduction: uses not ssa_names: ");
2558 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
2559 if ((TREE_CODE (op1
) == SSA_NAME
2560 && !types_compatible_p (type
,TREE_TYPE (op1
)))
2561 || (TREE_CODE (op2
) == SSA_NAME
2562 && !types_compatible_p (type
, TREE_TYPE (op2
)))
2563 || (op3
&& TREE_CODE (op3
) == SSA_NAME
2564 && !types_compatible_p (type
, TREE_TYPE (op3
)))
2565 || (op4
&& TREE_CODE (op4
) == SSA_NAME
2566 && !types_compatible_p (type
, TREE_TYPE (op4
))))
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE
, vect_location
,
2571 "reduction: multiple types: operation type: ");
2572 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, type
);
2573 dump_printf (MSG_NOTE
, ", operands types: ");
2574 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2576 dump_printf (MSG_NOTE
, ",");
2577 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2581 dump_printf (MSG_NOTE
, ",");
2582 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2588 dump_printf (MSG_NOTE
, ",");
2589 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2592 dump_printf (MSG_NOTE
, "\n");
2598 /* Check that it's ok to change the order of the computation.
2599 Generally, when vectorizing a reduction we change the order of the
2600 computation. This may change the behavior of the program in some
2601 cases, so we need to check that this is ok. One exception is when
2602 vectorizing an outer-loop: the inner-loop is executed sequentially,
2603 and therefore vectorizing reductions in the inner-loop during
2604 outer-loop vectorization is safe. */
2606 /* CHECKME: check for !flag_finite_math_only too? */
2607 if (SCALAR_FLOAT_TYPE_P (type
) && !flag_associative_math
2610 /* Changing the order of operations changes the semantics. */
2611 if (dump_enabled_p ())
2612 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2613 "reduction: unsafe fp math optimization: ");
2616 else if (INTEGRAL_TYPE_P (type
) && check_reduction
)
2618 if (TYPE_OVERFLOW_TRAPS (type
))
2620 /* Changing the order of operations changes the semantics. */
2621 if (dump_enabled_p ())
2622 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2623 "reduction: unsafe int math optimization"
2624 " (overflow traps): ");
2627 if (need_wrapping_integral_overflow
&& !TYPE_OVERFLOW_WRAPS (type
))
2629 /* Changing the order of operations changes the semantics. */
2630 if (dump_enabled_p ())
2631 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2632 "reduction: unsafe int math optimization"
2633 " (overflow doesn't wrap): ");
2637 else if (SAT_FIXED_POINT_TYPE_P (type
) && check_reduction
)
2639 /* Changing the order of operations changes the semantics. */
2640 if (dump_enabled_p ())
2641 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2642 "reduction: unsafe fixed-point math optimization: ");
2646 /* If we detected "res -= x[i]" earlier, rewrite it into
2647 "res += -x[i]" now. If this turns out to be useless reassoc
2648 will clean it up again. */
2649 if (orig_code
== MINUS_EXPR
)
2651 tree rhs
= gimple_assign_rhs2 (def_stmt
);
2652 tree negrhs
= make_ssa_name (TREE_TYPE (rhs
));
2653 gimple negate_stmt
= gimple_build_assign (negrhs
, NEGATE_EXPR
, rhs
);
2654 gimple_stmt_iterator gsi
= gsi_for_stmt (def_stmt
);
2655 set_vinfo_for_stmt (negate_stmt
, new_stmt_vec_info (negate_stmt
,
2657 gsi_insert_before (&gsi
, negate_stmt
, GSI_NEW_STMT
);
2658 gimple_assign_set_rhs2 (def_stmt
, negrhs
);
2659 gimple_assign_set_rhs_code (def_stmt
, PLUS_EXPR
);
2660 update_stmt (def_stmt
);
2663 /* Reduction is safe. We're dealing with one of the following:
2664 1) integer arithmetic and no trapv
2665 2) floating point arithmetic, and special flags permit this optimization
2666 3) nested cycle (i.e., outer loop vectorization). */
2667 if (TREE_CODE (op1
) == SSA_NAME
)
2668 def1
= SSA_NAME_DEF_STMT (op1
);
2670 if (TREE_CODE (op2
) == SSA_NAME
)
2671 def2
= SSA_NAME_DEF_STMT (op2
);
2673 if (code
!= COND_EXPR
2674 && ((!def1
|| gimple_nop_p (def1
)) && (!def2
|| gimple_nop_p (def2
))))
2676 if (dump_enabled_p ())
2677 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
2681 /* Check that one def is the reduction def, defined by PHI,
2682 the other def is either defined in the loop ("vect_internal_def"),
2683 or it's an induction (defined by a loop-header phi-node). */
2685 if (def2
&& def2
== phi
2686 && (code
== COND_EXPR
2687 || !def1
|| gimple_nop_p (def1
)
2688 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2689 || (def1
&& flow_bb_inside_loop_p (loop
, gimple_bb (def1
))
2690 && (is_gimple_assign (def1
)
2691 || is_gimple_call (def1
)
2692 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2693 == vect_induction_def
2694 || (gimple_code (def1
) == GIMPLE_PHI
2695 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1
))
2696 == vect_internal_def
2697 && !is_loop_header_bb_p (gimple_bb (def1
)))))))
2699 if (dump_enabled_p ())
2700 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2704 if (def1
&& def1
== phi
2705 && (code
== COND_EXPR
2706 || !def2
|| gimple_nop_p (def2
)
2707 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2708 || (def2
&& flow_bb_inside_loop_p (loop
, gimple_bb (def2
))
2709 && (is_gimple_assign (def2
)
2710 || is_gimple_call (def2
)
2711 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2712 == vect_induction_def
2713 || (gimple_code (def2
) == GIMPLE_PHI
2714 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2
))
2715 == vect_internal_def
2716 && !is_loop_header_bb_p (gimple_bb (def2
)))))))
2718 if (check_reduction
)
2720 /* Swap operands (just for simplicity - so that the rest of the code
2721 can assume that the reduction variable is always the last (second)
2723 if (dump_enabled_p ())
2724 report_vect_op (MSG_NOTE
, def_stmt
,
2725 "detected reduction: need to swap operands: ");
2727 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
2728 gimple_assign_rhs2_ptr (def_stmt
));
2730 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
2731 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2742 /* Try to find SLP reduction chain. */
2743 if (check_reduction
&& vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
2745 if (dump_enabled_p ())
2746 report_vect_op (MSG_NOTE
, def_stmt
,
2747 "reduction: detected reduction chain: ");
2752 if (dump_enabled_p ())
2753 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2754 "reduction: unknown pattern: ");
2759 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2760 in-place. Arguments as there. */
2763 vect_is_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2764 bool check_reduction
, bool *double_reduc
,
2765 bool need_wrapping_integral_overflow
)
2767 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2768 double_reduc
, false,
2769 need_wrapping_integral_overflow
);
2772 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2773 in-place if it enables detection of more reductions. Arguments
2777 vect_force_simple_reduction (loop_vec_info loop_info
, gimple phi
,
2778 bool check_reduction
, bool *double_reduc
,
2779 bool need_wrapping_integral_overflow
)
2781 return vect_is_simple_reduction_1 (loop_info
, phi
, check_reduction
,
2783 need_wrapping_integral_overflow
);
2786 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2788 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
2789 int *peel_iters_epilogue
,
2790 stmt_vector_for_cost
*scalar_cost_vec
,
2791 stmt_vector_for_cost
*prologue_cost_vec
,
2792 stmt_vector_for_cost
*epilogue_cost_vec
)
2795 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2797 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2799 *peel_iters_epilogue
= vf
/2;
2800 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_NOTE
, vect_location
,
2802 "cost model: epilogue peel iters set to vf/2 "
2803 "because loop iterations are unknown .\n");
2805 /* If peeled iterations are known but number of scalar loop
2806 iterations are unknown, count a taken branch per peeled loop. */
2807 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
2808 NULL
, 0, vect_prologue
);
2809 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
2810 NULL
, 0, vect_epilogue
);
2814 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2815 peel_iters_prologue
= niters
< peel_iters_prologue
?
2816 niters
: peel_iters_prologue
;
2817 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % vf
;
2818 /* If we need to peel for gaps, but no peeling is required, we have to
2819 peel VF iterations. */
2820 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
2821 *peel_iters_epilogue
= vf
;
2824 stmt_info_for_cost
*si
;
2826 if (peel_iters_prologue
)
2827 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
2828 retval
+= record_stmt_cost (prologue_cost_vec
,
2829 si
->count
* peel_iters_prologue
,
2830 si
->kind
, NULL
, si
->misalign
,
2832 if (*peel_iters_epilogue
)
2833 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
2834 retval
+= record_stmt_cost (epilogue_cost_vec
,
2835 si
->count
* *peel_iters_epilogue
,
2836 si
->kind
, NULL
, si
->misalign
,
2842 /* Function vect_estimate_min_profitable_iters
2844 Return the number of iterations required for the vector version of the
2845 loop to be profitable relative to the cost of the scalar version of the
2849 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
2850 int *ret_min_profitable_niters
,
2851 int *ret_min_profitable_estimate
)
2853 int min_profitable_iters
;
2854 int min_profitable_estimate
;
2855 int peel_iters_prologue
;
2856 int peel_iters_epilogue
;
2857 unsigned vec_inside_cost
= 0;
2858 int vec_outside_cost
= 0;
2859 unsigned vec_prologue_cost
= 0;
2860 unsigned vec_epilogue_cost
= 0;
2861 int scalar_single_iter_cost
= 0;
2862 int scalar_outside_cost
= 0;
2863 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2864 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2865 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2867 /* Cost model disabled. */
2868 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2870 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
2871 *ret_min_profitable_niters
= 0;
2872 *ret_min_profitable_estimate
= 0;
2876 /* Requires loop versioning tests to handle misalignment. */
2877 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2879 /* FIXME: Make cost depend on complexity of individual check. */
2880 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
2881 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2883 dump_printf (MSG_NOTE
,
2884 "cost model: Adding cost of checks for loop "
2885 "versioning to treat misalignment.\n");
2888 /* Requires loop versioning with alias checks. */
2889 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2891 /* FIXME: Make cost depend on complexity of individual check. */
2892 unsigned len
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).length ();
2893 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
2895 dump_printf (MSG_NOTE
,
2896 "cost model: Adding cost of checks for loop "
2897 "versioning aliasing.\n");
2900 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
2901 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
2902 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
2905 /* Count statements in scalar loop. Using this as scalar cost for a single
2908 TODO: Add outer loop support.
2910 TODO: Consider assigning different costs to different scalar
2913 scalar_single_iter_cost
2914 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
2916 /* Add additional cost for the peeled instructions in prologue and epilogue
2919 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2920 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2922 TODO: Build an expression that represents peel_iters for prologue and
2923 epilogue to be used in a run-time test. */
2927 peel_iters_prologue
= vf
/2;
2928 dump_printf (MSG_NOTE
, "cost model: "
2929 "prologue peel iters set to vf/2.\n");
2931 /* If peeling for alignment is unknown, loop bound of main loop becomes
2933 peel_iters_epilogue
= vf
/2;
2934 dump_printf (MSG_NOTE
, "cost model: "
2935 "epilogue peel iters set to vf/2 because "
2936 "peeling for alignment is unknown.\n");
2938 /* If peeled iterations are unknown, count a taken branch and a not taken
2939 branch per peeled loop. Even if scalar loop iterations are known,
2940 vector iterations are not known since peeled prologue iterations are
2941 not known. Hence guards remain the same. */
2942 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
2943 NULL
, 0, vect_prologue
);
2944 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
2945 NULL
, 0, vect_prologue
);
2946 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
2947 NULL
, 0, vect_epilogue
);
2948 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
2949 NULL
, 0, vect_epilogue
);
2950 stmt_info_for_cost
*si
;
2952 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
2954 struct _stmt_vec_info
*stmt_info
2955 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2956 (void) add_stmt_cost (target_cost_data
,
2957 si
->count
* peel_iters_prologue
,
2958 si
->kind
, stmt_info
, si
->misalign
,
2960 (void) add_stmt_cost (target_cost_data
,
2961 si
->count
* peel_iters_epilogue
,
2962 si
->kind
, stmt_info
, si
->misalign
,
2968 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2969 stmt_info_for_cost
*si
;
2971 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2973 prologue_cost_vec
.create (2);
2974 epilogue_cost_vec
.create (2);
2975 peel_iters_prologue
= npeel
;
2977 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
2978 &peel_iters_epilogue
,
2979 &LOOP_VINFO_SCALAR_ITERATION_COST
2982 &epilogue_cost_vec
);
2984 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
2986 struct _stmt_vec_info
*stmt_info
2987 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2988 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2989 si
->misalign
, vect_prologue
);
2992 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
2994 struct _stmt_vec_info
*stmt_info
2995 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
2996 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
2997 si
->misalign
, vect_epilogue
);
3000 prologue_cost_vec
.release ();
3001 epilogue_cost_vec
.release ();
3004 /* FORNOW: The scalar outside cost is incremented in one of the
3007 1. The vectorizer checks for alignment and aliasing and generates
3008 a condition that allows dynamic vectorization. A cost model
3009 check is ANDED with the versioning condition. Hence scalar code
3010 path now has the added cost of the versioning check.
3012 if (cost > th & versioning_check)
3015 Hence run-time scalar is incremented by not-taken branch cost.
3017 2. The vectorizer then checks if a prologue is required. If the
3018 cost model check was not done before during versioning, it has to
3019 be done before the prologue check.
3022 prologue = scalar_iters
3027 if (prologue == num_iters)
3030 Hence the run-time scalar cost is incremented by a taken branch,
3031 plus a not-taken branch, plus a taken branch cost.
3033 3. The vectorizer then checks if an epilogue is required. If the
3034 cost model check was not done before during prologue check, it
3035 has to be done with the epilogue check.
3041 if (prologue == num_iters)
3044 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3047 Hence the run-time scalar cost should be incremented by 2 taken
3050 TODO: The back end may reorder the BBS's differently and reverse
3051 conditions/branch directions. Change the estimates below to
3052 something more reasonable. */
3054 /* If the number of iterations is known and we do not do versioning, we can
3055 decide whether to vectorize at compile time. Hence the scalar version
3056 do not carry cost model guard costs. */
3057 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3058 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
3059 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3061 /* Cost model check occurs at versioning. */
3062 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
3063 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3064 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3067 /* Cost model check occurs at prologue generation. */
3068 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3069 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3070 + vect_get_stmt_cost (cond_branch_not_taken
);
3071 /* Cost model check occurs at epilogue generation. */
3073 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3077 /* Complete the target-specific cost calculations. */
3078 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3079 &vec_inside_cost
, &vec_epilogue_cost
);
3081 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3083 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3086 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3088 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3090 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3092 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3093 scalar_single_iter_cost
);
3094 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3095 scalar_outside_cost
);
3096 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3098 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3099 peel_iters_prologue
);
3100 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3101 peel_iters_epilogue
);
3104 /* Calculate number of iterations required to make the vector version
3105 profitable, relative to the loop bodies only. The following condition
3107 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3109 SIC = scalar iteration cost, VIC = vector iteration cost,
3110 VOC = vector outside cost, VF = vectorization factor,
3111 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3112 SOC = scalar outside cost for run time cost model check. */
3114 if ((scalar_single_iter_cost
* vf
) > (int) vec_inside_cost
)
3116 if (vec_outside_cost
<= 0)
3117 min_profitable_iters
= 1;
3120 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
) * vf
3121 - vec_inside_cost
* peel_iters_prologue
3122 - vec_inside_cost
* peel_iters_epilogue
)
3123 / ((scalar_single_iter_cost
* vf
)
3126 if ((scalar_single_iter_cost
* vf
* min_profitable_iters
)
3127 <= (((int) vec_inside_cost
* min_profitable_iters
)
3128 + (((int) vec_outside_cost
- scalar_outside_cost
) * vf
)))
3129 min_profitable_iters
++;
3132 /* vector version will never be profitable. */
3135 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3136 warning_at (vect_location
, OPT_Wopenmp_simd
, "vectorization "
3137 "did not happen for a simd loop");
3139 if (dump_enabled_p ())
3140 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3141 "cost model: the vector iteration cost = %d "
3142 "divided by the scalar iteration cost = %d "
3143 "is greater or equal to the vectorization factor = %d"
3145 vec_inside_cost
, scalar_single_iter_cost
, vf
);
3146 *ret_min_profitable_niters
= -1;
3147 *ret_min_profitable_estimate
= -1;
3151 dump_printf (MSG_NOTE
,
3152 " Calculated minimum iters for profitability: %d\n",
3153 min_profitable_iters
);
3155 min_profitable_iters
=
3156 min_profitable_iters
< vf
? vf
: min_profitable_iters
;
3158 /* Because the condition we create is:
3159 if (niters <= min_profitable_iters)
3160 then skip the vectorized loop. */
3161 min_profitable_iters
--;
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE
, vect_location
,
3165 " Runtime profitability threshold = %d\n",
3166 min_profitable_iters
);
3168 *ret_min_profitable_niters
= min_profitable_iters
;
3170 /* Calculate number of iterations required to make the vector version
3171 profitable, relative to the loop bodies only.
3173 Non-vectorized variant is SIC * niters and it must win over vector
3174 variant on the expected loop trip count. The following condition must hold true:
3175 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3177 if (vec_outside_cost
<= 0)
3178 min_profitable_estimate
= 1;
3181 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
) * vf
3182 - vec_inside_cost
* peel_iters_prologue
3183 - vec_inside_cost
* peel_iters_epilogue
)
3184 / ((scalar_single_iter_cost
* vf
)
3187 min_profitable_estimate
--;
3188 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE
, vect_location
,
3191 " Static estimate profitability threshold = %d\n",
3192 min_profitable_iters
);
3194 *ret_min_profitable_estimate
= min_profitable_estimate
;
3197 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3198 vector elements (not bits) for a vector of mode MODE. */
3200 calc_vec_perm_mask_for_shift (enum machine_mode mode
, unsigned int offset
,
3203 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3205 for (i
= 0; i
< nelt
; i
++)
3206 sel
[i
] = (i
+ offset
) & (2*nelt
- 1);
3209 /* Checks whether the target supports whole-vector shifts for vectors of mode
3210 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3211 it supports vec_perm_const with masks for all necessary shift amounts. */
3213 have_whole_vector_shift (enum machine_mode mode
)
3215 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3218 if (direct_optab_handler (vec_perm_const_optab
, mode
) == CODE_FOR_nothing
)
3221 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
3222 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
3224 for (i
= nelt
/2; i
>= 1; i
/=2)
3226 calc_vec_perm_mask_for_shift (mode
, i
, sel
);
3227 if (!can_vec_perm_p (mode
, false, sel
))
3233 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3236 get_reduction_op (gimple stmt
, int reduc_index
)
3238 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
3240 case GIMPLE_SINGLE_RHS
:
3241 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
))
3243 return TREE_OPERAND (gimple_assign_rhs1 (stmt
), reduc_index
);
3244 case GIMPLE_UNARY_RHS
:
3245 return gimple_assign_rhs1 (stmt
);
3246 case GIMPLE_BINARY_RHS
:
3248 ? gimple_assign_rhs2 (stmt
) : gimple_assign_rhs1 (stmt
));
3249 case GIMPLE_TERNARY_RHS
:
3250 return gimple_op (stmt
, reduc_index
+ 1);
3256 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3257 functions. Design better to avoid maintenance issues. */
3259 /* Function vect_model_reduction_cost.
3261 Models cost for a reduction operation, including the vector ops
3262 generated within the strip-mine loop, the initial definition before
3263 the loop, and the epilogue code that must be generated. */
3266 vect_model_reduction_cost (stmt_vec_info stmt_info
, enum tree_code reduc_code
,
3267 int ncopies
, int reduc_index
)
3269 int prologue_cost
= 0, epilogue_cost
= 0;
3270 enum tree_code code
;
3273 gimple stmt
, orig_stmt
;
3276 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3277 struct loop
*loop
= NULL
;
3278 void *target_cost_data
;
3282 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3283 target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3286 target_cost_data
= BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info
));
3288 /* Cost of reduction op inside loop. */
3289 unsigned inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3290 stmt_info
, 0, vect_body
);
3291 stmt
= STMT_VINFO_STMT (stmt_info
);
3293 reduction_op
= get_reduction_op (stmt
, reduc_index
);
3295 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
3298 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3301 "unsupported data-type ");
3302 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
3303 TREE_TYPE (reduction_op
));
3304 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3309 mode
= TYPE_MODE (vectype
);
3310 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
3313 orig_stmt
= STMT_VINFO_STMT (stmt_info
);
3315 code
= gimple_assign_rhs_code (orig_stmt
);
3317 /* Add in cost for initial definition. */
3318 prologue_cost
+= add_stmt_cost (target_cost_data
, 1, scalar_to_vec
,
3319 stmt_info
, 0, vect_prologue
);
3321 /* Determine cost of epilogue code.
3323 We have a reduction operator that will reduce the vector in one statement.
3324 Also requires scalar extract. */
3326 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt
))
3328 if (reduc_code
!= ERROR_MARK
)
3330 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vector_stmt
,
3331 stmt_info
, 0, vect_epilogue
);
3332 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1, vec_to_scalar
,
3333 stmt_info
, 0, vect_epilogue
);
3337 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3339 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt
)));
3340 int element_bitsize
= tree_to_uhwi (bitsize
);
3341 int nelements
= vec_size_in_bits
/ element_bitsize
;
3343 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3345 /* We have a whole vector shift available. */
3346 if (VECTOR_MODE_P (mode
)
3347 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3348 && have_whole_vector_shift (mode
))
3350 /* Final reduction via vector shifts and the reduction operator.
3351 Also requires scalar extract. */
3352 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3353 exact_log2 (nelements
) * 2,
3354 vector_stmt
, stmt_info
, 0,
3356 epilogue_cost
+= add_stmt_cost (target_cost_data
, 1,
3357 vec_to_scalar
, stmt_info
, 0,
3361 /* Use extracts and reduction op for final reduction. For N
3362 elements, we have N extracts and N-1 reduction ops. */
3363 epilogue_cost
+= add_stmt_cost (target_cost_data
,
3364 nelements
+ nelements
- 1,
3365 vector_stmt
, stmt_info
, 0,
3370 if (dump_enabled_p ())
3371 dump_printf (MSG_NOTE
,
3372 "vect_model_reduction_cost: inside_cost = %d, "
3373 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
3374 prologue_cost
, epilogue_cost
);
3380 /* Function vect_model_induction_cost.
3382 Models cost for induction operations. */
3385 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
)
3387 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3388 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3389 unsigned inside_cost
, prologue_cost
;
3391 /* loop cost for vec_loop. */
3392 inside_cost
= add_stmt_cost (target_cost_data
, ncopies
, vector_stmt
,
3393 stmt_info
, 0, vect_body
);
3395 /* prologue cost for vec_init and vec_step. */
3396 prologue_cost
= add_stmt_cost (target_cost_data
, 2, scalar_to_vec
,
3397 stmt_info
, 0, vect_prologue
);
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE
, vect_location
,
3401 "vect_model_induction_cost: inside_cost = %d, "
3402 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
3406 /* Function get_initial_def_for_induction
3409 STMT - a stmt that performs an induction operation in the loop.
3410 IV_PHI - the initial value of the induction variable
3413 Return a vector variable, initialized with the first VF values of
3414 the induction variable. E.g., for an iv with IV_PHI='X' and
3415 evolution S, for a vector of 4 units, we want to return:
3416 [X, X + S, X + 2*S, X + 3*S]. */
3419 get_initial_def_for_induction (gimple iv_phi
)
3421 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (iv_phi
);
3422 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3423 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3426 edge pe
= loop_preheader_edge (loop
);
3427 struct loop
*iv_loop
;
3429 tree new_vec
, vec_init
, vec_step
, t
;
3432 gimple init_stmt
, new_stmt
;
3433 gphi
*induction_phi
;
3434 tree induc_def
, vec_def
, vec_dest
;
3435 tree init_expr
, step_expr
;
3436 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3440 stmt_vec_info phi_info
= vinfo_for_stmt (iv_phi
);
3441 bool nested_in_vect_loop
= false;
3442 gimple_seq stmts
= NULL
;
3443 imm_use_iterator imm_iter
;
3444 use_operand_p use_p
;
3448 gimple_stmt_iterator si
;
3449 basic_block bb
= gimple_bb (iv_phi
);
3453 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3454 if (nested_in_vect_loop_p (loop
, iv_phi
))
3456 nested_in_vect_loop
= true;
3457 iv_loop
= loop
->inner
;
3461 gcc_assert (iv_loop
== (gimple_bb (iv_phi
))->loop_father
);
3463 latch_e
= loop_latch_edge (iv_loop
);
3464 loop_arg
= PHI_ARG_DEF_FROM_EDGE (iv_phi
, latch_e
);
3466 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
3467 gcc_assert (step_expr
!= NULL_TREE
);
3469 pe
= loop_preheader_edge (iv_loop
);
3470 init_expr
= PHI_ARG_DEF_FROM_EDGE (iv_phi
,
3471 loop_preheader_edge (iv_loop
));
3473 vectype
= get_vectype_for_scalar_type (TREE_TYPE (init_expr
));
3474 resvectype
= get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi
)));
3475 gcc_assert (vectype
);
3476 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3477 ncopies
= vf
/ nunits
;
3479 gcc_assert (phi_info
);
3480 gcc_assert (ncopies
>= 1);
3482 /* Convert the step to the desired type. */
3483 step_expr
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3485 &stmts
, true, NULL_TREE
);
3488 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3489 gcc_assert (!new_bb
);
3492 /* Find the first insertion point in the BB. */
3493 si
= gsi_after_labels (bb
);
3495 /* Create the vector that holds the initial_value of the induction. */
3496 if (nested_in_vect_loop
)
3498 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3499 been created during vectorization of previous stmts. We obtain it
3500 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3501 vec_init
= vect_get_vec_def_for_operand (init_expr
, iv_phi
, NULL
);
3502 /* If the initial value is not of proper type, convert it. */
3503 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
3506 = gimple_build_assign (vect_get_new_vect_var (vectype
,
3510 build1 (VIEW_CONVERT_EXPR
, vectype
,
3512 vec_init
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3513 gimple_assign_set_lhs (new_stmt
, vec_init
);
3514 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
3516 gcc_assert (!new_bb
);
3517 set_vinfo_for_stmt (new_stmt
,
3518 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3523 vec
<constructor_elt
, va_gc
> *v
;
3525 /* iv_loop is the loop to be vectorized. Create:
3526 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3527 new_var
= vect_get_new_vect_var (TREE_TYPE (vectype
),
3528 vect_scalar_var
, "var_");
3529 new_name
= force_gimple_operand (fold_convert (TREE_TYPE (vectype
),
3531 &stmts
, false, new_var
);
3534 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
3535 gcc_assert (!new_bb
);
3538 vec_alloc (v
, nunits
);
3539 bool constant_p
= is_gimple_min_invariant (new_name
);
3540 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3541 for (i
= 1; i
< nunits
; i
++)
3543 /* Create: new_name_i = new_name + step_expr */
3544 new_name
= fold_build2 (PLUS_EXPR
, TREE_TYPE (new_name
),
3545 new_name
, step_expr
);
3546 if (!is_gimple_min_invariant (new_name
))
3548 init_stmt
= gimple_build_assign (new_var
, new_name
);
3549 new_name
= make_ssa_name (new_var
, init_stmt
);
3550 gimple_assign_set_lhs (init_stmt
, new_name
);
3551 new_bb
= gsi_insert_on_edge_immediate (pe
, init_stmt
);
3552 gcc_assert (!new_bb
);
3553 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE
, vect_location
,
3556 "created new init_stmt: ");
3557 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, init_stmt
, 0);
3558 dump_printf (MSG_NOTE
, "\n");
3562 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, new_name
);
3564 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3566 new_vec
= build_vector_from_ctor (vectype
, v
);
3568 new_vec
= build_constructor (vectype
, v
);
3569 vec_init
= vect_init_vector (iv_phi
, new_vec
, vectype
, NULL
);
3573 /* Create the vector that holds the step of the induction. */
3574 if (nested_in_vect_loop
)
3575 /* iv_loop is nested in the loop to be vectorized. Generate:
3576 vec_step = [S, S, S, S] */
3577 new_name
= step_expr
;
3580 /* iv_loop is the loop to be vectorized. Generate:
3581 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3582 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3584 expr
= build_int_cst (integer_type_node
, vf
);
3585 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3588 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
3589 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3591 if (TREE_CODE (step_expr
) == SSA_NAME
)
3592 new_name
= vect_init_vector (iv_phi
, new_name
,
3593 TREE_TYPE (step_expr
), NULL
);
3596 t
= unshare_expr (new_name
);
3597 gcc_assert (CONSTANT_CLASS_P (new_name
)
3598 || TREE_CODE (new_name
) == SSA_NAME
);
3599 stepvectype
= get_vectype_for_scalar_type (TREE_TYPE (new_name
));
3600 gcc_assert (stepvectype
);
3601 new_vec
= build_vector_from_val (stepvectype
, t
);
3602 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3605 /* Create the following def-use cycle:
3610 vec_iv = PHI <vec_init, vec_loop>
3614 vec_loop = vec_iv + vec_step; */
3616 /* Create the induction-phi that defines the induction-operand. */
3617 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
3618 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
3619 set_vinfo_for_stmt (induction_phi
,
3620 new_stmt_vec_info (induction_phi
, loop_vinfo
, NULL
));
3621 induc_def
= PHI_RESULT (induction_phi
);
3623 /* Create the iv update inside the loop */
3624 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
, induc_def
, vec_step
);
3625 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3626 gimple_assign_set_lhs (new_stmt
, vec_def
);
3627 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3628 set_vinfo_for_stmt (new_stmt
, new_stmt_vec_info (new_stmt
, loop_vinfo
,
3631 /* Set the arguments of the phi node: */
3632 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
3633 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
3637 /* In case that vectorization factor (VF) is bigger than the number
3638 of elements that we can fit in a vectype (nunits), we have to generate
3639 more than one vector stmt - i.e - we need to "unroll" the
3640 vector stmt by a factor VF/nunits. For more details see documentation
3641 in vectorizable_operation. */
3645 stmt_vec_info prev_stmt_vinfo
;
3646 /* FORNOW. This restriction should be relaxed. */
3647 gcc_assert (!nested_in_vect_loop
);
3649 /* Create the vector that holds the step of the induction. */
3650 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
3652 expr
= build_int_cst (integer_type_node
, nunits
);
3653 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
3656 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
3657 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
3659 if (TREE_CODE (step_expr
) == SSA_NAME
)
3660 new_name
= vect_init_vector (iv_phi
, new_name
,
3661 TREE_TYPE (step_expr
), NULL
);
3662 t
= unshare_expr (new_name
);
3663 gcc_assert (CONSTANT_CLASS_P (new_name
)
3664 || TREE_CODE (new_name
) == SSA_NAME
);
3665 new_vec
= build_vector_from_val (stepvectype
, t
);
3666 vec_step
= vect_init_vector (iv_phi
, new_vec
, stepvectype
, NULL
);
3668 vec_def
= induc_def
;
3669 prev_stmt_vinfo
= vinfo_for_stmt (induction_phi
);
3670 for (i
= 1; i
< ncopies
; i
++)
3672 /* vec_i = vec_prev + vec_step */
3673 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
3675 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
3676 gimple_assign_set_lhs (new_stmt
, vec_def
);
3678 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3679 if (!useless_type_conversion_p (resvectype
, vectype
))
3682 = gimple_build_assign
3683 (vect_get_new_vect_var (resvectype
, vect_simple_var
,
3686 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3687 gimple_assign_lhs (new_stmt
)));
3688 gimple_assign_set_lhs (new_stmt
,
3690 (gimple_assign_lhs (new_stmt
), new_stmt
));
3691 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3693 set_vinfo_for_stmt (new_stmt
,
3694 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3695 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt
;
3696 prev_stmt_vinfo
= vinfo_for_stmt (new_stmt
);
3700 if (nested_in_vect_loop
)
3702 /* Find the loop-closed exit-phi of the induction, and record
3703 the final vector of induction results: */
3705 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
3707 gimple use_stmt
= USE_STMT (use_p
);
3708 if (is_gimple_debug (use_stmt
))
3711 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
3713 exit_phi
= use_stmt
;
3719 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (exit_phi
);
3720 /* FORNOW. Currently not supporting the case that an inner-loop induction
3721 is not used in the outer-loop (i.e. only outside the outer-loop). */
3722 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
3723 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
3725 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt
;
3726 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_NOTE
, vect_location
,
3729 "vector of inductions after inner-loop:");
3730 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, new_stmt
, 0);
3731 dump_printf (MSG_NOTE
, "\n");
3737 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE
, vect_location
,
3740 "transform induction: created def-use cycle: ");
3741 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, induction_phi
, 0);
3742 dump_printf (MSG_NOTE
, "\n");
3743 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
3744 SSA_NAME_DEF_STMT (vec_def
), 0);
3745 dump_printf (MSG_NOTE
, "\n");
3748 STMT_VINFO_VEC_STMT (phi_info
) = induction_phi
;
3749 if (!useless_type_conversion_p (resvectype
, vectype
))
3751 new_stmt
= gimple_build_assign (vect_get_new_vect_var (resvectype
,
3755 build1 (VIEW_CONVERT_EXPR
, resvectype
,
3757 induc_def
= make_ssa_name (gimple_assign_lhs (new_stmt
), new_stmt
);
3758 gimple_assign_set_lhs (new_stmt
, induc_def
);
3759 si
= gsi_after_labels (bb
);
3760 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
3761 set_vinfo_for_stmt (new_stmt
,
3762 new_stmt_vec_info (new_stmt
, loop_vinfo
, NULL
));
3763 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt
))
3764 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi
));
3771 /* Function get_initial_def_for_reduction
3774 STMT - a stmt that performs a reduction operation in the loop.
3775 INIT_VAL - the initial value of the reduction variable
3778 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3779 of the reduction (used for adjusting the epilog - see below).
3780 Return a vector variable, initialized according to the operation that STMT
3781 performs. This vector will be used as the initial value of the
3782 vector of partial results.
3784 Option1 (adjust in epilog): Initialize the vector as follows:
3785 add/bit or/xor: [0,0,...,0,0]
3786 mult/bit and: [1,1,...,1,1]
3787 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3788 and when necessary (e.g. add/mult case) let the caller know
3789 that it needs to adjust the result by init_val.
3791 Option2: Initialize the vector as follows:
3792 add/bit or/xor: [init_val,0,0,...,0]
3793 mult/bit and: [init_val,1,1,...,1]
3794 min/max/cond_expr: [init_val,init_val,...,init_val]
3795 and no adjustments are needed.
3797 For example, for the following code:
3803 STMT is 's = s + a[i]', and the reduction variable is 's'.
3804 For a vector of 4 units, we want to return either [0,0,0,init_val],
3805 or [0,0,0,0] and let the caller know that it needs to adjust
3806 the result at the end by 'init_val'.
3808 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3809 initialization vector is simpler (same element in all entries), if
3810 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3812 A cost model should help decide between these two schemes. */
3815 get_initial_def_for_reduction (gimple stmt
, tree init_val
,
3816 tree
*adjustment_def
)
3818 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
3819 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3820 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3821 tree scalar_type
= TREE_TYPE (init_val
);
3822 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3824 enum tree_code code
= gimple_assign_rhs_code (stmt
);
3829 bool nested_in_vect_loop
= false;
3831 REAL_VALUE_TYPE real_init_val
= dconst0
;
3832 int int_init_val
= 0;
3833 gimple def_stmt
= NULL
;
3835 gcc_assert (vectype
);
3836 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3838 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3839 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3841 if (nested_in_vect_loop_p (loop
, stmt
))
3842 nested_in_vect_loop
= true;
3844 gcc_assert (loop
== (gimple_bb (stmt
))->loop_father
);
3846 /* In case of double reduction we only create a vector variable to be put
3847 in the reduction phi node. The actual statement creation is done in
3848 vect_create_epilog_for_reduction. */
3849 if (adjustment_def
&& nested_in_vect_loop
3850 && TREE_CODE (init_val
) == SSA_NAME
3851 && (def_stmt
= SSA_NAME_DEF_STMT (init_val
))
3852 && gimple_code (def_stmt
) == GIMPLE_PHI
3853 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3854 && vinfo_for_stmt (def_stmt
)
3855 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
))
3856 == vect_double_reduction_def
)
3858 *adjustment_def
= NULL
;
3859 return vect_create_destination_var (init_val
, vectype
);
3862 if (TREE_CONSTANT (init_val
))
3864 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3865 init_value
= build_real (scalar_type
, TREE_REAL_CST (init_val
));
3867 init_value
= build_int_cst (scalar_type
, TREE_INT_CST_LOW (init_val
));
3870 init_value
= init_val
;
3874 case WIDEN_SUM_EXPR
:
3883 /* ADJUSMENT_DEF is NULL when called from
3884 vect_create_epilog_for_reduction to vectorize double reduction. */
3887 if (nested_in_vect_loop
)
3888 *adjustment_def
= vect_get_vec_def_for_operand (init_val
, stmt
,
3891 *adjustment_def
= init_val
;
3894 if (code
== MULT_EXPR
)
3896 real_init_val
= dconst1
;
3900 if (code
== BIT_AND_EXPR
)
3903 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3904 def_for_init
= build_real (scalar_type
, real_init_val
);
3906 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3908 /* Create a vector of '0' or '1' except the first element. */
3909 elts
= XALLOCAVEC (tree
, nunits
);
3910 for (i
= nunits
- 2; i
>= 0; --i
)
3911 elts
[i
+ 1] = def_for_init
;
3913 /* Option1: the first element is '0' or '1' as well. */
3916 elts
[0] = def_for_init
;
3917 init_def
= build_vector (vectype
, elts
);
3921 /* Option2: the first element is INIT_VAL. */
3923 if (TREE_CONSTANT (init_val
))
3924 init_def
= build_vector (vectype
, elts
);
3927 vec
<constructor_elt
, va_gc
> *v
;
3928 vec_alloc (v
, nunits
);
3929 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, init_val
);
3930 for (i
= 1; i
< nunits
; ++i
)
3931 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[i
]);
3932 init_def
= build_constructor (vectype
, v
);
3942 *adjustment_def
= NULL_TREE
;
3943 init_def
= vect_get_vec_def_for_operand (init_val
, stmt
, NULL
);
3947 init_def
= build_vector_from_val (vectype
, init_value
);
3957 /* Function vect_create_epilog_for_reduction
3959 Create code at the loop-epilog to finalize the result of a reduction
3962 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3963 reduction statements.
3964 STMT is the scalar reduction stmt that is being vectorized.
3965 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3966 number of elements that we can fit in a vectype (nunits). In this case
3967 we have to generate more than one vector stmt - i.e - we need to "unroll"
3968 the vector stmt by a factor VF/nunits. For more details see documentation
3969 in vectorizable_operation.
3970 REDUC_CODE is the tree-code for the epilog reduction.
3971 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3973 REDUC_INDEX is the index of the operand in the right hand side of the
3974 statement that is defined by REDUCTION_PHI.
3975 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3976 SLP_NODE is an SLP node containing a group of reduction statements. The
3977 first one in this group is STMT.
3980 1. Creates the reduction def-use cycles: sets the arguments for
3982 The loop-entry argument is the vectorized initial-value of the reduction.
3983 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3985 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3986 by applying the operation specified by REDUC_CODE if available, or by
3987 other means (whole-vector shifts or a scalar loop).
3988 The function also creates a new phi node at the loop exit to preserve
3989 loop-closed form, as illustrated below.
3991 The flow at the entry to this function:
3994 vec_def = phi <null, null> # REDUCTION_PHI
3995 VECT_DEF = vector_stmt # vectorized form of STMT
3996 s_loop = scalar_stmt # (scalar) STMT
3998 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4002 The above is transformed by this function into:
4005 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4006 VECT_DEF = vector_stmt # vectorized form of STMT
4007 s_loop = scalar_stmt # (scalar) STMT
4009 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4010 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4011 v_out2 = reduce <v_out1>
4012 s_out3 = extract_field <v_out2, 0>
4013 s_out4 = adjust_result <s_out3>
4019 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
, gimple stmt
,
4020 int ncopies
, enum tree_code reduc_code
,
4021 vec
<gimple
> reduction_phis
,
4022 int reduc_index
, bool double_reduc
,
4025 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4026 stmt_vec_info prev_phi_info
;
4029 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4030 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4031 basic_block exit_bb
;
4034 gimple new_phi
= NULL
, phi
;
4035 gimple_stmt_iterator exit_gsi
;
4037 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4038 gimple epilog_stmt
= NULL
;
4039 enum tree_code code
= gimple_assign_rhs_code (stmt
);
4042 tree adjustment_def
= NULL
;
4043 tree vec_initial_def
= NULL
;
4044 tree reduction_op
, expr
, def
;
4045 tree orig_name
, scalar_result
;
4046 imm_use_iterator imm_iter
, phi_imm_iter
;
4047 use_operand_p use_p
, phi_use_p
;
4048 gimple use_stmt
, orig_stmt
, reduction_phi
= NULL
;
4049 bool nested_in_vect_loop
= false;
4050 auto_vec
<gimple
> new_phis
;
4051 auto_vec
<gimple
> inner_phis
;
4052 enum vect_def_type dt
= vect_unknown_def_type
;
4054 auto_vec
<tree
> scalar_results
;
4055 unsigned int group_size
= 1, k
, ratio
;
4056 auto_vec
<tree
> vec_initial_defs
;
4057 auto_vec
<gimple
> phis
;
4058 bool slp_reduc
= false;
4059 tree new_phi_result
;
4060 gimple inner_phi
= NULL
;
4063 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4065 if (nested_in_vect_loop_p (loop
, stmt
))
4069 nested_in_vect_loop
= true;
4070 gcc_assert (!slp_node
);
4073 reduction_op
= get_reduction_op (stmt
, reduc_index
);
4075 vectype
= get_vectype_for_scalar_type (TREE_TYPE (reduction_op
));
4076 gcc_assert (vectype
);
4077 mode
= TYPE_MODE (vectype
);
4079 /* 1. Create the reduction def-use cycle:
4080 Set the arguments of REDUCTION_PHIS, i.e., transform
4083 vec_def = phi <null, null> # REDUCTION_PHI
4084 VECT_DEF = vector_stmt # vectorized form of STMT
4090 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4091 VECT_DEF = vector_stmt # vectorized form of STMT
4094 (in case of SLP, do it for all the phis). */
4096 /* Get the loop-entry arguments. */
4098 vect_get_vec_defs (reduction_op
, NULL_TREE
, stmt
, &vec_initial_defs
,
4099 NULL
, slp_node
, reduc_index
);
4102 vec_initial_defs
.create (1);
4103 /* For the case of reduction, vect_get_vec_def_for_operand returns
4104 the scalar def before the loop, that defines the initial value
4105 of the reduction variable. */
4106 vec_initial_def
= vect_get_vec_def_for_operand (reduction_op
, stmt
,
4108 vec_initial_defs
.quick_push (vec_initial_def
);
4111 /* Set phi nodes arguments. */
4112 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi
)
4114 tree vec_init_def
, def
;
4116 vec_init_def
= force_gimple_operand (vec_initial_defs
[i
], &stmts
,
4118 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4120 for (j
= 0; j
< ncopies
; j
++)
4122 /* Set the loop-entry arg of the reduction-phi. */
4123 add_phi_arg (as_a
<gphi
*> (phi
), vec_init_def
,
4124 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4126 /* Set the loop-latch arg for the reduction-phi. */
4128 def
= vect_get_vec_def_for_stmt_copy (vect_unknown_def_type
, def
);
4130 add_phi_arg (as_a
<gphi
*> (phi
), def
, loop_latch_edge (loop
),
4133 if (dump_enabled_p ())
4135 dump_printf_loc (MSG_NOTE
, vect_location
,
4136 "transform reduction: created def-use cycle: ");
4137 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
4138 dump_printf (MSG_NOTE
, "\n");
4139 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, SSA_NAME_DEF_STMT (def
), 0);
4140 dump_printf (MSG_NOTE
, "\n");
4143 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4147 /* 2. Create epilog code.
4148 The reduction epilog code operates across the elements of the vector
4149 of partial results computed by the vectorized loop.
4150 The reduction epilog code consists of:
4152 step 1: compute the scalar result in a vector (v_out2)
4153 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4154 step 3: adjust the scalar result (s_out3) if needed.
4156 Step 1 can be accomplished using one the following three schemes:
4157 (scheme 1) using reduc_code, if available.
4158 (scheme 2) using whole-vector shifts, if available.
4159 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4162 The overall epilog code looks like this:
4164 s_out0 = phi <s_loop> # original EXIT_PHI
4165 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4166 v_out2 = reduce <v_out1> # step 1
4167 s_out3 = extract_field <v_out2, 0> # step 2
4168 s_out4 = adjust_result <s_out3> # step 3
4170 (step 3 is optional, and steps 1 and 2 may be combined).
4171 Lastly, the uses of s_out0 are replaced by s_out4. */
4174 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4175 v_out1 = phi <VECT_DEF>
4176 Store them in NEW_PHIS. */
4178 exit_bb
= single_exit (loop
)->dest
;
4179 prev_phi_info
= NULL
;
4180 new_phis
.create (vect_defs
.length ());
4181 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4183 for (j
= 0; j
< ncopies
; j
++)
4185 tree new_def
= copy_ssa_name (def
);
4186 phi
= create_phi_node (new_def
, exit_bb
);
4187 set_vinfo_for_stmt (phi
, new_stmt_vec_info (phi
, loop_vinfo
, NULL
));
4189 new_phis
.quick_push (phi
);
4192 def
= vect_get_vec_def_for_stmt_copy (dt
, def
);
4193 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi
;
4196 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4197 prev_phi_info
= vinfo_for_stmt (phi
);
4201 /* The epilogue is created for the outer-loop, i.e., for the loop being
4202 vectorized. Create exit phis for the outer loop. */
4206 exit_bb
= single_exit (loop
)->dest
;
4207 inner_phis
.create (vect_defs
.length ());
4208 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4210 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4211 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4212 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4214 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4216 inner_phis
.quick_push (phi
);
4217 new_phis
[i
] = outer_phi
;
4218 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4219 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
)))
4221 phi
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi
));
4222 new_result
= copy_ssa_name (PHI_RESULT (phi
));
4223 outer_phi
= create_phi_node (new_result
, exit_bb
);
4224 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4226 set_vinfo_for_stmt (outer_phi
, new_stmt_vec_info (outer_phi
,
4228 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi
;
4229 prev_phi_info
= vinfo_for_stmt (outer_phi
);
4234 exit_gsi
= gsi_after_labels (exit_bb
);
4236 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4237 (i.e. when reduc_code is not available) and in the final adjustment
4238 code (if needed). Also get the original scalar reduction variable as
4239 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4240 represents a reduction pattern), the tree-code and scalar-def are
4241 taken from the original stmt that the pattern-stmt (STMT) replaces.
4242 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4243 are taken from STMT. */
4245 orig_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
);
4248 /* Regular reduction */
4253 /* Reduction pattern */
4254 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (orig_stmt
);
4255 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
));
4256 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo
) == stmt
);
4259 code
= gimple_assign_rhs_code (orig_stmt
);
4260 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4261 partial results are added and not subtracted. */
4262 if (code
== MINUS_EXPR
)
4265 scalar_dest
= gimple_assign_lhs (orig_stmt
);
4266 scalar_type
= TREE_TYPE (scalar_dest
);
4267 scalar_results
.create (group_size
);
4268 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4269 bitsize
= TYPE_SIZE (scalar_type
);
4271 /* In case this is a reduction in an inner-loop while vectorizing an outer
4272 loop - we don't need to extract a single scalar result at the end of the
4273 inner-loop (unless it is double reduction, i.e., the use of reduction is
4274 outside the outer-loop). The final vector of partial results will be used
4275 in the vectorized outer-loop, or reduced to a scalar result at the end of
4277 if (nested_in_vect_loop
&& !double_reduc
)
4278 goto vect_finalize_reduction
;
4280 /* SLP reduction without reduction chain, e.g.,
4284 b2 = operation (b1) */
4285 slp_reduc
= (slp_node
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)));
4287 /* In case of reduction chain, e.g.,
4290 a3 = operation (a2),
4292 we may end up with more than one vector result. Here we reduce them to
4294 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4296 tree first_vect
= PHI_RESULT (new_phis
[0]);
4298 gassign
*new_vec_stmt
= NULL
;
4300 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4301 for (k
= 1; k
< new_phis
.length (); k
++)
4303 gimple next_phi
= new_phis
[k
];
4304 tree second_vect
= PHI_RESULT (next_phi
);
4306 tmp
= build2 (code
, vectype
, first_vect
, second_vect
);
4307 new_vec_stmt
= gimple_build_assign (vec_dest
, tmp
);
4308 first_vect
= make_ssa_name (vec_dest
, new_vec_stmt
);
4309 gimple_assign_set_lhs (new_vec_stmt
, first_vect
);
4310 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4313 new_phi_result
= first_vect
;
4316 new_phis
.truncate (0);
4317 new_phis
.safe_push (new_vec_stmt
);
4321 new_phi_result
= PHI_RESULT (new_phis
[0]);
4323 /* 2.3 Create the reduction code, using one of the three schemes described
4324 above. In SLP we simply need to extract all the elements from the
4325 vector (without reducing them), so we use scalar shifts. */
4326 if (reduc_code
!= ERROR_MARK
&& !slp_reduc
)
4331 /*** Case 1: Create:
4332 v_out2 = reduc_expr <v_out1> */
4334 if (dump_enabled_p ())
4335 dump_printf_loc (MSG_NOTE
, vect_location
,
4336 "Reduce using direct vector reduction.\n");
4338 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4339 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
4342 vect_create_destination_var (scalar_dest
, vec_elem_type
);
4343 tmp
= build1 (reduc_code
, vec_elem_type
, new_phi_result
);
4344 epilog_stmt
= gimple_build_assign (tmp_dest
, tmp
);
4345 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
4346 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4347 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4349 tmp
= build1 (NOP_EXPR
, scalar_type
, new_temp
);
4352 tmp
= build1 (reduc_code
, scalar_type
, new_phi_result
);
4353 epilog_stmt
= gimple_build_assign (new_scalar_dest
, tmp
);
4354 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4355 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4356 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4357 scalar_results
.safe_push (new_temp
);
4361 bool reduce_with_shift
= have_whole_vector_shift (mode
);
4362 int element_bitsize
= tree_to_uhwi (bitsize
);
4363 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4366 /* Regardless of whether we have a whole vector shift, if we're
4367 emulating the operation via tree-vect-generic, we don't want
4368 to use it. Only the first round of the reduction is likely
4369 to still be profitable via emulation. */
4370 /* ??? It might be better to emit a reduction tree code here, so that
4371 tree-vect-generic can expand the first round via bit tricks. */
4372 if (!VECTOR_MODE_P (mode
))
4373 reduce_with_shift
= false;
4376 optab optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4377 if (optab_handler (optab
, mode
) == CODE_FOR_nothing
)
4378 reduce_with_shift
= false;
4381 if (reduce_with_shift
&& !slp_reduc
)
4383 int nelements
= vec_size_in_bits
/ element_bitsize
;
4384 unsigned char *sel
= XALLOCAVEC (unsigned char, nelements
);
4388 tree zero_vec
= build_zero_cst (vectype
);
4389 /*** Case 2: Create:
4390 for (offset = nelements/2; offset >= 1; offset/=2)
4392 Create: va' = vec_shift <va, offset>
4393 Create: va = vop <va, va'>
4398 if (dump_enabled_p ())
4399 dump_printf_loc (MSG_NOTE
, vect_location
,
4400 "Reduce using vector shifts\n");
4402 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4403 new_temp
= new_phi_result
;
4404 for (elt_offset
= nelements
/ 2;
4408 calc_vec_perm_mask_for_shift (mode
, elt_offset
, sel
);
4409 tree mask
= vect_gen_perm_mask_any (vectype
, sel
);
4410 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
4411 new_temp
, zero_vec
, mask
);
4412 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4413 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4414 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4416 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
4418 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4419 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4420 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4423 /* 2.4 Extract the final scalar result. Create:
4424 s_out3 = extract_field <v_out2, bitpos> */
4426 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_NOTE
, vect_location
,
4428 "extract scalar result\n");
4430 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
4431 bitsize
, bitsize_zero_node
);
4432 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4433 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4434 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4435 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4436 scalar_results
.safe_push (new_temp
);
4440 /*** Case 3: Create:
4441 s = extract_field <v_out2, 0>
4442 for (offset = element_size;
4443 offset < vector_size;
4444 offset += element_size;)
4446 Create: s' = extract_field <v_out2, offset>
4447 Create: s = op <s, s'> // For non SLP cases
4450 if (dump_enabled_p ())
4451 dump_printf_loc (MSG_NOTE
, vect_location
,
4452 "Reduce using scalar code.\n");
4454 vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4455 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4458 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4459 vec_temp
= PHI_RESULT (new_phi
);
4461 vec_temp
= gimple_assign_lhs (new_phi
);
4462 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
4464 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4465 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4466 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4467 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4469 /* In SLP we don't need to apply reduction operation, so we just
4470 collect s' values in SCALAR_RESULTS. */
4472 scalar_results
.safe_push (new_temp
);
4474 for (bit_offset
= element_bitsize
;
4475 bit_offset
< vec_size_in_bits
;
4476 bit_offset
+= element_bitsize
)
4478 tree bitpos
= bitsize_int (bit_offset
);
4479 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
4482 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4483 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4484 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4485 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4489 /* In SLP we don't need to apply reduction operation, so
4490 we just collect s' values in SCALAR_RESULTS. */
4491 new_temp
= new_name
;
4492 scalar_results
.safe_push (new_name
);
4496 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4497 new_name
, new_temp
);
4498 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4499 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4500 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4505 /* The only case where we need to reduce scalar results in SLP, is
4506 unrolling. If the size of SCALAR_RESULTS is greater than
4507 GROUP_SIZE, we reduce them combining elements modulo
4511 tree res
, first_res
, new_res
;
4514 /* Reduce multiple scalar results in case of SLP unrolling. */
4515 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
4518 first_res
= scalar_results
[j
% group_size
];
4519 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
4521 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
4522 gimple_assign_set_lhs (new_stmt
, new_res
);
4523 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
4524 scalar_results
[j
% group_size
] = new_res
;
4528 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4529 scalar_results
.safe_push (new_temp
);
4533 vect_finalize_reduction
:
4538 /* 2.5 Adjust the final result by the initial value of the reduction
4539 variable. (When such adjustment is not needed, then
4540 'adjustment_def' is zero). For example, if code is PLUS we create:
4541 new_temp = loop_exit_def + adjustment_def */
4545 gcc_assert (!slp_reduc
);
4546 if (nested_in_vect_loop
)
4548 new_phi
= new_phis
[0];
4549 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
4550 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
4551 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4555 new_temp
= scalar_results
[0];
4556 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
4557 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
4558 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
4561 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
4562 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
4563 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4564 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4565 if (nested_in_vect_loop
)
4567 set_vinfo_for_stmt (epilog_stmt
,
4568 new_stmt_vec_info (epilog_stmt
, loop_vinfo
,
4570 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt
)) =
4571 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi
));
4574 scalar_results
.quick_push (new_temp
);
4576 scalar_results
[0] = new_temp
;
4579 scalar_results
[0] = new_temp
;
4581 new_phis
[0] = epilog_stmt
;
4584 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4585 phis with new adjusted scalar results, i.e., replace use <s_out0>
4590 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4591 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4592 v_out2 = reduce <v_out1>
4593 s_out3 = extract_field <v_out2, 0>
4594 s_out4 = adjust_result <s_out3>
4601 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4602 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4603 v_out2 = reduce <v_out1>
4604 s_out3 = extract_field <v_out2, 0>
4605 s_out4 = adjust_result <s_out3>
4610 /* In SLP reduction chain we reduce vector results into one vector if
4611 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4612 the last stmt in the reduction chain, since we are looking for the loop
4614 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
4616 gimple dest_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
4617 /* Handle reduction patterns. */
4618 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
)))
4619 dest_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt
));
4621 scalar_dest
= gimple_assign_lhs (dest_stmt
);
4625 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4626 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4627 need to match SCALAR_RESULTS with corresponding statements. The first
4628 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4629 the first vector stmt, etc.
4630 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4631 if (group_size
> new_phis
.length ())
4633 ratio
= group_size
/ new_phis
.length ();
4634 gcc_assert (!(group_size
% new_phis
.length ()));
4639 for (k
= 0; k
< group_size
; k
++)
4643 epilog_stmt
= new_phis
[k
/ ratio
];
4644 reduction_phi
= reduction_phis
[k
/ ratio
];
4646 inner_phi
= inner_phis
[k
/ ratio
];
4651 gimple current_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
4653 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt
));
4654 /* SLP statements can't participate in patterns. */
4655 gcc_assert (!orig_stmt
);
4656 scalar_dest
= gimple_assign_lhs (current_stmt
);
4660 /* Find the loop-closed-use at the loop exit of the original scalar
4661 result. (The reduction result is expected to have two immediate uses -
4662 one at the latch block, and one at the loop exit). */
4663 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4664 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
4665 && !is_gimple_debug (USE_STMT (use_p
)))
4666 phis
.safe_push (USE_STMT (use_p
));
4668 /* While we expect to have found an exit_phi because of loop-closed-ssa
4669 form we can end up without one if the scalar cycle is dead. */
4671 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4675 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
4678 /* FORNOW. Currently not supporting the case that an inner-loop
4679 reduction is not used in the outer-loop (but only outside the
4680 outer-loop), unless it is double reduction. */
4681 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
4682 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
))
4686 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
4688 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt
;
4690 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
4691 != vect_double_reduction_def
)
4694 /* Handle double reduction:
4696 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4697 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4698 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4699 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4701 At that point the regular reduction (stmt2 and stmt3) is
4702 already vectorized, as well as the exit phi node, stmt4.
4703 Here we vectorize the phi node of double reduction, stmt1, and
4704 update all relevant statements. */
4706 /* Go through all the uses of s2 to find double reduction phi
4707 node, i.e., stmt1 above. */
4708 orig_name
= PHI_RESULT (exit_phi
);
4709 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4711 stmt_vec_info use_stmt_vinfo
;
4712 stmt_vec_info new_phi_vinfo
;
4713 tree vect_phi_init
, preheader_arg
, vect_phi_res
, init_def
;
4714 basic_block bb
= gimple_bb (use_stmt
);
4717 /* Check that USE_STMT is really double reduction phi
4719 if (gimple_code (use_stmt
) != GIMPLE_PHI
4720 || gimple_phi_num_args (use_stmt
) != 2
4721 || bb
->loop_father
!= outer_loop
)
4723 use_stmt_vinfo
= vinfo_for_stmt (use_stmt
);
4725 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
4726 != vect_double_reduction_def
)
4729 /* Create vector phi node for double reduction:
4730 vs1 = phi <vs0, vs2>
4731 vs1 was created previously in this function by a call to
4732 vect_get_vec_def_for_operand and is stored in
4734 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4735 vs0 is created here. */
4737 /* Create vector phi node. */
4738 vect_phi
= create_phi_node (vec_initial_def
, bb
);
4739 new_phi_vinfo
= new_stmt_vec_info (vect_phi
,
4740 loop_vec_info_for_loop (outer_loop
), NULL
);
4741 set_vinfo_for_stmt (vect_phi
, new_phi_vinfo
);
4743 /* Create vs0 - initial def of the double reduction phi. */
4744 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
4745 loop_preheader_edge (outer_loop
));
4746 init_def
= get_initial_def_for_reduction (stmt
,
4747 preheader_arg
, NULL
);
4748 vect_phi_init
= vect_init_vector (use_stmt
, init_def
,
4751 /* Update phi node arguments with vs0 and vs2. */
4752 add_phi_arg (vect_phi
, vect_phi_init
,
4753 loop_preheader_edge (outer_loop
),
4755 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
),
4756 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
4757 if (dump_enabled_p ())
4759 dump_printf_loc (MSG_NOTE
, vect_location
,
4760 "created double reduction phi node: ");
4761 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, vect_phi
, 0);
4762 dump_printf (MSG_NOTE
, "\n");
4765 vect_phi_res
= PHI_RESULT (vect_phi
);
4767 /* Replace the use, i.e., set the correct vs1 in the regular
4768 reduction phi node. FORNOW, NCOPIES is always 1, so the
4769 loop is redundant. */
4770 use
= reduction_phi
;
4771 for (j
= 0; j
< ncopies
; j
++)
4773 edge pr_edge
= loop_preheader_edge (loop
);
4774 SET_PHI_ARG_DEF (use
, pr_edge
->dest_idx
, vect_phi_res
);
4775 use
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use
));
4782 if (nested_in_vect_loop
)
4791 /* Find the loop-closed-use at the loop exit of the original scalar
4792 result. (The reduction result is expected to have two immediate uses,
4793 one at the latch block, and one at the loop exit). For double
4794 reductions we are looking for exit phis of the outer loop. */
4795 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
4797 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
4799 if (!is_gimple_debug (USE_STMT (use_p
)))
4800 phis
.safe_push (USE_STMT (use_p
));
4804 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
4806 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
4808 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
4810 if (!flow_bb_inside_loop_p (loop
,
4811 gimple_bb (USE_STMT (phi_use_p
)))
4812 && !is_gimple_debug (USE_STMT (phi_use_p
)))
4813 phis
.safe_push (USE_STMT (phi_use_p
));
4819 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
4821 /* Replace the uses: */
4822 orig_name
= PHI_RESULT (exit_phi
);
4823 scalar_result
= scalar_results
[k
];
4824 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
4825 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4826 SET_USE (use_p
, scalar_result
);
4834 /* Function vectorizable_reduction.
4836 Check if STMT performs a reduction operation that can be vectorized.
4837 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4838 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4839 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4841 This function also handles reduction idioms (patterns) that have been
4842 recognized in advance during vect_pattern_recog. In this case, STMT may be
4844 X = pattern_expr (arg0, arg1, ..., X)
4845 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4846 sequence that had been detected and replaced by the pattern-stmt (STMT).
4848 In some cases of reduction patterns, the type of the reduction variable X is
4849 different than the type of the other arguments of STMT.
4850 In such cases, the vectype that is used when transforming STMT into a vector
4851 stmt is different than the vectype that is used to determine the
4852 vectorization factor, because it consists of a different number of elements
4853 than the actual number of elements that are being operated upon in parallel.
4855 For example, consider an accumulation of shorts into an int accumulator.
4856 On some targets it's possible to vectorize this pattern operating on 8
4857 shorts at a time (hence, the vectype for purposes of determining the
4858 vectorization factor should be V8HI); on the other hand, the vectype that
4859 is used to create the vector form is actually V4SI (the type of the result).
4861 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4862 indicates what is the actual level of parallelism (V8HI in the example), so
4863 that the right vectorization factor would be derived. This vectype
4864 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4865 be used to create the vectorized stmt. The right vectype for the vectorized
4866 stmt is obtained from the type of the result X:
4867 get_vectype_for_scalar_type (TREE_TYPE (X))
4869 This means that, contrary to "regular" reductions (or "regular" stmts in
4870 general), the following equation:
4871 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4872 does *NOT* necessarily hold for reduction patterns. */
4875 vectorizable_reduction (gimple stmt
, gimple_stmt_iterator
*gsi
,
4876 gimple
*vec_stmt
, slp_tree slp_node
)
4880 tree loop_vec_def0
= NULL_TREE
, loop_vec_def1
= NULL_TREE
;
4881 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4882 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
4883 tree vectype_in
= NULL_TREE
;
4884 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4885 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4886 enum tree_code code
, orig_code
, epilog_reduc_code
;
4887 machine_mode vec_mode
;
4889 optab optab
, reduc_optab
;
4890 tree new_temp
= NULL_TREE
;
4893 enum vect_def_type dt
;
4894 gphi
*new_phi
= NULL
;
4898 stmt_vec_info orig_stmt_info
;
4899 tree expr
= NULL_TREE
;
4903 stmt_vec_info prev_stmt_info
, prev_phi_info
;
4904 bool single_defuse_cycle
= false;
4905 tree reduc_def
= NULL_TREE
;
4906 gimple new_stmt
= NULL
;
4909 bool nested_cycle
= false, found_nested_cycle_def
= false;
4910 gimple reduc_def_stmt
= NULL
;
4911 bool double_reduc
= false, dummy
;
4913 struct loop
* def_stmt_loop
, *outer_loop
= NULL
;
4915 gimple def_arg_stmt
;
4916 auto_vec
<tree
> vec_oprnds0
;
4917 auto_vec
<tree
> vec_oprnds1
;
4918 auto_vec
<tree
> vect_defs
;
4919 auto_vec
<gimple
> phis
;
4921 tree def0
, def1
, tem
, op0
, op1
= NULL_TREE
;
4922 bool first_p
= true;
4924 /* In case of reduction chain we switch to the first stmt in the chain, but
4925 we don't update STMT_INFO, since only the last stmt is marked as reduction
4926 and has reduction properties. */
4927 if (GROUP_FIRST_ELEMENT (stmt_info
)
4928 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
4930 stmt
= GROUP_FIRST_ELEMENT (stmt_info
);
4934 if (nested_in_vect_loop_p (loop
, stmt
))
4938 nested_cycle
= true;
4941 /* 1. Is vectorizable reduction? */
4942 /* Not supportable if the reduction variable is used in the loop, unless
4943 it's a reduction chain. */
4944 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
4945 && !GROUP_FIRST_ELEMENT (stmt_info
))
4948 /* Reductions that are not used even in an enclosing outer-loop,
4949 are expected to be "live" (used out of the loop). */
4950 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
4951 && !STMT_VINFO_LIVE_P (stmt_info
))
4954 /* Make sure it was already recognized as a reduction computation. */
4955 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_reduction_def
4956 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt
)) != vect_nested_cycle
)
4959 /* 2. Has this been recognized as a reduction pattern?
4961 Check if STMT represents a pattern that has been recognized
4962 in earlier analysis stages. For stmts that represent a pattern,
4963 the STMT_VINFO_RELATED_STMT field records the last stmt in
4964 the original sequence that constitutes the pattern. */
4966 orig_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt
));
4969 orig_stmt_info
= vinfo_for_stmt (orig_stmt
);
4970 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4971 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
4974 /* 3. Check the operands of the operation. The first operands are defined
4975 inside the loop body. The last operand is the reduction variable,
4976 which is defined by the loop-header-phi. */
4978 gcc_assert (is_gimple_assign (stmt
));
4981 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
4983 case GIMPLE_SINGLE_RHS
:
4984 op_type
= TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt
));
4985 if (op_type
== ternary_op
)
4987 tree rhs
= gimple_assign_rhs1 (stmt
);
4988 ops
[0] = TREE_OPERAND (rhs
, 0);
4989 ops
[1] = TREE_OPERAND (rhs
, 1);
4990 ops
[2] = TREE_OPERAND (rhs
, 2);
4991 code
= TREE_CODE (rhs
);
4997 case GIMPLE_BINARY_RHS
:
4998 code
= gimple_assign_rhs_code (stmt
);
4999 op_type
= TREE_CODE_LENGTH (code
);
5000 gcc_assert (op_type
== binary_op
);
5001 ops
[0] = gimple_assign_rhs1 (stmt
);
5002 ops
[1] = gimple_assign_rhs2 (stmt
);
5005 case GIMPLE_TERNARY_RHS
:
5006 code
= gimple_assign_rhs_code (stmt
);
5007 op_type
= TREE_CODE_LENGTH (code
);
5008 gcc_assert (op_type
== ternary_op
);
5009 ops
[0] = gimple_assign_rhs1 (stmt
);
5010 ops
[1] = gimple_assign_rhs2 (stmt
);
5011 ops
[2] = gimple_assign_rhs3 (stmt
);
5014 case GIMPLE_UNARY_RHS
:
5020 /* The default is that the reduction variable is the last in statement. */
5021 int reduc_index
= op_type
- 1;
5023 if (code
== COND_EXPR
&& slp_node
)
5026 scalar_dest
= gimple_assign_lhs (stmt
);
5027 scalar_type
= TREE_TYPE (scalar_dest
);
5028 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
5029 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
5032 /* Do not try to vectorize bit-precision reductions. */
5033 if ((TYPE_PRECISION (scalar_type
)
5034 != GET_MODE_PRECISION (TYPE_MODE (scalar_type
))))
5037 /* All uses but the last are expected to be defined in the loop.
5038 The last use is the reduction variable. In case of nested cycle this
5039 assumption is not true: we use reduc_index to record the index of the
5040 reduction variable. */
5041 for (i
= 0; i
< op_type
- 1; i
++)
5043 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5044 if (i
== 0 && code
== COND_EXPR
)
5047 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
5048 &def_stmt
, &def
, &dt
, &tem
);
5051 gcc_assert (is_simple_use
);
5053 if (dt
!= vect_internal_def
5054 && dt
!= vect_external_def
5055 && dt
!= vect_constant_def
5056 && dt
!= vect_induction_def
5057 && !(dt
== vect_nested_cycle
&& nested_cycle
))
5060 if (dt
== vect_nested_cycle
)
5062 found_nested_cycle_def
= true;
5063 reduc_def_stmt
= def_stmt
;
5068 is_simple_use
= vect_is_simple_use_1 (ops
[i
], stmt
, loop_vinfo
, NULL
,
5069 &def_stmt
, &def
, &dt
, &tem
);
5072 gcc_assert (is_simple_use
);
5073 if (!found_nested_cycle_def
)
5074 reduc_def_stmt
= def_stmt
;
5076 if (reduc_def_stmt
&& gimple_code (reduc_def_stmt
) != GIMPLE_PHI
)
5079 if (!(dt
== vect_reduction_def
5080 || dt
== vect_nested_cycle
5081 || ((dt
== vect_internal_def
|| dt
== vect_external_def
5082 || dt
== vect_constant_def
|| dt
== vect_induction_def
)
5083 && nested_cycle
&& found_nested_cycle_def
)))
5085 /* For pattern recognized stmts, orig_stmt might be a reduction,
5086 but some helper statements for the pattern might not, or
5087 might be COND_EXPRs with reduction uses in the condition. */
5088 gcc_assert (orig_stmt
);
5092 gimple tmp
= vect_is_simple_reduction (loop_vinfo
, reduc_def_stmt
,
5093 !nested_cycle
, &dummy
, false);
5095 gcc_assert (tmp
== orig_stmt
5096 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == orig_stmt
);
5098 /* We changed STMT to be the first stmt in reduction chain, hence we
5099 check that in this case the first element in the chain is STMT. */
5100 gcc_assert (stmt
== tmp
5101 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp
)) == stmt
);
5103 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt
)))
5106 if (slp_node
|| PURE_SLP_STMT (stmt_info
))
5109 ncopies
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5110 / TYPE_VECTOR_SUBPARTS (vectype_in
));
5112 gcc_assert (ncopies
>= 1);
5114 vec_mode
= TYPE_MODE (vectype_in
);
5116 if (code
== COND_EXPR
)
5118 if (!vectorizable_condition (stmt
, gsi
, NULL
, ops
[reduc_index
], 0, NULL
))
5120 if (dump_enabled_p ())
5121 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5122 "unsupported condition in reduction\n");
5129 /* 4. Supportable by target? */
5131 if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
5132 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
5134 /* Shifts and rotates are only supported by vectorizable_shifts,
5135 not vectorizable_reduction. */
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5138 "unsupported shift or rotation.\n");
5142 /* 4.1. check support for the operation in the loop */
5143 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
5146 if (dump_enabled_p ())
5147 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5153 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5155 if (dump_enabled_p ())
5156 dump_printf (MSG_NOTE
, "op not supported by target.\n");
5158 if (GET_MODE_SIZE (vec_mode
) != UNITS_PER_WORD
5159 || LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5160 < vect_min_worthwhile_factor (code
))
5163 if (dump_enabled_p ())
5164 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
5167 /* Worthwhile without SIMD support? */
5168 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
5169 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5170 < vect_min_worthwhile_factor (code
))
5172 if (dump_enabled_p ())
5173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5174 "not worthwhile without SIMD support.\n");
5180 /* 4.2. Check support for the epilog operation.
5182 If STMT represents a reduction pattern, then the type of the
5183 reduction variable may be different than the type of the rest
5184 of the arguments. For example, consider the case of accumulation
5185 of shorts into an int accumulator; The original code:
5186 S1: int_a = (int) short_a;
5187 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5190 STMT: int_acc = widen_sum <short_a, int_acc>
5193 1. The tree-code that is used to create the vector operation in the
5194 epilog code (that reduces the partial results) is not the
5195 tree-code of STMT, but is rather the tree-code of the original
5196 stmt from the pattern that STMT is replacing. I.e, in the example
5197 above we want to use 'widen_sum' in the loop, but 'plus' in the
5199 2. The type (mode) we use to check available target support
5200 for the vector operation to be created in the *epilog*, is
5201 determined by the type of the reduction variable (in the example
5202 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5203 However the type (mode) we use to check available target support
5204 for the vector operation to be created *inside the loop*, is
5205 determined by the type of the other arguments to STMT (in the
5206 example we'd check this: optab_handler (widen_sum_optab,
5209 This is contrary to "regular" reductions, in which the types of all
5210 the arguments are the same as the type of the reduction variable.
5211 For "regular" reductions we can therefore use the same vector type
5212 (and also the same tree-code) when generating the epilog code and
5213 when generating the code inside the loop. */
5217 /* This is a reduction pattern: get the vectype from the type of the
5218 reduction variable, and get the tree-code from orig_stmt. */
5219 orig_code
= gimple_assign_rhs_code (orig_stmt
);
5220 gcc_assert (vectype_out
);
5221 vec_mode
= TYPE_MODE (vectype_out
);
5225 /* Regular reduction: use the same vectype and tree-code as used for
5226 the vector code inside the loop can be used for the epilog code. */
5232 def_bb
= gimple_bb (reduc_def_stmt
);
5233 def_stmt_loop
= def_bb
->loop_father
;
5234 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5235 loop_preheader_edge (def_stmt_loop
));
5236 if (TREE_CODE (def_arg
) == SSA_NAME
5237 && (def_arg_stmt
= SSA_NAME_DEF_STMT (def_arg
))
5238 && gimple_code (def_arg_stmt
) == GIMPLE_PHI
5239 && flow_bb_inside_loop_p (outer_loop
, gimple_bb (def_arg_stmt
))
5240 && vinfo_for_stmt (def_arg_stmt
)
5241 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt
))
5242 == vect_double_reduction_def
)
5243 double_reduc
= true;
5246 epilog_reduc_code
= ERROR_MARK
;
5247 if (reduction_code_for_scalar_code (orig_code
, &epilog_reduc_code
))
5249 reduc_optab
= optab_for_tree_code (epilog_reduc_code
, vectype_out
,
5253 if (dump_enabled_p ())
5254 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5255 "no optab for reduction.\n");
5257 epilog_reduc_code
= ERROR_MARK
;
5259 else if (optab_handler (reduc_optab
, vec_mode
) == CODE_FOR_nothing
)
5261 optab
= scalar_reduc_to_vector (reduc_optab
, vectype_out
);
5262 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
5264 if (dump_enabled_p ())
5265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5266 "reduc op not supported by target.\n");
5268 epilog_reduc_code
= ERROR_MARK
;
5274 if (!nested_cycle
|| double_reduc
)
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5278 "no reduc code for scalar code.\n");
5284 if (double_reduc
&& ncopies
> 1)
5286 if (dump_enabled_p ())
5287 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5288 "multiple types in double reduction\n");
5293 /* In case of widenning multiplication by a constant, we update the type
5294 of the constant to be the type of the other operand. We check that the
5295 constant fits the type in the pattern recognition pass. */
5296 if (code
== DOT_PROD_EXPR
5297 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
5299 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
5300 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
5301 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
5302 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
5305 if (dump_enabled_p ())
5306 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5307 "invalid types in dot-prod\n");
5313 if (!vec_stmt
) /* transformation not required. */
5316 && !vect_model_reduction_cost (stmt_info
, epilog_reduc_code
, ncopies
,
5319 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
5328 /* FORNOW: Multiple types are not supported for condition. */
5329 if (code
== COND_EXPR
)
5330 gcc_assert (ncopies
== 1);
5332 /* Create the destination vector */
5333 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
5335 /* In case the vectorization factor (VF) is bigger than the number
5336 of elements that we can fit in a vectype (nunits), we have to generate
5337 more than one vector stmt - i.e - we need to "unroll" the
5338 vector stmt by a factor VF/nunits. For more details see documentation
5339 in vectorizable_operation. */
5341 /* If the reduction is used in an outer loop we need to generate
5342 VF intermediate results, like so (e.g. for ncopies=2):
5347 (i.e. we generate VF results in 2 registers).
5348 In this case we have a separate def-use cycle for each copy, and therefore
5349 for each copy we get the vector def for the reduction variable from the
5350 respective phi node created for this copy.
5352 Otherwise (the reduction is unused in the loop nest), we can combine
5353 together intermediate results, like so (e.g. for ncopies=2):
5357 (i.e. we generate VF/2 results in a single register).
5358 In this case for each copy we get the vector def for the reduction variable
5359 from the vectorized reduction operation generated in the previous iteration.
5362 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
)
5364 single_defuse_cycle
= true;
5368 epilog_copies
= ncopies
;
5370 prev_stmt_info
= NULL
;
5371 prev_phi_info
= NULL
;
5373 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
5377 vec_oprnds0
.create (1);
5378 if (op_type
== ternary_op
)
5379 vec_oprnds1
.create (1);
5382 phis
.create (vec_num
);
5383 vect_defs
.create (vec_num
);
5385 vect_defs
.quick_push (NULL_TREE
);
5387 for (j
= 0; j
< ncopies
; j
++)
5389 if (j
== 0 || !single_defuse_cycle
)
5391 for (i
= 0; i
< vec_num
; i
++)
5393 /* Create the reduction-phi that defines the reduction
5395 new_phi
= create_phi_node (vec_dest
, loop
->header
);
5396 set_vinfo_for_stmt (new_phi
,
5397 new_stmt_vec_info (new_phi
, loop_vinfo
,
5399 if (j
== 0 || slp_node
)
5400 phis
.quick_push (new_phi
);
5404 if (code
== COND_EXPR
)
5406 gcc_assert (!slp_node
);
5407 vectorizable_condition (stmt
, gsi
, vec_stmt
,
5408 PHI_RESULT (phis
[0]),
5410 /* Multiple types are not supported for condition. */
5417 op0
= ops
[!reduc_index
];
5418 if (op_type
== ternary_op
)
5420 if (reduc_index
== 0)
5427 vect_get_vec_defs (op0
, op1
, stmt
, &vec_oprnds0
, &vec_oprnds1
,
5431 loop_vec_def0
= vect_get_vec_def_for_operand (ops
[!reduc_index
],
5433 vec_oprnds0
.quick_push (loop_vec_def0
);
5434 if (op_type
== ternary_op
)
5436 loop_vec_def1
= vect_get_vec_def_for_operand (op1
, stmt
,
5438 vec_oprnds1
.quick_push (loop_vec_def1
);
5446 enum vect_def_type dt
;
5450 vect_is_simple_use (ops
[!reduc_index
], stmt
, loop_vinfo
, NULL
,
5451 &dummy_stmt
, &dummy
, &dt
);
5452 loop_vec_def0
= vect_get_vec_def_for_stmt_copy (dt
,
5454 vec_oprnds0
[0] = loop_vec_def0
;
5455 if (op_type
== ternary_op
)
5457 vect_is_simple_use (op1
, stmt
, loop_vinfo
, NULL
, &dummy_stmt
,
5459 loop_vec_def1
= vect_get_vec_def_for_stmt_copy (dt
,
5461 vec_oprnds1
[0] = loop_vec_def1
;
5465 if (single_defuse_cycle
)
5466 reduc_def
= gimple_assign_lhs (new_stmt
);
5468 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi
;
5471 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5474 reduc_def
= PHI_RESULT (phis
[i
]);
5477 if (!single_defuse_cycle
|| j
== 0)
5478 reduc_def
= PHI_RESULT (new_phi
);
5481 def1
= ((op_type
== ternary_op
)
5482 ? vec_oprnds1
[i
] : NULL
);
5483 if (op_type
== binary_op
)
5485 if (reduc_index
== 0)
5486 expr
= build2 (code
, vectype_out
, reduc_def
, def0
);
5488 expr
= build2 (code
, vectype_out
, def0
, reduc_def
);
5492 if (reduc_index
== 0)
5493 expr
= build3 (code
, vectype_out
, reduc_def
, def0
, def1
);
5496 if (reduc_index
== 1)
5497 expr
= build3 (code
, vectype_out
, def0
, reduc_def
, def1
);
5499 expr
= build3 (code
, vectype_out
, def0
, def1
, reduc_def
);
5503 new_stmt
= gimple_build_assign (vec_dest
, expr
);
5504 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5505 gimple_assign_set_lhs (new_stmt
, new_temp
);
5506 vect_finish_stmt_generation (stmt
, new_stmt
, gsi
);
5510 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
5511 vect_defs
.quick_push (new_temp
);
5514 vect_defs
[0] = new_temp
;
5521 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt
;
5523 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt
;
5525 prev_stmt_info
= vinfo_for_stmt (new_stmt
);
5526 prev_phi_info
= vinfo_for_stmt (new_phi
);
5529 /* Finalize the reduction-phi (set its arguments) and create the
5530 epilog reduction code. */
5531 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
5533 new_temp
= gimple_assign_lhs (*vec_stmt
);
5534 vect_defs
[0] = new_temp
;
5537 vect_create_epilog_for_reduction (vect_defs
, stmt
, epilog_copies
,
5538 epilog_reduc_code
, phis
, reduc_index
,
5539 double_reduc
, slp_node
);
5544 /* Function vect_min_worthwhile_factor.
5546 For a loop where we could vectorize the operation indicated by CODE,
5547 return the minimum vectorization factor that makes it worthwhile
5548 to use generic vectors. */
5550 vect_min_worthwhile_factor (enum tree_code code
)
5571 /* Function vectorizable_induction
5573 Check if PHI performs an induction computation that can be vectorized.
5574 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5575 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5576 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5579 vectorizable_induction (gimple phi
, gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5582 stmt_vec_info stmt_info
= vinfo_for_stmt (phi
);
5583 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5584 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5585 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5586 int nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5587 int ncopies
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) / nunits
;
5590 gcc_assert (ncopies
>= 1);
5591 /* FORNOW. These restrictions should be relaxed. */
5592 if (nested_in_vect_loop_p (loop
, phi
))
5594 imm_use_iterator imm_iter
;
5595 use_operand_p use_p
;
5602 if (dump_enabled_p ())
5603 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5604 "multiple types in nested loop.\n");
5609 latch_e
= loop_latch_edge (loop
->inner
);
5610 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
5611 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
5613 gimple use_stmt
= USE_STMT (use_p
);
5614 if (is_gimple_debug (use_stmt
))
5617 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
5619 exit_phi
= use_stmt
;
5625 stmt_vec_info exit_phi_vinfo
= vinfo_for_stmt (exit_phi
);
5626 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
5627 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5631 "inner-loop induction only used outside "
5632 "of the outer vectorized loop.\n");
5638 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
5641 /* FORNOW: SLP not supported. */
5642 if (STMT_SLP_TYPE (stmt_info
))
5645 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
);
5647 if (gimple_code (phi
) != GIMPLE_PHI
)
5650 if (!vec_stmt
) /* transformation not required. */
5652 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
5653 if (dump_enabled_p ())
5654 dump_printf_loc (MSG_NOTE
, vect_location
,
5655 "=== vectorizable_induction ===\n");
5656 vect_model_induction_cost (stmt_info
, ncopies
);
5662 if (dump_enabled_p ())
5663 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
5665 vec_def
= get_initial_def_for_induction (phi
);
5666 *vec_stmt
= SSA_NAME_DEF_STMT (vec_def
);
5670 /* Function vectorizable_live_operation.
5672 STMT computes a value that is used outside the loop. Check if
5673 it can be supported. */
5676 vectorizable_live_operation (gimple stmt
,
5677 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
5680 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5681 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5682 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5688 enum vect_def_type dt
;
5689 enum tree_code code
;
5690 enum gimple_rhs_class rhs_class
;
5692 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
5694 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5697 if (!is_gimple_assign (stmt
))
5699 if (gimple_call_internal_p (stmt
)
5700 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
5701 && gimple_call_lhs (stmt
)
5703 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
5705 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0)))
5707 edge e
= single_exit (loop
);
5708 basic_block merge_bb
= e
->dest
;
5709 imm_use_iterator imm_iter
;
5710 use_operand_p use_p
;
5711 tree lhs
= gimple_call_lhs (stmt
);
5713 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
5715 gimple use_stmt
= USE_STMT (use_p
);
5716 if (gimple_code (use_stmt
) == GIMPLE_PHI
5717 && gimple_bb (use_stmt
) == merge_bb
)
5722 = build_int_cst (unsigned_type_node
,
5723 loop_vinfo
->vectorization_factor
- 1);
5724 SET_PHI_ARG_DEF (use_stmt
, e
->dest_idx
, vfm1
);
5734 if (TREE_CODE (gimple_assign_lhs (stmt
)) != SSA_NAME
)
5737 /* FORNOW. CHECKME. */
5738 if (nested_in_vect_loop_p (loop
, stmt
))
5741 code
= gimple_assign_rhs_code (stmt
);
5742 op_type
= TREE_CODE_LENGTH (code
);
5743 rhs_class
= get_gimple_rhs_class (code
);
5744 gcc_assert (rhs_class
!= GIMPLE_UNARY_RHS
|| op_type
== unary_op
);
5745 gcc_assert (rhs_class
!= GIMPLE_BINARY_RHS
|| op_type
== binary_op
);
5747 /* FORNOW: support only if all uses are invariant. This means
5748 that the scalar operations can remain in place, unvectorized.
5749 The original last scalar value that they compute will be used. */
5751 for (i
= 0; i
< op_type
; i
++)
5753 if (rhs_class
== GIMPLE_SINGLE_RHS
)
5754 op
= TREE_OPERAND (gimple_op (stmt
, 1), i
);
5756 op
= gimple_op (stmt
, i
+ 1);
5758 && !vect_is_simple_use (op
, stmt
, loop_vinfo
, NULL
, &def_stmt
, &def
,
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5763 "use not simple.\n");
5767 if (dt
!= vect_external_def
&& dt
!= vect_constant_def
)
5771 /* No transformation is required for the cases we currently support. */
5775 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5778 vect_loop_kill_debug_uses (struct loop
*loop
, gimple stmt
)
5780 ssa_op_iter op_iter
;
5781 imm_use_iterator imm_iter
;
5782 def_operand_p def_p
;
5785 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
5787 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
5791 if (!is_gimple_debug (ustmt
))
5794 bb
= gimple_bb (ustmt
);
5796 if (!flow_bb_inside_loop_p (loop
, bb
))
5798 if (gimple_debug_bind_p (ustmt
))
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_NOTE
, vect_location
,
5802 "killing debug use\n");
5804 gimple_debug_bind_reset_value (ustmt
);
5805 update_stmt (ustmt
);
5815 /* This function builds ni_name = number of iterations. Statements
5816 are emitted on the loop preheader edge. */
5819 vect_build_loop_niters (loop_vec_info loop_vinfo
)
5821 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
5822 if (TREE_CODE (ni
) == INTEGER_CST
)
5827 gimple_seq stmts
= NULL
;
5828 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5830 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
5831 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
5833 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5840 /* This function generates the following statements:
5842 ni_name = number of iterations loop executes
5843 ratio = ni_name / vf
5844 ratio_mult_vf_name = ratio * vf
5846 and places them on the loop preheader edge. */
5849 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo
,
5851 tree
*ratio_mult_vf_name_ptr
,
5852 tree
*ratio_name_ptr
)
5854 tree ni_minus_gap_name
;
5857 tree ratio_mult_vf_name
;
5858 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5859 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5862 log_vf
= build_int_cst (TREE_TYPE (ni_name
), exact_log2 (vf
));
5864 /* If epilogue loop is required because of data accesses with gaps, we
5865 subtract one iteration from the total number of iterations here for
5866 correct calculation of RATIO. */
5867 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
5869 ni_minus_gap_name
= fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5871 build_one_cst (TREE_TYPE (ni_name
)));
5872 if (!is_gimple_val (ni_minus_gap_name
))
5874 var
= create_tmp_var (TREE_TYPE (ni_name
), "ni_gap");
5875 gimple stmts
= NULL
;
5876 ni_minus_gap_name
= force_gimple_operand (ni_minus_gap_name
, &stmts
,
5878 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5882 ni_minus_gap_name
= ni_name
;
5884 /* Create: ratio = ni >> log2(vf) */
5885 /* ??? As we have ni == number of latch executions + 1, ni could
5886 have overflown to zero. So avoid computing ratio based on ni
5887 but compute it using the fact that we know ratio will be at least
5888 one, thus via (ni - vf) >> log2(vf) + 1. */
5890 = fold_build2 (PLUS_EXPR
, TREE_TYPE (ni_name
),
5891 fold_build2 (RSHIFT_EXPR
, TREE_TYPE (ni_name
),
5892 fold_build2 (MINUS_EXPR
, TREE_TYPE (ni_name
),
5895 (TREE_TYPE (ni_name
), vf
)),
5897 build_int_cst (TREE_TYPE (ni_name
), 1));
5898 if (!is_gimple_val (ratio_name
))
5900 var
= create_tmp_var (TREE_TYPE (ni_name
), "bnd");
5901 gimple stmts
= NULL
;
5902 ratio_name
= force_gimple_operand (ratio_name
, &stmts
, true, var
);
5903 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5905 *ratio_name_ptr
= ratio_name
;
5907 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5909 if (ratio_mult_vf_name_ptr
)
5911 ratio_mult_vf_name
= fold_build2 (LSHIFT_EXPR
, TREE_TYPE (ratio_name
),
5912 ratio_name
, log_vf
);
5913 if (!is_gimple_val (ratio_mult_vf_name
))
5915 var
= create_tmp_var (TREE_TYPE (ni_name
), "ratio_mult_vf");
5916 gimple stmts
= NULL
;
5917 ratio_mult_vf_name
= force_gimple_operand (ratio_mult_vf_name
, &stmts
,
5919 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5921 *ratio_mult_vf_name_ptr
= ratio_mult_vf_name
;
5928 /* Function vect_transform_loop.
5930 The analysis phase has determined that the loop is vectorizable.
5931 Vectorize the loop - created vectorized stmts to replace the scalar
5932 stmts in the loop, and update the loop exit condition. */
5935 vect_transform_loop (loop_vec_info loop_vinfo
)
5937 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5938 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
5939 int nbbs
= loop
->num_nodes
;
5942 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
5944 bool slp_scheduled
= false;
5945 gimple stmt
, pattern_stmt
;
5946 gimple_seq pattern_def_seq
= NULL
;
5947 gimple_stmt_iterator pattern_def_si
= gsi_none ();
5948 bool transform_pattern_stmt
= false;
5949 bool check_profitability
= false;
5951 /* Record number of iterations before we started tampering with the profile. */
5952 gcov_type expected_iterations
= expected_loop_iterations_unbounded (loop
);
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vec_transform_loop ===\n");
5957 /* If profile is inprecise, we have chance to fix it up. */
5958 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5959 expected_iterations
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
5961 /* Use the more conservative vectorization threshold. If the number
5962 of iterations is constant assume the cost check has been performed
5963 by our caller. If the threshold makes all loops profitable that
5964 run at least the vectorization factor number of times checking
5965 is pointless, too. */
5966 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
5967 if (th
>= LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1
5968 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
5970 if (dump_enabled_p ())
5971 dump_printf_loc (MSG_NOTE
, vect_location
,
5972 "Profitability threshold is %d loop iterations.\n",
5974 check_profitability
= true;
5977 /* Version the loop first, if required, so the profitability check
5980 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
)
5981 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
5983 vect_loop_versioning (loop_vinfo
, th
, check_profitability
);
5984 check_profitability
= false;
5987 tree ni_name
= vect_build_loop_niters (loop_vinfo
);
5988 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = ni_name
;
5990 /* Peel the loop if there are data refs with unknown alignment.
5991 Only one data ref with unknown store is allowed. */
5993 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
5995 vect_do_peeling_for_alignment (loop_vinfo
, ni_name
,
5996 th
, check_profitability
);
5997 check_profitability
= false;
5998 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6000 ni_name
= NULL_TREE
;
6003 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6004 compile time constant), or it is a constant that doesn't divide by the
6005 vectorization factor, then an epilog loop needs to be created.
6006 We therefore duplicate the loop: the original loop will be vectorized,
6007 and will compute the first (n/VF) iterations. The second copy of the loop
6008 will remain scalar and will compute the remaining (n%VF) iterations.
6009 (VF is the vectorization factor). */
6011 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
6012 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
6016 ni_name
= vect_build_loop_niters (loop_vinfo
);
6017 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, &ratio_mult_vf
,
6019 vect_do_peeling_for_loop_bound (loop_vinfo
, ni_name
, ratio_mult_vf
,
6020 th
, check_profitability
);
6022 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
6023 ratio
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
6024 LOOP_VINFO_INT_NITERS (loop_vinfo
) / vectorization_factor
);
6028 ni_name
= vect_build_loop_niters (loop_vinfo
);
6029 vect_generate_tmps_on_preheader (loop_vinfo
, ni_name
, NULL
, &ratio
);
6032 /* 1) Make sure the loop header has exactly two entries
6033 2) Make sure we have a preheader basic block. */
6035 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
6037 split_edge (loop_preheader_edge (loop
));
6039 /* FORNOW: the vectorizer supports only loops which body consist
6040 of one basic block (header + empty latch). When the vectorizer will
6041 support more involved loop forms, the order by which the BBs are
6042 traversed need to be reconsidered. */
6044 for (i
= 0; i
< nbbs
; i
++)
6046 basic_block bb
= bbs
[i
];
6047 stmt_vec_info stmt_info
;
6049 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
6052 gphi
*phi
= si
.phi ();
6053 if (dump_enabled_p ())
6055 dump_printf_loc (MSG_NOTE
, vect_location
,
6056 "------>vectorizing phi: ");
6057 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, phi
, 0);
6058 dump_printf (MSG_NOTE
, "\n");
6060 stmt_info
= vinfo_for_stmt (phi
);
6064 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6065 vect_loop_kill_debug_uses (loop
, phi
);
6067 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6068 && !STMT_VINFO_LIVE_P (stmt_info
))
6071 if (STMT_VINFO_VECTYPE (stmt_info
)
6072 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
))
6073 != (unsigned HOST_WIDE_INT
) vectorization_factor
)
6074 && dump_enabled_p ())
6075 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6077 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
6079 if (dump_enabled_p ())
6080 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
6081 vect_transform_stmt (phi
, NULL
, NULL
, NULL
, NULL
);
6085 pattern_stmt
= NULL
;
6086 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
6087 !gsi_end_p (si
) || transform_pattern_stmt
;)
6091 if (transform_pattern_stmt
)
6092 stmt
= pattern_stmt
;
6095 stmt
= gsi_stmt (si
);
6096 /* During vectorization remove existing clobber stmts. */
6097 if (gimple_clobber_p (stmt
))
6099 unlink_stmt_vdef (stmt
);
6100 gsi_remove (&si
, true);
6101 release_defs (stmt
);
6106 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE
, vect_location
,
6109 "------>vectorizing statement: ");
6110 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
6111 dump_printf (MSG_NOTE
, "\n");
6114 stmt_info
= vinfo_for_stmt (stmt
);
6116 /* vector stmts created in the outer-loop during vectorization of
6117 stmts in an inner-loop may not have a stmt_info, and do not
6118 need to be vectorized. */
6125 if (MAY_HAVE_DEBUG_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
6126 vect_loop_kill_debug_uses (loop
, stmt
);
6128 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
6129 && !STMT_VINFO_LIVE_P (stmt_info
))
6131 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6132 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6133 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6134 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6136 stmt
= pattern_stmt
;
6137 stmt_info
= vinfo_for_stmt (stmt
);
6145 else if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
6146 && (pattern_stmt
= STMT_VINFO_RELATED_STMT (stmt_info
))
6147 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt
))
6148 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt
))))
6149 transform_pattern_stmt
= true;
6151 /* If pattern statement has def stmts, vectorize them too. */
6152 if (is_pattern_stmt_p (stmt_info
))
6154 if (pattern_def_seq
== NULL
)
6156 pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
6157 pattern_def_si
= gsi_start (pattern_def_seq
);
6159 else if (!gsi_end_p (pattern_def_si
))
6160 gsi_next (&pattern_def_si
);
6161 if (pattern_def_seq
!= NULL
)
6163 gimple pattern_def_stmt
= NULL
;
6164 stmt_vec_info pattern_def_stmt_info
= NULL
;
6166 while (!gsi_end_p (pattern_def_si
))
6168 pattern_def_stmt
= gsi_stmt (pattern_def_si
);
6169 pattern_def_stmt_info
6170 = vinfo_for_stmt (pattern_def_stmt
);
6171 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info
)
6172 || STMT_VINFO_LIVE_P (pattern_def_stmt_info
))
6174 gsi_next (&pattern_def_si
);
6177 if (!gsi_end_p (pattern_def_si
))
6179 if (dump_enabled_p ())
6181 dump_printf_loc (MSG_NOTE
, vect_location
,
6182 "==> vectorizing pattern def "
6184 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
6185 pattern_def_stmt
, 0);
6186 dump_printf (MSG_NOTE
, "\n");
6189 stmt
= pattern_def_stmt
;
6190 stmt_info
= pattern_def_stmt_info
;
6194 pattern_def_si
= gsi_none ();
6195 transform_pattern_stmt
= false;
6199 transform_pattern_stmt
= false;
6202 if (STMT_VINFO_VECTYPE (stmt_info
))
6206 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
6207 if (!STMT_SLP_TYPE (stmt_info
)
6208 && nunits
!= (unsigned int) vectorization_factor
6209 && dump_enabled_p ())
6210 /* For SLP VF is set according to unrolling factor, and not
6211 to vector size, hence for SLP this print is not valid. */
6212 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
6215 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6217 if (STMT_SLP_TYPE (stmt_info
))
6221 slp_scheduled
= true;
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_NOTE
, vect_location
,
6225 "=== scheduling SLP instances ===\n");
6227 vect_schedule_slp (loop_vinfo
, NULL
);
6230 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6231 if (!vinfo_for_stmt (stmt
) || PURE_SLP_STMT (stmt_info
))
6233 if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6235 pattern_def_seq
= NULL
;
6242 /* -------- vectorize statement ------------ */
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
6246 grouped_store
= false;
6247 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, NULL
, NULL
);
6250 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6252 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6253 interleaving chain was completed - free all the stores in
6256 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info
));
6260 /* Free the attached stmt_vec_info and remove the stmt. */
6261 gimple store
= gsi_stmt (si
);
6262 free_stmt_vec_info (store
);
6263 unlink_stmt_vdef (store
);
6264 gsi_remove (&si
, true);
6265 release_defs (store
);
6268 /* Stores can only appear at the end of pattern statements. */
6269 gcc_assert (!transform_pattern_stmt
);
6270 pattern_def_seq
= NULL
;
6272 else if (!transform_pattern_stmt
&& gsi_end_p (pattern_def_si
))
6274 pattern_def_seq
= NULL
;
6280 slpeel_make_loop_iterate_ntimes (loop
, ratio
);
6282 /* Reduce loop iterations by the vectorization factor. */
6283 scale_loop_profile (loop
, GCOV_COMPUTE_SCALE (1, vectorization_factor
),
6284 expected_iterations
/ vectorization_factor
);
6285 loop
->nb_iterations_upper_bound
6286 = wi::udiv_floor (loop
->nb_iterations_upper_bound
, vectorization_factor
);
6287 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6288 && loop
->nb_iterations_upper_bound
!= 0)
6289 loop
->nb_iterations_upper_bound
= loop
->nb_iterations_upper_bound
- 1;
6290 if (loop
->any_estimate
)
6292 loop
->nb_iterations_estimate
6293 = wi::udiv_floor (loop
->nb_iterations_estimate
, vectorization_factor
);
6294 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
6295 && loop
->nb_iterations_estimate
!= 0)
6296 loop
->nb_iterations_estimate
= loop
->nb_iterations_estimate
- 1;
6299 if (dump_enabled_p ())
6301 dump_printf_loc (MSG_NOTE
, vect_location
,
6302 "LOOP VECTORIZED\n");
6304 dump_printf_loc (MSG_NOTE
, vect_location
,
6305 "OUTER LOOP VECTORIZED\n");
6306 dump_printf (MSG_NOTE
, "\n");