* gfortran.fortran-torture/execute/intrinsic_nearest.x: Skip AIX.
[official-gcc.git] / gcc / tree-vect-loop.c
blob41eac972a97c90c062ccfb8313a377c16a84cd66
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
193 for (i = 0; i < nbbs; i++)
195 basic_block bb = bbs[i];
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
207 gcc_assert (stmt_info);
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
232 return false;
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
254 tree vf_vectype;
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
261 stmt_info = vinfo_for_stmt (stmt);
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
270 gcc_assert (stmt_info);
272 /* Skip stmts which do not need to be vectorized. */
273 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
275 || gimple_clobber_p (stmt))
277 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
278 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
279 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
280 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
282 stmt = pattern_stmt;
283 stmt_info = vinfo_for_stmt (pattern_stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining pattern statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 else
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
295 gsi_next (&si);
296 continue;
299 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
300 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
301 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
302 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 analyze_pattern_stmt = true;
305 /* If a pattern statement has def stmts, analyze them too. */
306 if (is_pattern_stmt_p (stmt_info))
308 if (pattern_def_seq == NULL)
310 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
311 pattern_def_si = gsi_start (pattern_def_seq);
313 else if (!gsi_end_p (pattern_def_si))
314 gsi_next (&pattern_def_si);
315 if (pattern_def_seq != NULL)
317 gimple pattern_def_stmt = NULL;
318 stmt_vec_info pattern_def_stmt_info = NULL;
320 while (!gsi_end_p (pattern_def_si))
322 pattern_def_stmt = gsi_stmt (pattern_def_si);
323 pattern_def_stmt_info
324 = vinfo_for_stmt (pattern_def_stmt);
325 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
326 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
327 break;
328 gsi_next (&pattern_def_si);
331 if (!gsi_end_p (pattern_def_si))
333 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location,
336 "==> examining pattern def stmt: ");
337 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
338 pattern_def_stmt, 0);
341 stmt = pattern_def_stmt;
342 stmt_info = pattern_def_stmt_info;
344 else
346 pattern_def_si = gsi_none ();
347 analyze_pattern_stmt = false;
350 else
351 analyze_pattern_stmt = false;
354 if (gimple_get_lhs (stmt) == NULL_TREE)
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "not vectorized: irregular stmt.");
360 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
363 return false;
366 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
368 if (dump_enabled_p ())
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "not vectorized: vector stmt in loop:");
372 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
374 return false;
377 if (STMT_VINFO_VECTYPE (stmt_info))
379 /* The only case when a vectype had been already set is for stmts
380 that contain a dataref, or for "pattern-stmts" (stmts
381 generated by the vectorizer to represent/replace a certain
382 idiom). */
383 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
384 || is_pattern_stmt_p (stmt_info)
385 || !gsi_end_p (pattern_def_si));
386 vectype = STMT_VINFO_VECTYPE (stmt_info);
388 else
390 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
391 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_NOTE, vect_location,
395 "get vectype for scalar type: ");
396 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
398 vectype = get_vectype_for_scalar_type (scalar_type);
399 if (!vectype)
401 if (dump_enabled_p ())
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "not vectorized: unsupported "
405 "data-type ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
407 scalar_type);
409 return false;
412 STMT_VINFO_VECTYPE (stmt_info) = vectype;
414 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
421 /* The vectorization factor is according to the smallest
422 scalar type (or the largest vector size, but we only
423 support one vector size per loop). */
424 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
425 &dummy);
426 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "get vectype for scalar type: ");
430 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
432 vf_vectype = get_vectype_for_scalar_type (scalar_type);
433 if (!vf_vectype)
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "not vectorized: unsupported data-type ");
439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
440 scalar_type);
442 return false;
445 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
446 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
448 if (dump_enabled_p ())
450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
451 "not vectorized: different sized vector "
452 "types in statement, ");
453 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
454 vectype);
455 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
456 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
457 vf_vectype);
459 return false;
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
468 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
471 if (!vectorization_factor
472 || (nunits > vectorization_factor))
473 vectorization_factor = nunits;
475 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
477 pattern_def_seq = NULL;
478 gsi_next (&si);
483 /* TODO: Analyze cost. Decide if worth while to vectorize. */
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
486 vectorization_factor);
487 if (vectorization_factor <= 1)
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
491 "not vectorized: unsupported data-type");
492 return false;
494 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
496 return true;
500 /* Function vect_is_simple_iv_evolution.
502 FORNOW: A simple evolution of an induction variables in the loop is
503 considered a polynomial evolution. */
505 static bool
506 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
507 tree * step)
509 tree init_expr;
510 tree step_expr;
511 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
512 basic_block bb;
514 /* When there is no evolution in this loop, the evolution function
515 is not "simple". */
516 if (evolution_part == NULL_TREE)
517 return false;
519 /* When the evolution is a polynomial of degree >= 2
520 the evolution function is not "simple". */
521 if (tree_is_chrec (evolution_part))
522 return false;
524 step_expr = evolution_part;
525 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
531 dump_printf (MSG_NOTE, ", init: ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
535 *init = init_expr;
536 *step = step_expr;
538 if (TREE_CODE (step_expr) != INTEGER_CST
539 && (TREE_CODE (step_expr) != SSA_NAME
540 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
541 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
542 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
543 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
544 || !flag_associative_math)))
545 && (TREE_CODE (step_expr) != REAL_CST
546 || !flag_associative_math))
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "step unknown.");
551 return false;
554 return true;
557 /* Function vect_analyze_scalar_cycles_1.
559 Examine the cross iteration def-use cycles of scalar variables
560 in LOOP. LOOP_VINFO represents the loop that is now being
561 considered for vectorization (can be LOOP, or an outer-loop
562 enclosing LOOP). */
564 static void
565 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
567 basic_block bb = loop->header;
568 tree init, step;
569 vec<gimple> worklist;
570 worklist.create (64);
571 gimple_stmt_iterator gsi;
572 bool double_reduc;
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE, vect_location,
576 "=== vect_analyze_scalar_cycles ===");
578 /* First - identify all inductions. Reduction detection assumes that all the
579 inductions have been identified, therefore, this order must not be
580 changed. */
581 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
583 gimple phi = gsi_stmt (gsi);
584 tree access_fn = NULL;
585 tree def = PHI_RESULT (phi);
586 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
588 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
591 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
594 /* Skip virtual phi's. The data dependences that are associated with
595 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
596 if (virtual_operand_p (def))
597 continue;
599 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
601 /* Analyze the evolution function. */
602 access_fn = analyze_scalar_evolution (loop, def);
603 if (access_fn)
605 STRIP_NOPS (access_fn);
606 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location,
609 "Access function of PHI: ");
610 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
612 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
613 = evolution_part_in_loop_num (access_fn, loop->num);
616 if (!access_fn
617 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
618 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
619 && TREE_CODE (step) != INTEGER_CST))
621 worklist.safe_push (phi);
622 continue;
625 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
629 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
633 /* Second - identify all reductions and nested cycles. */
634 while (worklist.length () > 0)
636 gimple phi = worklist.pop ();
637 tree def = PHI_RESULT (phi);
638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
639 gimple reduc_stmt;
640 bool nested_cycle;
642 if (dump_enabled_p ())
644 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
645 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
648 gcc_assert (!virtual_operand_p (def)
649 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
651 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
652 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
653 &double_reduc);
654 if (reduc_stmt)
656 if (double_reduc)
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected double reduction.");
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_double_reduction_def;
666 else
668 if (nested_cycle)
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "Detected vectorizable nested cycle.");
674 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
675 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
676 vect_nested_cycle;
678 else
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected reduction.");
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
685 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
686 vect_reduction_def;
687 /* Store the reduction cycles for possible vectorization in
688 loop-aware SLP. */
689 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
693 else
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "Unknown def-use cycle pattern.");
699 worklist.release ();
703 /* Function vect_analyze_scalar_cycles.
705 Examine the cross iteration def-use cycles of scalar variables, by
706 analyzing the loop-header PHIs of scalar variables. Classify each
707 cycle as one of the following: invariant, induction, reduction, unknown.
708 We do that for the loop represented by LOOP_VINFO, and also to its
709 inner-loop, if exists.
710 Examples for scalar cycles:
712 Example1: reduction:
714 loop1:
715 for (i=0; i<N; i++)
716 sum += a[i];
718 Example2: induction:
720 loop2:
721 for (i=0; i<N; i++)
722 a[i] = i; */
724 static void
725 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
731 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
732 Reductions in such inner-loop therefore have different properties than
733 the reductions in the nest that gets vectorized:
734 1. When vectorized, they are executed in the same order as in the original
735 scalar loop, so we can't change the order of computation when
736 vectorizing them.
737 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
738 current checks are too strict. */
740 if (loop->inner)
741 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
744 /* Function vect_get_loop_niters.
746 Determine how many iterations the loop is executed.
747 If an expression that represents the number of iterations
748 can be constructed, place it in NUMBER_OF_ITERATIONS.
749 Return the loop exit condition. */
751 static gimple
752 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
754 tree niters;
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_NOTE, vect_location,
758 "=== get_loop_niters ===");
759 niters = number_of_exit_cond_executions (loop);
761 if (niters != NULL_TREE
762 && niters != chrec_dont_know)
764 *number_of_iterations = niters;
766 if (dump_enabled_p ())
768 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
769 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
773 return get_loop_exit_condition (loop);
777 /* Function bb_in_loop_p
779 Used as predicate for dfs order traversal of the loop bbs. */
781 static bool
782 bb_in_loop_p (const_basic_block bb, const void *data)
784 const struct loop *const loop = (const struct loop *)data;
785 if (flow_bb_inside_loop_p (loop, bb))
786 return true;
787 return false;
791 /* Function new_loop_vec_info.
793 Create and initialize a new loop_vec_info struct for LOOP, as well as
794 stmt_vec_info structs for all the stmts in LOOP. */
796 static loop_vec_info
797 new_loop_vec_info (struct loop *loop)
799 loop_vec_info res;
800 basic_block *bbs;
801 gimple_stmt_iterator si;
802 unsigned int i, nbbs;
804 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
805 LOOP_VINFO_LOOP (res) = loop;
807 bbs = get_loop_body (loop);
809 /* Create/Update stmt_info for all stmts in the loop. */
810 for (i = 0; i < loop->num_nodes; i++)
812 basic_block bb = bbs[i];
814 /* BBs in a nested inner-loop will have been already processed (because
815 we will have called vect_analyze_loop_form for any nested inner-loop).
816 Therefore, for stmts in an inner-loop we just want to update the
817 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
818 loop_info of the outer-loop we are currently considering to vectorize
819 (instead of the loop_info of the inner-loop).
820 For stmts in other BBs we need to create a stmt_info from scratch. */
821 if (bb->loop_father != loop)
823 /* Inner-loop bb. */
824 gcc_assert (loop->inner && bb->loop_father == loop->inner);
825 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
827 gimple phi = gsi_stmt (si);
828 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
829 loop_vec_info inner_loop_vinfo =
830 STMT_VINFO_LOOP_VINFO (stmt_info);
831 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
832 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
834 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
836 gimple stmt = gsi_stmt (si);
837 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
838 loop_vec_info inner_loop_vinfo =
839 STMT_VINFO_LOOP_VINFO (stmt_info);
840 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
841 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
844 else
846 /* bb in current nest. */
847 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
849 gimple phi = gsi_stmt (si);
850 gimple_set_uid (phi, 0);
851 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
854 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
856 gimple stmt = gsi_stmt (si);
857 gimple_set_uid (stmt, 0);
858 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
863 /* CHECKME: We want to visit all BBs before their successors (except for
864 latch blocks, for which this assertion wouldn't hold). In the simple
865 case of the loop forms we allow, a dfs order of the BBs would the same
866 as reversed postorder traversal, so we are safe. */
868 free (bbs);
869 bbs = XCNEWVEC (basic_block, loop->num_nodes);
870 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
871 bbs, loop->num_nodes, loop);
872 gcc_assert (nbbs == loop->num_nodes);
874 LOOP_VINFO_BBS (res) = bbs;
875 LOOP_VINFO_NITERS (res) = NULL;
876 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
877 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
878 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
879 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
880 LOOP_VINFO_VECT_FACTOR (res) = 0;
881 LOOP_VINFO_LOOP_NEST (res).create (3);
882 LOOP_VINFO_DATAREFS (res).create (10);
883 LOOP_VINFO_DDRS (res).create (10 * 10);
884 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
885 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
886 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
887 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
888 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
889 LOOP_VINFO_GROUPED_STORES (res).create (10);
890 LOOP_VINFO_REDUCTIONS (res).create (10);
891 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
892 LOOP_VINFO_SLP_INSTANCES (res).create (10);
893 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
894 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
895 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
896 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
898 return res;
902 /* Function destroy_loop_vec_info.
904 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
905 stmts in the loop. */
907 void
908 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
910 struct loop *loop;
911 basic_block *bbs;
912 int nbbs;
913 gimple_stmt_iterator si;
914 int j;
915 vec<slp_instance> slp_instances;
916 slp_instance instance;
917 bool swapped;
919 if (!loop_vinfo)
920 return;
922 loop = LOOP_VINFO_LOOP (loop_vinfo);
924 bbs = LOOP_VINFO_BBS (loop_vinfo);
925 nbbs = clean_stmts ? loop->num_nodes : 0;
926 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
928 for (j = 0; j < nbbs; j++)
930 basic_block bb = bbs[j];
931 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
932 free_stmt_vec_info (gsi_stmt (si));
934 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
936 gimple stmt = gsi_stmt (si);
938 /* We may have broken canonical form by moving a constant
939 into RHS1 of a commutative op. Fix such occurrences. */
940 if (swapped && is_gimple_assign (stmt))
942 enum tree_code code = gimple_assign_rhs_code (stmt);
944 if ((code == PLUS_EXPR
945 || code == POINTER_PLUS_EXPR
946 || code == MULT_EXPR)
947 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
948 swap_tree_operands (stmt,
949 gimple_assign_rhs1_ptr (stmt),
950 gimple_assign_rhs2_ptr (stmt));
953 /* Free stmt_vec_info. */
954 free_stmt_vec_info (stmt);
955 gsi_next (&si);
959 free (LOOP_VINFO_BBS (loop_vinfo));
960 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
961 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
962 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
963 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
964 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
965 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
966 FOR_EACH_VEC_ELT (slp_instances, j, instance)
967 vect_free_slp_instance (instance);
969 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
970 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
971 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
972 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
974 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
975 LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
977 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
979 free (loop_vinfo);
980 loop->aux = NULL;
984 /* Function vect_analyze_loop_1.
986 Apply a set of analyses on LOOP, and create a loop_vec_info struct
987 for it. The different analyses will record information in the
988 loop_vec_info struct. This is a subset of the analyses applied in
989 vect_analyze_loop, to be applied on an inner-loop nested in the loop
990 that is now considered for (outer-loop) vectorization. */
992 static loop_vec_info
993 vect_analyze_loop_1 (struct loop *loop)
995 loop_vec_info loop_vinfo;
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_NOTE, vect_location,
999 "===== analyze_loop_nest_1 =====");
1001 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1003 loop_vinfo = vect_analyze_loop_form (loop);
1004 if (!loop_vinfo)
1006 if (dump_enabled_p ())
1007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1008 "bad inner-loop form.");
1009 return NULL;
1012 return loop_vinfo;
1016 /* Function vect_analyze_loop_form.
1018 Verify that certain CFG restrictions hold, including:
1019 - the loop has a pre-header
1020 - the loop has a single entry and exit
1021 - the loop exit condition is simple enough, and the number of iterations
1022 can be analyzed (a countable loop). */
1024 loop_vec_info
1025 vect_analyze_loop_form (struct loop *loop)
1027 loop_vec_info loop_vinfo;
1028 gimple loop_cond;
1029 tree number_of_iterations = NULL;
1030 loop_vec_info inner_loop_vinfo = NULL;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== vect_analyze_loop_form ===");
1036 /* Different restrictions apply when we are considering an inner-most loop,
1037 vs. an outer (nested) loop.
1038 (FORNOW. May want to relax some of these restrictions in the future). */
1040 if (!loop->inner)
1042 /* Inner-most loop. We currently require that the number of BBs is
1043 exactly 2 (the header and latch). Vectorizable inner-most loops
1044 look like this:
1046 (pre-header)
1048 header <--------+
1049 | | |
1050 | +--> latch --+
1052 (exit-bb) */
1054 if (loop->num_nodes != 2)
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "not vectorized: control flow in loop.");
1059 return NULL;
1062 if (empty_block_p (loop->header))
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "not vectorized: empty loop.");
1067 return NULL;
1070 else
1072 struct loop *innerloop = loop->inner;
1073 edge entryedge;
1075 /* Nested loop. We currently require that the loop is doubly-nested,
1076 contains a single inner loop, and the number of BBs is exactly 5.
1077 Vectorizable outer-loops look like this:
1079 (pre-header)
1081 header <---+
1083 inner-loop |
1085 tail ------+
1087 (exit-bb)
1089 The inner-loop has the properties expected of inner-most loops
1090 as described above. */
1092 if ((loop->inner)->inner || (loop->inner)->next)
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "not vectorized: multiple nested loops.");
1097 return NULL;
1100 /* Analyze the inner-loop. */
1101 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1102 if (!inner_loop_vinfo)
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1106 "not vectorized: Bad inner loop.");
1107 return NULL;
1110 if (!expr_invariant_in_loop_p (loop,
1111 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "not vectorized: inner-loop count not invariant.");
1116 destroy_loop_vec_info (inner_loop_vinfo, true);
1117 return NULL;
1120 if (loop->num_nodes != 5)
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "not vectorized: control flow in loop.");
1125 destroy_loop_vec_info (inner_loop_vinfo, true);
1126 return NULL;
1129 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1130 entryedge = EDGE_PRED (innerloop->header, 0);
1131 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1132 entryedge = EDGE_PRED (innerloop->header, 1);
1134 if (entryedge->src != loop->header
1135 || !single_exit (innerloop)
1136 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "not vectorized: unsupported outerloop form.");
1141 destroy_loop_vec_info (inner_loop_vinfo, true);
1142 return NULL;
1145 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "Considering outer-loop vectorization.");
1150 if (!single_exit (loop)
1151 || EDGE_COUNT (loop->header->preds) != 2)
1153 if (dump_enabled_p ())
1155 if (!single_exit (loop))
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "not vectorized: multiple exits.");
1158 else if (EDGE_COUNT (loop->header->preds) != 2)
1159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1160 "not vectorized: too many incoming edges.");
1162 if (inner_loop_vinfo)
1163 destroy_loop_vec_info (inner_loop_vinfo, true);
1164 return NULL;
1167 /* We assume that the loop exit condition is at the end of the loop. i.e,
1168 that the loop is represented as a do-while (with a proper if-guard
1169 before the loop if needed), where the loop header contains all the
1170 executable statements, and the latch is empty. */
1171 if (!empty_block_p (loop->latch)
1172 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1174 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176 "not vectorized: latch block not empty.");
1177 if (inner_loop_vinfo)
1178 destroy_loop_vec_info (inner_loop_vinfo, true);
1179 return NULL;
1182 /* Make sure there exists a single-predecessor exit bb: */
1183 if (!single_pred_p (single_exit (loop)->dest))
1185 edge e = single_exit (loop);
1186 if (!(e->flags & EDGE_ABNORMAL))
1188 split_loop_exit_edge (e);
1189 if (dump_enabled_p ())
1190 dump_printf (MSG_NOTE, "split exit edge.");
1192 else
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: abnormal loop exit edge.");
1197 if (inner_loop_vinfo)
1198 destroy_loop_vec_info (inner_loop_vinfo, true);
1199 return NULL;
1203 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1204 if (!loop_cond)
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: complicated exit condition.");
1209 if (inner_loop_vinfo)
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1214 if (!number_of_iterations)
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: number of iterations cannot be "
1219 "computed.");
1220 if (inner_loop_vinfo)
1221 destroy_loop_vec_info (inner_loop_vinfo, true);
1222 return NULL;
1225 if (chrec_contains_undetermined (number_of_iterations))
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "Infinite number of iterations.");
1230 if (inner_loop_vinfo)
1231 destroy_loop_vec_info (inner_loop_vinfo, true);
1232 return NULL;
1235 if (!NITERS_KNOWN_P (number_of_iterations))
1237 if (dump_enabled_p ())
1239 dump_printf_loc (MSG_NOTE, vect_location,
1240 "Symbolic number of iterations is ");
1241 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1244 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1246 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248 "not vectorized: number of iterations = 0.");
1249 if (inner_loop_vinfo)
1250 destroy_loop_vec_info (inner_loop_vinfo, true);
1251 return NULL;
1254 loop_vinfo = new_loop_vec_info (loop);
1255 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1256 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1258 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1260 /* CHECKME: May want to keep it around it in the future. */
1261 if (inner_loop_vinfo)
1262 destroy_loop_vec_info (inner_loop_vinfo, false);
1264 gcc_assert (!loop->aux);
1265 loop->aux = loop_vinfo;
1266 return loop_vinfo;
1270 /* Function vect_analyze_loop_operations.
1272 Scan the loop stmts and make sure they are all vectorizable. */
1274 static bool
1275 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1277 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1278 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1279 int nbbs = loop->num_nodes;
1280 gimple_stmt_iterator si;
1281 unsigned int vectorization_factor = 0;
1282 int i;
1283 gimple phi;
1284 stmt_vec_info stmt_info;
1285 bool need_to_vectorize = false;
1286 int min_profitable_iters;
1287 int min_scalar_loop_bound;
1288 unsigned int th;
1289 bool only_slp_in_loop = true, ok;
1290 HOST_WIDE_INT max_niter;
1291 HOST_WIDE_INT estimated_niter;
1292 int min_profitable_estimate;
1294 if (dump_enabled_p ())
1295 dump_printf_loc (MSG_NOTE, vect_location,
1296 "=== vect_analyze_loop_operations ===");
1298 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1299 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1300 if (slp)
1302 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1303 vectorization factor of the loop is the unrolling factor required by
1304 the SLP instances. If that unrolling factor is 1, we say, that we
1305 perform pure SLP on loop - cross iteration parallelism is not
1306 exploited. */
1307 for (i = 0; i < nbbs; i++)
1309 basic_block bb = bbs[i];
1310 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1312 gimple stmt = gsi_stmt (si);
1313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1314 gcc_assert (stmt_info);
1315 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1316 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1317 && !PURE_SLP_STMT (stmt_info))
1318 /* STMT needs both SLP and loop-based vectorization. */
1319 only_slp_in_loop = false;
1323 if (only_slp_in_loop)
1324 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1325 else
1326 vectorization_factor = least_common_multiple (vectorization_factor,
1327 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1329 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1330 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_NOTE, vect_location,
1332 "Updating vectorization factor to %d ",
1333 vectorization_factor);
1336 for (i = 0; i < nbbs; i++)
1338 basic_block bb = bbs[i];
1340 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1342 phi = gsi_stmt (si);
1343 ok = true;
1345 stmt_info = vinfo_for_stmt (phi);
1346 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1349 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1352 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1353 (i.e., a phi in the tail of the outer-loop). */
1354 if (! is_loop_header_bb_p (bb))
1356 /* FORNOW: we currently don't support the case that these phis
1357 are not used in the outerloop (unless it is double reduction,
1358 i.e., this phi is vect_reduction_def), cause this case
1359 requires to actually do something here. */
1360 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1361 || STMT_VINFO_LIVE_P (stmt_info))
1362 && STMT_VINFO_DEF_TYPE (stmt_info)
1363 != vect_double_reduction_def)
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367 "Unsupported loop-closed phi in "
1368 "outer-loop.");
1369 return false;
1372 /* If PHI is used in the outer loop, we check that its operand
1373 is defined in the inner loop. */
1374 if (STMT_VINFO_RELEVANT_P (stmt_info))
1376 tree phi_op;
1377 gimple op_def_stmt;
1379 if (gimple_phi_num_args (phi) != 1)
1380 return false;
1382 phi_op = PHI_ARG_DEF (phi, 0);
1383 if (TREE_CODE (phi_op) != SSA_NAME)
1384 return false;
1386 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1387 if (!op_def_stmt
1388 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1389 || !vinfo_for_stmt (op_def_stmt))
1390 return false;
1392 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1393 != vect_used_in_outer
1394 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1395 != vect_used_in_outer_by_reduction)
1396 return false;
1399 continue;
1402 gcc_assert (stmt_info);
1404 if (STMT_VINFO_LIVE_P (stmt_info))
1406 /* FORNOW: not yet supported. */
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409 "not vectorized: value used after loop.");
1410 return false;
1413 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1414 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1416 /* A scalar-dependence cycle that we don't support. */
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "not vectorized: scalar dependence cycle.");
1420 return false;
1423 if (STMT_VINFO_RELEVANT_P (stmt_info))
1425 need_to_vectorize = true;
1426 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1427 ok = vectorizable_induction (phi, NULL, NULL);
1430 if (!ok)
1432 if (dump_enabled_p ())
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 "not vectorized: relevant phi not "
1436 "supported: ");
1437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1439 return false;
1443 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1445 gimple stmt = gsi_stmt (si);
1446 if (!gimple_clobber_p (stmt)
1447 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1448 return false;
1450 } /* bbs */
1452 /* All operations in the loop are either irrelevant (deal with loop
1453 control, or dead), or only used outside the loop and can be moved
1454 out of the loop (e.g. invariants, inductions). The loop can be
1455 optimized away by scalar optimizations. We're better off not
1456 touching this loop. */
1457 if (!need_to_vectorize)
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "All the computation can be taken out of the loop.");
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "not vectorized: redundant loop. no profit to "
1465 "vectorize.");
1466 return false;
1469 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1470 dump_printf_loc (MSG_NOTE, vect_location,
1471 "vectorization_factor = %d, niters = "
1472 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1473 LOOP_VINFO_INT_NITERS (loop_vinfo));
1475 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1476 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1477 || ((max_niter = max_stmt_executions_int (loop)) != -1
1478 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1480 if (dump_enabled_p ())
1481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482 "not vectorized: iteration count too small.");
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "not vectorized: iteration count smaller than "
1486 "vectorization factor.");
1487 return false;
1490 /* Analyze cost. Decide if worth while to vectorize. */
1492 /* Once VF is set, SLP costs should be updated since the number of created
1493 vector stmts depends on VF. */
1494 vect_update_slp_costs_according_to_vf (loop_vinfo);
1496 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1497 &min_profitable_estimate);
1498 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1500 if (min_profitable_iters < 0)
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: vectorization not profitable.");
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: vector version will never be "
1508 "profitable.");
1509 return false;
1512 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1513 * vectorization_factor) - 1);
1516 /* Use the cost model only if it is more conservative than user specified
1517 threshold. */
1519 th = (unsigned) min_scalar_loop_bound;
1520 if (min_profitable_iters
1521 && (!min_scalar_loop_bound
1522 || min_profitable_iters > min_scalar_loop_bound))
1523 th = (unsigned) min_profitable_iters;
1525 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1526 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: vectorization not profitable.");
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_NOTE, vect_location,
1533 "not vectorized: iteration count smaller than user "
1534 "specified loop bound parameter or minimum profitable "
1535 "iterations (whichever is more conservative).");
1536 return false;
1539 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1540 && ((unsigned HOST_WIDE_INT) estimated_niter
1541 <= MAX (th, (unsigned)min_profitable_estimate)))
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: estimated iteration count too "
1546 "small.");
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location,
1549 "not vectorized: estimated iteration count smaller "
1550 "than specified loop bound parameter or minimum "
1551 "profitable iterations (whichever is more "
1552 "conservative).");
1553 return false;
1556 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1557 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1558 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1560 if (dump_enabled_p ())
1561 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1562 if (!vect_can_advance_ivs_p (loop_vinfo))
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "not vectorized: can't create epilog loop 1.");
1567 return false;
1569 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1571 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573 "not vectorized: can't create epilog loop 2.");
1574 return false;
1578 return true;
1582 /* Function vect_analyze_loop_2.
1584 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1585 for it. The different analyses will record information in the
1586 loop_vec_info struct. */
1587 static bool
1588 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1590 bool ok, slp = false;
1591 int max_vf = MAX_VECTORIZATION_FACTOR;
1592 int min_vf = 2;
1594 /* Find all data references in the loop (which correspond to vdefs/vuses)
1595 and analyze their evolution in the loop. Also adjust the minimal
1596 vectorization factor according to the loads and stores.
1598 FORNOW: Handle only simple, array references, which
1599 alignment can be forced, and aligned pointer-references. */
1601 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1602 if (!ok)
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "bad data references.");
1607 return false;
1610 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1611 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1613 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1614 if (!ok)
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1618 "bad data access.");
1619 return false;
1622 /* Classify all cross-iteration scalar data-flow cycles.
1623 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1625 vect_analyze_scalar_cycles (loop_vinfo);
1627 vect_pattern_recog (loop_vinfo, NULL);
1629 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1631 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1632 if (!ok)
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636 "unexpected pattern.");
1637 return false;
1640 /* Analyze data dependences between the data-refs in the loop
1641 and adjust the maximum vectorization factor according to
1642 the dependences.
1643 FORNOW: fail at the first data dependence that we encounter. */
1645 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1646 if (!ok
1647 || max_vf < min_vf)
1649 if (dump_enabled_p ())
1650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651 "bad data dependence.");
1652 return false;
1655 ok = vect_determine_vectorization_factor (loop_vinfo);
1656 if (!ok)
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "can't determine vectorization factor.");
1661 return false;
1663 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "bad data dependence.");
1668 return false;
1671 /* Analyze the alignment of the data-refs in the loop.
1672 Fail if a data reference is found that cannot be vectorized. */
1674 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1675 if (!ok)
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "bad data alignment.");
1680 return false;
1683 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1684 It is important to call pruning after vect_analyze_data_ref_accesses,
1685 since we use grouping information gathered by interleaving analysis. */
1686 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1687 if (!ok)
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "too long list of versioning for alias "
1692 "run-time tests.");
1693 return false;
1696 /* This pass will decide on using loop versioning and/or loop peeling in
1697 order to enhance the alignment of data references in the loop. */
1699 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1700 if (!ok)
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "bad data alignment.");
1705 return false;
1708 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1709 ok = vect_analyze_slp (loop_vinfo, NULL);
1710 if (ok)
1712 /* Decide which possible SLP instances to SLP. */
1713 slp = vect_make_slp_decision (loop_vinfo);
1715 /* Find stmts that need to be both vectorized and SLPed. */
1716 vect_detect_hybrid_slp (loop_vinfo);
1718 else
1719 return false;
1721 /* Scan all the operations in the loop and make sure they are
1722 vectorizable. */
1724 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1725 if (!ok)
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 "bad operation or unsupported loop bound.");
1730 return false;
1733 return true;
1736 /* Function vect_analyze_loop.
1738 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1739 for it. The different analyses will record information in the
1740 loop_vec_info struct. */
1741 loop_vec_info
1742 vect_analyze_loop (struct loop *loop)
1744 loop_vec_info loop_vinfo;
1745 unsigned int vector_sizes;
1747 /* Autodetect first vector size we try. */
1748 current_vector_size = 0;
1749 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "===== analyze_loop_nest =====");
1755 if (loop_outer (loop)
1756 && loop_vec_info_for_loop (loop_outer (loop))
1757 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE, vect_location,
1761 "outer-loop already vectorized.");
1762 return NULL;
1765 while (1)
1767 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1768 loop_vinfo = vect_analyze_loop_form (loop);
1769 if (!loop_vinfo)
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "bad loop form.");
1774 return NULL;
1777 if (vect_analyze_loop_2 (loop_vinfo))
1779 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1781 return loop_vinfo;
1784 destroy_loop_vec_info (loop_vinfo, true);
1786 vector_sizes &= ~current_vector_size;
1787 if (vector_sizes == 0
1788 || current_vector_size == 0)
1789 return NULL;
1791 /* Try the next biggest vector size. */
1792 current_vector_size = 1 << floor_log2 (vector_sizes);
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "***** Re-trying analysis with "
1796 "vector size %d\n", current_vector_size);
1801 /* Function reduction_code_for_scalar_code
1803 Input:
1804 CODE - tree_code of a reduction operations.
1806 Output:
1807 REDUC_CODE - the corresponding tree-code to be used to reduce the
1808 vector of partial results into a single scalar result (which
1809 will also reside in a vector) or ERROR_MARK if the operation is
1810 a supported reduction operation, but does not have such tree-code.
1812 Return FALSE if CODE currently cannot be vectorized as reduction. */
1814 static bool
1815 reduction_code_for_scalar_code (enum tree_code code,
1816 enum tree_code *reduc_code)
1818 switch (code)
1820 case MAX_EXPR:
1821 *reduc_code = REDUC_MAX_EXPR;
1822 return true;
1824 case MIN_EXPR:
1825 *reduc_code = REDUC_MIN_EXPR;
1826 return true;
1828 case PLUS_EXPR:
1829 *reduc_code = REDUC_PLUS_EXPR;
1830 return true;
1832 case MULT_EXPR:
1833 case MINUS_EXPR:
1834 case BIT_IOR_EXPR:
1835 case BIT_XOR_EXPR:
1836 case BIT_AND_EXPR:
1837 *reduc_code = ERROR_MARK;
1838 return true;
1840 default:
1841 return false;
1846 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1847 STMT is printed with a message MSG. */
1849 static void
1850 report_vect_op (int msg_type, gimple stmt, const char *msg)
1852 dump_printf_loc (msg_type, vect_location, "%s", msg);
1853 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1857 /* Detect SLP reduction of the form:
1859 #a1 = phi <a5, a0>
1860 a2 = operation (a1)
1861 a3 = operation (a2)
1862 a4 = operation (a3)
1863 a5 = operation (a4)
1865 #a = phi <a5>
1867 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1868 FIRST_STMT is the first reduction stmt in the chain
1869 (a2 = operation (a1)).
1871 Return TRUE if a reduction chain was detected. */
1873 static bool
1874 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1876 struct loop *loop = (gimple_bb (phi))->loop_father;
1877 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1878 enum tree_code code;
1879 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1880 stmt_vec_info use_stmt_info, current_stmt_info;
1881 tree lhs;
1882 imm_use_iterator imm_iter;
1883 use_operand_p use_p;
1884 int nloop_uses, size = 0, n_out_of_loop_uses;
1885 bool found = false;
1887 if (loop != vect_loop)
1888 return false;
1890 lhs = PHI_RESULT (phi);
1891 code = gimple_assign_rhs_code (first_stmt);
1892 while (1)
1894 nloop_uses = 0;
1895 n_out_of_loop_uses = 0;
1896 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1898 gimple use_stmt = USE_STMT (use_p);
1899 if (is_gimple_debug (use_stmt))
1900 continue;
1902 use_stmt = USE_STMT (use_p);
1904 /* Check if we got back to the reduction phi. */
1905 if (use_stmt == phi)
1907 loop_use_stmt = use_stmt;
1908 found = true;
1909 break;
1912 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1914 if (vinfo_for_stmt (use_stmt)
1915 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1917 loop_use_stmt = use_stmt;
1918 nloop_uses++;
1921 else
1922 n_out_of_loop_uses++;
1924 /* There are can be either a single use in the loop or two uses in
1925 phi nodes. */
1926 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1927 return false;
1930 if (found)
1931 break;
1933 /* We reached a statement with no loop uses. */
1934 if (nloop_uses == 0)
1935 return false;
1937 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1938 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1939 return false;
1941 if (!is_gimple_assign (loop_use_stmt)
1942 || code != gimple_assign_rhs_code (loop_use_stmt)
1943 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1944 return false;
1946 /* Insert USE_STMT into reduction chain. */
1947 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1948 if (current_stmt)
1950 current_stmt_info = vinfo_for_stmt (current_stmt);
1951 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1952 GROUP_FIRST_ELEMENT (use_stmt_info)
1953 = GROUP_FIRST_ELEMENT (current_stmt_info);
1955 else
1956 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1958 lhs = gimple_assign_lhs (loop_use_stmt);
1959 current_stmt = loop_use_stmt;
1960 size++;
1963 if (!found || loop_use_stmt != phi || size < 2)
1964 return false;
1966 /* Swap the operands, if needed, to make the reduction operand be the second
1967 operand. */
1968 lhs = PHI_RESULT (phi);
1969 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1970 while (next_stmt)
1972 if (gimple_assign_rhs2 (next_stmt) == lhs)
1974 tree op = gimple_assign_rhs1 (next_stmt);
1975 gimple def_stmt = NULL;
1977 if (TREE_CODE (op) == SSA_NAME)
1978 def_stmt = SSA_NAME_DEF_STMT (op);
1980 /* Check that the other def is either defined in the loop
1981 ("vect_internal_def"), or it's an induction (defined by a
1982 loop-header phi-node). */
1983 if (def_stmt
1984 && gimple_bb (def_stmt)
1985 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1986 && (is_gimple_assign (def_stmt)
1987 || is_gimple_call (def_stmt)
1988 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1989 == vect_induction_def
1990 || (gimple_code (def_stmt) == GIMPLE_PHI
1991 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1992 == vect_internal_def
1993 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1995 lhs = gimple_assign_lhs (next_stmt);
1996 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1997 continue;
2000 return false;
2002 else
2004 tree op = gimple_assign_rhs2 (next_stmt);
2005 gimple def_stmt = NULL;
2007 if (TREE_CODE (op) == SSA_NAME)
2008 def_stmt = SSA_NAME_DEF_STMT (op);
2010 /* Check that the other def is either defined in the loop
2011 ("vect_internal_def"), or it's an induction (defined by a
2012 loop-header phi-node). */
2013 if (def_stmt
2014 && gimple_bb (def_stmt)
2015 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2016 && (is_gimple_assign (def_stmt)
2017 || is_gimple_call (def_stmt)
2018 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2019 == vect_induction_def
2020 || (gimple_code (def_stmt) == GIMPLE_PHI
2021 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2022 == vect_internal_def
2023 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2025 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2028 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2031 swap_tree_operands (next_stmt,
2032 gimple_assign_rhs1_ptr (next_stmt),
2033 gimple_assign_rhs2_ptr (next_stmt));
2034 update_stmt (next_stmt);
2036 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2037 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2039 else
2040 return false;
2043 lhs = gimple_assign_lhs (next_stmt);
2044 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2047 /* Save the chain for further analysis in SLP detection. */
2048 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2049 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2050 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2052 return true;
2056 /* Function vect_is_simple_reduction_1
2058 (1) Detect a cross-iteration def-use cycle that represents a simple
2059 reduction computation. We look for the following pattern:
2061 loop_header:
2062 a1 = phi < a0, a2 >
2063 a3 = ...
2064 a2 = operation (a3, a1)
2066 such that:
2067 1. operation is commutative and associative and it is safe to
2068 change the order of the computation (if CHECK_REDUCTION is true)
2069 2. no uses for a2 in the loop (a2 is used out of the loop)
2070 3. no uses of a1 in the loop besides the reduction operation
2071 4. no uses of a1 outside the loop.
2073 Conditions 1,4 are tested here.
2074 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2076 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2077 nested cycles, if CHECK_REDUCTION is false.
2079 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2080 reductions:
2082 a1 = phi < a0, a2 >
2083 inner loop (def of a3)
2084 a2 = phi < a3 >
2086 If MODIFY is true it tries also to rework the code in-place to enable
2087 detection of more reduction patterns. For the time being we rewrite
2088 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2091 static gimple
2092 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2093 bool check_reduction, bool *double_reduc,
2094 bool modify)
2096 struct loop *loop = (gimple_bb (phi))->loop_father;
2097 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2098 edge latch_e = loop_latch_edge (loop);
2099 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2100 gimple def_stmt, def1 = NULL, def2 = NULL;
2101 enum tree_code orig_code, code;
2102 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2103 tree type;
2104 int nloop_uses;
2105 tree name;
2106 imm_use_iterator imm_iter;
2107 use_operand_p use_p;
2108 bool phi_def;
2110 *double_reduc = false;
2112 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2113 otherwise, we assume outer loop vectorization. */
2114 gcc_assert ((check_reduction && loop == vect_loop)
2115 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2117 name = PHI_RESULT (phi);
2118 nloop_uses = 0;
2119 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2121 gimple use_stmt = USE_STMT (use_p);
2122 if (is_gimple_debug (use_stmt))
2123 continue;
2125 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "intermediate value used outside loop.");
2131 return NULL;
2134 if (vinfo_for_stmt (use_stmt)
2135 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2136 nloop_uses++;
2137 if (nloop_uses > 1)
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "reduction used in loop.");
2142 return NULL;
2146 if (TREE_CODE (loop_arg) != SSA_NAME)
2148 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "reduction: not ssa_name: ");
2152 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2154 return NULL;
2157 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2158 if (!def_stmt)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "reduction: no def_stmt.");
2163 return NULL;
2166 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2168 if (dump_enabled_p ())
2169 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2170 return NULL;
2173 if (is_gimple_assign (def_stmt))
2175 name = gimple_assign_lhs (def_stmt);
2176 phi_def = false;
2178 else
2180 name = PHI_RESULT (def_stmt);
2181 phi_def = true;
2184 nloop_uses = 0;
2185 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2187 gimple use_stmt = USE_STMT (use_p);
2188 if (is_gimple_debug (use_stmt))
2189 continue;
2190 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2191 && vinfo_for_stmt (use_stmt)
2192 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2193 nloop_uses++;
2194 if (nloop_uses > 1)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "reduction used in loop.");
2199 return NULL;
2203 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2204 defined in the inner loop. */
2205 if (phi_def)
2207 op1 = PHI_ARG_DEF (def_stmt, 0);
2209 if (gimple_phi_num_args (def_stmt) != 1
2210 || TREE_CODE (op1) != SSA_NAME)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "unsupported phi node definition.");
2216 return NULL;
2219 def1 = SSA_NAME_DEF_STMT (op1);
2220 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2221 && loop->inner
2222 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2223 && is_gimple_assign (def1))
2225 if (dump_enabled_p ())
2226 report_vect_op (MSG_NOTE, def_stmt,
2227 "detected double reduction: ");
2229 *double_reduc = true;
2230 return def_stmt;
2233 return NULL;
2236 code = orig_code = gimple_assign_rhs_code (def_stmt);
2238 /* We can handle "res -= x[i]", which is non-associative by
2239 simply rewriting this into "res += -x[i]". Avoid changing
2240 gimple instruction for the first simple tests and only do this
2241 if we're allowed to change code at all. */
2242 if (code == MINUS_EXPR
2243 && modify
2244 && (op1 = gimple_assign_rhs1 (def_stmt))
2245 && TREE_CODE (op1) == SSA_NAME
2246 && SSA_NAME_DEF_STMT (op1) == phi)
2247 code = PLUS_EXPR;
2249 if (check_reduction
2250 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2252 if (dump_enabled_p ())
2253 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2254 "reduction: not commutative/associative: ");
2255 return NULL;
2258 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2260 if (code != COND_EXPR)
2262 if (dump_enabled_p ())
2263 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2264 "reduction: not binary operation: ");
2266 return NULL;
2269 op3 = gimple_assign_rhs1 (def_stmt);
2270 if (COMPARISON_CLASS_P (op3))
2272 op4 = TREE_OPERAND (op3, 1);
2273 op3 = TREE_OPERAND (op3, 0);
2276 op1 = gimple_assign_rhs2 (def_stmt);
2277 op2 = gimple_assign_rhs3 (def_stmt);
2279 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2281 if (dump_enabled_p ())
2282 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2283 "reduction: uses not ssa_names: ");
2285 return NULL;
2288 else
2290 op1 = gimple_assign_rhs1 (def_stmt);
2291 op2 = gimple_assign_rhs2 (def_stmt);
2293 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2295 if (dump_enabled_p ())
2296 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2297 "reduction: uses not ssa_names: ");
2299 return NULL;
2303 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2304 if ((TREE_CODE (op1) == SSA_NAME
2305 && !types_compatible_p (type,TREE_TYPE (op1)))
2306 || (TREE_CODE (op2) == SSA_NAME
2307 && !types_compatible_p (type, TREE_TYPE (op2)))
2308 || (op3 && TREE_CODE (op3) == SSA_NAME
2309 && !types_compatible_p (type, TREE_TYPE (op3)))
2310 || (op4 && TREE_CODE (op4) == SSA_NAME
2311 && !types_compatible_p (type, TREE_TYPE (op4))))
2313 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "reduction: multiple types: operation type: ");
2317 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2318 dump_printf (MSG_NOTE, ", operands types: ");
2319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2320 TREE_TYPE (op1));
2321 dump_printf (MSG_NOTE, ",");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323 TREE_TYPE (op2));
2324 if (op3)
2326 dump_printf (MSG_NOTE, ",");
2327 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2328 TREE_TYPE (op3));
2331 if (op4)
2333 dump_printf (MSG_NOTE, ",");
2334 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2335 TREE_TYPE (op4));
2339 return NULL;
2342 /* Check that it's ok to change the order of the computation.
2343 Generally, when vectorizing a reduction we change the order of the
2344 computation. This may change the behavior of the program in some
2345 cases, so we need to check that this is ok. One exception is when
2346 vectorizing an outer-loop: the inner-loop is executed sequentially,
2347 and therefore vectorizing reductions in the inner-loop during
2348 outer-loop vectorization is safe. */
2350 /* CHECKME: check for !flag_finite_math_only too? */
2351 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2352 && check_reduction)
2354 /* Changing the order of operations changes the semantics. */
2355 if (dump_enabled_p ())
2356 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2357 "reduction: unsafe fp math optimization: ");
2358 return NULL;
2360 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2361 && check_reduction)
2363 /* Changing the order of operations changes the semantics. */
2364 if (dump_enabled_p ())
2365 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2366 "reduction: unsafe int math optimization: ");
2367 return NULL;
2369 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2371 /* Changing the order of operations changes the semantics. */
2372 if (dump_enabled_p ())
2373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374 "reduction: unsafe fixed-point math optimization: ");
2375 return NULL;
2378 /* If we detected "res -= x[i]" earlier, rewrite it into
2379 "res += -x[i]" now. If this turns out to be useless reassoc
2380 will clean it up again. */
2381 if (orig_code == MINUS_EXPR)
2383 tree rhs = gimple_assign_rhs2 (def_stmt);
2384 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2385 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2386 rhs, NULL);
2387 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2388 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2389 loop_info, NULL));
2390 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2391 gimple_assign_set_rhs2 (def_stmt, negrhs);
2392 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2393 update_stmt (def_stmt);
2396 /* Reduction is safe. We're dealing with one of the following:
2397 1) integer arithmetic and no trapv
2398 2) floating point arithmetic, and special flags permit this optimization
2399 3) nested cycle (i.e., outer loop vectorization). */
2400 if (TREE_CODE (op1) == SSA_NAME)
2401 def1 = SSA_NAME_DEF_STMT (op1);
2403 if (TREE_CODE (op2) == SSA_NAME)
2404 def2 = SSA_NAME_DEF_STMT (op2);
2406 if (code != COND_EXPR
2407 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2409 if (dump_enabled_p ())
2410 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2411 return NULL;
2414 /* Check that one def is the reduction def, defined by PHI,
2415 the other def is either defined in the loop ("vect_internal_def"),
2416 or it's an induction (defined by a loop-header phi-node). */
2418 if (def2 && def2 == phi
2419 && (code == COND_EXPR
2420 || !def1 || gimple_nop_p (def1)
2421 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2422 && (is_gimple_assign (def1)
2423 || is_gimple_call (def1)
2424 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2425 == vect_induction_def
2426 || (gimple_code (def1) == GIMPLE_PHI
2427 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2428 == vect_internal_def
2429 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2431 if (dump_enabled_p ())
2432 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2433 return def_stmt;
2436 if (def1 && def1 == phi
2437 && (code == COND_EXPR
2438 || !def2 || gimple_nop_p (def2)
2439 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2440 && (is_gimple_assign (def2)
2441 || is_gimple_call (def2)
2442 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2443 == vect_induction_def
2444 || (gimple_code (def2) == GIMPLE_PHI
2445 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2446 == vect_internal_def
2447 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2449 if (check_reduction)
2451 /* Swap operands (just for simplicity - so that the rest of the code
2452 can assume that the reduction variable is always the last (second)
2453 argument). */
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE, def_stmt,
2456 "detected reduction: need to swap operands: ");
2458 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2459 gimple_assign_rhs2_ptr (def_stmt));
2461 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2462 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2464 else
2466 if (dump_enabled_p ())
2467 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2470 return def_stmt;
2473 /* Try to find SLP reduction chain. */
2474 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2476 if (dump_enabled_p ())
2477 report_vect_op (MSG_NOTE, def_stmt,
2478 "reduction: detected reduction chain: ");
2480 return def_stmt;
2483 if (dump_enabled_p ())
2484 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2485 "reduction: unknown pattern: ");
2487 return NULL;
2490 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2491 in-place. Arguments as there. */
2493 static gimple
2494 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2495 bool check_reduction, bool *double_reduc)
2497 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498 double_reduc, false);
2501 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2502 in-place if it enables detection of more reductions. Arguments
2503 as there. */
2505 gimple
2506 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2507 bool check_reduction, bool *double_reduc)
2509 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2510 double_reduc, true);
2513 /* Calculate the cost of one scalar iteration of the loop. */
2515 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2517 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2518 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2519 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2520 int innerloop_iters, i, stmt_cost;
2522 /* Count statements in scalar loop. Using this as scalar cost for a single
2523 iteration for now.
2525 TODO: Add outer loop support.
2527 TODO: Consider assigning different costs to different scalar
2528 statements. */
2530 /* FORNOW. */
2531 innerloop_iters = 1;
2532 if (loop->inner)
2533 innerloop_iters = 50; /* FIXME */
2535 for (i = 0; i < nbbs; i++)
2537 gimple_stmt_iterator si;
2538 basic_block bb = bbs[i];
2540 if (bb->loop_father == loop->inner)
2541 factor = innerloop_iters;
2542 else
2543 factor = 1;
2545 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2547 gimple stmt = gsi_stmt (si);
2548 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2550 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2551 continue;
2553 /* Skip stmts that are not vectorized inside the loop. */
2554 if (stmt_info
2555 && !STMT_VINFO_RELEVANT_P (stmt_info)
2556 && (!STMT_VINFO_LIVE_P (stmt_info)
2557 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2558 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2559 continue;
2561 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2563 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2564 stmt_cost = vect_get_stmt_cost (scalar_load);
2565 else
2566 stmt_cost = vect_get_stmt_cost (scalar_store);
2568 else
2569 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2571 scalar_single_iter_cost += stmt_cost * factor;
2574 return scalar_single_iter_cost;
2577 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2579 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2580 int *peel_iters_epilogue,
2581 int scalar_single_iter_cost,
2582 stmt_vector_for_cost *prologue_cost_vec,
2583 stmt_vector_for_cost *epilogue_cost_vec)
2585 int retval = 0;
2586 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2588 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2590 *peel_iters_epilogue = vf/2;
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "cost model: epilogue peel iters set to vf/2 "
2594 "because loop iterations are unknown .");
2596 /* If peeled iterations are known but number of scalar loop
2597 iterations are unknown, count a taken branch per peeled loop. */
2598 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2599 NULL, 0, vect_prologue);
2601 else
2603 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2604 peel_iters_prologue = niters < peel_iters_prologue ?
2605 niters : peel_iters_prologue;
2606 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2607 /* If we need to peel for gaps, but no peeling is required, we have to
2608 peel VF iterations. */
2609 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2610 *peel_iters_epilogue = vf;
2613 if (peel_iters_prologue)
2614 retval += record_stmt_cost (prologue_cost_vec,
2615 peel_iters_prologue * scalar_single_iter_cost,
2616 scalar_stmt, NULL, 0, vect_prologue);
2617 if (*peel_iters_epilogue)
2618 retval += record_stmt_cost (epilogue_cost_vec,
2619 *peel_iters_epilogue * scalar_single_iter_cost,
2620 scalar_stmt, NULL, 0, vect_epilogue);
2621 return retval;
2624 /* Function vect_estimate_min_profitable_iters
2626 Return the number of iterations required for the vector version of the
2627 loop to be profitable relative to the cost of the scalar version of the
2628 loop. */
2630 static void
2631 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2632 int *ret_min_profitable_niters,
2633 int *ret_min_profitable_estimate)
2635 int min_profitable_iters;
2636 int min_profitable_estimate;
2637 int peel_iters_prologue;
2638 int peel_iters_epilogue;
2639 unsigned vec_inside_cost = 0;
2640 int vec_outside_cost = 0;
2641 unsigned vec_prologue_cost = 0;
2642 unsigned vec_epilogue_cost = 0;
2643 int scalar_single_iter_cost = 0;
2644 int scalar_outside_cost = 0;
2645 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2646 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2649 /* Cost model disabled. */
2650 if (!flag_vect_cost_model)
2652 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2653 *ret_min_profitable_niters = 0;
2654 *ret_min_profitable_estimate = 0;
2655 return;
2658 /* Requires loop versioning tests to handle misalignment. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2663 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664 vect_prologue);
2665 dump_printf (MSG_NOTE,
2666 "cost model: Adding cost of checks for loop "
2667 "versioning to treat misalignment.\n");
2670 /* Requires loop versioning with alias checks. */
2671 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2673 /* FIXME: Make cost depend on complexity of individual check. */
2674 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2675 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2676 vect_prologue);
2677 dump_printf (MSG_NOTE,
2678 "cost model: Adding cost of checks for loop "
2679 "versioning aliasing.\n");
2682 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2683 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2684 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2685 vect_prologue);
2687 /* Count statements in scalar loop. Using this as scalar cost for a single
2688 iteration for now.
2690 TODO: Add outer loop support.
2692 TODO: Consider assigning different costs to different scalar
2693 statements. */
2695 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2697 /* Add additional cost for the peeled instructions in prologue and epilogue
2698 loop.
2700 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2701 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2703 TODO: Build an expression that represents peel_iters for prologue and
2704 epilogue to be used in a run-time test. */
2706 if (npeel < 0)
2708 peel_iters_prologue = vf/2;
2709 dump_printf (MSG_NOTE, "cost model: "
2710 "prologue peel iters set to vf/2.");
2712 /* If peeling for alignment is unknown, loop bound of main loop becomes
2713 unknown. */
2714 peel_iters_epilogue = vf/2;
2715 dump_printf (MSG_NOTE, "cost model: "
2716 "epilogue peel iters set to vf/2 because "
2717 "peeling for alignment is unknown.");
2719 /* If peeled iterations are unknown, count a taken branch and a not taken
2720 branch per peeled loop. Even if scalar loop iterations are known,
2721 vector iterations are not known since peeled prologue iterations are
2722 not known. Hence guards remain the same. */
2723 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2724 NULL, 0, vect_prologue);
2725 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2726 NULL, 0, vect_prologue);
2727 /* FORNOW: Don't attempt to pass individual scalar instructions to
2728 the model; just assume linear cost for scalar iterations. */
2729 (void) add_stmt_cost (target_cost_data,
2730 peel_iters_prologue * scalar_single_iter_cost,
2731 scalar_stmt, NULL, 0, vect_prologue);
2732 (void) add_stmt_cost (target_cost_data,
2733 peel_iters_epilogue * scalar_single_iter_cost,
2734 scalar_stmt, NULL, 0, vect_epilogue);
2736 else
2738 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2739 stmt_info_for_cost *si;
2740 int j;
2741 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2743 prologue_cost_vec.create (2);
2744 epilogue_cost_vec.create (2);
2745 peel_iters_prologue = npeel;
2747 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2748 &peel_iters_epilogue,
2749 scalar_single_iter_cost,
2750 &prologue_cost_vec,
2751 &epilogue_cost_vec);
2753 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2755 struct _stmt_vec_info *stmt_info
2756 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2757 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2758 si->misalign, vect_prologue);
2761 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2763 struct _stmt_vec_info *stmt_info
2764 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2765 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2766 si->misalign, vect_epilogue);
2769 prologue_cost_vec.release ();
2770 epilogue_cost_vec.release ();
2773 /* FORNOW: The scalar outside cost is incremented in one of the
2774 following ways:
2776 1. The vectorizer checks for alignment and aliasing and generates
2777 a condition that allows dynamic vectorization. A cost model
2778 check is ANDED with the versioning condition. Hence scalar code
2779 path now has the added cost of the versioning check.
2781 if (cost > th & versioning_check)
2782 jmp to vector code
2784 Hence run-time scalar is incremented by not-taken branch cost.
2786 2. The vectorizer then checks if a prologue is required. If the
2787 cost model check was not done before during versioning, it has to
2788 be done before the prologue check.
2790 if (cost <= th)
2791 prologue = scalar_iters
2792 if (prologue == 0)
2793 jmp to vector code
2794 else
2795 execute prologue
2796 if (prologue == num_iters)
2797 go to exit
2799 Hence the run-time scalar cost is incremented by a taken branch,
2800 plus a not-taken branch, plus a taken branch cost.
2802 3. The vectorizer then checks if an epilogue is required. If the
2803 cost model check was not done before during prologue check, it
2804 has to be done with the epilogue check.
2806 if (prologue == 0)
2807 jmp to vector code
2808 else
2809 execute prologue
2810 if (prologue == num_iters)
2811 go to exit
2812 vector code:
2813 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2814 jmp to epilogue
2816 Hence the run-time scalar cost should be incremented by 2 taken
2817 branches.
2819 TODO: The back end may reorder the BBS's differently and reverse
2820 conditions/branch directions. Change the estimates below to
2821 something more reasonable. */
2823 /* If the number of iterations is known and we do not do versioning, we can
2824 decide whether to vectorize at compile time. Hence the scalar version
2825 do not carry cost model guard costs. */
2826 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2827 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2828 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2830 /* Cost model check occurs at versioning. */
2831 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2832 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2833 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2834 else
2836 /* Cost model check occurs at prologue generation. */
2837 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2838 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2839 + vect_get_stmt_cost (cond_branch_not_taken);
2840 /* Cost model check occurs at epilogue generation. */
2841 else
2842 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2846 /* Complete the target-specific cost calculations. */
2847 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2848 &vec_inside_cost, &vec_epilogue_cost);
2850 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2852 /* Calculate number of iterations required to make the vector version
2853 profitable, relative to the loop bodies only. The following condition
2854 must hold true:
2855 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2856 where
2857 SIC = scalar iteration cost, VIC = vector iteration cost,
2858 VOC = vector outside cost, VF = vectorization factor,
2859 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2860 SOC = scalar outside cost for run time cost model check. */
2862 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2864 if (vec_outside_cost <= 0)
2865 min_profitable_iters = 1;
2866 else
2868 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2869 - vec_inside_cost * peel_iters_prologue
2870 - vec_inside_cost * peel_iters_epilogue)
2871 / ((scalar_single_iter_cost * vf)
2872 - vec_inside_cost);
2874 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2875 <= (((int) vec_inside_cost * min_profitable_iters)
2876 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2877 min_profitable_iters++;
2880 /* vector version will never be profitable. */
2881 else
2883 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885 "cost model: the vector iteration cost = %d "
2886 "divided by the scalar iteration cost = %d "
2887 "is greater or equal to the vectorization factor = %d.",
2888 vec_inside_cost, scalar_single_iter_cost, vf);
2889 *ret_min_profitable_niters = -1;
2890 *ret_min_profitable_estimate = -1;
2891 return;
2894 if (dump_enabled_p ())
2896 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2897 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2898 vec_inside_cost);
2899 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2900 vec_prologue_cost);
2901 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2902 vec_epilogue_cost);
2903 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2904 scalar_single_iter_cost);
2905 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2906 scalar_outside_cost);
2907 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2908 vec_outside_cost);
2909 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2910 peel_iters_prologue);
2911 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2912 peel_iters_epilogue);
2913 dump_printf (MSG_NOTE,
2914 " Calculated minimum iters for profitability: %d\n",
2915 min_profitable_iters);
2918 min_profitable_iters =
2919 min_profitable_iters < vf ? vf : min_profitable_iters;
2921 /* Because the condition we create is:
2922 if (niters <= min_profitable_iters)
2923 then skip the vectorized loop. */
2924 min_profitable_iters--;
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 " Runtime profitability threshold = %d\n", min_profitable_iters);
2930 *ret_min_profitable_niters = min_profitable_iters;
2932 /* Calculate number of iterations required to make the vector version
2933 profitable, relative to the loop bodies only.
2935 Non-vectorized variant is SIC * niters and it must win over vector
2936 variant on the expected loop trip count. The following condition must hold true:
2937 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2939 if (vec_outside_cost <= 0)
2940 min_profitable_estimate = 1;
2941 else
2943 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2944 - vec_inside_cost * peel_iters_prologue
2945 - vec_inside_cost * peel_iters_epilogue)
2946 / ((scalar_single_iter_cost * vf)
2947 - vec_inside_cost);
2949 min_profitable_estimate --;
2950 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2951 if (dump_enabled_p ())
2952 dump_printf_loc (MSG_NOTE, vect_location,
2953 " Static estimate profitability threshold = %d\n",
2954 min_profitable_iters);
2956 *ret_min_profitable_estimate = min_profitable_estimate;
2960 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2961 functions. Design better to avoid maintenance issues. */
2963 /* Function vect_model_reduction_cost.
2965 Models cost for a reduction operation, including the vector ops
2966 generated within the strip-mine loop, the initial definition before
2967 the loop, and the epilogue code that must be generated. */
2969 static bool
2970 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2971 int ncopies)
2973 int prologue_cost = 0, epilogue_cost = 0;
2974 enum tree_code code;
2975 optab optab;
2976 tree vectype;
2977 gimple stmt, orig_stmt;
2978 tree reduction_op;
2979 enum machine_mode mode;
2980 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2981 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2982 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2984 /* Cost of reduction op inside loop. */
2985 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2986 stmt_info, 0, vect_body);
2987 stmt = STMT_VINFO_STMT (stmt_info);
2989 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2991 case GIMPLE_SINGLE_RHS:
2992 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2993 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2994 break;
2995 case GIMPLE_UNARY_RHS:
2996 reduction_op = gimple_assign_rhs1 (stmt);
2997 break;
2998 case GIMPLE_BINARY_RHS:
2999 reduction_op = gimple_assign_rhs2 (stmt);
3000 break;
3001 case GIMPLE_TERNARY_RHS:
3002 reduction_op = gimple_assign_rhs3 (stmt);
3003 break;
3004 default:
3005 gcc_unreachable ();
3008 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3009 if (!vectype)
3011 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "unsupported data-type ");
3015 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3016 TREE_TYPE (reduction_op));
3018 return false;
3021 mode = TYPE_MODE (vectype);
3022 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3024 if (!orig_stmt)
3025 orig_stmt = STMT_VINFO_STMT (stmt_info);
3027 code = gimple_assign_rhs_code (orig_stmt);
3029 /* Add in cost for initial definition. */
3030 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3031 stmt_info, 0, vect_prologue);
3033 /* Determine cost of epilogue code.
3035 We have a reduction operator that will reduce the vector in one statement.
3036 Also requires scalar extract. */
3038 if (!nested_in_vect_loop_p (loop, orig_stmt))
3040 if (reduc_code != ERROR_MARK)
3042 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3043 stmt_info, 0, vect_epilogue);
3044 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3045 stmt_info, 0, vect_epilogue);
3047 else
3049 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3050 tree bitsize =
3051 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3052 int element_bitsize = tree_low_cst (bitsize, 1);
3053 int nelements = vec_size_in_bits / element_bitsize;
3055 optab = optab_for_tree_code (code, vectype, optab_default);
3057 /* We have a whole vector shift available. */
3058 if (VECTOR_MODE_P (mode)
3059 && optab_handler (optab, mode) != CODE_FOR_nothing
3060 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3062 /* Final reduction via vector shifts and the reduction operator.
3063 Also requires scalar extract. */
3064 epilogue_cost += add_stmt_cost (target_cost_data,
3065 exact_log2 (nelements) * 2,
3066 vector_stmt, stmt_info, 0,
3067 vect_epilogue);
3068 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3069 vec_to_scalar, stmt_info, 0,
3070 vect_epilogue);
3072 else
3073 /* Use extracts and reduction op for final reduction. For N
3074 elements, we have N extracts and N-1 reduction ops. */
3075 epilogue_cost += add_stmt_cost (target_cost_data,
3076 nelements + nelements - 1,
3077 vector_stmt, stmt_info, 0,
3078 vect_epilogue);
3082 if (dump_enabled_p ())
3083 dump_printf (MSG_NOTE,
3084 "vect_model_reduction_cost: inside_cost = %d, "
3085 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3086 prologue_cost, epilogue_cost);
3088 return true;
3092 /* Function vect_model_induction_cost.
3094 Models cost for induction operations. */
3096 static void
3097 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3100 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3101 unsigned inside_cost, prologue_cost;
3103 /* loop cost for vec_loop. */
3104 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3105 stmt_info, 0, vect_body);
3107 /* prologue cost for vec_init and vec_step. */
3108 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3109 stmt_info, 0, vect_prologue);
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "vect_model_induction_cost: inside_cost = %d, "
3114 "prologue_cost = %d .", inside_cost, prologue_cost);
3118 /* Function get_initial_def_for_induction
3120 Input:
3121 STMT - a stmt that performs an induction operation in the loop.
3122 IV_PHI - the initial value of the induction variable
3124 Output:
3125 Return a vector variable, initialized with the first VF values of
3126 the induction variable. E.g., for an iv with IV_PHI='X' and
3127 evolution S, for a vector of 4 units, we want to return:
3128 [X, X + S, X + 2*S, X + 3*S]. */
3130 static tree
3131 get_initial_def_for_induction (gimple iv_phi)
3133 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3134 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3135 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3136 tree scalar_type;
3137 tree vectype;
3138 int nunits;
3139 edge pe = loop_preheader_edge (loop);
3140 struct loop *iv_loop;
3141 basic_block new_bb;
3142 tree new_vec, vec_init, vec_step, t;
3143 tree access_fn;
3144 tree new_var;
3145 tree new_name;
3146 gimple init_stmt, induction_phi, new_stmt;
3147 tree induc_def, vec_def, vec_dest;
3148 tree init_expr, step_expr;
3149 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3150 int i;
3151 bool ok;
3152 int ncopies;
3153 tree expr;
3154 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3155 bool nested_in_vect_loop = false;
3156 gimple_seq stmts = NULL;
3157 imm_use_iterator imm_iter;
3158 use_operand_p use_p;
3159 gimple exit_phi;
3160 edge latch_e;
3161 tree loop_arg;
3162 gimple_stmt_iterator si;
3163 basic_block bb = gimple_bb (iv_phi);
3164 tree stepvectype;
3165 tree resvectype;
3167 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3168 if (nested_in_vect_loop_p (loop, iv_phi))
3170 nested_in_vect_loop = true;
3171 iv_loop = loop->inner;
3173 else
3174 iv_loop = loop;
3175 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3177 latch_e = loop_latch_edge (iv_loop);
3178 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3180 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3181 gcc_assert (access_fn);
3182 STRIP_NOPS (access_fn);
3183 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3184 &init_expr, &step_expr);
3185 gcc_assert (ok);
3186 pe = loop_preheader_edge (iv_loop);
3188 scalar_type = TREE_TYPE (init_expr);
3189 vectype = get_vectype_for_scalar_type (scalar_type);
3190 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3191 gcc_assert (vectype);
3192 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3193 ncopies = vf / nunits;
3195 gcc_assert (phi_info);
3196 gcc_assert (ncopies >= 1);
3198 /* Find the first insertion point in the BB. */
3199 si = gsi_after_labels (bb);
3201 /* Create the vector that holds the initial_value of the induction. */
3202 if (nested_in_vect_loop)
3204 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3205 been created during vectorization of previous stmts. We obtain it
3206 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3207 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3208 loop_preheader_edge (iv_loop));
3209 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3210 /* If the initial value is not of proper type, convert it. */
3211 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3213 new_stmt = gimple_build_assign_with_ops
3214 (VIEW_CONVERT_EXPR,
3215 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3216 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3217 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3218 gimple_assign_set_lhs (new_stmt, vec_init);
3219 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3220 new_stmt);
3221 gcc_assert (!new_bb);
3222 set_vinfo_for_stmt (new_stmt,
3223 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3226 else
3228 vec<constructor_elt, va_gc> *v;
3230 /* iv_loop is the loop to be vectorized. Create:
3231 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3232 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3233 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3234 if (stmts)
3236 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3237 gcc_assert (!new_bb);
3240 vec_alloc (v, nunits);
3241 bool constant_p = is_gimple_min_invariant (new_name);
3242 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3243 for (i = 1; i < nunits; i++)
3245 /* Create: new_name_i = new_name + step_expr */
3246 enum tree_code code = POINTER_TYPE_P (scalar_type)
3247 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3248 new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3249 if (!is_gimple_min_invariant (new_name))
3251 init_stmt = gimple_build_assign (new_var, new_name);
3252 new_name = make_ssa_name (new_var, init_stmt);
3253 gimple_assign_set_lhs (init_stmt, new_name);
3254 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3255 gcc_assert (!new_bb);
3256 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "created new init_stmt: ");
3260 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3262 constant_p = false;
3264 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3266 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3267 if (constant_p)
3268 new_vec = build_vector_from_ctor (vectype, v);
3269 else
3270 new_vec = build_constructor (vectype, v);
3271 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3275 /* Create the vector that holds the step of the induction. */
3276 if (nested_in_vect_loop)
3277 /* iv_loop is nested in the loop to be vectorized. Generate:
3278 vec_step = [S, S, S, S] */
3279 new_name = step_expr;
3280 else
3282 /* iv_loop is the loop to be vectorized. Generate:
3283 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3284 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3286 expr = build_int_cst (integer_type_node, vf);
3287 expr = fold_convert (TREE_TYPE (step_expr), expr);
3289 else
3290 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3291 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3292 expr, step_expr);
3293 if (TREE_CODE (step_expr) == SSA_NAME)
3294 new_name = vect_init_vector (iv_phi, new_name,
3295 TREE_TYPE (step_expr), NULL);
3298 t = unshare_expr (new_name);
3299 gcc_assert (CONSTANT_CLASS_P (new_name)
3300 || TREE_CODE (new_name) == SSA_NAME);
3301 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3302 gcc_assert (stepvectype);
3303 new_vec = build_vector_from_val (stepvectype, t);
3304 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3307 /* Create the following def-use cycle:
3308 loop prolog:
3309 vec_init = ...
3310 vec_step = ...
3311 loop:
3312 vec_iv = PHI <vec_init, vec_loop>
3314 STMT
3316 vec_loop = vec_iv + vec_step; */
3318 /* Create the induction-phi that defines the induction-operand. */
3319 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3320 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3321 set_vinfo_for_stmt (induction_phi,
3322 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3323 induc_def = PHI_RESULT (induction_phi);
3325 /* Create the iv update inside the loop */
3326 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3327 induc_def, vec_step);
3328 vec_def = make_ssa_name (vec_dest, new_stmt);
3329 gimple_assign_set_lhs (new_stmt, vec_def);
3330 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3331 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3332 NULL));
3334 /* Set the arguments of the phi node: */
3335 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3336 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3337 UNKNOWN_LOCATION);
3340 /* In case that vectorization factor (VF) is bigger than the number
3341 of elements that we can fit in a vectype (nunits), we have to generate
3342 more than one vector stmt - i.e - we need to "unroll" the
3343 vector stmt by a factor VF/nunits. For more details see documentation
3344 in vectorizable_operation. */
3346 if (ncopies > 1)
3348 stmt_vec_info prev_stmt_vinfo;
3349 /* FORNOW. This restriction should be relaxed. */
3350 gcc_assert (!nested_in_vect_loop);
3352 /* Create the vector that holds the step of the induction. */
3353 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3355 expr = build_int_cst (integer_type_node, nunits);
3356 expr = fold_convert (TREE_TYPE (step_expr), expr);
3358 else
3359 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3360 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3361 expr, step_expr);
3362 if (TREE_CODE (step_expr) == SSA_NAME)
3363 new_name = vect_init_vector (iv_phi, new_name,
3364 TREE_TYPE (step_expr), NULL);
3365 t = unshare_expr (new_name);
3366 gcc_assert (CONSTANT_CLASS_P (new_name)
3367 || TREE_CODE (new_name) == SSA_NAME);
3368 new_vec = build_vector_from_val (stepvectype, t);
3369 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3371 vec_def = induc_def;
3372 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3373 for (i = 1; i < ncopies; i++)
3375 /* vec_i = vec_prev + vec_step */
3376 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3377 vec_def, vec_step);
3378 vec_def = make_ssa_name (vec_dest, new_stmt);
3379 gimple_assign_set_lhs (new_stmt, vec_def);
3381 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3382 if (!useless_type_conversion_p (resvectype, vectype))
3384 new_stmt = gimple_build_assign_with_ops
3385 (VIEW_CONVERT_EXPR,
3386 vect_get_new_vect_var (resvectype, vect_simple_var,
3387 "vec_iv_"),
3388 build1 (VIEW_CONVERT_EXPR, resvectype,
3389 gimple_assign_lhs (new_stmt)), NULL_TREE);
3390 gimple_assign_set_lhs (new_stmt,
3391 make_ssa_name
3392 (gimple_assign_lhs (new_stmt), new_stmt));
3393 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3395 set_vinfo_for_stmt (new_stmt,
3396 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3397 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3398 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3402 if (nested_in_vect_loop)
3404 /* Find the loop-closed exit-phi of the induction, and record
3405 the final vector of induction results: */
3406 exit_phi = NULL;
3407 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3409 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3411 exit_phi = USE_STMT (use_p);
3412 break;
3415 if (exit_phi)
3417 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3418 /* FORNOW. Currently not supporting the case that an inner-loop induction
3419 is not used in the outer-loop (i.e. only outside the outer-loop). */
3420 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3421 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3423 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3424 if (dump_enabled_p ())
3426 dump_printf_loc (MSG_NOTE, vect_location,
3427 "vector of inductions after inner-loop:");
3428 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3434 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_NOTE, vect_location,
3437 "transform induction: created def-use cycle: ");
3438 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3439 dump_printf (MSG_NOTE, "\n");
3440 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3441 SSA_NAME_DEF_STMT (vec_def), 0);
3444 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3445 if (!useless_type_conversion_p (resvectype, vectype))
3447 new_stmt = gimple_build_assign_with_ops
3448 (VIEW_CONVERT_EXPR,
3449 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3450 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3451 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3452 gimple_assign_set_lhs (new_stmt, induc_def);
3453 si = gsi_after_labels (bb);
3454 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3455 set_vinfo_for_stmt (new_stmt,
3456 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3457 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3458 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3461 return induc_def;
3465 /* Function get_initial_def_for_reduction
3467 Input:
3468 STMT - a stmt that performs a reduction operation in the loop.
3469 INIT_VAL - the initial value of the reduction variable
3471 Output:
3472 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3473 of the reduction (used for adjusting the epilog - see below).
3474 Return a vector variable, initialized according to the operation that STMT
3475 performs. This vector will be used as the initial value of the
3476 vector of partial results.
3478 Option1 (adjust in epilog): Initialize the vector as follows:
3479 add/bit or/xor: [0,0,...,0,0]
3480 mult/bit and: [1,1,...,1,1]
3481 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3482 and when necessary (e.g. add/mult case) let the caller know
3483 that it needs to adjust the result by init_val.
3485 Option2: Initialize the vector as follows:
3486 add/bit or/xor: [init_val,0,0,...,0]
3487 mult/bit and: [init_val,1,1,...,1]
3488 min/max/cond_expr: [init_val,init_val,...,init_val]
3489 and no adjustments are needed.
3491 For example, for the following code:
3493 s = init_val;
3494 for (i=0;i<n;i++)
3495 s = s + a[i];
3497 STMT is 's = s + a[i]', and the reduction variable is 's'.
3498 For a vector of 4 units, we want to return either [0,0,0,init_val],
3499 or [0,0,0,0] and let the caller know that it needs to adjust
3500 the result at the end by 'init_val'.
3502 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3503 initialization vector is simpler (same element in all entries), if
3504 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3506 A cost model should help decide between these two schemes. */
3508 tree
3509 get_initial_def_for_reduction (gimple stmt, tree init_val,
3510 tree *adjustment_def)
3512 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3513 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3514 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3515 tree scalar_type = TREE_TYPE (init_val);
3516 tree vectype = get_vectype_for_scalar_type (scalar_type);
3517 int nunits;
3518 enum tree_code code = gimple_assign_rhs_code (stmt);
3519 tree def_for_init;
3520 tree init_def;
3521 tree *elts;
3522 int i;
3523 bool nested_in_vect_loop = false;
3524 tree init_value;
3525 REAL_VALUE_TYPE real_init_val = dconst0;
3526 int int_init_val = 0;
3527 gimple def_stmt = NULL;
3529 gcc_assert (vectype);
3530 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3532 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3533 || SCALAR_FLOAT_TYPE_P (scalar_type));
3535 if (nested_in_vect_loop_p (loop, stmt))
3536 nested_in_vect_loop = true;
3537 else
3538 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3540 /* In case of double reduction we only create a vector variable to be put
3541 in the reduction phi node. The actual statement creation is done in
3542 vect_create_epilog_for_reduction. */
3543 if (adjustment_def && nested_in_vect_loop
3544 && TREE_CODE (init_val) == SSA_NAME
3545 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3546 && gimple_code (def_stmt) == GIMPLE_PHI
3547 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3548 && vinfo_for_stmt (def_stmt)
3549 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3550 == vect_double_reduction_def)
3552 *adjustment_def = NULL;
3553 return vect_create_destination_var (init_val, vectype);
3556 if (TREE_CONSTANT (init_val))
3558 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3559 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3560 else
3561 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3563 else
3564 init_value = init_val;
3566 switch (code)
3568 case WIDEN_SUM_EXPR:
3569 case DOT_PROD_EXPR:
3570 case PLUS_EXPR:
3571 case MINUS_EXPR:
3572 case BIT_IOR_EXPR:
3573 case BIT_XOR_EXPR:
3574 case MULT_EXPR:
3575 case BIT_AND_EXPR:
3576 /* ADJUSMENT_DEF is NULL when called from
3577 vect_create_epilog_for_reduction to vectorize double reduction. */
3578 if (adjustment_def)
3580 if (nested_in_vect_loop)
3581 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3582 NULL);
3583 else
3584 *adjustment_def = init_val;
3587 if (code == MULT_EXPR)
3589 real_init_val = dconst1;
3590 int_init_val = 1;
3593 if (code == BIT_AND_EXPR)
3594 int_init_val = -1;
3596 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3597 def_for_init = build_real (scalar_type, real_init_val);
3598 else
3599 def_for_init = build_int_cst (scalar_type, int_init_val);
3601 /* Create a vector of '0' or '1' except the first element. */
3602 elts = XALLOCAVEC (tree, nunits);
3603 for (i = nunits - 2; i >= 0; --i)
3604 elts[i + 1] = def_for_init;
3606 /* Option1: the first element is '0' or '1' as well. */
3607 if (adjustment_def)
3609 elts[0] = def_for_init;
3610 init_def = build_vector (vectype, elts);
3611 break;
3614 /* Option2: the first element is INIT_VAL. */
3615 elts[0] = init_val;
3616 if (TREE_CONSTANT (init_val))
3617 init_def = build_vector (vectype, elts);
3618 else
3620 vec<constructor_elt, va_gc> *v;
3621 vec_alloc (v, nunits);
3622 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3623 for (i = 1; i < nunits; ++i)
3624 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3625 init_def = build_constructor (vectype, v);
3628 break;
3630 case MIN_EXPR:
3631 case MAX_EXPR:
3632 case COND_EXPR:
3633 if (adjustment_def)
3635 *adjustment_def = NULL_TREE;
3636 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3637 break;
3640 init_def = build_vector_from_val (vectype, init_value);
3641 break;
3643 default:
3644 gcc_unreachable ();
3647 return init_def;
3651 /* Function vect_create_epilog_for_reduction
3653 Create code at the loop-epilog to finalize the result of a reduction
3654 computation.
3656 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3657 reduction statements.
3658 STMT is the scalar reduction stmt that is being vectorized.
3659 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3660 number of elements that we can fit in a vectype (nunits). In this case
3661 we have to generate more than one vector stmt - i.e - we need to "unroll"
3662 the vector stmt by a factor VF/nunits. For more details see documentation
3663 in vectorizable_operation.
3664 REDUC_CODE is the tree-code for the epilog reduction.
3665 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3666 computation.
3667 REDUC_INDEX is the index of the operand in the right hand side of the
3668 statement that is defined by REDUCTION_PHI.
3669 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3670 SLP_NODE is an SLP node containing a group of reduction statements. The
3671 first one in this group is STMT.
3673 This function:
3674 1. Creates the reduction def-use cycles: sets the arguments for
3675 REDUCTION_PHIS:
3676 The loop-entry argument is the vectorized initial-value of the reduction.
3677 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3678 sums.
3679 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3680 by applying the operation specified by REDUC_CODE if available, or by
3681 other means (whole-vector shifts or a scalar loop).
3682 The function also creates a new phi node at the loop exit to preserve
3683 loop-closed form, as illustrated below.
3685 The flow at the entry to this function:
3687 loop:
3688 vec_def = phi <null, null> # REDUCTION_PHI
3689 VECT_DEF = vector_stmt # vectorized form of STMT
3690 s_loop = scalar_stmt # (scalar) STMT
3691 loop_exit:
3692 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3693 use <s_out0>
3694 use <s_out0>
3696 The above is transformed by this function into:
3698 loop:
3699 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3700 VECT_DEF = vector_stmt # vectorized form of STMT
3701 s_loop = scalar_stmt # (scalar) STMT
3702 loop_exit:
3703 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3704 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3705 v_out2 = reduce <v_out1>
3706 s_out3 = extract_field <v_out2, 0>
3707 s_out4 = adjust_result <s_out3>
3708 use <s_out4>
3709 use <s_out4>
3712 static void
3713 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3714 int ncopies, enum tree_code reduc_code,
3715 vec<gimple> reduction_phis,
3716 int reduc_index, bool double_reduc,
3717 slp_tree slp_node)
3719 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3720 stmt_vec_info prev_phi_info;
3721 tree vectype;
3722 enum machine_mode mode;
3723 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3724 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3725 basic_block exit_bb;
3726 tree scalar_dest;
3727 tree scalar_type;
3728 gimple new_phi = NULL, phi;
3729 gimple_stmt_iterator exit_gsi;
3730 tree vec_dest;
3731 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3732 gimple epilog_stmt = NULL;
3733 enum tree_code code = gimple_assign_rhs_code (stmt);
3734 gimple exit_phi;
3735 tree bitsize, bitpos;
3736 tree adjustment_def = NULL;
3737 tree vec_initial_def = NULL;
3738 tree reduction_op, expr, def;
3739 tree orig_name, scalar_result;
3740 imm_use_iterator imm_iter, phi_imm_iter;
3741 use_operand_p use_p, phi_use_p;
3742 bool extract_scalar_result = false;
3743 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3744 bool nested_in_vect_loop = false;
3745 vec<gimple> new_phis = vNULL;
3746 vec<gimple> inner_phis = vNULL;
3747 enum vect_def_type dt = vect_unknown_def_type;
3748 int j, i;
3749 vec<tree> scalar_results = vNULL;
3750 unsigned int group_size = 1, k, ratio;
3751 vec<tree> vec_initial_defs = vNULL;
3752 vec<gimple> phis;
3753 bool slp_reduc = false;
3754 tree new_phi_result;
3755 gimple inner_phi = NULL;
3757 if (slp_node)
3758 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3760 if (nested_in_vect_loop_p (loop, stmt))
3762 outer_loop = loop;
3763 loop = loop->inner;
3764 nested_in_vect_loop = true;
3765 gcc_assert (!slp_node);
3768 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3770 case GIMPLE_SINGLE_RHS:
3771 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3772 == ternary_op);
3773 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3774 break;
3775 case GIMPLE_UNARY_RHS:
3776 reduction_op = gimple_assign_rhs1 (stmt);
3777 break;
3778 case GIMPLE_BINARY_RHS:
3779 reduction_op = reduc_index ?
3780 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3781 break;
3782 case GIMPLE_TERNARY_RHS:
3783 reduction_op = gimple_op (stmt, reduc_index + 1);
3784 break;
3785 default:
3786 gcc_unreachable ();
3789 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3790 gcc_assert (vectype);
3791 mode = TYPE_MODE (vectype);
3793 /* 1. Create the reduction def-use cycle:
3794 Set the arguments of REDUCTION_PHIS, i.e., transform
3796 loop:
3797 vec_def = phi <null, null> # REDUCTION_PHI
3798 VECT_DEF = vector_stmt # vectorized form of STMT
3801 into:
3803 loop:
3804 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3805 VECT_DEF = vector_stmt # vectorized form of STMT
3808 (in case of SLP, do it for all the phis). */
3810 /* Get the loop-entry arguments. */
3811 if (slp_node)
3812 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3813 NULL, slp_node, reduc_index);
3814 else
3816 vec_initial_defs.create (1);
3817 /* For the case of reduction, vect_get_vec_def_for_operand returns
3818 the scalar def before the loop, that defines the initial value
3819 of the reduction variable. */
3820 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3821 &adjustment_def);
3822 vec_initial_defs.quick_push (vec_initial_def);
3825 /* Set phi nodes arguments. */
3826 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3828 tree vec_init_def = vec_initial_defs[i];
3829 tree def = vect_defs[i];
3830 for (j = 0; j < ncopies; j++)
3832 /* Set the loop-entry arg of the reduction-phi. */
3833 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3834 UNKNOWN_LOCATION);
3836 /* Set the loop-latch arg for the reduction-phi. */
3837 if (j > 0)
3838 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3840 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3842 if (dump_enabled_p ())
3844 dump_printf_loc (MSG_NOTE, vect_location,
3845 "transform reduction: created def-use cycle: ");
3846 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3847 dump_printf (MSG_NOTE, "\n");
3848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3851 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3855 vec_initial_defs.release ();
3857 /* 2. Create epilog code.
3858 The reduction epilog code operates across the elements of the vector
3859 of partial results computed by the vectorized loop.
3860 The reduction epilog code consists of:
3862 step 1: compute the scalar result in a vector (v_out2)
3863 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3864 step 3: adjust the scalar result (s_out3) if needed.
3866 Step 1 can be accomplished using one the following three schemes:
3867 (scheme 1) using reduc_code, if available.
3868 (scheme 2) using whole-vector shifts, if available.
3869 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3870 combined.
3872 The overall epilog code looks like this:
3874 s_out0 = phi <s_loop> # original EXIT_PHI
3875 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3876 v_out2 = reduce <v_out1> # step 1
3877 s_out3 = extract_field <v_out2, 0> # step 2
3878 s_out4 = adjust_result <s_out3> # step 3
3880 (step 3 is optional, and steps 1 and 2 may be combined).
3881 Lastly, the uses of s_out0 are replaced by s_out4. */
3884 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3885 v_out1 = phi <VECT_DEF>
3886 Store them in NEW_PHIS. */
3888 exit_bb = single_exit (loop)->dest;
3889 prev_phi_info = NULL;
3890 new_phis.create (vect_defs.length ());
3891 FOR_EACH_VEC_ELT (vect_defs, i, def)
3893 for (j = 0; j < ncopies; j++)
3895 tree new_def = copy_ssa_name (def, NULL);
3896 phi = create_phi_node (new_def, exit_bb);
3897 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3898 if (j == 0)
3899 new_phis.quick_push (phi);
3900 else
3902 def = vect_get_vec_def_for_stmt_copy (dt, def);
3903 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3906 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3907 prev_phi_info = vinfo_for_stmt (phi);
3911 /* The epilogue is created for the outer-loop, i.e., for the loop being
3912 vectorized. Create exit phis for the outer loop. */
3913 if (double_reduc)
3915 loop = outer_loop;
3916 exit_bb = single_exit (loop)->dest;
3917 inner_phis.create (vect_defs.length ());
3918 FOR_EACH_VEC_ELT (new_phis, i, phi)
3920 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3921 gimple outer_phi = create_phi_node (new_result, exit_bb);
3922 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3923 PHI_RESULT (phi));
3924 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3925 loop_vinfo, NULL));
3926 inner_phis.quick_push (phi);
3927 new_phis[i] = outer_phi;
3928 prev_phi_info = vinfo_for_stmt (outer_phi);
3929 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3931 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3932 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3933 outer_phi = create_phi_node (new_result, exit_bb);
3934 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3935 PHI_RESULT (phi));
3936 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3937 loop_vinfo, NULL));
3938 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3939 prev_phi_info = vinfo_for_stmt (outer_phi);
3944 exit_gsi = gsi_after_labels (exit_bb);
3946 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3947 (i.e. when reduc_code is not available) and in the final adjustment
3948 code (if needed). Also get the original scalar reduction variable as
3949 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3950 represents a reduction pattern), the tree-code and scalar-def are
3951 taken from the original stmt that the pattern-stmt (STMT) replaces.
3952 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3953 are taken from STMT. */
3955 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3956 if (!orig_stmt)
3958 /* Regular reduction */
3959 orig_stmt = stmt;
3961 else
3963 /* Reduction pattern */
3964 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3965 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3966 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3969 code = gimple_assign_rhs_code (orig_stmt);
3970 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3971 partial results are added and not subtracted. */
3972 if (code == MINUS_EXPR)
3973 code = PLUS_EXPR;
3975 scalar_dest = gimple_assign_lhs (orig_stmt);
3976 scalar_type = TREE_TYPE (scalar_dest);
3977 scalar_results.create (group_size);
3978 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3979 bitsize = TYPE_SIZE (scalar_type);
3981 /* In case this is a reduction in an inner-loop while vectorizing an outer
3982 loop - we don't need to extract a single scalar result at the end of the
3983 inner-loop (unless it is double reduction, i.e., the use of reduction is
3984 outside the outer-loop). The final vector of partial results will be used
3985 in the vectorized outer-loop, or reduced to a scalar result at the end of
3986 the outer-loop. */
3987 if (nested_in_vect_loop && !double_reduc)
3988 goto vect_finalize_reduction;
3990 /* SLP reduction without reduction chain, e.g.,
3991 # a1 = phi <a2, a0>
3992 # b1 = phi <b2, b0>
3993 a2 = operation (a1)
3994 b2 = operation (b1) */
3995 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3997 /* In case of reduction chain, e.g.,
3998 # a1 = phi <a3, a0>
3999 a2 = operation (a1)
4000 a3 = operation (a2),
4002 we may end up with more than one vector result. Here we reduce them to
4003 one vector. */
4004 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4006 tree first_vect = PHI_RESULT (new_phis[0]);
4007 tree tmp;
4008 gimple new_vec_stmt = NULL;
4010 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4011 for (k = 1; k < new_phis.length (); k++)
4013 gimple next_phi = new_phis[k];
4014 tree second_vect = PHI_RESULT (next_phi);
4016 tmp = build2 (code, vectype, first_vect, second_vect);
4017 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4018 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4019 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4020 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4023 new_phi_result = first_vect;
4024 if (new_vec_stmt)
4026 new_phis.truncate (0);
4027 new_phis.safe_push (new_vec_stmt);
4030 else
4031 new_phi_result = PHI_RESULT (new_phis[0]);
4033 /* 2.3 Create the reduction code, using one of the three schemes described
4034 above. In SLP we simply need to extract all the elements from the
4035 vector (without reducing them), so we use scalar shifts. */
4036 if (reduc_code != ERROR_MARK && !slp_reduc)
4038 tree tmp;
4040 /*** Case 1: Create:
4041 v_out2 = reduc_expr <v_out1> */
4043 if (dump_enabled_p ())
4044 dump_printf_loc (MSG_NOTE, vect_location,
4045 "Reduce using direct vector reduction.");
4047 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4048 tmp = build1 (reduc_code, vectype, new_phi_result);
4049 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4050 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4051 gimple_assign_set_lhs (epilog_stmt, new_temp);
4052 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4054 extract_scalar_result = true;
4056 else
4058 enum tree_code shift_code = ERROR_MARK;
4059 bool have_whole_vector_shift = true;
4060 int bit_offset;
4061 int element_bitsize = tree_low_cst (bitsize, 1);
4062 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4063 tree vec_temp;
4065 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4066 shift_code = VEC_RSHIFT_EXPR;
4067 else
4068 have_whole_vector_shift = false;
4070 /* Regardless of whether we have a whole vector shift, if we're
4071 emulating the operation via tree-vect-generic, we don't want
4072 to use it. Only the first round of the reduction is likely
4073 to still be profitable via emulation. */
4074 /* ??? It might be better to emit a reduction tree code here, so that
4075 tree-vect-generic can expand the first round via bit tricks. */
4076 if (!VECTOR_MODE_P (mode))
4077 have_whole_vector_shift = false;
4078 else
4080 optab optab = optab_for_tree_code (code, vectype, optab_default);
4081 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4082 have_whole_vector_shift = false;
4085 if (have_whole_vector_shift && !slp_reduc)
4087 /*** Case 2: Create:
4088 for (offset = VS/2; offset >= element_size; offset/=2)
4090 Create: va' = vec_shift <va, offset>
4091 Create: va = vop <va, va'>
4092 } */
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 "Reduce using vector shifts");
4098 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4099 new_temp = new_phi_result;
4100 for (bit_offset = vec_size_in_bits/2;
4101 bit_offset >= element_bitsize;
4102 bit_offset /= 2)
4104 tree bitpos = size_int (bit_offset);
4106 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4107 vec_dest, new_temp, bitpos);
4108 new_name = make_ssa_name (vec_dest, epilog_stmt);
4109 gimple_assign_set_lhs (epilog_stmt, new_name);
4110 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4112 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4113 new_name, new_temp);
4114 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4115 gimple_assign_set_lhs (epilog_stmt, new_temp);
4116 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4119 extract_scalar_result = true;
4121 else
4123 tree rhs;
4125 /*** Case 3: Create:
4126 s = extract_field <v_out2, 0>
4127 for (offset = element_size;
4128 offset < vector_size;
4129 offset += element_size;)
4131 Create: s' = extract_field <v_out2, offset>
4132 Create: s = op <s, s'> // For non SLP cases
4133 } */
4135 if (dump_enabled_p ())
4136 dump_printf_loc (MSG_NOTE, vect_location,
4137 "Reduce using scalar code. ");
4139 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4140 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4142 if (gimple_code (new_phi) == GIMPLE_PHI)
4143 vec_temp = PHI_RESULT (new_phi);
4144 else
4145 vec_temp = gimple_assign_lhs (new_phi);
4146 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4147 bitsize_zero_node);
4148 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4149 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4150 gimple_assign_set_lhs (epilog_stmt, new_temp);
4151 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4153 /* In SLP we don't need to apply reduction operation, so we just
4154 collect s' values in SCALAR_RESULTS. */
4155 if (slp_reduc)
4156 scalar_results.safe_push (new_temp);
4158 for (bit_offset = element_bitsize;
4159 bit_offset < vec_size_in_bits;
4160 bit_offset += element_bitsize)
4162 tree bitpos = bitsize_int (bit_offset);
4163 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4164 bitsize, bitpos);
4166 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4167 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4168 gimple_assign_set_lhs (epilog_stmt, new_name);
4169 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4171 if (slp_reduc)
4173 /* In SLP we don't need to apply reduction operation, so
4174 we just collect s' values in SCALAR_RESULTS. */
4175 new_temp = new_name;
4176 scalar_results.safe_push (new_name);
4178 else
4180 epilog_stmt = gimple_build_assign_with_ops (code,
4181 new_scalar_dest, new_name, new_temp);
4182 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4183 gimple_assign_set_lhs (epilog_stmt, new_temp);
4184 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4189 /* The only case where we need to reduce scalar results in SLP, is
4190 unrolling. If the size of SCALAR_RESULTS is greater than
4191 GROUP_SIZE, we reduce them combining elements modulo
4192 GROUP_SIZE. */
4193 if (slp_reduc)
4195 tree res, first_res, new_res;
4196 gimple new_stmt;
4198 /* Reduce multiple scalar results in case of SLP unrolling. */
4199 for (j = group_size; scalar_results.iterate (j, &res);
4200 j++)
4202 first_res = scalar_results[j % group_size];
4203 new_stmt = gimple_build_assign_with_ops (code,
4204 new_scalar_dest, first_res, res);
4205 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4206 gimple_assign_set_lhs (new_stmt, new_res);
4207 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4208 scalar_results[j % group_size] = new_res;
4211 else
4212 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4213 scalar_results.safe_push (new_temp);
4215 extract_scalar_result = false;
4219 /* 2.4 Extract the final scalar result. Create:
4220 s_out3 = extract_field <v_out2, bitpos> */
4222 if (extract_scalar_result)
4224 tree rhs;
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "extract scalar result");
4230 if (BYTES_BIG_ENDIAN)
4231 bitpos = size_binop (MULT_EXPR,
4232 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4233 TYPE_SIZE (scalar_type));
4234 else
4235 bitpos = bitsize_zero_node;
4237 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4238 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4239 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4240 gimple_assign_set_lhs (epilog_stmt, new_temp);
4241 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4242 scalar_results.safe_push (new_temp);
4245 vect_finalize_reduction:
4247 if (double_reduc)
4248 loop = loop->inner;
4250 /* 2.5 Adjust the final result by the initial value of the reduction
4251 variable. (When such adjustment is not needed, then
4252 'adjustment_def' is zero). For example, if code is PLUS we create:
4253 new_temp = loop_exit_def + adjustment_def */
4255 if (adjustment_def)
4257 gcc_assert (!slp_reduc);
4258 if (nested_in_vect_loop)
4260 new_phi = new_phis[0];
4261 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4262 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4263 new_dest = vect_create_destination_var (scalar_dest, vectype);
4265 else
4267 new_temp = scalar_results[0];
4268 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4269 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4270 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4273 epilog_stmt = gimple_build_assign (new_dest, expr);
4274 new_temp = make_ssa_name (new_dest, epilog_stmt);
4275 gimple_assign_set_lhs (epilog_stmt, new_temp);
4276 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4278 if (nested_in_vect_loop)
4280 set_vinfo_for_stmt (epilog_stmt,
4281 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4282 NULL));
4283 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4284 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4286 if (!double_reduc)
4287 scalar_results.quick_push (new_temp);
4288 else
4289 scalar_results[0] = new_temp;
4291 else
4292 scalar_results[0] = new_temp;
4294 new_phis[0] = epilog_stmt;
4297 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4298 phis with new adjusted scalar results, i.e., replace use <s_out0>
4299 with use <s_out4>.
4301 Transform:
4302 loop_exit:
4303 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4304 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4305 v_out2 = reduce <v_out1>
4306 s_out3 = extract_field <v_out2, 0>
4307 s_out4 = adjust_result <s_out3>
4308 use <s_out0>
4309 use <s_out0>
4311 into:
4313 loop_exit:
4314 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4315 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4316 v_out2 = reduce <v_out1>
4317 s_out3 = extract_field <v_out2, 0>
4318 s_out4 = adjust_result <s_out3>
4319 use <s_out4>
4320 use <s_out4> */
4323 /* In SLP reduction chain we reduce vector results into one vector if
4324 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4325 the last stmt in the reduction chain, since we are looking for the loop
4326 exit phi node. */
4327 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4329 scalar_dest = gimple_assign_lhs (
4330 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4331 group_size = 1;
4334 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4335 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4336 need to match SCALAR_RESULTS with corresponding statements. The first
4337 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4338 the first vector stmt, etc.
4339 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4340 if (group_size > new_phis.length ())
4342 ratio = group_size / new_phis.length ();
4343 gcc_assert (!(group_size % new_phis.length ()));
4345 else
4346 ratio = 1;
4348 for (k = 0; k < group_size; k++)
4350 if (k % ratio == 0)
4352 epilog_stmt = new_phis[k / ratio];
4353 reduction_phi = reduction_phis[k / ratio];
4354 if (double_reduc)
4355 inner_phi = inner_phis[k / ratio];
4358 if (slp_reduc)
4360 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4362 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4363 /* SLP statements can't participate in patterns. */
4364 gcc_assert (!orig_stmt);
4365 scalar_dest = gimple_assign_lhs (current_stmt);
4368 phis.create (3);
4369 /* Find the loop-closed-use at the loop exit of the original scalar
4370 result. (The reduction result is expected to have two immediate uses -
4371 one at the latch block, and one at the loop exit). */
4372 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4373 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4374 phis.safe_push (USE_STMT (use_p));
4376 /* We expect to have found an exit_phi because of loop-closed-ssa
4377 form. */
4378 gcc_assert (!phis.is_empty ());
4380 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4382 if (outer_loop)
4384 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4385 gimple vect_phi;
4387 /* FORNOW. Currently not supporting the case that an inner-loop
4388 reduction is not used in the outer-loop (but only outside the
4389 outer-loop), unless it is double reduction. */
4390 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4391 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4392 || double_reduc);
4394 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4395 if (!double_reduc
4396 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4397 != vect_double_reduction_def)
4398 continue;
4400 /* Handle double reduction:
4402 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4403 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4404 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4405 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4407 At that point the regular reduction (stmt2 and stmt3) is
4408 already vectorized, as well as the exit phi node, stmt4.
4409 Here we vectorize the phi node of double reduction, stmt1, and
4410 update all relevant statements. */
4412 /* Go through all the uses of s2 to find double reduction phi
4413 node, i.e., stmt1 above. */
4414 orig_name = PHI_RESULT (exit_phi);
4415 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4417 stmt_vec_info use_stmt_vinfo;
4418 stmt_vec_info new_phi_vinfo;
4419 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4420 basic_block bb = gimple_bb (use_stmt);
4421 gimple use;
4423 /* Check that USE_STMT is really double reduction phi
4424 node. */
4425 if (gimple_code (use_stmt) != GIMPLE_PHI
4426 || gimple_phi_num_args (use_stmt) != 2
4427 || bb->loop_father != outer_loop)
4428 continue;
4429 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4430 if (!use_stmt_vinfo
4431 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4432 != vect_double_reduction_def)
4433 continue;
4435 /* Create vector phi node for double reduction:
4436 vs1 = phi <vs0, vs2>
4437 vs1 was created previously in this function by a call to
4438 vect_get_vec_def_for_operand and is stored in
4439 vec_initial_def;
4440 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4441 vs0 is created here. */
4443 /* Create vector phi node. */
4444 vect_phi = create_phi_node (vec_initial_def, bb);
4445 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4446 loop_vec_info_for_loop (outer_loop), NULL);
4447 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4449 /* Create vs0 - initial def of the double reduction phi. */
4450 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4451 loop_preheader_edge (outer_loop));
4452 init_def = get_initial_def_for_reduction (stmt,
4453 preheader_arg, NULL);
4454 vect_phi_init = vect_init_vector (use_stmt, init_def,
4455 vectype, NULL);
4457 /* Update phi node arguments with vs0 and vs2. */
4458 add_phi_arg (vect_phi, vect_phi_init,
4459 loop_preheader_edge (outer_loop),
4460 UNKNOWN_LOCATION);
4461 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4462 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4463 if (dump_enabled_p ())
4465 dump_printf_loc (MSG_NOTE, vect_location,
4466 "created double reduction phi node: ");
4467 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4470 vect_phi_res = PHI_RESULT (vect_phi);
4472 /* Replace the use, i.e., set the correct vs1 in the regular
4473 reduction phi node. FORNOW, NCOPIES is always 1, so the
4474 loop is redundant. */
4475 use = reduction_phi;
4476 for (j = 0; j < ncopies; j++)
4478 edge pr_edge = loop_preheader_edge (loop);
4479 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4480 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4486 phis.release ();
4487 if (nested_in_vect_loop)
4489 if (double_reduc)
4490 loop = outer_loop;
4491 else
4492 continue;
4495 phis.create (3);
4496 /* Find the loop-closed-use at the loop exit of the original scalar
4497 result. (The reduction result is expected to have two immediate uses,
4498 one at the latch block, and one at the loop exit). For double
4499 reductions we are looking for exit phis of the outer loop. */
4500 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4502 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4503 phis.safe_push (USE_STMT (use_p));
4504 else
4506 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4508 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4510 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4512 if (!flow_bb_inside_loop_p (loop,
4513 gimple_bb (USE_STMT (phi_use_p))))
4514 phis.safe_push (USE_STMT (phi_use_p));
4520 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4522 /* Replace the uses: */
4523 orig_name = PHI_RESULT (exit_phi);
4524 scalar_result = scalar_results[k];
4525 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4526 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4527 SET_USE (use_p, scalar_result);
4530 phis.release ();
4533 scalar_results.release ();
4534 inner_phis.release ();
4535 new_phis.release ();
4539 /* Function vectorizable_reduction.
4541 Check if STMT performs a reduction operation that can be vectorized.
4542 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4543 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4544 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4546 This function also handles reduction idioms (patterns) that have been
4547 recognized in advance during vect_pattern_recog. In this case, STMT may be
4548 of this form:
4549 X = pattern_expr (arg0, arg1, ..., X)
4550 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4551 sequence that had been detected and replaced by the pattern-stmt (STMT).
4553 In some cases of reduction patterns, the type of the reduction variable X is
4554 different than the type of the other arguments of STMT.
4555 In such cases, the vectype that is used when transforming STMT into a vector
4556 stmt is different than the vectype that is used to determine the
4557 vectorization factor, because it consists of a different number of elements
4558 than the actual number of elements that are being operated upon in parallel.
4560 For example, consider an accumulation of shorts into an int accumulator.
4561 On some targets it's possible to vectorize this pattern operating on 8
4562 shorts at a time (hence, the vectype for purposes of determining the
4563 vectorization factor should be V8HI); on the other hand, the vectype that
4564 is used to create the vector form is actually V4SI (the type of the result).
4566 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4567 indicates what is the actual level of parallelism (V8HI in the example), so
4568 that the right vectorization factor would be derived. This vectype
4569 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4570 be used to create the vectorized stmt. The right vectype for the vectorized
4571 stmt is obtained from the type of the result X:
4572 get_vectype_for_scalar_type (TREE_TYPE (X))
4574 This means that, contrary to "regular" reductions (or "regular" stmts in
4575 general), the following equation:
4576 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4577 does *NOT* necessarily hold for reduction patterns. */
4579 bool
4580 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4581 gimple *vec_stmt, slp_tree slp_node)
4583 tree vec_dest;
4584 tree scalar_dest;
4585 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4586 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4587 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4588 tree vectype_in = NULL_TREE;
4589 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4590 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4591 enum tree_code code, orig_code, epilog_reduc_code;
4592 enum machine_mode vec_mode;
4593 int op_type;
4594 optab optab, reduc_optab;
4595 tree new_temp = NULL_TREE;
4596 tree def;
4597 gimple def_stmt;
4598 enum vect_def_type dt;
4599 gimple new_phi = NULL;
4600 tree scalar_type;
4601 bool is_simple_use;
4602 gimple orig_stmt;
4603 stmt_vec_info orig_stmt_info;
4604 tree expr = NULL_TREE;
4605 int i;
4606 int ncopies;
4607 int epilog_copies;
4608 stmt_vec_info prev_stmt_info, prev_phi_info;
4609 bool single_defuse_cycle = false;
4610 tree reduc_def = NULL_TREE;
4611 gimple new_stmt = NULL;
4612 int j;
4613 tree ops[3];
4614 bool nested_cycle = false, found_nested_cycle_def = false;
4615 gimple reduc_def_stmt = NULL;
4616 /* The default is that the reduction variable is the last in statement. */
4617 int reduc_index = 2;
4618 bool double_reduc = false, dummy;
4619 basic_block def_bb;
4620 struct loop * def_stmt_loop, *outer_loop = NULL;
4621 tree def_arg;
4622 gimple def_arg_stmt;
4623 vec<tree> vec_oprnds0 = vNULL;
4624 vec<tree> vec_oprnds1 = vNULL;
4625 vec<tree> vect_defs = vNULL;
4626 vec<gimple> phis = vNULL;
4627 int vec_num;
4628 tree def0, def1, tem, op0, op1 = NULL_TREE;
4630 /* In case of reduction chain we switch to the first stmt in the chain, but
4631 we don't update STMT_INFO, since only the last stmt is marked as reduction
4632 and has reduction properties. */
4633 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4634 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4636 if (nested_in_vect_loop_p (loop, stmt))
4638 outer_loop = loop;
4639 loop = loop->inner;
4640 nested_cycle = true;
4643 /* 1. Is vectorizable reduction? */
4644 /* Not supportable if the reduction variable is used in the loop, unless
4645 it's a reduction chain. */
4646 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4647 && !GROUP_FIRST_ELEMENT (stmt_info))
4648 return false;
4650 /* Reductions that are not used even in an enclosing outer-loop,
4651 are expected to be "live" (used out of the loop). */
4652 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4653 && !STMT_VINFO_LIVE_P (stmt_info))
4654 return false;
4656 /* Make sure it was already recognized as a reduction computation. */
4657 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4658 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4659 return false;
4661 /* 2. Has this been recognized as a reduction pattern?
4663 Check if STMT represents a pattern that has been recognized
4664 in earlier analysis stages. For stmts that represent a pattern,
4665 the STMT_VINFO_RELATED_STMT field records the last stmt in
4666 the original sequence that constitutes the pattern. */
4668 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4669 if (orig_stmt)
4671 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4672 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4673 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4676 /* 3. Check the operands of the operation. The first operands are defined
4677 inside the loop body. The last operand is the reduction variable,
4678 which is defined by the loop-header-phi. */
4680 gcc_assert (is_gimple_assign (stmt));
4682 /* Flatten RHS. */
4683 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4685 case GIMPLE_SINGLE_RHS:
4686 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4687 if (op_type == ternary_op)
4689 tree rhs = gimple_assign_rhs1 (stmt);
4690 ops[0] = TREE_OPERAND (rhs, 0);
4691 ops[1] = TREE_OPERAND (rhs, 1);
4692 ops[2] = TREE_OPERAND (rhs, 2);
4693 code = TREE_CODE (rhs);
4695 else
4696 return false;
4697 break;
4699 case GIMPLE_BINARY_RHS:
4700 code = gimple_assign_rhs_code (stmt);
4701 op_type = TREE_CODE_LENGTH (code);
4702 gcc_assert (op_type == binary_op);
4703 ops[0] = gimple_assign_rhs1 (stmt);
4704 ops[1] = gimple_assign_rhs2 (stmt);
4705 break;
4707 case GIMPLE_TERNARY_RHS:
4708 code = gimple_assign_rhs_code (stmt);
4709 op_type = TREE_CODE_LENGTH (code);
4710 gcc_assert (op_type == ternary_op);
4711 ops[0] = gimple_assign_rhs1 (stmt);
4712 ops[1] = gimple_assign_rhs2 (stmt);
4713 ops[2] = gimple_assign_rhs3 (stmt);
4714 break;
4716 case GIMPLE_UNARY_RHS:
4717 return false;
4719 default:
4720 gcc_unreachable ();
4723 if (code == COND_EXPR && slp_node)
4724 return false;
4726 scalar_dest = gimple_assign_lhs (stmt);
4727 scalar_type = TREE_TYPE (scalar_dest);
4728 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4729 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4730 return false;
4732 /* Do not try to vectorize bit-precision reductions. */
4733 if ((TYPE_PRECISION (scalar_type)
4734 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4735 return false;
4737 /* All uses but the last are expected to be defined in the loop.
4738 The last use is the reduction variable. In case of nested cycle this
4739 assumption is not true: we use reduc_index to record the index of the
4740 reduction variable. */
4741 for (i = 0; i < op_type - 1; i++)
4743 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4744 if (i == 0 && code == COND_EXPR)
4745 continue;
4747 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4748 &def_stmt, &def, &dt, &tem);
4749 if (!vectype_in)
4750 vectype_in = tem;
4751 gcc_assert (is_simple_use);
4753 if (dt != vect_internal_def
4754 && dt != vect_external_def
4755 && dt != vect_constant_def
4756 && dt != vect_induction_def
4757 && !(dt == vect_nested_cycle && nested_cycle))
4758 return false;
4760 if (dt == vect_nested_cycle)
4762 found_nested_cycle_def = true;
4763 reduc_def_stmt = def_stmt;
4764 reduc_index = i;
4768 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4769 &def_stmt, &def, &dt, &tem);
4770 if (!vectype_in)
4771 vectype_in = tem;
4772 gcc_assert (is_simple_use);
4773 if (!(dt == vect_reduction_def
4774 || dt == vect_nested_cycle
4775 || ((dt == vect_internal_def || dt == vect_external_def
4776 || dt == vect_constant_def || dt == vect_induction_def)
4777 && nested_cycle && found_nested_cycle_def)))
4779 /* For pattern recognized stmts, orig_stmt might be a reduction,
4780 but some helper statements for the pattern might not, or
4781 might be COND_EXPRs with reduction uses in the condition. */
4782 gcc_assert (orig_stmt);
4783 return false;
4785 if (!found_nested_cycle_def)
4786 reduc_def_stmt = def_stmt;
4788 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4789 if (orig_stmt)
4790 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4791 reduc_def_stmt,
4792 !nested_cycle,
4793 &dummy));
4794 else
4796 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4797 !nested_cycle, &dummy);
4798 /* We changed STMT to be the first stmt in reduction chain, hence we
4799 check that in this case the first element in the chain is STMT. */
4800 gcc_assert (stmt == tmp
4801 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4804 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4805 return false;
4807 if (slp_node || PURE_SLP_STMT (stmt_info))
4808 ncopies = 1;
4809 else
4810 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4811 / TYPE_VECTOR_SUBPARTS (vectype_in));
4813 gcc_assert (ncopies >= 1);
4815 vec_mode = TYPE_MODE (vectype_in);
4817 if (code == COND_EXPR)
4819 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4821 if (dump_enabled_p ())
4822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4823 "unsupported condition in reduction");
4825 return false;
4828 else
4830 /* 4. Supportable by target? */
4832 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4833 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4835 /* Shifts and rotates are only supported by vectorizable_shifts,
4836 not vectorizable_reduction. */
4837 if (dump_enabled_p ())
4838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4839 "unsupported shift or rotation.");
4840 return false;
4843 /* 4.1. check support for the operation in the loop */
4844 optab = optab_for_tree_code (code, vectype_in, optab_default);
4845 if (!optab)
4847 if (dump_enabled_p ())
4848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4849 "no optab.");
4851 return false;
4854 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4856 if (dump_enabled_p ())
4857 dump_printf (MSG_NOTE, "op not supported by target.");
4859 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4860 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4861 < vect_min_worthwhile_factor (code))
4862 return false;
4864 if (dump_enabled_p ())
4865 dump_printf (MSG_NOTE, "proceeding using word mode.");
4868 /* Worthwhile without SIMD support? */
4869 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4870 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4871 < vect_min_worthwhile_factor (code))
4873 if (dump_enabled_p ())
4874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4875 "not worthwhile without SIMD support.");
4877 return false;
4881 /* 4.2. Check support for the epilog operation.
4883 If STMT represents a reduction pattern, then the type of the
4884 reduction variable may be different than the type of the rest
4885 of the arguments. For example, consider the case of accumulation
4886 of shorts into an int accumulator; The original code:
4887 S1: int_a = (int) short_a;
4888 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4890 was replaced with:
4891 STMT: int_acc = widen_sum <short_a, int_acc>
4893 This means that:
4894 1. The tree-code that is used to create the vector operation in the
4895 epilog code (that reduces the partial results) is not the
4896 tree-code of STMT, but is rather the tree-code of the original
4897 stmt from the pattern that STMT is replacing. I.e, in the example
4898 above we want to use 'widen_sum' in the loop, but 'plus' in the
4899 epilog.
4900 2. The type (mode) we use to check available target support
4901 for the vector operation to be created in the *epilog*, is
4902 determined by the type of the reduction variable (in the example
4903 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4904 However the type (mode) we use to check available target support
4905 for the vector operation to be created *inside the loop*, is
4906 determined by the type of the other arguments to STMT (in the
4907 example we'd check this: optab_handler (widen_sum_optab,
4908 vect_short_mode)).
4910 This is contrary to "regular" reductions, in which the types of all
4911 the arguments are the same as the type of the reduction variable.
4912 For "regular" reductions we can therefore use the same vector type
4913 (and also the same tree-code) when generating the epilog code and
4914 when generating the code inside the loop. */
4916 if (orig_stmt)
4918 /* This is a reduction pattern: get the vectype from the type of the
4919 reduction variable, and get the tree-code from orig_stmt. */
4920 orig_code = gimple_assign_rhs_code (orig_stmt);
4921 gcc_assert (vectype_out);
4922 vec_mode = TYPE_MODE (vectype_out);
4924 else
4926 /* Regular reduction: use the same vectype and tree-code as used for
4927 the vector code inside the loop can be used for the epilog code. */
4928 orig_code = code;
4931 if (nested_cycle)
4933 def_bb = gimple_bb (reduc_def_stmt);
4934 def_stmt_loop = def_bb->loop_father;
4935 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4936 loop_preheader_edge (def_stmt_loop));
4937 if (TREE_CODE (def_arg) == SSA_NAME
4938 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4939 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4940 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4941 && vinfo_for_stmt (def_arg_stmt)
4942 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4943 == vect_double_reduction_def)
4944 double_reduc = true;
4947 epilog_reduc_code = ERROR_MARK;
4948 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4950 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4951 optab_default);
4952 if (!reduc_optab)
4954 if (dump_enabled_p ())
4955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4956 "no optab for reduction.");
4958 epilog_reduc_code = ERROR_MARK;
4961 if (reduc_optab
4962 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4964 if (dump_enabled_p ())
4965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966 "reduc op not supported by target.");
4968 epilog_reduc_code = ERROR_MARK;
4971 else
4973 if (!nested_cycle || double_reduc)
4975 if (dump_enabled_p ())
4976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4977 "no reduc code for scalar code.");
4979 return false;
4983 if (double_reduc && ncopies > 1)
4985 if (dump_enabled_p ())
4986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4987 "multiple types in double reduction");
4989 return false;
4992 /* In case of widenning multiplication by a constant, we update the type
4993 of the constant to be the type of the other operand. We check that the
4994 constant fits the type in the pattern recognition pass. */
4995 if (code == DOT_PROD_EXPR
4996 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4998 if (TREE_CODE (ops[0]) == INTEGER_CST)
4999 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5000 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5001 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5002 else
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5006 "invalid types in dot-prod");
5008 return false;
5012 if (!vec_stmt) /* transformation not required. */
5014 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5015 return false;
5016 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5017 return true;
5020 /** Transform. **/
5022 if (dump_enabled_p ())
5023 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
5025 /* FORNOW: Multiple types are not supported for condition. */
5026 if (code == COND_EXPR)
5027 gcc_assert (ncopies == 1);
5029 /* Create the destination vector */
5030 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5032 /* In case the vectorization factor (VF) is bigger than the number
5033 of elements that we can fit in a vectype (nunits), we have to generate
5034 more than one vector stmt - i.e - we need to "unroll" the
5035 vector stmt by a factor VF/nunits. For more details see documentation
5036 in vectorizable_operation. */
5038 /* If the reduction is used in an outer loop we need to generate
5039 VF intermediate results, like so (e.g. for ncopies=2):
5040 r0 = phi (init, r0)
5041 r1 = phi (init, r1)
5042 r0 = x0 + r0;
5043 r1 = x1 + r1;
5044 (i.e. we generate VF results in 2 registers).
5045 In this case we have a separate def-use cycle for each copy, and therefore
5046 for each copy we get the vector def for the reduction variable from the
5047 respective phi node created for this copy.
5049 Otherwise (the reduction is unused in the loop nest), we can combine
5050 together intermediate results, like so (e.g. for ncopies=2):
5051 r = phi (init, r)
5052 r = x0 + r;
5053 r = x1 + r;
5054 (i.e. we generate VF/2 results in a single register).
5055 In this case for each copy we get the vector def for the reduction variable
5056 from the vectorized reduction operation generated in the previous iteration.
5059 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5061 single_defuse_cycle = true;
5062 epilog_copies = 1;
5064 else
5065 epilog_copies = ncopies;
5067 prev_stmt_info = NULL;
5068 prev_phi_info = NULL;
5069 if (slp_node)
5071 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5072 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5073 == TYPE_VECTOR_SUBPARTS (vectype_in));
5075 else
5077 vec_num = 1;
5078 vec_oprnds0.create (1);
5079 if (op_type == ternary_op)
5080 vec_oprnds1.create (1);
5083 phis.create (vec_num);
5084 vect_defs.create (vec_num);
5085 if (!slp_node)
5086 vect_defs.quick_push (NULL_TREE);
5088 for (j = 0; j < ncopies; j++)
5090 if (j == 0 || !single_defuse_cycle)
5092 for (i = 0; i < vec_num; i++)
5094 /* Create the reduction-phi that defines the reduction
5095 operand. */
5096 new_phi = create_phi_node (vec_dest, loop->header);
5097 set_vinfo_for_stmt (new_phi,
5098 new_stmt_vec_info (new_phi, loop_vinfo,
5099 NULL));
5100 if (j == 0 || slp_node)
5101 phis.quick_push (new_phi);
5105 if (code == COND_EXPR)
5107 gcc_assert (!slp_node);
5108 vectorizable_condition (stmt, gsi, vec_stmt,
5109 PHI_RESULT (phis[0]),
5110 reduc_index, NULL);
5111 /* Multiple types are not supported for condition. */
5112 break;
5115 /* Handle uses. */
5116 if (j == 0)
5118 op0 = ops[!reduc_index];
5119 if (op_type == ternary_op)
5121 if (reduc_index == 0)
5122 op1 = ops[2];
5123 else
5124 op1 = ops[1];
5127 if (slp_node)
5128 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5129 slp_node, -1);
5130 else
5132 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5133 stmt, NULL);
5134 vec_oprnds0.quick_push (loop_vec_def0);
5135 if (op_type == ternary_op)
5137 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5138 NULL);
5139 vec_oprnds1.quick_push (loop_vec_def1);
5143 else
5145 if (!slp_node)
5147 enum vect_def_type dt;
5148 gimple dummy_stmt;
5149 tree dummy;
5151 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5152 &dummy_stmt, &dummy, &dt);
5153 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5154 loop_vec_def0);
5155 vec_oprnds0[0] = loop_vec_def0;
5156 if (op_type == ternary_op)
5158 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5159 &dummy, &dt);
5160 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5161 loop_vec_def1);
5162 vec_oprnds1[0] = loop_vec_def1;
5166 if (single_defuse_cycle)
5167 reduc_def = gimple_assign_lhs (new_stmt);
5169 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5172 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5174 if (slp_node)
5175 reduc_def = PHI_RESULT (phis[i]);
5176 else
5178 if (!single_defuse_cycle || j == 0)
5179 reduc_def = PHI_RESULT (new_phi);
5182 def1 = ((op_type == ternary_op)
5183 ? vec_oprnds1[i] : NULL);
5184 if (op_type == binary_op)
5186 if (reduc_index == 0)
5187 expr = build2 (code, vectype_out, reduc_def, def0);
5188 else
5189 expr = build2 (code, vectype_out, def0, reduc_def);
5191 else
5193 if (reduc_index == 0)
5194 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5195 else
5197 if (reduc_index == 1)
5198 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5199 else
5200 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5204 new_stmt = gimple_build_assign (vec_dest, expr);
5205 new_temp = make_ssa_name (vec_dest, new_stmt);
5206 gimple_assign_set_lhs (new_stmt, new_temp);
5207 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5209 if (slp_node)
5211 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5212 vect_defs.quick_push (new_temp);
5214 else
5215 vect_defs[0] = new_temp;
5218 if (slp_node)
5219 continue;
5221 if (j == 0)
5222 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5223 else
5224 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5226 prev_stmt_info = vinfo_for_stmt (new_stmt);
5227 prev_phi_info = vinfo_for_stmt (new_phi);
5230 /* Finalize the reduction-phi (set its arguments) and create the
5231 epilog reduction code. */
5232 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5234 new_temp = gimple_assign_lhs (*vec_stmt);
5235 vect_defs[0] = new_temp;
5238 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5239 epilog_reduc_code, phis, reduc_index,
5240 double_reduc, slp_node);
5242 phis.release ();
5243 vect_defs.release ();
5244 vec_oprnds0.release ();
5245 vec_oprnds1.release ();
5247 return true;
5250 /* Function vect_min_worthwhile_factor.
5252 For a loop where we could vectorize the operation indicated by CODE,
5253 return the minimum vectorization factor that makes it worthwhile
5254 to use generic vectors. */
5256 vect_min_worthwhile_factor (enum tree_code code)
5258 switch (code)
5260 case PLUS_EXPR:
5261 case MINUS_EXPR:
5262 case NEGATE_EXPR:
5263 return 4;
5265 case BIT_AND_EXPR:
5266 case BIT_IOR_EXPR:
5267 case BIT_XOR_EXPR:
5268 case BIT_NOT_EXPR:
5269 return 2;
5271 default:
5272 return INT_MAX;
5277 /* Function vectorizable_induction
5279 Check if PHI performs an induction computation that can be vectorized.
5280 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5281 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5282 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5284 bool
5285 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5286 gimple *vec_stmt)
5288 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5289 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5290 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5292 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5293 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5294 tree vec_def;
5296 gcc_assert (ncopies >= 1);
5297 /* FORNOW. These restrictions should be relaxed. */
5298 if (nested_in_vect_loop_p (loop, phi))
5300 imm_use_iterator imm_iter;
5301 use_operand_p use_p;
5302 gimple exit_phi;
5303 edge latch_e;
5304 tree loop_arg;
5306 if (ncopies > 1)
5308 if (dump_enabled_p ())
5309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5310 "multiple types in nested loop.");
5311 return false;
5314 exit_phi = NULL;
5315 latch_e = loop_latch_edge (loop->inner);
5316 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5317 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5319 if (!flow_bb_inside_loop_p (loop->inner,
5320 gimple_bb (USE_STMT (use_p))))
5322 exit_phi = USE_STMT (use_p);
5323 break;
5326 if (exit_phi)
5328 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5329 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5330 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5332 if (dump_enabled_p ())
5333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5334 "inner-loop induction only used outside "
5335 "of the outer vectorized loop.");
5336 return false;
5341 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5342 return false;
5344 /* FORNOW: SLP not supported. */
5345 if (STMT_SLP_TYPE (stmt_info))
5346 return false;
5348 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5350 if (gimple_code (phi) != GIMPLE_PHI)
5351 return false;
5353 if (!vec_stmt) /* transformation not required. */
5355 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5356 if (dump_enabled_p ())
5357 dump_printf_loc (MSG_NOTE, vect_location,
5358 "=== vectorizable_induction ===");
5359 vect_model_induction_cost (stmt_info, ncopies);
5360 return true;
5363 /** Transform. **/
5365 if (dump_enabled_p ())
5366 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5368 vec_def = get_initial_def_for_induction (phi);
5369 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5370 return true;
5373 /* Function vectorizable_live_operation.
5375 STMT computes a value that is used outside the loop. Check if
5376 it can be supported. */
5378 bool
5379 vectorizable_live_operation (gimple stmt,
5380 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5381 gimple *vec_stmt ATTRIBUTE_UNUSED)
5383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5384 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5386 int i;
5387 int op_type;
5388 tree op;
5389 tree def;
5390 gimple def_stmt;
5391 enum vect_def_type dt;
5392 enum tree_code code;
5393 enum gimple_rhs_class rhs_class;
5395 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5397 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5398 return false;
5400 if (!is_gimple_assign (stmt))
5401 return false;
5403 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5404 return false;
5406 /* FORNOW. CHECKME. */
5407 if (nested_in_vect_loop_p (loop, stmt))
5408 return false;
5410 code = gimple_assign_rhs_code (stmt);
5411 op_type = TREE_CODE_LENGTH (code);
5412 rhs_class = get_gimple_rhs_class (code);
5413 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5414 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5416 /* FORNOW: support only if all uses are invariant. This means
5417 that the scalar operations can remain in place, unvectorized.
5418 The original last scalar value that they compute will be used. */
5420 for (i = 0; i < op_type; i++)
5422 if (rhs_class == GIMPLE_SINGLE_RHS)
5423 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5424 else
5425 op = gimple_op (stmt, i + 1);
5426 if (op
5427 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5428 &dt))
5430 if (dump_enabled_p ())
5431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5432 "use not simple.");
5433 return false;
5436 if (dt != vect_external_def && dt != vect_constant_def)
5437 return false;
5440 /* No transformation is required for the cases we currently support. */
5441 return true;
5444 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5446 static void
5447 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5449 ssa_op_iter op_iter;
5450 imm_use_iterator imm_iter;
5451 def_operand_p def_p;
5452 gimple ustmt;
5454 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5456 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5458 basic_block bb;
5460 if (!is_gimple_debug (ustmt))
5461 continue;
5463 bb = gimple_bb (ustmt);
5465 if (!flow_bb_inside_loop_p (loop, bb))
5467 if (gimple_debug_bind_p (ustmt))
5469 if (dump_enabled_p ())
5470 dump_printf_loc (MSG_NOTE, vect_location,
5471 "killing debug use");
5473 gimple_debug_bind_reset_value (ustmt);
5474 update_stmt (ustmt);
5476 else
5477 gcc_unreachable ();
5483 /* Function vect_transform_loop.
5485 The analysis phase has determined that the loop is vectorizable.
5486 Vectorize the loop - created vectorized stmts to replace the scalar
5487 stmts in the loop, and update the loop exit condition. */
5489 void
5490 vect_transform_loop (loop_vec_info loop_vinfo)
5492 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5493 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5494 int nbbs = loop->num_nodes;
5495 gimple_stmt_iterator si;
5496 int i;
5497 tree ratio = NULL;
5498 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5499 bool grouped_store;
5500 bool slp_scheduled = false;
5501 unsigned int nunits;
5502 gimple stmt, pattern_stmt;
5503 gimple_seq pattern_def_seq = NULL;
5504 gimple_stmt_iterator pattern_def_si = gsi_none ();
5505 bool transform_pattern_stmt = false;
5506 bool check_profitability = false;
5507 int th;
5508 /* Record number of iterations before we started tampering with the profile. */
5509 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5511 if (dump_enabled_p ())
5512 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5514 /* If profile is inprecise, we have chance to fix it up. */
5515 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5516 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5518 /* Use the more conservative vectorization threshold. If the number
5519 of iterations is constant assume the cost check has been performed
5520 by our caller. If the threshold makes all loops profitable that
5521 run at least the vectorization factor number of times checking
5522 is pointless, too. */
5523 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5524 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5525 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5526 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5527 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5529 if (dump_enabled_p ())
5530 dump_printf_loc (MSG_NOTE, vect_location,
5531 "Profitability threshold is %d loop iterations.", th);
5532 check_profitability = true;
5535 /* Version the loop first, if required, so the profitability check
5536 comes first. */
5538 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5539 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5541 vect_loop_versioning (loop_vinfo, th, check_profitability);
5542 check_profitability = false;
5545 /* Peel the loop if there are data refs with unknown alignment.
5546 Only one data ref with unknown store is allowed. */
5548 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5550 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5551 check_profitability = false;
5554 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5555 compile time constant), or it is a constant that doesn't divide by the
5556 vectorization factor, then an epilog loop needs to be created.
5557 We therefore duplicate the loop: the original loop will be vectorized,
5558 and will compute the first (n/VF) iterations. The second copy of the loop
5559 will remain scalar and will compute the remaining (n%VF) iterations.
5560 (VF is the vectorization factor). */
5562 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5563 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5564 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5565 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5566 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5567 th, check_profitability);
5568 else
5569 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5570 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5572 /* 1) Make sure the loop header has exactly two entries
5573 2) Make sure we have a preheader basic block. */
5575 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5577 split_edge (loop_preheader_edge (loop));
5579 /* FORNOW: the vectorizer supports only loops which body consist
5580 of one basic block (header + empty latch). When the vectorizer will
5581 support more involved loop forms, the order by which the BBs are
5582 traversed need to be reconsidered. */
5584 for (i = 0; i < nbbs; i++)
5586 basic_block bb = bbs[i];
5587 stmt_vec_info stmt_info;
5588 gimple phi;
5590 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5592 phi = gsi_stmt (si);
5593 if (dump_enabled_p ())
5595 dump_printf_loc (MSG_NOTE, vect_location,
5596 "------>vectorizing phi: ");
5597 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5599 stmt_info = vinfo_for_stmt (phi);
5600 if (!stmt_info)
5601 continue;
5603 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5604 vect_loop_kill_debug_uses (loop, phi);
5606 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5607 && !STMT_VINFO_LIVE_P (stmt_info))
5608 continue;
5610 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5611 != (unsigned HOST_WIDE_INT) vectorization_factor)
5612 && dump_enabled_p ())
5613 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5615 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5617 if (dump_enabled_p ())
5618 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5619 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5623 pattern_stmt = NULL;
5624 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5626 bool is_store;
5628 if (transform_pattern_stmt)
5629 stmt = pattern_stmt;
5630 else
5632 stmt = gsi_stmt (si);
5633 /* During vectorization remove existing clobber stmts. */
5634 if (gimple_clobber_p (stmt))
5636 unlink_stmt_vdef (stmt);
5637 gsi_remove (&si, true);
5638 release_defs (stmt);
5639 continue;
5643 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_NOTE, vect_location,
5646 "------>vectorizing statement: ");
5647 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5650 stmt_info = vinfo_for_stmt (stmt);
5652 /* vector stmts created in the outer-loop during vectorization of
5653 stmts in an inner-loop may not have a stmt_info, and do not
5654 need to be vectorized. */
5655 if (!stmt_info)
5657 gsi_next (&si);
5658 continue;
5661 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5662 vect_loop_kill_debug_uses (loop, stmt);
5664 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5665 && !STMT_VINFO_LIVE_P (stmt_info))
5667 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5668 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5669 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5670 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5672 stmt = pattern_stmt;
5673 stmt_info = vinfo_for_stmt (stmt);
5675 else
5677 gsi_next (&si);
5678 continue;
5681 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5682 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5683 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5684 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5685 transform_pattern_stmt = true;
5687 /* If pattern statement has def stmts, vectorize them too. */
5688 if (is_pattern_stmt_p (stmt_info))
5690 if (pattern_def_seq == NULL)
5692 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5693 pattern_def_si = gsi_start (pattern_def_seq);
5695 else if (!gsi_end_p (pattern_def_si))
5696 gsi_next (&pattern_def_si);
5697 if (pattern_def_seq != NULL)
5699 gimple pattern_def_stmt = NULL;
5700 stmt_vec_info pattern_def_stmt_info = NULL;
5702 while (!gsi_end_p (pattern_def_si))
5704 pattern_def_stmt = gsi_stmt (pattern_def_si);
5705 pattern_def_stmt_info
5706 = vinfo_for_stmt (pattern_def_stmt);
5707 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5708 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5709 break;
5710 gsi_next (&pattern_def_si);
5713 if (!gsi_end_p (pattern_def_si))
5715 if (dump_enabled_p ())
5717 dump_printf_loc (MSG_NOTE, vect_location,
5718 "==> vectorizing pattern def "
5719 "stmt: ");
5720 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5721 pattern_def_stmt, 0);
5724 stmt = pattern_def_stmt;
5725 stmt_info = pattern_def_stmt_info;
5727 else
5729 pattern_def_si = gsi_none ();
5730 transform_pattern_stmt = false;
5733 else
5734 transform_pattern_stmt = false;
5737 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5738 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5739 STMT_VINFO_VECTYPE (stmt_info));
5740 if (!STMT_SLP_TYPE (stmt_info)
5741 && nunits != (unsigned int) vectorization_factor
5742 && dump_enabled_p ())
5743 /* For SLP VF is set according to unrolling factor, and not to
5744 vector size, hence for SLP this print is not valid. */
5745 dump_printf_loc (MSG_NOTE, vect_location,
5746 "multiple-types.");
5748 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5749 reached. */
5750 if (STMT_SLP_TYPE (stmt_info))
5752 if (!slp_scheduled)
5754 slp_scheduled = true;
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_NOTE, vect_location,
5758 "=== scheduling SLP instances ===");
5760 vect_schedule_slp (loop_vinfo, NULL);
5763 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5764 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5766 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5768 pattern_def_seq = NULL;
5769 gsi_next (&si);
5771 continue;
5775 /* -------- vectorize statement ------------ */
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5779 grouped_store = false;
5780 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5781 if (is_store)
5783 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5785 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5786 interleaving chain was completed - free all the stores in
5787 the chain. */
5788 gsi_next (&si);
5789 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5790 continue;
5792 else
5794 /* Free the attached stmt_vec_info and remove the stmt. */
5795 gimple store = gsi_stmt (si);
5796 free_stmt_vec_info (store);
5797 unlink_stmt_vdef (store);
5798 gsi_remove (&si, true);
5799 release_defs (store);
5800 continue;
5804 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5806 pattern_def_seq = NULL;
5807 gsi_next (&si);
5809 } /* stmts in BB */
5810 } /* BBs in loop */
5812 slpeel_make_loop_iterate_ntimes (loop, ratio);
5814 /* Reduce loop iterations by the vectorization factor. */
5815 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5816 expected_iterations / vectorization_factor);
5817 loop->nb_iterations_upper_bound
5818 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5819 FLOOR_DIV_EXPR);
5820 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5821 && loop->nb_iterations_upper_bound != double_int_zero)
5822 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5823 if (loop->any_estimate)
5825 loop->nb_iterations_estimate
5826 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5827 FLOOR_DIV_EXPR);
5828 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5829 && loop->nb_iterations_estimate != double_int_zero)
5830 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5833 if (dump_enabled_p ())
5835 dump_printf_loc (MSG_NOTE, vect_location,
5836 "LOOP VECTORIZED\n");
5837 if (loop->inner)
5838 dump_printf_loc (MSG_NOTE, vect_location,
5839 "OUTER LOOP VECTORIZED\n");