PR tree-ssa/57385
[official-gcc.git] / gcc / tree-vect-loop.c
blob058e4a4ec547e8b3b1ff2504add1f7bfef51c397
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
193 for (i = 0; i < nbbs; i++)
195 basic_block bb = bbs[i];
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
207 gcc_assert (stmt_info);
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
232 return false;
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
254 tree vf_vectype;
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
261 stmt_info = vinfo_for_stmt (stmt);
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
270 gcc_assert (stmt_info);
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
281 stmt = pattern_stmt;
282 stmt_info = vinfo_for_stmt (pattern_stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
290 else
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294 gsi_next (&si);
295 continue;
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 analyze_pattern_stmt = true;
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info))
307 if (pattern_def_seq == NULL)
309 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 pattern_def_si = gsi_start (pattern_def_seq);
312 else if (!gsi_end_p (pattern_def_si))
313 gsi_next (&pattern_def_si);
314 if (pattern_def_seq != NULL)
316 gimple pattern_def_stmt = NULL;
317 stmt_vec_info pattern_def_stmt_info = NULL;
319 while (!gsi_end_p (pattern_def_si))
321 pattern_def_stmt = gsi_stmt (pattern_def_si);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 break;
327 gsi_next (&pattern_def_si);
330 if (!gsi_end_p (pattern_def_si))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337 pattern_def_stmt, 0);
340 stmt = pattern_def_stmt;
341 stmt_info = pattern_def_stmt_info;
343 else
345 pattern_def_si = gsi_none ();
346 analyze_pattern_stmt = false;
349 else
350 analyze_pattern_stmt = false;
353 if (gimple_get_lhs (stmt) == NULL_TREE)
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
362 return false;
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
373 return false;
376 if (STMT_VINFO_VECTYPE (stmt_info))
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
381 idiom). */
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 || is_pattern_stmt_p (stmt_info)
384 || !gsi_end_p (pattern_def_si));
385 vectype = STMT_VINFO_VECTYPE (stmt_info);
387 else
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_NOTE, vect_location,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
397 vectype = get_vectype_for_scalar_type (scalar_type);
398 if (!vectype)
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "not vectorized: unsupported "
404 "data-type ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 scalar_type);
408 return false;
411 STMT_VINFO_VECTYPE (stmt_info) = vectype;
413 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
416 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
424 &dummy);
425 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE, vect_location,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
431 vf_vectype = get_vectype_for_scalar_type (scalar_type);
432 if (!vf_vectype)
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
439 scalar_type);
441 return false;
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
453 vectype);
454 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
456 vf_vectype);
458 return false;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
467 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
470 if (!vectorization_factor
471 || (nunits > vectorization_factor))
472 vectorization_factor = nunits;
474 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
476 pattern_def_seq = NULL;
477 gsi_next (&si);
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
485 vectorization_factor);
486 if (vectorization_factor <= 1)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "not vectorized: unsupported data-type");
491 return false;
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
495 return true;
499 /* Function vect_is_simple_iv_evolution.
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
504 static bool
505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
506 tree * step)
508 tree init_expr;
509 tree step_expr;
510 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
512 /* When there is no evolution in this loop, the evolution function
513 is not "simple". */
514 if (evolution_part == NULL_TREE)
515 return false;
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part))
520 return false;
522 step_expr = evolution_part;
523 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
528 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
529 dump_printf (MSG_NOTE, ", init: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
533 *init = init_expr;
534 *step = step_expr;
536 if (TREE_CODE (step_expr) != INTEGER_CST)
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "step unknown.");
541 return false;
544 return true;
547 /* Function vect_analyze_scalar_cycles_1.
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
552 enclosing LOOP). */
554 static void
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
557 basic_block bb = loop->header;
558 tree dumy;
559 vec<gimple> worklist;
560 worklist.create (64);
561 gimple_stmt_iterator gsi;
562 bool double_reduc;
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "=== vect_analyze_scalar_cycles ===");
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
570 changed. */
571 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
573 gimple phi = gsi_stmt (gsi);
574 tree access_fn = NULL;
575 tree def = PHI_RESULT (phi);
576 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
578 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def))
587 continue;
589 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
591 /* Analyze the evolution function. */
592 access_fn = analyze_scalar_evolution (loop, def);
593 if (access_fn)
595 STRIP_NOPS (access_fn);
596 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
603 = evolution_part_in_loop_num (access_fn, loop->num);
606 if (!access_fn
607 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
609 worklist.safe_push (phi);
610 continue;
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist.length () > 0)
624 gimple phi = worklist.pop ();
625 tree def = PHI_RESULT (phi);
626 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
627 gimple reduc_stmt;
628 bool nested_cycle;
630 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
636 gcc_assert (!virtual_operand_p (def)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
639 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
640 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
641 &double_reduc);
642 if (reduc_stmt)
644 if (double_reduc)
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Detected double reduction.");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
652 vect_double_reduction_def;
654 else
656 if (nested_cycle)
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected vectorizable nested cycle.");
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_nested_cycle;
666 else
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE, vect_location,
670 "Detected reduction.");
672 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
674 vect_reduction_def;
675 /* Store the reduction cycles for possible vectorization in
676 loop-aware SLP. */
677 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
681 else
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
684 "Unknown def-use cycle pattern.");
687 worklist.release ();
691 /* Function vect_analyze_scalar_cycles.
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
700 Example1: reduction:
702 loop1:
703 for (i=0; i<N; i++)
704 sum += a[i];
706 Example2: induction:
708 loop2:
709 for (i=0; i<N; i++)
710 a[i] = i; */
712 static void
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
715 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
717 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
724 vectorizing them.
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
728 if (loop->inner)
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
732 /* Function vect_get_loop_niters.
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
739 static gimple
740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
742 tree niters;
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "=== get_loop_niters ===");
747 niters = number_of_exit_cond_executions (loop);
749 if (niters != NULL_TREE
750 && niters != chrec_dont_know)
752 *number_of_iterations = niters;
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
761 return get_loop_exit_condition (loop);
765 /* Function bb_in_loop_p
767 Used as predicate for dfs order traversal of the loop bbs. */
769 static bool
770 bb_in_loop_p (const_basic_block bb, const void *data)
772 const struct loop *const loop = (const struct loop *)data;
773 if (flow_bb_inside_loop_p (loop, bb))
774 return true;
775 return false;
779 /* Function new_loop_vec_info.
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
784 static loop_vec_info
785 new_loop_vec_info (struct loop *loop)
787 loop_vec_info res;
788 basic_block *bbs;
789 gimple_stmt_iterator si;
790 unsigned int i, nbbs;
792 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
793 LOOP_VINFO_LOOP (res) = loop;
795 bbs = get_loop_body (loop);
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i = 0; i < loop->num_nodes; i++)
800 basic_block bb = bbs[i];
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb->loop_father != loop)
811 /* Inner-loop bb. */
812 gcc_assert (loop->inner && bb->loop_father == loop->inner);
813 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
815 gimple phi = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
817 loop_vec_info inner_loop_vinfo =
818 STMT_VINFO_LOOP_VINFO (stmt_info);
819 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
820 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
822 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
824 gimple stmt = gsi_stmt (si);
825 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
826 loop_vec_info inner_loop_vinfo =
827 STMT_VINFO_LOOP_VINFO (stmt_info);
828 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
829 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
832 else
834 /* bb in current nest. */
835 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
837 gimple phi = gsi_stmt (si);
838 gimple_set_uid (phi, 0);
839 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
842 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple stmt = gsi_stmt (si);
845 gimple_set_uid (stmt, 0);
846 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 free (bbs);
857 bbs = XCNEWVEC (basic_block, loop->num_nodes);
858 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
859 bbs, loop->num_nodes, loop);
860 gcc_assert (nbbs == loop->num_nodes);
862 LOOP_VINFO_BBS (res) = bbs;
863 LOOP_VINFO_NITERS (res) = NULL;
864 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
868 LOOP_VINFO_VECT_FACTOR (res) = 0;
869 LOOP_VINFO_LOOP_NEST (res).create (3);
870 LOOP_VINFO_DATAREFS (res).create (10);
871 LOOP_VINFO_DDRS (res).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
877 LOOP_VINFO_GROUPED_STORES (res).create (10);
878 LOOP_VINFO_REDUCTIONS (res).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
882 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
883 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
884 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
886 return res;
890 /* Function destroy_loop_vec_info.
892 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
893 stmts in the loop. */
895 void
896 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
898 struct loop *loop;
899 basic_block *bbs;
900 int nbbs;
901 gimple_stmt_iterator si;
902 int j;
903 vec<slp_instance> slp_instances;
904 slp_instance instance;
905 bool swapped;
907 if (!loop_vinfo)
908 return;
910 loop = LOOP_VINFO_LOOP (loop_vinfo);
912 bbs = LOOP_VINFO_BBS (loop_vinfo);
913 nbbs = clean_stmts ? loop->num_nodes : 0;
914 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
916 for (j = 0; j < nbbs; j++)
918 basic_block bb = bbs[j];
919 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
920 free_stmt_vec_info (gsi_stmt (si));
922 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
924 gimple stmt = gsi_stmt (si);
926 /* We may have broken canonical form by moving a constant
927 into RHS1 of a commutative op. Fix such occurrences. */
928 if (swapped && is_gimple_assign (stmt))
930 enum tree_code code = gimple_assign_rhs_code (stmt);
932 if ((code == PLUS_EXPR
933 || code == POINTER_PLUS_EXPR
934 || code == MULT_EXPR)
935 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
936 swap_tree_operands (stmt,
937 gimple_assign_rhs1_ptr (stmt),
938 gimple_assign_rhs2_ptr (stmt));
941 /* Free stmt_vec_info. */
942 free_stmt_vec_info (stmt);
943 gsi_next (&si);
947 free (LOOP_VINFO_BBS (loop_vinfo));
948 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
949 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
950 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
951 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
952 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
953 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
954 FOR_EACH_VEC_ELT (slp_instances, j, instance)
955 vect_free_slp_instance (instance);
957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
958 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
959 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
960 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
962 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
963 LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
965 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
967 free (loop_vinfo);
968 loop->aux = NULL;
972 /* Function vect_analyze_loop_1.
974 Apply a set of analyses on LOOP, and create a loop_vec_info struct
975 for it. The different analyses will record information in the
976 loop_vec_info struct. This is a subset of the analyses applied in
977 vect_analyze_loop, to be applied on an inner-loop nested in the loop
978 that is now considered for (outer-loop) vectorization. */
980 static loop_vec_info
981 vect_analyze_loop_1 (struct loop *loop)
983 loop_vec_info loop_vinfo;
985 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location,
987 "===== analyze_loop_nest_1 =====");
989 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
991 loop_vinfo = vect_analyze_loop_form (loop);
992 if (!loop_vinfo)
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
996 "bad inner-loop form.");
997 return NULL;
1000 return loop_vinfo;
1004 /* Function vect_analyze_loop_form.
1006 Verify that certain CFG restrictions hold, including:
1007 - the loop has a pre-header
1008 - the loop has a single entry and exit
1009 - the loop exit condition is simple enough, and the number of iterations
1010 can be analyzed (a countable loop). */
1012 loop_vec_info
1013 vect_analyze_loop_form (struct loop *loop)
1015 loop_vec_info loop_vinfo;
1016 gimple loop_cond;
1017 tree number_of_iterations = NULL;
1018 loop_vec_info inner_loop_vinfo = NULL;
1020 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_NOTE, vect_location,
1022 "=== vect_analyze_loop_form ===");
1024 /* Different restrictions apply when we are considering an inner-most loop,
1025 vs. an outer (nested) loop.
1026 (FORNOW. May want to relax some of these restrictions in the future). */
1028 if (!loop->inner)
1030 /* Inner-most loop. We currently require that the number of BBs is
1031 exactly 2 (the header and latch). Vectorizable inner-most loops
1032 look like this:
1034 (pre-header)
1036 header <--------+
1037 | | |
1038 | +--> latch --+
1040 (exit-bb) */
1042 if (loop->num_nodes != 2)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "not vectorized: control flow in loop.");
1047 return NULL;
1050 if (empty_block_p (loop->header))
1052 if (dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054 "not vectorized: empty loop.");
1055 return NULL;
1058 else
1060 struct loop *innerloop = loop->inner;
1061 edge entryedge;
1063 /* Nested loop. We currently require that the loop is doubly-nested,
1064 contains a single inner loop, and the number of BBs is exactly 5.
1065 Vectorizable outer-loops look like this:
1067 (pre-header)
1069 header <---+
1071 inner-loop |
1073 tail ------+
1075 (exit-bb)
1077 The inner-loop has the properties expected of inner-most loops
1078 as described above. */
1080 if ((loop->inner)->inner || (loop->inner)->next)
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "not vectorized: multiple nested loops.");
1085 return NULL;
1088 /* Analyze the inner-loop. */
1089 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1090 if (!inner_loop_vinfo)
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "not vectorized: Bad inner loop.");
1095 return NULL;
1098 if (!expr_invariant_in_loop_p (loop,
1099 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "not vectorized: inner-loop count not invariant.");
1104 destroy_loop_vec_info (inner_loop_vinfo, true);
1105 return NULL;
1108 if (loop->num_nodes != 5)
1110 if (dump_enabled_p ())
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 "not vectorized: control flow in loop.");
1113 destroy_loop_vec_info (inner_loop_vinfo, true);
1114 return NULL;
1117 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1118 entryedge = EDGE_PRED (innerloop->header, 0);
1119 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1120 entryedge = EDGE_PRED (innerloop->header, 1);
1122 if (entryedge->src != loop->header
1123 || !single_exit (innerloop)
1124 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "not vectorized: unsupported outerloop form.");
1129 destroy_loop_vec_info (inner_loop_vinfo, true);
1130 return NULL;
1133 if (dump_enabled_p ())
1134 dump_printf_loc (MSG_NOTE, vect_location,
1135 "Considering outer-loop vectorization.");
1138 if (!single_exit (loop)
1139 || EDGE_COUNT (loop->header->preds) != 2)
1141 if (dump_enabled_p ())
1143 if (!single_exit (loop))
1144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1145 "not vectorized: multiple exits.");
1146 else if (EDGE_COUNT (loop->header->preds) != 2)
1147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1148 "not vectorized: too many incoming edges.");
1150 if (inner_loop_vinfo)
1151 destroy_loop_vec_info (inner_loop_vinfo, true);
1152 return NULL;
1155 /* We assume that the loop exit condition is at the end of the loop. i.e,
1156 that the loop is represented as a do-while (with a proper if-guard
1157 before the loop if needed), where the loop header contains all the
1158 executable statements, and the latch is empty. */
1159 if (!empty_block_p (loop->latch)
1160 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1162 if (dump_enabled_p ())
1163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1164 "not vectorized: latch block not empty.");
1165 if (inner_loop_vinfo)
1166 destroy_loop_vec_info (inner_loop_vinfo, true);
1167 return NULL;
1170 /* Make sure there exists a single-predecessor exit bb: */
1171 if (!single_pred_p (single_exit (loop)->dest))
1173 edge e = single_exit (loop);
1174 if (!(e->flags & EDGE_ABNORMAL))
1176 split_loop_exit_edge (e);
1177 if (dump_enabled_p ())
1178 dump_printf (MSG_NOTE, "split exit edge.");
1180 else
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 "not vectorized: abnormal loop exit edge.");
1185 if (inner_loop_vinfo)
1186 destroy_loop_vec_info (inner_loop_vinfo, true);
1187 return NULL;
1191 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1192 if (!loop_cond)
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: complicated exit condition.");
1197 if (inner_loop_vinfo)
1198 destroy_loop_vec_info (inner_loop_vinfo, true);
1199 return NULL;
1202 if (!number_of_iterations)
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206 "not vectorized: number of iterations cannot be "
1207 "computed.");
1208 if (inner_loop_vinfo)
1209 destroy_loop_vec_info (inner_loop_vinfo, true);
1210 return NULL;
1213 if (chrec_contains_undetermined (number_of_iterations))
1215 if (dump_enabled_p ())
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 "Infinite number of iterations.");
1218 if (inner_loop_vinfo)
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1223 if (!NITERS_KNOWN_P (number_of_iterations))
1225 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_NOTE, vect_location,
1228 "Symbolic number of iterations is ");
1229 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1232 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: number of iterations = 0.");
1237 if (inner_loop_vinfo)
1238 destroy_loop_vec_info (inner_loop_vinfo, true);
1239 return NULL;
1242 loop_vinfo = new_loop_vec_info (loop);
1243 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1244 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1246 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1248 /* CHECKME: May want to keep it around it in the future. */
1249 if (inner_loop_vinfo)
1250 destroy_loop_vec_info (inner_loop_vinfo, false);
1252 gcc_assert (!loop->aux);
1253 loop->aux = loop_vinfo;
1254 return loop_vinfo;
1258 /* Function vect_analyze_loop_operations.
1260 Scan the loop stmts and make sure they are all vectorizable. */
1262 static bool
1263 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1265 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1266 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1267 int nbbs = loop->num_nodes;
1268 gimple_stmt_iterator si;
1269 unsigned int vectorization_factor = 0;
1270 int i;
1271 gimple phi;
1272 stmt_vec_info stmt_info;
1273 bool need_to_vectorize = false;
1274 int min_profitable_iters;
1275 int min_scalar_loop_bound;
1276 unsigned int th;
1277 bool only_slp_in_loop = true, ok;
1278 HOST_WIDE_INT max_niter;
1279 HOST_WIDE_INT estimated_niter;
1280 int min_profitable_estimate;
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_NOTE, vect_location,
1284 "=== vect_analyze_loop_operations ===");
1286 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1287 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1288 if (slp)
1290 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1291 vectorization factor of the loop is the unrolling factor required by
1292 the SLP instances. If that unrolling factor is 1, we say, that we
1293 perform pure SLP on loop - cross iteration parallelism is not
1294 exploited. */
1295 for (i = 0; i < nbbs; i++)
1297 basic_block bb = bbs[i];
1298 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1300 gimple stmt = gsi_stmt (si);
1301 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1302 gcc_assert (stmt_info);
1303 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1304 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1305 && !PURE_SLP_STMT (stmt_info))
1306 /* STMT needs both SLP and loop-based vectorization. */
1307 only_slp_in_loop = false;
1311 if (only_slp_in_loop)
1312 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1313 else
1314 vectorization_factor = least_common_multiple (vectorization_factor,
1315 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1317 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_NOTE, vect_location,
1320 "Updating vectorization factor to %d ",
1321 vectorization_factor);
1324 for (i = 0; i < nbbs; i++)
1326 basic_block bb = bbs[i];
1328 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1330 phi = gsi_stmt (si);
1331 ok = true;
1333 stmt_info = vinfo_for_stmt (phi);
1334 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1337 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1340 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1341 (i.e., a phi in the tail of the outer-loop). */
1342 if (! is_loop_header_bb_p (bb))
1344 /* FORNOW: we currently don't support the case that these phis
1345 are not used in the outerloop (unless it is double reduction,
1346 i.e., this phi is vect_reduction_def), cause this case
1347 requires to actually do something here. */
1348 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1349 || STMT_VINFO_LIVE_P (stmt_info))
1350 && STMT_VINFO_DEF_TYPE (stmt_info)
1351 != vect_double_reduction_def)
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "Unsupported loop-closed phi in "
1356 "outer-loop.");
1357 return false;
1360 /* If PHI is used in the outer loop, we check that its operand
1361 is defined in the inner loop. */
1362 if (STMT_VINFO_RELEVANT_P (stmt_info))
1364 tree phi_op;
1365 gimple op_def_stmt;
1367 if (gimple_phi_num_args (phi) != 1)
1368 return false;
1370 phi_op = PHI_ARG_DEF (phi, 0);
1371 if (TREE_CODE (phi_op) != SSA_NAME)
1372 return false;
1374 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1375 if (!op_def_stmt
1376 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1377 || !vinfo_for_stmt (op_def_stmt))
1378 return false;
1380 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1381 != vect_used_in_outer
1382 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1383 != vect_used_in_outer_by_reduction)
1384 return false;
1387 continue;
1390 gcc_assert (stmt_info);
1392 if (STMT_VINFO_LIVE_P (stmt_info))
1394 /* FORNOW: not yet supported. */
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397 "not vectorized: value used after loop.");
1398 return false;
1401 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1402 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1404 /* A scalar-dependence cycle that we don't support. */
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407 "not vectorized: scalar dependence cycle.");
1408 return false;
1411 if (STMT_VINFO_RELEVANT_P (stmt_info))
1413 need_to_vectorize = true;
1414 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1415 ok = vectorizable_induction (phi, NULL, NULL);
1418 if (!ok)
1420 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423 "not vectorized: relevant phi not "
1424 "supported: ");
1425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1427 return false;
1431 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1433 gimple stmt = gsi_stmt (si);
1434 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1435 return false;
1437 } /* bbs */
1439 /* All operations in the loop are either irrelevant (deal with loop
1440 control, or dead), or only used outside the loop and can be moved
1441 out of the loop (e.g. invariants, inductions). The loop can be
1442 optimized away by scalar optimizations. We're better off not
1443 touching this loop. */
1444 if (!need_to_vectorize)
1446 if (dump_enabled_p ())
1447 dump_printf_loc (MSG_NOTE, vect_location,
1448 "All the computation can be taken out of the loop.");
1449 if (dump_enabled_p ())
1450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451 "not vectorized: redundant loop. no profit to "
1452 "vectorize.");
1453 return false;
1456 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1457 dump_printf_loc (MSG_NOTE, vect_location,
1458 "vectorization_factor = %d, niters = "
1459 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1460 LOOP_VINFO_INT_NITERS (loop_vinfo));
1462 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1463 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1464 || ((max_niter = max_stmt_executions_int (loop)) != -1
1465 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469 "not vectorized: iteration count too small.");
1470 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472 "not vectorized: iteration count smaller than "
1473 "vectorization factor.");
1474 return false;
1477 /* Analyze cost. Decide if worth while to vectorize. */
1479 /* Once VF is set, SLP costs should be updated since the number of created
1480 vector stmts depends on VF. */
1481 vect_update_slp_costs_according_to_vf (loop_vinfo);
1483 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1484 &min_profitable_estimate);
1485 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1487 if (min_profitable_iters < 0)
1489 if (dump_enabled_p ())
1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491 "not vectorized: vectorization not profitable.");
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494 "not vectorized: vector version will never be "
1495 "profitable.");
1496 return false;
1499 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1500 * vectorization_factor) - 1);
1503 /* Use the cost model only if it is more conservative than user specified
1504 threshold. */
1506 th = (unsigned) min_scalar_loop_bound;
1507 if (min_profitable_iters
1508 && (!min_scalar_loop_bound
1509 || min_profitable_iters > min_scalar_loop_bound))
1510 th = (unsigned) min_profitable_iters;
1512 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1513 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1515 if (dump_enabled_p ())
1516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517 "not vectorized: vectorization not profitable.");
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_NOTE, vect_location,
1520 "not vectorized: iteration count smaller than user "
1521 "specified loop bound parameter or minimum profitable "
1522 "iterations (whichever is more conservative).");
1523 return false;
1526 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1527 && ((unsigned HOST_WIDE_INT) estimated_niter
1528 <= MAX (th, (unsigned)min_profitable_estimate)))
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532 "not vectorized: estimated iteration count too "
1533 "small.");
1534 if (dump_enabled_p ())
1535 dump_printf_loc (MSG_NOTE, vect_location,
1536 "not vectorized: estimated iteration count smaller "
1537 "than specified loop bound parameter or minimum "
1538 "profitable iterations (whichever is more "
1539 "conservative).");
1540 return false;
1543 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1545 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1549 if (!vect_can_advance_ivs_p (loop_vinfo))
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 "not vectorized: can't create epilog loop 1.");
1554 return false;
1556 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1560 "not vectorized: can't create epilog loop 2.");
1561 return false;
1565 return true;
1569 /* Function vect_analyze_loop_2.
1571 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1572 for it. The different analyses will record information in the
1573 loop_vec_info struct. */
1574 static bool
1575 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1577 bool ok, slp = false;
1578 int max_vf = MAX_VECTORIZATION_FACTOR;
1579 int min_vf = 2;
1581 /* Find all data references in the loop (which correspond to vdefs/vuses)
1582 and analyze their evolution in the loop. Also adjust the minimal
1583 vectorization factor according to the loads and stores.
1585 FORNOW: Handle only simple, array references, which
1586 alignment can be forced, and aligned pointer-references. */
1588 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1589 if (!ok)
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593 "bad data references.");
1594 return false;
1597 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1598 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1600 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1601 if (!ok)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "bad data access.");
1606 return false;
1609 /* Classify all cross-iteration scalar data-flow cycles.
1610 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1612 vect_analyze_scalar_cycles (loop_vinfo);
1614 vect_pattern_recog (loop_vinfo, NULL);
1616 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1618 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1619 if (!ok)
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "unexpected pattern.");
1624 return false;
1627 /* Analyze data dependences between the data-refs in the loop
1628 and adjust the maximum vectorization factor according to
1629 the dependences.
1630 FORNOW: fail at the first data dependence that we encounter. */
1632 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1633 if (!ok
1634 || max_vf < min_vf)
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638 "bad data dependence.");
1639 return false;
1642 ok = vect_determine_vectorization_factor (loop_vinfo);
1643 if (!ok)
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "can't determine vectorization factor.");
1648 return false;
1650 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "bad data dependence.");
1655 return false;
1658 /* Analyze the alignment of the data-refs in the loop.
1659 Fail if a data reference is found that cannot be vectorized. */
1661 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1662 if (!ok)
1664 if (dump_enabled_p ())
1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666 "bad data alignment.");
1667 return false;
1670 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1671 It is important to call pruning after vect_analyze_data_ref_accesses,
1672 since we use grouping information gathered by interleaving analysis. */
1673 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1674 if (!ok)
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678 "too long list of versioning for alias "
1679 "run-time tests.");
1680 return false;
1683 /* This pass will decide on using loop versioning and/or loop peeling in
1684 order to enhance the alignment of data references in the loop. */
1686 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1687 if (!ok)
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "bad data alignment.");
1692 return false;
1695 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1696 ok = vect_analyze_slp (loop_vinfo, NULL);
1697 if (ok)
1699 /* Decide which possible SLP instances to SLP. */
1700 slp = vect_make_slp_decision (loop_vinfo);
1702 /* Find stmts that need to be both vectorized and SLPed. */
1703 vect_detect_hybrid_slp (loop_vinfo);
1705 else
1706 return false;
1708 /* Scan all the operations in the loop and make sure they are
1709 vectorizable. */
1711 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1712 if (!ok)
1714 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716 "bad operation or unsupported loop bound.");
1717 return false;
1720 return true;
1723 /* Function vect_analyze_loop.
1725 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1726 for it. The different analyses will record information in the
1727 loop_vec_info struct. */
1728 loop_vec_info
1729 vect_analyze_loop (struct loop *loop)
1731 loop_vec_info loop_vinfo;
1732 unsigned int vector_sizes;
1734 /* Autodetect first vector size we try. */
1735 current_vector_size = 0;
1736 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "===== analyze_loop_nest =====");
1742 if (loop_outer (loop)
1743 && loop_vec_info_for_loop (loop_outer (loop))
1744 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "outer-loop already vectorized.");
1749 return NULL;
1752 while (1)
1754 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1755 loop_vinfo = vect_analyze_loop_form (loop);
1756 if (!loop_vinfo)
1758 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1760 "bad loop form.");
1761 return NULL;
1764 if (vect_analyze_loop_2 (loop_vinfo))
1766 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1768 return loop_vinfo;
1771 destroy_loop_vec_info (loop_vinfo, true);
1773 vector_sizes &= ~current_vector_size;
1774 if (vector_sizes == 0
1775 || current_vector_size == 0)
1776 return NULL;
1778 /* Try the next biggest vector size. */
1779 current_vector_size = 1 << floor_log2 (vector_sizes);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "***** Re-trying analysis with "
1783 "vector size %d\n", current_vector_size);
1788 /* Function reduction_code_for_scalar_code
1790 Input:
1791 CODE - tree_code of a reduction operations.
1793 Output:
1794 REDUC_CODE - the corresponding tree-code to be used to reduce the
1795 vector of partial results into a single scalar result (which
1796 will also reside in a vector) or ERROR_MARK if the operation is
1797 a supported reduction operation, but does not have such tree-code.
1799 Return FALSE if CODE currently cannot be vectorized as reduction. */
1801 static bool
1802 reduction_code_for_scalar_code (enum tree_code code,
1803 enum tree_code *reduc_code)
1805 switch (code)
1807 case MAX_EXPR:
1808 *reduc_code = REDUC_MAX_EXPR;
1809 return true;
1811 case MIN_EXPR:
1812 *reduc_code = REDUC_MIN_EXPR;
1813 return true;
1815 case PLUS_EXPR:
1816 *reduc_code = REDUC_PLUS_EXPR;
1817 return true;
1819 case MULT_EXPR:
1820 case MINUS_EXPR:
1821 case BIT_IOR_EXPR:
1822 case BIT_XOR_EXPR:
1823 case BIT_AND_EXPR:
1824 *reduc_code = ERROR_MARK;
1825 return true;
1827 default:
1828 return false;
1833 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1834 STMT is printed with a message MSG. */
1836 static void
1837 report_vect_op (int msg_type, gimple stmt, const char *msg)
1839 dump_printf_loc (msg_type, vect_location, "%s", msg);
1840 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1844 /* Detect SLP reduction of the form:
1846 #a1 = phi <a5, a0>
1847 a2 = operation (a1)
1848 a3 = operation (a2)
1849 a4 = operation (a3)
1850 a5 = operation (a4)
1852 #a = phi <a5>
1854 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1855 FIRST_STMT is the first reduction stmt in the chain
1856 (a2 = operation (a1)).
1858 Return TRUE if a reduction chain was detected. */
1860 static bool
1861 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1863 struct loop *loop = (gimple_bb (phi))->loop_father;
1864 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1865 enum tree_code code;
1866 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1867 stmt_vec_info use_stmt_info, current_stmt_info;
1868 tree lhs;
1869 imm_use_iterator imm_iter;
1870 use_operand_p use_p;
1871 int nloop_uses, size = 0, n_out_of_loop_uses;
1872 bool found = false;
1874 if (loop != vect_loop)
1875 return false;
1877 lhs = PHI_RESULT (phi);
1878 code = gimple_assign_rhs_code (first_stmt);
1879 while (1)
1881 nloop_uses = 0;
1882 n_out_of_loop_uses = 0;
1883 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1885 gimple use_stmt = USE_STMT (use_p);
1886 if (is_gimple_debug (use_stmt))
1887 continue;
1889 use_stmt = USE_STMT (use_p);
1891 /* Check if we got back to the reduction phi. */
1892 if (use_stmt == phi)
1894 loop_use_stmt = use_stmt;
1895 found = true;
1896 break;
1899 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1901 if (vinfo_for_stmt (use_stmt)
1902 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1904 loop_use_stmt = use_stmt;
1905 nloop_uses++;
1908 else
1909 n_out_of_loop_uses++;
1911 /* There are can be either a single use in the loop or two uses in
1912 phi nodes. */
1913 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1914 return false;
1917 if (found)
1918 break;
1920 /* We reached a statement with no loop uses. */
1921 if (nloop_uses == 0)
1922 return false;
1924 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1925 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1926 return false;
1928 if (!is_gimple_assign (loop_use_stmt)
1929 || code != gimple_assign_rhs_code (loop_use_stmt)
1930 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1931 return false;
1933 /* Insert USE_STMT into reduction chain. */
1934 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1935 if (current_stmt)
1937 current_stmt_info = vinfo_for_stmt (current_stmt);
1938 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1939 GROUP_FIRST_ELEMENT (use_stmt_info)
1940 = GROUP_FIRST_ELEMENT (current_stmt_info);
1942 else
1943 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1945 lhs = gimple_assign_lhs (loop_use_stmt);
1946 current_stmt = loop_use_stmt;
1947 size++;
1950 if (!found || loop_use_stmt != phi || size < 2)
1951 return false;
1953 /* Swap the operands, if needed, to make the reduction operand be the second
1954 operand. */
1955 lhs = PHI_RESULT (phi);
1956 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1957 while (next_stmt)
1959 if (gimple_assign_rhs2 (next_stmt) == lhs)
1961 tree op = gimple_assign_rhs1 (next_stmt);
1962 gimple def_stmt = NULL;
1964 if (TREE_CODE (op) == SSA_NAME)
1965 def_stmt = SSA_NAME_DEF_STMT (op);
1967 /* Check that the other def is either defined in the loop
1968 ("vect_internal_def"), or it's an induction (defined by a
1969 loop-header phi-node). */
1970 if (def_stmt
1971 && gimple_bb (def_stmt)
1972 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1973 && (is_gimple_assign (def_stmt)
1974 || is_gimple_call (def_stmt)
1975 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1976 == vect_induction_def
1977 || (gimple_code (def_stmt) == GIMPLE_PHI
1978 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1979 == vect_internal_def
1980 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1982 lhs = gimple_assign_lhs (next_stmt);
1983 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1984 continue;
1987 return false;
1989 else
1991 tree op = gimple_assign_rhs2 (next_stmt);
1992 gimple def_stmt = NULL;
1994 if (TREE_CODE (op) == SSA_NAME)
1995 def_stmt = SSA_NAME_DEF_STMT (op);
1997 /* Check that the other def is either defined in the loop
1998 ("vect_internal_def"), or it's an induction (defined by a
1999 loop-header phi-node). */
2000 if (def_stmt
2001 && gimple_bb (def_stmt)
2002 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2003 && (is_gimple_assign (def_stmt)
2004 || is_gimple_call (def_stmt)
2005 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2006 == vect_induction_def
2007 || (gimple_code (def_stmt) == GIMPLE_PHI
2008 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2009 == vect_internal_def
2010 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2015 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2018 swap_tree_operands (next_stmt,
2019 gimple_assign_rhs1_ptr (next_stmt),
2020 gimple_assign_rhs2_ptr (next_stmt));
2021 update_stmt (next_stmt);
2023 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2024 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2026 else
2027 return false;
2030 lhs = gimple_assign_lhs (next_stmt);
2031 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2034 /* Save the chain for further analysis in SLP detection. */
2035 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2036 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2037 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2039 return true;
2043 /* Function vect_is_simple_reduction_1
2045 (1) Detect a cross-iteration def-use cycle that represents a simple
2046 reduction computation. We look for the following pattern:
2048 loop_header:
2049 a1 = phi < a0, a2 >
2050 a3 = ...
2051 a2 = operation (a3, a1)
2053 such that:
2054 1. operation is commutative and associative and it is safe to
2055 change the order of the computation (if CHECK_REDUCTION is true)
2056 2. no uses for a2 in the loop (a2 is used out of the loop)
2057 3. no uses of a1 in the loop besides the reduction operation
2058 4. no uses of a1 outside the loop.
2060 Conditions 1,4 are tested here.
2061 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2063 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2064 nested cycles, if CHECK_REDUCTION is false.
2066 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2067 reductions:
2069 a1 = phi < a0, a2 >
2070 inner loop (def of a3)
2071 a2 = phi < a3 >
2073 If MODIFY is true it tries also to rework the code in-place to enable
2074 detection of more reduction patterns. For the time being we rewrite
2075 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2078 static gimple
2079 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2080 bool check_reduction, bool *double_reduc,
2081 bool modify)
2083 struct loop *loop = (gimple_bb (phi))->loop_father;
2084 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2085 edge latch_e = loop_latch_edge (loop);
2086 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2087 gimple def_stmt, def1 = NULL, def2 = NULL;
2088 enum tree_code orig_code, code;
2089 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2090 tree type;
2091 int nloop_uses;
2092 tree name;
2093 imm_use_iterator imm_iter;
2094 use_operand_p use_p;
2095 bool phi_def;
2097 *double_reduc = false;
2099 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2100 otherwise, we assume outer loop vectorization. */
2101 gcc_assert ((check_reduction && loop == vect_loop)
2102 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2104 name = PHI_RESULT (phi);
2105 nloop_uses = 0;
2106 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2108 gimple use_stmt = USE_STMT (use_p);
2109 if (is_gimple_debug (use_stmt))
2110 continue;
2112 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "intermediate value used outside loop.");
2118 return NULL;
2121 if (vinfo_for_stmt (use_stmt)
2122 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2123 nloop_uses++;
2124 if (nloop_uses > 1)
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "reduction used in loop.");
2129 return NULL;
2133 if (TREE_CODE (loop_arg) != SSA_NAME)
2135 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138 "reduction: not ssa_name: ");
2139 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2141 return NULL;
2144 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2145 if (!def_stmt)
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "reduction: no def_stmt.");
2150 return NULL;
2153 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2155 if (dump_enabled_p ())
2156 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2157 return NULL;
2160 if (is_gimple_assign (def_stmt))
2162 name = gimple_assign_lhs (def_stmt);
2163 phi_def = false;
2165 else
2167 name = PHI_RESULT (def_stmt);
2168 phi_def = true;
2171 nloop_uses = 0;
2172 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2174 gimple use_stmt = USE_STMT (use_p);
2175 if (is_gimple_debug (use_stmt))
2176 continue;
2177 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2178 && vinfo_for_stmt (use_stmt)
2179 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2180 nloop_uses++;
2181 if (nloop_uses > 1)
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 "reduction used in loop.");
2186 return NULL;
2190 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2191 defined in the inner loop. */
2192 if (phi_def)
2194 op1 = PHI_ARG_DEF (def_stmt, 0);
2196 if (gimple_phi_num_args (def_stmt) != 1
2197 || TREE_CODE (op1) != SSA_NAME)
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2201 "unsupported phi node definition.");
2203 return NULL;
2206 def1 = SSA_NAME_DEF_STMT (op1);
2207 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2208 && loop->inner
2209 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2210 && is_gimple_assign (def1))
2212 if (dump_enabled_p ())
2213 report_vect_op (MSG_NOTE, def_stmt,
2214 "detected double reduction: ");
2216 *double_reduc = true;
2217 return def_stmt;
2220 return NULL;
2223 code = orig_code = gimple_assign_rhs_code (def_stmt);
2225 /* We can handle "res -= x[i]", which is non-associative by
2226 simply rewriting this into "res += -x[i]". Avoid changing
2227 gimple instruction for the first simple tests and only do this
2228 if we're allowed to change code at all. */
2229 if (code == MINUS_EXPR
2230 && modify
2231 && (op1 = gimple_assign_rhs1 (def_stmt))
2232 && TREE_CODE (op1) == SSA_NAME
2233 && SSA_NAME_DEF_STMT (op1) == phi)
2234 code = PLUS_EXPR;
2236 if (check_reduction
2237 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2239 if (dump_enabled_p ())
2240 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2241 "reduction: not commutative/associative: ");
2242 return NULL;
2245 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2247 if (code != COND_EXPR)
2249 if (dump_enabled_p ())
2250 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2251 "reduction: not binary operation: ");
2253 return NULL;
2256 op3 = gimple_assign_rhs1 (def_stmt);
2257 if (COMPARISON_CLASS_P (op3))
2259 op4 = TREE_OPERAND (op3, 1);
2260 op3 = TREE_OPERAND (op3, 0);
2263 op1 = gimple_assign_rhs2 (def_stmt);
2264 op2 = gimple_assign_rhs3 (def_stmt);
2266 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2268 if (dump_enabled_p ())
2269 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2270 "reduction: uses not ssa_names: ");
2272 return NULL;
2275 else
2277 op1 = gimple_assign_rhs1 (def_stmt);
2278 op2 = gimple_assign_rhs2 (def_stmt);
2280 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2282 if (dump_enabled_p ())
2283 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2284 "reduction: uses not ssa_names: ");
2286 return NULL;
2290 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2291 if ((TREE_CODE (op1) == SSA_NAME
2292 && !types_compatible_p (type,TREE_TYPE (op1)))
2293 || (TREE_CODE (op2) == SSA_NAME
2294 && !types_compatible_p (type, TREE_TYPE (op2)))
2295 || (op3 && TREE_CODE (op3) == SSA_NAME
2296 && !types_compatible_p (type, TREE_TYPE (op3)))
2297 || (op4 && TREE_CODE (op4) == SSA_NAME
2298 && !types_compatible_p (type, TREE_TYPE (op4))))
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "reduction: multiple types: operation type: ");
2304 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2305 dump_printf (MSG_NOTE, ", operands types: ");
2306 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2307 TREE_TYPE (op1));
2308 dump_printf (MSG_NOTE, ",");
2309 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2310 TREE_TYPE (op2));
2311 if (op3)
2313 dump_printf (MSG_NOTE, ",");
2314 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2315 TREE_TYPE (op3));
2318 if (op4)
2320 dump_printf (MSG_NOTE, ",");
2321 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2322 TREE_TYPE (op4));
2326 return NULL;
2329 /* Check that it's ok to change the order of the computation.
2330 Generally, when vectorizing a reduction we change the order of the
2331 computation. This may change the behavior of the program in some
2332 cases, so we need to check that this is ok. One exception is when
2333 vectorizing an outer-loop: the inner-loop is executed sequentially,
2334 and therefore vectorizing reductions in the inner-loop during
2335 outer-loop vectorization is safe. */
2337 /* CHECKME: check for !flag_finite_math_only too? */
2338 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2339 && check_reduction)
2341 /* Changing the order of operations changes the semantics. */
2342 if (dump_enabled_p ())
2343 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2344 "reduction: unsafe fp math optimization: ");
2345 return NULL;
2347 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2348 && check_reduction)
2350 /* Changing the order of operations changes the semantics. */
2351 if (dump_enabled_p ())
2352 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2353 "reduction: unsafe int math optimization: ");
2354 return NULL;
2356 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2358 /* Changing the order of operations changes the semantics. */
2359 if (dump_enabled_p ())
2360 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2361 "reduction: unsafe fixed-point math optimization: ");
2362 return NULL;
2365 /* If we detected "res -= x[i]" earlier, rewrite it into
2366 "res += -x[i]" now. If this turns out to be useless reassoc
2367 will clean it up again. */
2368 if (orig_code == MINUS_EXPR)
2370 tree rhs = gimple_assign_rhs2 (def_stmt);
2371 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2372 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2373 rhs, NULL);
2374 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2375 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2376 loop_info, NULL));
2377 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2378 gimple_assign_set_rhs2 (def_stmt, negrhs);
2379 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2380 update_stmt (def_stmt);
2383 /* Reduction is safe. We're dealing with one of the following:
2384 1) integer arithmetic and no trapv
2385 2) floating point arithmetic, and special flags permit this optimization
2386 3) nested cycle (i.e., outer loop vectorization). */
2387 if (TREE_CODE (op1) == SSA_NAME)
2388 def1 = SSA_NAME_DEF_STMT (op1);
2390 if (TREE_CODE (op2) == SSA_NAME)
2391 def2 = SSA_NAME_DEF_STMT (op2);
2393 if (code != COND_EXPR
2394 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2396 if (dump_enabled_p ())
2397 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2398 return NULL;
2401 /* Check that one def is the reduction def, defined by PHI,
2402 the other def is either defined in the loop ("vect_internal_def"),
2403 or it's an induction (defined by a loop-header phi-node). */
2405 if (def2 && def2 == phi
2406 && (code == COND_EXPR
2407 || !def1 || gimple_nop_p (def1)
2408 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2409 && (is_gimple_assign (def1)
2410 || is_gimple_call (def1)
2411 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2412 == vect_induction_def
2413 || (gimple_code (def1) == GIMPLE_PHI
2414 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2415 == vect_internal_def
2416 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2418 if (dump_enabled_p ())
2419 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2420 return def_stmt;
2423 if (def1 && def1 == phi
2424 && (code == COND_EXPR
2425 || !def2 || gimple_nop_p (def2)
2426 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2427 && (is_gimple_assign (def2)
2428 || is_gimple_call (def2)
2429 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2430 == vect_induction_def
2431 || (gimple_code (def2) == GIMPLE_PHI
2432 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2433 == vect_internal_def
2434 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2436 if (check_reduction)
2438 /* Swap operands (just for simplicity - so that the rest of the code
2439 can assume that the reduction variable is always the last (second)
2440 argument). */
2441 if (dump_enabled_p ())
2442 report_vect_op (MSG_NOTE, def_stmt,
2443 "detected reduction: need to swap operands: ");
2445 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2446 gimple_assign_rhs2_ptr (def_stmt));
2448 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2449 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2451 else
2453 if (dump_enabled_p ())
2454 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2457 return def_stmt;
2460 /* Try to find SLP reduction chain. */
2461 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2463 if (dump_enabled_p ())
2464 report_vect_op (MSG_NOTE, def_stmt,
2465 "reduction: detected reduction chain: ");
2467 return def_stmt;
2470 if (dump_enabled_p ())
2471 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2472 "reduction: unknown pattern: ");
2474 return NULL;
2477 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2478 in-place. Arguments as there. */
2480 static gimple
2481 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2482 bool check_reduction, bool *double_reduc)
2484 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2485 double_reduc, false);
2488 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2489 in-place if it enables detection of more reductions. Arguments
2490 as there. */
2492 gimple
2493 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2494 bool check_reduction, bool *double_reduc)
2496 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2497 double_reduc, true);
2500 /* Calculate the cost of one scalar iteration of the loop. */
2502 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2504 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2505 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2506 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2507 int innerloop_iters, i, stmt_cost;
2509 /* Count statements in scalar loop. Using this as scalar cost for a single
2510 iteration for now.
2512 TODO: Add outer loop support.
2514 TODO: Consider assigning different costs to different scalar
2515 statements. */
2517 /* FORNOW. */
2518 innerloop_iters = 1;
2519 if (loop->inner)
2520 innerloop_iters = 50; /* FIXME */
2522 for (i = 0; i < nbbs; i++)
2524 gimple_stmt_iterator si;
2525 basic_block bb = bbs[i];
2527 if (bb->loop_father == loop->inner)
2528 factor = innerloop_iters;
2529 else
2530 factor = 1;
2532 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2534 gimple stmt = gsi_stmt (si);
2535 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2538 continue;
2540 /* Skip stmts that are not vectorized inside the loop. */
2541 if (stmt_info
2542 && !STMT_VINFO_RELEVANT_P (stmt_info)
2543 && (!STMT_VINFO_LIVE_P (stmt_info)
2544 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2545 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2546 continue;
2548 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2550 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2551 stmt_cost = vect_get_stmt_cost (scalar_load);
2552 else
2553 stmt_cost = vect_get_stmt_cost (scalar_store);
2555 else
2556 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2558 scalar_single_iter_cost += stmt_cost * factor;
2561 return scalar_single_iter_cost;
2564 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2566 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2567 int *peel_iters_epilogue,
2568 int scalar_single_iter_cost,
2569 stmt_vector_for_cost *prologue_cost_vec,
2570 stmt_vector_for_cost *epilogue_cost_vec)
2572 int retval = 0;
2573 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2575 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2577 *peel_iters_epilogue = vf/2;
2578 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE, vect_location,
2580 "cost model: epilogue peel iters set to vf/2 "
2581 "because loop iterations are unknown .");
2583 /* If peeled iterations are known but number of scalar loop
2584 iterations are unknown, count a taken branch per peeled loop. */
2585 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2586 NULL, 0, vect_prologue);
2588 else
2590 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2591 peel_iters_prologue = niters < peel_iters_prologue ?
2592 niters : peel_iters_prologue;
2593 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2594 /* If we need to peel for gaps, but no peeling is required, we have to
2595 peel VF iterations. */
2596 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2597 *peel_iters_epilogue = vf;
2600 if (peel_iters_prologue)
2601 retval += record_stmt_cost (prologue_cost_vec,
2602 peel_iters_prologue * scalar_single_iter_cost,
2603 scalar_stmt, NULL, 0, vect_prologue);
2604 if (*peel_iters_epilogue)
2605 retval += record_stmt_cost (epilogue_cost_vec,
2606 *peel_iters_epilogue * scalar_single_iter_cost,
2607 scalar_stmt, NULL, 0, vect_epilogue);
2608 return retval;
2611 /* Function vect_estimate_min_profitable_iters
2613 Return the number of iterations required for the vector version of the
2614 loop to be profitable relative to the cost of the scalar version of the
2615 loop. */
2617 static void
2618 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2619 int *ret_min_profitable_niters,
2620 int *ret_min_profitable_estimate)
2622 int min_profitable_iters;
2623 int min_profitable_estimate;
2624 int peel_iters_prologue;
2625 int peel_iters_epilogue;
2626 unsigned vec_inside_cost = 0;
2627 int vec_outside_cost = 0;
2628 unsigned vec_prologue_cost = 0;
2629 unsigned vec_epilogue_cost = 0;
2630 int scalar_single_iter_cost = 0;
2631 int scalar_outside_cost = 0;
2632 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2634 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2636 /* Cost model disabled. */
2637 if (!flag_vect_cost_model)
2639 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2640 *ret_min_profitable_niters = 0;
2641 *ret_min_profitable_estimate = 0;
2642 return;
2645 /* Requires loop versioning tests to handle misalignment. */
2646 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2648 /* FIXME: Make cost depend on complexity of individual check. */
2649 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2650 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2651 vect_prologue);
2652 dump_printf (MSG_NOTE,
2653 "cost model: Adding cost of checks for loop "
2654 "versioning to treat misalignment.\n");
2657 /* Requires loop versioning with alias checks. */
2658 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2660 /* FIXME: Make cost depend on complexity of individual check. */
2661 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2662 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2663 vect_prologue);
2664 dump_printf (MSG_NOTE,
2665 "cost model: Adding cost of checks for loop "
2666 "versioning aliasing.\n");
2669 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2670 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2671 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2672 vect_prologue);
2674 /* Count statements in scalar loop. Using this as scalar cost for a single
2675 iteration for now.
2677 TODO: Add outer loop support.
2679 TODO: Consider assigning different costs to different scalar
2680 statements. */
2682 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2684 /* Add additional cost for the peeled instructions in prologue and epilogue
2685 loop.
2687 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2688 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2690 TODO: Build an expression that represents peel_iters for prologue and
2691 epilogue to be used in a run-time test. */
2693 if (npeel < 0)
2695 peel_iters_prologue = vf/2;
2696 dump_printf (MSG_NOTE, "cost model: "
2697 "prologue peel iters set to vf/2.");
2699 /* If peeling for alignment is unknown, loop bound of main loop becomes
2700 unknown. */
2701 peel_iters_epilogue = vf/2;
2702 dump_printf (MSG_NOTE, "cost model: "
2703 "epilogue peel iters set to vf/2 because "
2704 "peeling for alignment is unknown.");
2706 /* If peeled iterations are unknown, count a taken branch and a not taken
2707 branch per peeled loop. Even if scalar loop iterations are known,
2708 vector iterations are not known since peeled prologue iterations are
2709 not known. Hence guards remain the same. */
2710 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2711 NULL, 0, vect_prologue);
2712 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2713 NULL, 0, vect_prologue);
2714 /* FORNOW: Don't attempt to pass individual scalar instructions to
2715 the model; just assume linear cost for scalar iterations. */
2716 (void) add_stmt_cost (target_cost_data,
2717 peel_iters_prologue * scalar_single_iter_cost,
2718 scalar_stmt, NULL, 0, vect_prologue);
2719 (void) add_stmt_cost (target_cost_data,
2720 peel_iters_epilogue * scalar_single_iter_cost,
2721 scalar_stmt, NULL, 0, vect_epilogue);
2723 else
2725 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2726 stmt_info_for_cost *si;
2727 int j;
2728 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2730 prologue_cost_vec.create (2);
2731 epilogue_cost_vec.create (2);
2732 peel_iters_prologue = npeel;
2734 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2735 &peel_iters_epilogue,
2736 scalar_single_iter_cost,
2737 &prologue_cost_vec,
2738 &epilogue_cost_vec);
2740 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2742 struct _stmt_vec_info *stmt_info
2743 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2744 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2745 si->misalign, vect_prologue);
2748 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2750 struct _stmt_vec_info *stmt_info
2751 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2752 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2753 si->misalign, vect_epilogue);
2756 prologue_cost_vec.release ();
2757 epilogue_cost_vec.release ();
2760 /* FORNOW: The scalar outside cost is incremented in one of the
2761 following ways:
2763 1. The vectorizer checks for alignment and aliasing and generates
2764 a condition that allows dynamic vectorization. A cost model
2765 check is ANDED with the versioning condition. Hence scalar code
2766 path now has the added cost of the versioning check.
2768 if (cost > th & versioning_check)
2769 jmp to vector code
2771 Hence run-time scalar is incremented by not-taken branch cost.
2773 2. The vectorizer then checks if a prologue is required. If the
2774 cost model check was not done before during versioning, it has to
2775 be done before the prologue check.
2777 if (cost <= th)
2778 prologue = scalar_iters
2779 if (prologue == 0)
2780 jmp to vector code
2781 else
2782 execute prologue
2783 if (prologue == num_iters)
2784 go to exit
2786 Hence the run-time scalar cost is incremented by a taken branch,
2787 plus a not-taken branch, plus a taken branch cost.
2789 3. The vectorizer then checks if an epilogue is required. If the
2790 cost model check was not done before during prologue check, it
2791 has to be done with the epilogue check.
2793 if (prologue == 0)
2794 jmp to vector code
2795 else
2796 execute prologue
2797 if (prologue == num_iters)
2798 go to exit
2799 vector code:
2800 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2801 jmp to epilogue
2803 Hence the run-time scalar cost should be incremented by 2 taken
2804 branches.
2806 TODO: The back end may reorder the BBS's differently and reverse
2807 conditions/branch directions. Change the estimates below to
2808 something more reasonable. */
2810 /* If the number of iterations is known and we do not do versioning, we can
2811 decide whether to vectorize at compile time. Hence the scalar version
2812 do not carry cost model guard costs. */
2813 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2814 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817 /* Cost model check occurs at versioning. */
2818 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2819 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2820 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2821 else
2823 /* Cost model check occurs at prologue generation. */
2824 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2825 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2826 + vect_get_stmt_cost (cond_branch_not_taken);
2827 /* Cost model check occurs at epilogue generation. */
2828 else
2829 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2833 /* Complete the target-specific cost calculations. */
2834 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2835 &vec_inside_cost, &vec_epilogue_cost);
2837 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2839 /* Calculate number of iterations required to make the vector version
2840 profitable, relative to the loop bodies only. The following condition
2841 must hold true:
2842 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2843 where
2844 SIC = scalar iteration cost, VIC = vector iteration cost,
2845 VOC = vector outside cost, VF = vectorization factor,
2846 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2847 SOC = scalar outside cost for run time cost model check. */
2849 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2851 if (vec_outside_cost <= 0)
2852 min_profitable_iters = 1;
2853 else
2855 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2856 - vec_inside_cost * peel_iters_prologue
2857 - vec_inside_cost * peel_iters_epilogue)
2858 / ((scalar_single_iter_cost * vf)
2859 - vec_inside_cost);
2861 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2862 <= (((int) vec_inside_cost * min_profitable_iters)
2863 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2864 min_profitable_iters++;
2867 /* vector version will never be profitable. */
2868 else
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2872 "cost model: the vector iteration cost = %d "
2873 "divided by the scalar iteration cost = %d "
2874 "is greater or equal to the vectorization factor = %d.",
2875 vec_inside_cost, scalar_single_iter_cost, vf);
2876 *ret_min_profitable_niters = -1;
2877 *ret_min_profitable_estimate = -1;
2878 return;
2881 if (dump_enabled_p ())
2883 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2884 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2885 vec_inside_cost);
2886 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2887 vec_prologue_cost);
2888 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2889 vec_epilogue_cost);
2890 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2891 scalar_single_iter_cost);
2892 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2893 scalar_outside_cost);
2894 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2895 vec_outside_cost);
2896 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2897 peel_iters_prologue);
2898 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2899 peel_iters_epilogue);
2900 dump_printf (MSG_NOTE,
2901 " Calculated minimum iters for profitability: %d\n",
2902 min_profitable_iters);
2905 min_profitable_iters =
2906 min_profitable_iters < vf ? vf : min_profitable_iters;
2908 /* Because the condition we create is:
2909 if (niters <= min_profitable_iters)
2910 then skip the vectorized loop. */
2911 min_profitable_iters--;
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE, vect_location,
2915 " Runtime profitability threshold = %d\n", min_profitable_iters);
2917 *ret_min_profitable_niters = min_profitable_iters;
2919 /* Calculate number of iterations required to make the vector version
2920 profitable, relative to the loop bodies only.
2922 Non-vectorized variant is SIC * niters and it must win over vector
2923 variant on the expected loop trip count. The following condition must hold true:
2924 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2926 if (vec_outside_cost <= 0)
2927 min_profitable_estimate = 1;
2928 else
2930 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2931 - vec_inside_cost * peel_iters_prologue
2932 - vec_inside_cost * peel_iters_epilogue)
2933 / ((scalar_single_iter_cost * vf)
2934 - vec_inside_cost);
2936 min_profitable_estimate --;
2937 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE, vect_location,
2940 " Static estimate profitability threshold = %d\n",
2941 min_profitable_iters);
2943 *ret_min_profitable_estimate = min_profitable_estimate;
2947 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2948 functions. Design better to avoid maintenance issues. */
2950 /* Function vect_model_reduction_cost.
2952 Models cost for a reduction operation, including the vector ops
2953 generated within the strip-mine loop, the initial definition before
2954 the loop, and the epilogue code that must be generated. */
2956 static bool
2957 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2958 int ncopies)
2960 int prologue_cost = 0, epilogue_cost = 0;
2961 enum tree_code code;
2962 optab optab;
2963 tree vectype;
2964 gimple stmt, orig_stmt;
2965 tree reduction_op;
2966 enum machine_mode mode;
2967 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2969 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2971 /* Cost of reduction op inside loop. */
2972 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2973 stmt_info, 0, vect_body);
2974 stmt = STMT_VINFO_STMT (stmt_info);
2976 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2978 case GIMPLE_SINGLE_RHS:
2979 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2980 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2981 break;
2982 case GIMPLE_UNARY_RHS:
2983 reduction_op = gimple_assign_rhs1 (stmt);
2984 break;
2985 case GIMPLE_BINARY_RHS:
2986 reduction_op = gimple_assign_rhs2 (stmt);
2987 break;
2988 case GIMPLE_TERNARY_RHS:
2989 reduction_op = gimple_assign_rhs3 (stmt);
2990 break;
2991 default:
2992 gcc_unreachable ();
2995 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2996 if (!vectype)
2998 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "unsupported data-type ");
3002 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3003 TREE_TYPE (reduction_op));
3005 return false;
3008 mode = TYPE_MODE (vectype);
3009 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3011 if (!orig_stmt)
3012 orig_stmt = STMT_VINFO_STMT (stmt_info);
3014 code = gimple_assign_rhs_code (orig_stmt);
3016 /* Add in cost for initial definition. */
3017 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3018 stmt_info, 0, vect_prologue);
3020 /* Determine cost of epilogue code.
3022 We have a reduction operator that will reduce the vector in one statement.
3023 Also requires scalar extract. */
3025 if (!nested_in_vect_loop_p (loop, orig_stmt))
3027 if (reduc_code != ERROR_MARK)
3029 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3030 stmt_info, 0, vect_epilogue);
3031 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3032 stmt_info, 0, vect_epilogue);
3034 else
3036 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3037 tree bitsize =
3038 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3039 int element_bitsize = tree_low_cst (bitsize, 1);
3040 int nelements = vec_size_in_bits / element_bitsize;
3042 optab = optab_for_tree_code (code, vectype, optab_default);
3044 /* We have a whole vector shift available. */
3045 if (VECTOR_MODE_P (mode)
3046 && optab_handler (optab, mode) != CODE_FOR_nothing
3047 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3049 /* Final reduction via vector shifts and the reduction operator.
3050 Also requires scalar extract. */
3051 epilogue_cost += add_stmt_cost (target_cost_data,
3052 exact_log2 (nelements) * 2,
3053 vector_stmt, stmt_info, 0,
3054 vect_epilogue);
3055 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3056 vec_to_scalar, stmt_info, 0,
3057 vect_epilogue);
3059 else
3060 /* Use extracts and reduction op for final reduction. For N
3061 elements, we have N extracts and N-1 reduction ops. */
3062 epilogue_cost += add_stmt_cost (target_cost_data,
3063 nelements + nelements - 1,
3064 vector_stmt, stmt_info, 0,
3065 vect_epilogue);
3069 if (dump_enabled_p ())
3070 dump_printf (MSG_NOTE,
3071 "vect_model_reduction_cost: inside_cost = %d, "
3072 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3073 prologue_cost, epilogue_cost);
3075 return true;
3079 /* Function vect_model_induction_cost.
3081 Models cost for induction operations. */
3083 static void
3084 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3086 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3087 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3088 unsigned inside_cost, prologue_cost;
3090 /* loop cost for vec_loop. */
3091 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3092 stmt_info, 0, vect_body);
3094 /* prologue cost for vec_init and vec_step. */
3095 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3096 stmt_info, 0, vect_prologue);
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "vect_model_induction_cost: inside_cost = %d, "
3101 "prologue_cost = %d .", inside_cost, prologue_cost);
3105 /* Function get_initial_def_for_induction
3107 Input:
3108 STMT - a stmt that performs an induction operation in the loop.
3109 IV_PHI - the initial value of the induction variable
3111 Output:
3112 Return a vector variable, initialized with the first VF values of
3113 the induction variable. E.g., for an iv with IV_PHI='X' and
3114 evolution S, for a vector of 4 units, we want to return:
3115 [X, X + S, X + 2*S, X + 3*S]. */
3117 static tree
3118 get_initial_def_for_induction (gimple iv_phi)
3120 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3122 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3123 tree scalar_type;
3124 tree vectype;
3125 int nunits;
3126 edge pe = loop_preheader_edge (loop);
3127 struct loop *iv_loop;
3128 basic_block new_bb;
3129 tree new_vec, vec_init, vec_step, t;
3130 tree access_fn;
3131 tree new_var;
3132 tree new_name;
3133 gimple init_stmt, induction_phi, new_stmt;
3134 tree induc_def, vec_def, vec_dest;
3135 tree init_expr, step_expr;
3136 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3137 int i;
3138 bool ok;
3139 int ncopies;
3140 tree expr;
3141 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3142 bool nested_in_vect_loop = false;
3143 gimple_seq stmts = NULL;
3144 imm_use_iterator imm_iter;
3145 use_operand_p use_p;
3146 gimple exit_phi;
3147 edge latch_e;
3148 tree loop_arg;
3149 gimple_stmt_iterator si;
3150 basic_block bb = gimple_bb (iv_phi);
3151 tree stepvectype;
3152 tree resvectype;
3154 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3155 if (nested_in_vect_loop_p (loop, iv_phi))
3157 nested_in_vect_loop = true;
3158 iv_loop = loop->inner;
3160 else
3161 iv_loop = loop;
3162 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3164 latch_e = loop_latch_edge (iv_loop);
3165 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3167 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3168 gcc_assert (access_fn);
3169 STRIP_NOPS (access_fn);
3170 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3171 &init_expr, &step_expr);
3172 gcc_assert (ok);
3173 pe = loop_preheader_edge (iv_loop);
3175 scalar_type = TREE_TYPE (init_expr);
3176 vectype = get_vectype_for_scalar_type (scalar_type);
3177 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3178 gcc_assert (vectype);
3179 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3180 ncopies = vf / nunits;
3182 gcc_assert (phi_info);
3183 gcc_assert (ncopies >= 1);
3185 /* Find the first insertion point in the BB. */
3186 si = gsi_after_labels (bb);
3188 /* Create the vector that holds the initial_value of the induction. */
3189 if (nested_in_vect_loop)
3191 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3192 been created during vectorization of previous stmts. We obtain it
3193 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3194 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3195 loop_preheader_edge (iv_loop));
3196 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3197 /* If the initial value is not of proper type, convert it. */
3198 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3200 new_stmt = gimple_build_assign_with_ops
3201 (VIEW_CONVERT_EXPR,
3202 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3203 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3204 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3205 gimple_assign_set_lhs (new_stmt, vec_init);
3206 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3207 new_stmt);
3208 gcc_assert (!new_bb);
3209 set_vinfo_for_stmt (new_stmt,
3210 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3213 else
3215 vec<constructor_elt, va_gc> *v;
3217 /* iv_loop is the loop to be vectorized. Create:
3218 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3219 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3220 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3221 if (stmts)
3223 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3224 gcc_assert (!new_bb);
3227 vec_alloc (v, nunits);
3228 bool constant_p = is_gimple_min_invariant (new_name);
3229 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3230 for (i = 1; i < nunits; i++)
3232 /* Create: new_name_i = new_name + step_expr */
3233 enum tree_code code = POINTER_TYPE_P (scalar_type)
3234 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3235 new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3236 if (!is_gimple_min_invariant (new_name))
3238 init_stmt = gimple_build_assign (new_var, new_name);
3239 new_name = make_ssa_name (new_var, init_stmt);
3240 gimple_assign_set_lhs (init_stmt, new_name);
3241 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3242 gcc_assert (!new_bb);
3243 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE, vect_location,
3246 "created new init_stmt: ");
3247 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3249 constant_p = false;
3251 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3253 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3254 if (constant_p)
3255 new_vec = build_vector_from_ctor (vectype, v);
3256 else
3257 new_vec = build_constructor (vectype, v);
3258 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3262 /* Create the vector that holds the step of the induction. */
3263 if (nested_in_vect_loop)
3264 /* iv_loop is nested in the loop to be vectorized. Generate:
3265 vec_step = [S, S, S, S] */
3266 new_name = step_expr;
3267 else
3269 /* iv_loop is the loop to be vectorized. Generate:
3270 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3271 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3272 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3273 expr, step_expr);
3276 t = unshare_expr (new_name);
3277 gcc_assert (CONSTANT_CLASS_P (new_name));
3278 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3279 gcc_assert (stepvectype);
3280 new_vec = build_vector_from_val (stepvectype, t);
3281 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3284 /* Create the following def-use cycle:
3285 loop prolog:
3286 vec_init = ...
3287 vec_step = ...
3288 loop:
3289 vec_iv = PHI <vec_init, vec_loop>
3291 STMT
3293 vec_loop = vec_iv + vec_step; */
3295 /* Create the induction-phi that defines the induction-operand. */
3296 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3297 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3298 set_vinfo_for_stmt (induction_phi,
3299 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3300 induc_def = PHI_RESULT (induction_phi);
3302 /* Create the iv update inside the loop */
3303 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3304 induc_def, vec_step);
3305 vec_def = make_ssa_name (vec_dest, new_stmt);
3306 gimple_assign_set_lhs (new_stmt, vec_def);
3307 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3308 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3309 NULL));
3311 /* Set the arguments of the phi node: */
3312 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3313 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3314 UNKNOWN_LOCATION);
3317 /* In case that vectorization factor (VF) is bigger than the number
3318 of elements that we can fit in a vectype (nunits), we have to generate
3319 more than one vector stmt - i.e - we need to "unroll" the
3320 vector stmt by a factor VF/nunits. For more details see documentation
3321 in vectorizable_operation. */
3323 if (ncopies > 1)
3325 stmt_vec_info prev_stmt_vinfo;
3326 /* FORNOW. This restriction should be relaxed. */
3327 gcc_assert (!nested_in_vect_loop);
3329 /* Create the vector that holds the step of the induction. */
3330 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3331 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3332 expr, step_expr);
3333 t = unshare_expr (new_name);
3334 gcc_assert (CONSTANT_CLASS_P (new_name));
3335 new_vec = build_vector_from_val (stepvectype, t);
3336 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3338 vec_def = induc_def;
3339 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3340 for (i = 1; i < ncopies; i++)
3342 /* vec_i = vec_prev + vec_step */
3343 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3344 vec_def, vec_step);
3345 vec_def = make_ssa_name (vec_dest, new_stmt);
3346 gimple_assign_set_lhs (new_stmt, vec_def);
3348 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3349 if (!useless_type_conversion_p (resvectype, vectype))
3351 new_stmt = gimple_build_assign_with_ops
3352 (VIEW_CONVERT_EXPR,
3353 vect_get_new_vect_var (resvectype, vect_simple_var,
3354 "vec_iv_"),
3355 build1 (VIEW_CONVERT_EXPR, resvectype,
3356 gimple_assign_lhs (new_stmt)), NULL_TREE);
3357 gimple_assign_set_lhs (new_stmt,
3358 make_ssa_name
3359 (gimple_assign_lhs (new_stmt), new_stmt));
3360 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3362 set_vinfo_for_stmt (new_stmt,
3363 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3364 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3365 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3369 if (nested_in_vect_loop)
3371 /* Find the loop-closed exit-phi of the induction, and record
3372 the final vector of induction results: */
3373 exit_phi = NULL;
3374 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3376 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3378 exit_phi = USE_STMT (use_p);
3379 break;
3382 if (exit_phi)
3384 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3385 /* FORNOW. Currently not supporting the case that an inner-loop induction
3386 is not used in the outer-loop (i.e. only outside the outer-loop). */
3387 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3388 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3390 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3391 if (dump_enabled_p ())
3393 dump_printf_loc (MSG_NOTE, vect_location,
3394 "vector of inductions after inner-loop:");
3395 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3401 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "transform induction: created def-use cycle: ");
3405 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3406 dump_printf (MSG_NOTE, "\n");
3407 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3408 SSA_NAME_DEF_STMT (vec_def), 0);
3411 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3412 if (!useless_type_conversion_p (resvectype, vectype))
3414 new_stmt = gimple_build_assign_with_ops
3415 (VIEW_CONVERT_EXPR,
3416 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3417 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3418 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3419 gimple_assign_set_lhs (new_stmt, induc_def);
3420 si = gsi_after_labels (bb);
3421 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3422 set_vinfo_for_stmt (new_stmt,
3423 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3424 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3425 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3428 return induc_def;
3432 /* Function get_initial_def_for_reduction
3434 Input:
3435 STMT - a stmt that performs a reduction operation in the loop.
3436 INIT_VAL - the initial value of the reduction variable
3438 Output:
3439 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3440 of the reduction (used for adjusting the epilog - see below).
3441 Return a vector variable, initialized according to the operation that STMT
3442 performs. This vector will be used as the initial value of the
3443 vector of partial results.
3445 Option1 (adjust in epilog): Initialize the vector as follows:
3446 add/bit or/xor: [0,0,...,0,0]
3447 mult/bit and: [1,1,...,1,1]
3448 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3449 and when necessary (e.g. add/mult case) let the caller know
3450 that it needs to adjust the result by init_val.
3452 Option2: Initialize the vector as follows:
3453 add/bit or/xor: [init_val,0,0,...,0]
3454 mult/bit and: [init_val,1,1,...,1]
3455 min/max/cond_expr: [init_val,init_val,...,init_val]
3456 and no adjustments are needed.
3458 For example, for the following code:
3460 s = init_val;
3461 for (i=0;i<n;i++)
3462 s = s + a[i];
3464 STMT is 's = s + a[i]', and the reduction variable is 's'.
3465 For a vector of 4 units, we want to return either [0,0,0,init_val],
3466 or [0,0,0,0] and let the caller know that it needs to adjust
3467 the result at the end by 'init_val'.
3469 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3470 initialization vector is simpler (same element in all entries), if
3471 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3473 A cost model should help decide between these two schemes. */
3475 tree
3476 get_initial_def_for_reduction (gimple stmt, tree init_val,
3477 tree *adjustment_def)
3479 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3481 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3482 tree scalar_type = TREE_TYPE (init_val);
3483 tree vectype = get_vectype_for_scalar_type (scalar_type);
3484 int nunits;
3485 enum tree_code code = gimple_assign_rhs_code (stmt);
3486 tree def_for_init;
3487 tree init_def;
3488 tree *elts;
3489 int i;
3490 bool nested_in_vect_loop = false;
3491 tree init_value;
3492 REAL_VALUE_TYPE real_init_val = dconst0;
3493 int int_init_val = 0;
3494 gimple def_stmt = NULL;
3496 gcc_assert (vectype);
3497 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3499 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3500 || SCALAR_FLOAT_TYPE_P (scalar_type));
3502 if (nested_in_vect_loop_p (loop, stmt))
3503 nested_in_vect_loop = true;
3504 else
3505 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3507 /* In case of double reduction we only create a vector variable to be put
3508 in the reduction phi node. The actual statement creation is done in
3509 vect_create_epilog_for_reduction. */
3510 if (adjustment_def && nested_in_vect_loop
3511 && TREE_CODE (init_val) == SSA_NAME
3512 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3513 && gimple_code (def_stmt) == GIMPLE_PHI
3514 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3515 && vinfo_for_stmt (def_stmt)
3516 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3517 == vect_double_reduction_def)
3519 *adjustment_def = NULL;
3520 return vect_create_destination_var (init_val, vectype);
3523 if (TREE_CONSTANT (init_val))
3525 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3526 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3527 else
3528 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3530 else
3531 init_value = init_val;
3533 switch (code)
3535 case WIDEN_SUM_EXPR:
3536 case DOT_PROD_EXPR:
3537 case PLUS_EXPR:
3538 case MINUS_EXPR:
3539 case BIT_IOR_EXPR:
3540 case BIT_XOR_EXPR:
3541 case MULT_EXPR:
3542 case BIT_AND_EXPR:
3543 /* ADJUSMENT_DEF is NULL when called from
3544 vect_create_epilog_for_reduction to vectorize double reduction. */
3545 if (adjustment_def)
3547 if (nested_in_vect_loop)
3548 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3549 NULL);
3550 else
3551 *adjustment_def = init_val;
3554 if (code == MULT_EXPR)
3556 real_init_val = dconst1;
3557 int_init_val = 1;
3560 if (code == BIT_AND_EXPR)
3561 int_init_val = -1;
3563 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3564 def_for_init = build_real (scalar_type, real_init_val);
3565 else
3566 def_for_init = build_int_cst (scalar_type, int_init_val);
3568 /* Create a vector of '0' or '1' except the first element. */
3569 elts = XALLOCAVEC (tree, nunits);
3570 for (i = nunits - 2; i >= 0; --i)
3571 elts[i + 1] = def_for_init;
3573 /* Option1: the first element is '0' or '1' as well. */
3574 if (adjustment_def)
3576 elts[0] = def_for_init;
3577 init_def = build_vector (vectype, elts);
3578 break;
3581 /* Option2: the first element is INIT_VAL. */
3582 elts[0] = init_val;
3583 if (TREE_CONSTANT (init_val))
3584 init_def = build_vector (vectype, elts);
3585 else
3587 vec<constructor_elt, va_gc> *v;
3588 vec_alloc (v, nunits);
3589 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3590 for (i = 1; i < nunits; ++i)
3591 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3592 init_def = build_constructor (vectype, v);
3595 break;
3597 case MIN_EXPR:
3598 case MAX_EXPR:
3599 case COND_EXPR:
3600 if (adjustment_def)
3602 *adjustment_def = NULL_TREE;
3603 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3604 break;
3607 init_def = build_vector_from_val (vectype, init_value);
3608 break;
3610 default:
3611 gcc_unreachable ();
3614 return init_def;
3618 /* Function vect_create_epilog_for_reduction
3620 Create code at the loop-epilog to finalize the result of a reduction
3621 computation.
3623 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3624 reduction statements.
3625 STMT is the scalar reduction stmt that is being vectorized.
3626 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3627 number of elements that we can fit in a vectype (nunits). In this case
3628 we have to generate more than one vector stmt - i.e - we need to "unroll"
3629 the vector stmt by a factor VF/nunits. For more details see documentation
3630 in vectorizable_operation.
3631 REDUC_CODE is the tree-code for the epilog reduction.
3632 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3633 computation.
3634 REDUC_INDEX is the index of the operand in the right hand side of the
3635 statement that is defined by REDUCTION_PHI.
3636 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3637 SLP_NODE is an SLP node containing a group of reduction statements. The
3638 first one in this group is STMT.
3640 This function:
3641 1. Creates the reduction def-use cycles: sets the arguments for
3642 REDUCTION_PHIS:
3643 The loop-entry argument is the vectorized initial-value of the reduction.
3644 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3645 sums.
3646 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3647 by applying the operation specified by REDUC_CODE if available, or by
3648 other means (whole-vector shifts or a scalar loop).
3649 The function also creates a new phi node at the loop exit to preserve
3650 loop-closed form, as illustrated below.
3652 The flow at the entry to this function:
3654 loop:
3655 vec_def = phi <null, null> # REDUCTION_PHI
3656 VECT_DEF = vector_stmt # vectorized form of STMT
3657 s_loop = scalar_stmt # (scalar) STMT
3658 loop_exit:
3659 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3660 use <s_out0>
3661 use <s_out0>
3663 The above is transformed by this function into:
3665 loop:
3666 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3667 VECT_DEF = vector_stmt # vectorized form of STMT
3668 s_loop = scalar_stmt # (scalar) STMT
3669 loop_exit:
3670 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3671 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3672 v_out2 = reduce <v_out1>
3673 s_out3 = extract_field <v_out2, 0>
3674 s_out4 = adjust_result <s_out3>
3675 use <s_out4>
3676 use <s_out4>
3679 static void
3680 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3681 int ncopies, enum tree_code reduc_code,
3682 vec<gimple> reduction_phis,
3683 int reduc_index, bool double_reduc,
3684 slp_tree slp_node)
3686 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3687 stmt_vec_info prev_phi_info;
3688 tree vectype;
3689 enum machine_mode mode;
3690 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3691 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3692 basic_block exit_bb;
3693 tree scalar_dest;
3694 tree scalar_type;
3695 gimple new_phi = NULL, phi;
3696 gimple_stmt_iterator exit_gsi;
3697 tree vec_dest;
3698 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3699 gimple epilog_stmt = NULL;
3700 enum tree_code code = gimple_assign_rhs_code (stmt);
3701 gimple exit_phi;
3702 tree bitsize, bitpos;
3703 tree adjustment_def = NULL;
3704 tree vec_initial_def = NULL;
3705 tree reduction_op, expr, def;
3706 tree orig_name, scalar_result;
3707 imm_use_iterator imm_iter, phi_imm_iter;
3708 use_operand_p use_p, phi_use_p;
3709 bool extract_scalar_result = false;
3710 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3711 bool nested_in_vect_loop = false;
3712 vec<gimple> new_phis = vNULL;
3713 vec<gimple> inner_phis = vNULL;
3714 enum vect_def_type dt = vect_unknown_def_type;
3715 int j, i;
3716 vec<tree> scalar_results = vNULL;
3717 unsigned int group_size = 1, k, ratio;
3718 vec<tree> vec_initial_defs = vNULL;
3719 vec<gimple> phis;
3720 bool slp_reduc = false;
3721 tree new_phi_result;
3722 gimple inner_phi = NULL;
3724 if (slp_node)
3725 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3727 if (nested_in_vect_loop_p (loop, stmt))
3729 outer_loop = loop;
3730 loop = loop->inner;
3731 nested_in_vect_loop = true;
3732 gcc_assert (!slp_node);
3735 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3737 case GIMPLE_SINGLE_RHS:
3738 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3739 == ternary_op);
3740 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3741 break;
3742 case GIMPLE_UNARY_RHS:
3743 reduction_op = gimple_assign_rhs1 (stmt);
3744 break;
3745 case GIMPLE_BINARY_RHS:
3746 reduction_op = reduc_index ?
3747 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3748 break;
3749 case GIMPLE_TERNARY_RHS:
3750 reduction_op = gimple_op (stmt, reduc_index + 1);
3751 break;
3752 default:
3753 gcc_unreachable ();
3756 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3757 gcc_assert (vectype);
3758 mode = TYPE_MODE (vectype);
3760 /* 1. Create the reduction def-use cycle:
3761 Set the arguments of REDUCTION_PHIS, i.e., transform
3763 loop:
3764 vec_def = phi <null, null> # REDUCTION_PHI
3765 VECT_DEF = vector_stmt # vectorized form of STMT
3768 into:
3770 loop:
3771 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3772 VECT_DEF = vector_stmt # vectorized form of STMT
3775 (in case of SLP, do it for all the phis). */
3777 /* Get the loop-entry arguments. */
3778 if (slp_node)
3779 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3780 NULL, slp_node, reduc_index);
3781 else
3783 vec_initial_defs.create (1);
3784 /* For the case of reduction, vect_get_vec_def_for_operand returns
3785 the scalar def before the loop, that defines the initial value
3786 of the reduction variable. */
3787 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3788 &adjustment_def);
3789 vec_initial_defs.quick_push (vec_initial_def);
3792 /* Set phi nodes arguments. */
3793 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3795 tree vec_init_def = vec_initial_defs[i];
3796 tree def = vect_defs[i];
3797 for (j = 0; j < ncopies; j++)
3799 /* Set the loop-entry arg of the reduction-phi. */
3800 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3801 UNKNOWN_LOCATION);
3803 /* Set the loop-latch arg for the reduction-phi. */
3804 if (j > 0)
3805 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3807 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3809 if (dump_enabled_p ())
3811 dump_printf_loc (MSG_NOTE, vect_location,
3812 "transform reduction: created def-use cycle: ");
3813 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3814 dump_printf (MSG_NOTE, "\n");
3815 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3818 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3822 vec_initial_defs.release ();
3824 /* 2. Create epilog code.
3825 The reduction epilog code operates across the elements of the vector
3826 of partial results computed by the vectorized loop.
3827 The reduction epilog code consists of:
3829 step 1: compute the scalar result in a vector (v_out2)
3830 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3831 step 3: adjust the scalar result (s_out3) if needed.
3833 Step 1 can be accomplished using one the following three schemes:
3834 (scheme 1) using reduc_code, if available.
3835 (scheme 2) using whole-vector shifts, if available.
3836 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3837 combined.
3839 The overall epilog code looks like this:
3841 s_out0 = phi <s_loop> # original EXIT_PHI
3842 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3843 v_out2 = reduce <v_out1> # step 1
3844 s_out3 = extract_field <v_out2, 0> # step 2
3845 s_out4 = adjust_result <s_out3> # step 3
3847 (step 3 is optional, and steps 1 and 2 may be combined).
3848 Lastly, the uses of s_out0 are replaced by s_out4. */
3851 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3852 v_out1 = phi <VECT_DEF>
3853 Store them in NEW_PHIS. */
3855 exit_bb = single_exit (loop)->dest;
3856 prev_phi_info = NULL;
3857 new_phis.create (vect_defs.length ());
3858 FOR_EACH_VEC_ELT (vect_defs, i, def)
3860 for (j = 0; j < ncopies; j++)
3862 tree new_def = copy_ssa_name (def, NULL);
3863 phi = create_phi_node (new_def, exit_bb);
3864 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3865 if (j == 0)
3866 new_phis.quick_push (phi);
3867 else
3869 def = vect_get_vec_def_for_stmt_copy (dt, def);
3870 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3873 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3874 prev_phi_info = vinfo_for_stmt (phi);
3878 /* The epilogue is created for the outer-loop, i.e., for the loop being
3879 vectorized. Create exit phis for the outer loop. */
3880 if (double_reduc)
3882 loop = outer_loop;
3883 exit_bb = single_exit (loop)->dest;
3884 inner_phis.create (vect_defs.length ());
3885 FOR_EACH_VEC_ELT (new_phis, i, phi)
3887 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3888 gimple outer_phi = create_phi_node (new_result, exit_bb);
3889 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3890 PHI_RESULT (phi));
3891 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3892 loop_vinfo, NULL));
3893 inner_phis.quick_push (phi);
3894 new_phis[i] = outer_phi;
3895 prev_phi_info = vinfo_for_stmt (outer_phi);
3896 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3898 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3899 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3900 outer_phi = create_phi_node (new_result, exit_bb);
3901 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3902 PHI_RESULT (phi));
3903 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3904 loop_vinfo, NULL));
3905 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3906 prev_phi_info = vinfo_for_stmt (outer_phi);
3911 exit_gsi = gsi_after_labels (exit_bb);
3913 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3914 (i.e. when reduc_code is not available) and in the final adjustment
3915 code (if needed). Also get the original scalar reduction variable as
3916 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3917 represents a reduction pattern), the tree-code and scalar-def are
3918 taken from the original stmt that the pattern-stmt (STMT) replaces.
3919 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3920 are taken from STMT. */
3922 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3923 if (!orig_stmt)
3925 /* Regular reduction */
3926 orig_stmt = stmt;
3928 else
3930 /* Reduction pattern */
3931 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3932 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3933 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3936 code = gimple_assign_rhs_code (orig_stmt);
3937 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3938 partial results are added and not subtracted. */
3939 if (code == MINUS_EXPR)
3940 code = PLUS_EXPR;
3942 scalar_dest = gimple_assign_lhs (orig_stmt);
3943 scalar_type = TREE_TYPE (scalar_dest);
3944 scalar_results.create (group_size);
3945 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3946 bitsize = TYPE_SIZE (scalar_type);
3948 /* In case this is a reduction in an inner-loop while vectorizing an outer
3949 loop - we don't need to extract a single scalar result at the end of the
3950 inner-loop (unless it is double reduction, i.e., the use of reduction is
3951 outside the outer-loop). The final vector of partial results will be used
3952 in the vectorized outer-loop, or reduced to a scalar result at the end of
3953 the outer-loop. */
3954 if (nested_in_vect_loop && !double_reduc)
3955 goto vect_finalize_reduction;
3957 /* SLP reduction without reduction chain, e.g.,
3958 # a1 = phi <a2, a0>
3959 # b1 = phi <b2, b0>
3960 a2 = operation (a1)
3961 b2 = operation (b1) */
3962 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3964 /* In case of reduction chain, e.g.,
3965 # a1 = phi <a3, a0>
3966 a2 = operation (a1)
3967 a3 = operation (a2),
3969 we may end up with more than one vector result. Here we reduce them to
3970 one vector. */
3971 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3973 tree first_vect = PHI_RESULT (new_phis[0]);
3974 tree tmp;
3975 gimple new_vec_stmt = NULL;
3977 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3978 for (k = 1; k < new_phis.length (); k++)
3980 gimple next_phi = new_phis[k];
3981 tree second_vect = PHI_RESULT (next_phi);
3983 tmp = build2 (code, vectype, first_vect, second_vect);
3984 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3985 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3986 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3987 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3990 new_phi_result = first_vect;
3991 if (new_vec_stmt)
3993 new_phis.truncate (0);
3994 new_phis.safe_push (new_vec_stmt);
3997 else
3998 new_phi_result = PHI_RESULT (new_phis[0]);
4000 /* 2.3 Create the reduction code, using one of the three schemes described
4001 above. In SLP we simply need to extract all the elements from the
4002 vector (without reducing them), so we use scalar shifts. */
4003 if (reduc_code != ERROR_MARK && !slp_reduc)
4005 tree tmp;
4007 /*** Case 1: Create:
4008 v_out2 = reduc_expr <v_out1> */
4010 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_NOTE, vect_location,
4012 "Reduce using direct vector reduction.");
4014 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4015 tmp = build1 (reduc_code, vectype, new_phi_result);
4016 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4017 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4018 gimple_assign_set_lhs (epilog_stmt, new_temp);
4019 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4021 extract_scalar_result = true;
4023 else
4025 enum tree_code shift_code = ERROR_MARK;
4026 bool have_whole_vector_shift = true;
4027 int bit_offset;
4028 int element_bitsize = tree_low_cst (bitsize, 1);
4029 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4030 tree vec_temp;
4032 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4033 shift_code = VEC_RSHIFT_EXPR;
4034 else
4035 have_whole_vector_shift = false;
4037 /* Regardless of whether we have a whole vector shift, if we're
4038 emulating the operation via tree-vect-generic, we don't want
4039 to use it. Only the first round of the reduction is likely
4040 to still be profitable via emulation. */
4041 /* ??? It might be better to emit a reduction tree code here, so that
4042 tree-vect-generic can expand the first round via bit tricks. */
4043 if (!VECTOR_MODE_P (mode))
4044 have_whole_vector_shift = false;
4045 else
4047 optab optab = optab_for_tree_code (code, vectype, optab_default);
4048 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4049 have_whole_vector_shift = false;
4052 if (have_whole_vector_shift && !slp_reduc)
4054 /*** Case 2: Create:
4055 for (offset = VS/2; offset >= element_size; offset/=2)
4057 Create: va' = vec_shift <va, offset>
4058 Create: va = vop <va, va'>
4059 } */
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE, vect_location,
4063 "Reduce using vector shifts");
4065 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4066 new_temp = new_phi_result;
4067 for (bit_offset = vec_size_in_bits/2;
4068 bit_offset >= element_bitsize;
4069 bit_offset /= 2)
4071 tree bitpos = size_int (bit_offset);
4073 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4074 vec_dest, new_temp, bitpos);
4075 new_name = make_ssa_name (vec_dest, epilog_stmt);
4076 gimple_assign_set_lhs (epilog_stmt, new_name);
4077 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4080 new_name, new_temp);
4081 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4082 gimple_assign_set_lhs (epilog_stmt, new_temp);
4083 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4086 extract_scalar_result = true;
4088 else
4090 tree rhs;
4092 /*** Case 3: Create:
4093 s = extract_field <v_out2, 0>
4094 for (offset = element_size;
4095 offset < vector_size;
4096 offset += element_size;)
4098 Create: s' = extract_field <v_out2, offset>
4099 Create: s = op <s, s'> // For non SLP cases
4100 } */
4102 if (dump_enabled_p ())
4103 dump_printf_loc (MSG_NOTE, vect_location,
4104 "Reduce using scalar code. ");
4106 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4107 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4109 if (gimple_code (new_phi) == GIMPLE_PHI)
4110 vec_temp = PHI_RESULT (new_phi);
4111 else
4112 vec_temp = gimple_assign_lhs (new_phi);
4113 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4114 bitsize_zero_node);
4115 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4116 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4117 gimple_assign_set_lhs (epilog_stmt, new_temp);
4118 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4120 /* In SLP we don't need to apply reduction operation, so we just
4121 collect s' values in SCALAR_RESULTS. */
4122 if (slp_reduc)
4123 scalar_results.safe_push (new_temp);
4125 for (bit_offset = element_bitsize;
4126 bit_offset < vec_size_in_bits;
4127 bit_offset += element_bitsize)
4129 tree bitpos = bitsize_int (bit_offset);
4130 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4131 bitsize, bitpos);
4133 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4134 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4135 gimple_assign_set_lhs (epilog_stmt, new_name);
4136 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4138 if (slp_reduc)
4140 /* In SLP we don't need to apply reduction operation, so
4141 we just collect s' values in SCALAR_RESULTS. */
4142 new_temp = new_name;
4143 scalar_results.safe_push (new_name);
4145 else
4147 epilog_stmt = gimple_build_assign_with_ops (code,
4148 new_scalar_dest, new_name, new_temp);
4149 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4150 gimple_assign_set_lhs (epilog_stmt, new_temp);
4151 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4156 /* The only case where we need to reduce scalar results in SLP, is
4157 unrolling. If the size of SCALAR_RESULTS is greater than
4158 GROUP_SIZE, we reduce them combining elements modulo
4159 GROUP_SIZE. */
4160 if (slp_reduc)
4162 tree res, first_res, new_res;
4163 gimple new_stmt;
4165 /* Reduce multiple scalar results in case of SLP unrolling. */
4166 for (j = group_size; scalar_results.iterate (j, &res);
4167 j++)
4169 first_res = scalar_results[j % group_size];
4170 new_stmt = gimple_build_assign_with_ops (code,
4171 new_scalar_dest, first_res, res);
4172 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4173 gimple_assign_set_lhs (new_stmt, new_res);
4174 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4175 scalar_results[j % group_size] = new_res;
4178 else
4179 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4180 scalar_results.safe_push (new_temp);
4182 extract_scalar_result = false;
4186 /* 2.4 Extract the final scalar result. Create:
4187 s_out3 = extract_field <v_out2, bitpos> */
4189 if (extract_scalar_result)
4191 tree rhs;
4193 if (dump_enabled_p ())
4194 dump_printf_loc (MSG_NOTE, vect_location,
4195 "extract scalar result");
4197 if (BYTES_BIG_ENDIAN)
4198 bitpos = size_binop (MULT_EXPR,
4199 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4200 TYPE_SIZE (scalar_type));
4201 else
4202 bitpos = bitsize_zero_node;
4204 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4205 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4206 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4207 gimple_assign_set_lhs (epilog_stmt, new_temp);
4208 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4209 scalar_results.safe_push (new_temp);
4212 vect_finalize_reduction:
4214 if (double_reduc)
4215 loop = loop->inner;
4217 /* 2.5 Adjust the final result by the initial value of the reduction
4218 variable. (When such adjustment is not needed, then
4219 'adjustment_def' is zero). For example, if code is PLUS we create:
4220 new_temp = loop_exit_def + adjustment_def */
4222 if (adjustment_def)
4224 gcc_assert (!slp_reduc);
4225 if (nested_in_vect_loop)
4227 new_phi = new_phis[0];
4228 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4229 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4230 new_dest = vect_create_destination_var (scalar_dest, vectype);
4232 else
4234 new_temp = scalar_results[0];
4235 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4236 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4237 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4240 epilog_stmt = gimple_build_assign (new_dest, expr);
4241 new_temp = make_ssa_name (new_dest, epilog_stmt);
4242 gimple_assign_set_lhs (epilog_stmt, new_temp);
4243 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4244 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4245 if (nested_in_vect_loop)
4247 set_vinfo_for_stmt (epilog_stmt,
4248 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4249 NULL));
4250 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4251 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4253 if (!double_reduc)
4254 scalar_results.quick_push (new_temp);
4255 else
4256 scalar_results[0] = new_temp;
4258 else
4259 scalar_results[0] = new_temp;
4261 new_phis[0] = epilog_stmt;
4264 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4265 phis with new adjusted scalar results, i.e., replace use <s_out0>
4266 with use <s_out4>.
4268 Transform:
4269 loop_exit:
4270 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4271 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4272 v_out2 = reduce <v_out1>
4273 s_out3 = extract_field <v_out2, 0>
4274 s_out4 = adjust_result <s_out3>
4275 use <s_out0>
4276 use <s_out0>
4278 into:
4280 loop_exit:
4281 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4282 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4283 v_out2 = reduce <v_out1>
4284 s_out3 = extract_field <v_out2, 0>
4285 s_out4 = adjust_result <s_out3>
4286 use <s_out4>
4287 use <s_out4> */
4290 /* In SLP reduction chain we reduce vector results into one vector if
4291 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4292 the last stmt in the reduction chain, since we are looking for the loop
4293 exit phi node. */
4294 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4296 scalar_dest = gimple_assign_lhs (
4297 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4298 group_size = 1;
4301 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4302 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4303 need to match SCALAR_RESULTS with corresponding statements. The first
4304 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4305 the first vector stmt, etc.
4306 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4307 if (group_size > new_phis.length ())
4309 ratio = group_size / new_phis.length ();
4310 gcc_assert (!(group_size % new_phis.length ()));
4312 else
4313 ratio = 1;
4315 for (k = 0; k < group_size; k++)
4317 if (k % ratio == 0)
4319 epilog_stmt = new_phis[k / ratio];
4320 reduction_phi = reduction_phis[k / ratio];
4321 if (double_reduc)
4322 inner_phi = inner_phis[k / ratio];
4325 if (slp_reduc)
4327 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4329 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4330 /* SLP statements can't participate in patterns. */
4331 gcc_assert (!orig_stmt);
4332 scalar_dest = gimple_assign_lhs (current_stmt);
4335 phis.create (3);
4336 /* Find the loop-closed-use at the loop exit of the original scalar
4337 result. (The reduction result is expected to have two immediate uses -
4338 one at the latch block, and one at the loop exit). */
4339 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4340 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4341 phis.safe_push (USE_STMT (use_p));
4343 /* We expect to have found an exit_phi because of loop-closed-ssa
4344 form. */
4345 gcc_assert (!phis.is_empty ());
4347 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4349 if (outer_loop)
4351 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4352 gimple vect_phi;
4354 /* FORNOW. Currently not supporting the case that an inner-loop
4355 reduction is not used in the outer-loop (but only outside the
4356 outer-loop), unless it is double reduction. */
4357 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4358 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4359 || double_reduc);
4361 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4362 if (!double_reduc
4363 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4364 != vect_double_reduction_def)
4365 continue;
4367 /* Handle double reduction:
4369 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4370 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4371 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4372 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4374 At that point the regular reduction (stmt2 and stmt3) is
4375 already vectorized, as well as the exit phi node, stmt4.
4376 Here we vectorize the phi node of double reduction, stmt1, and
4377 update all relevant statements. */
4379 /* Go through all the uses of s2 to find double reduction phi
4380 node, i.e., stmt1 above. */
4381 orig_name = PHI_RESULT (exit_phi);
4382 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4384 stmt_vec_info use_stmt_vinfo;
4385 stmt_vec_info new_phi_vinfo;
4386 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4387 basic_block bb = gimple_bb (use_stmt);
4388 gimple use;
4390 /* Check that USE_STMT is really double reduction phi
4391 node. */
4392 if (gimple_code (use_stmt) != GIMPLE_PHI
4393 || gimple_phi_num_args (use_stmt) != 2
4394 || bb->loop_father != outer_loop)
4395 continue;
4396 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4397 if (!use_stmt_vinfo
4398 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4399 != vect_double_reduction_def)
4400 continue;
4402 /* Create vector phi node for double reduction:
4403 vs1 = phi <vs0, vs2>
4404 vs1 was created previously in this function by a call to
4405 vect_get_vec_def_for_operand and is stored in
4406 vec_initial_def;
4407 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4408 vs0 is created here. */
4410 /* Create vector phi node. */
4411 vect_phi = create_phi_node (vec_initial_def, bb);
4412 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4413 loop_vec_info_for_loop (outer_loop), NULL);
4414 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4416 /* Create vs0 - initial def of the double reduction phi. */
4417 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4418 loop_preheader_edge (outer_loop));
4419 init_def = get_initial_def_for_reduction (stmt,
4420 preheader_arg, NULL);
4421 vect_phi_init = vect_init_vector (use_stmt, init_def,
4422 vectype, NULL);
4424 /* Update phi node arguments with vs0 and vs2. */
4425 add_phi_arg (vect_phi, vect_phi_init,
4426 loop_preheader_edge (outer_loop),
4427 UNKNOWN_LOCATION);
4428 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4429 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4430 if (dump_enabled_p ())
4432 dump_printf_loc (MSG_NOTE, vect_location,
4433 "created double reduction phi node: ");
4434 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4437 vect_phi_res = PHI_RESULT (vect_phi);
4439 /* Replace the use, i.e., set the correct vs1 in the regular
4440 reduction phi node. FORNOW, NCOPIES is always 1, so the
4441 loop is redundant. */
4442 use = reduction_phi;
4443 for (j = 0; j < ncopies; j++)
4445 edge pr_edge = loop_preheader_edge (loop);
4446 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4447 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4453 phis.release ();
4454 if (nested_in_vect_loop)
4456 if (double_reduc)
4457 loop = outer_loop;
4458 else
4459 continue;
4462 phis.create (3);
4463 /* Find the loop-closed-use at the loop exit of the original scalar
4464 result. (The reduction result is expected to have two immediate uses,
4465 one at the latch block, and one at the loop exit). For double
4466 reductions we are looking for exit phis of the outer loop. */
4467 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4469 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4470 phis.safe_push (USE_STMT (use_p));
4471 else
4473 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4475 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4477 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4479 if (!flow_bb_inside_loop_p (loop,
4480 gimple_bb (USE_STMT (phi_use_p))))
4481 phis.safe_push (USE_STMT (phi_use_p));
4487 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4489 /* Replace the uses: */
4490 orig_name = PHI_RESULT (exit_phi);
4491 scalar_result = scalar_results[k];
4492 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4493 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4494 SET_USE (use_p, scalar_result);
4497 phis.release ();
4500 scalar_results.release ();
4501 inner_phis.release ();
4502 new_phis.release ();
4506 /* Function vectorizable_reduction.
4508 Check if STMT performs a reduction operation that can be vectorized.
4509 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4510 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4511 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4513 This function also handles reduction idioms (patterns) that have been
4514 recognized in advance during vect_pattern_recog. In this case, STMT may be
4515 of this form:
4516 X = pattern_expr (arg0, arg1, ..., X)
4517 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4518 sequence that had been detected and replaced by the pattern-stmt (STMT).
4520 In some cases of reduction patterns, the type of the reduction variable X is
4521 different than the type of the other arguments of STMT.
4522 In such cases, the vectype that is used when transforming STMT into a vector
4523 stmt is different than the vectype that is used to determine the
4524 vectorization factor, because it consists of a different number of elements
4525 than the actual number of elements that are being operated upon in parallel.
4527 For example, consider an accumulation of shorts into an int accumulator.
4528 On some targets it's possible to vectorize this pattern operating on 8
4529 shorts at a time (hence, the vectype for purposes of determining the
4530 vectorization factor should be V8HI); on the other hand, the vectype that
4531 is used to create the vector form is actually V4SI (the type of the result).
4533 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4534 indicates what is the actual level of parallelism (V8HI in the example), so
4535 that the right vectorization factor would be derived. This vectype
4536 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4537 be used to create the vectorized stmt. The right vectype for the vectorized
4538 stmt is obtained from the type of the result X:
4539 get_vectype_for_scalar_type (TREE_TYPE (X))
4541 This means that, contrary to "regular" reductions (or "regular" stmts in
4542 general), the following equation:
4543 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4544 does *NOT* necessarily hold for reduction patterns. */
4546 bool
4547 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4548 gimple *vec_stmt, slp_tree slp_node)
4550 tree vec_dest;
4551 tree scalar_dest;
4552 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4553 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4554 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4555 tree vectype_in = NULL_TREE;
4556 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4558 enum tree_code code, orig_code, epilog_reduc_code;
4559 enum machine_mode vec_mode;
4560 int op_type;
4561 optab optab, reduc_optab;
4562 tree new_temp = NULL_TREE;
4563 tree def;
4564 gimple def_stmt;
4565 enum vect_def_type dt;
4566 gimple new_phi = NULL;
4567 tree scalar_type;
4568 bool is_simple_use;
4569 gimple orig_stmt;
4570 stmt_vec_info orig_stmt_info;
4571 tree expr = NULL_TREE;
4572 int i;
4573 int ncopies;
4574 int epilog_copies;
4575 stmt_vec_info prev_stmt_info, prev_phi_info;
4576 bool single_defuse_cycle = false;
4577 tree reduc_def = NULL_TREE;
4578 gimple new_stmt = NULL;
4579 int j;
4580 tree ops[3];
4581 bool nested_cycle = false, found_nested_cycle_def = false;
4582 gimple reduc_def_stmt = NULL;
4583 /* The default is that the reduction variable is the last in statement. */
4584 int reduc_index = 2;
4585 bool double_reduc = false, dummy;
4586 basic_block def_bb;
4587 struct loop * def_stmt_loop, *outer_loop = NULL;
4588 tree def_arg;
4589 gimple def_arg_stmt;
4590 vec<tree> vec_oprnds0 = vNULL;
4591 vec<tree> vec_oprnds1 = vNULL;
4592 vec<tree> vect_defs = vNULL;
4593 vec<gimple> phis = vNULL;
4594 int vec_num;
4595 tree def0, def1, tem, op0, op1 = NULL_TREE;
4597 /* In case of reduction chain we switch to the first stmt in the chain, but
4598 we don't update STMT_INFO, since only the last stmt is marked as reduction
4599 and has reduction properties. */
4600 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4601 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4603 if (nested_in_vect_loop_p (loop, stmt))
4605 outer_loop = loop;
4606 loop = loop->inner;
4607 nested_cycle = true;
4610 /* 1. Is vectorizable reduction? */
4611 /* Not supportable if the reduction variable is used in the loop, unless
4612 it's a reduction chain. */
4613 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4614 && !GROUP_FIRST_ELEMENT (stmt_info))
4615 return false;
4617 /* Reductions that are not used even in an enclosing outer-loop,
4618 are expected to be "live" (used out of the loop). */
4619 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4620 && !STMT_VINFO_LIVE_P (stmt_info))
4621 return false;
4623 /* Make sure it was already recognized as a reduction computation. */
4624 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4625 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4626 return false;
4628 /* 2. Has this been recognized as a reduction pattern?
4630 Check if STMT represents a pattern that has been recognized
4631 in earlier analysis stages. For stmts that represent a pattern,
4632 the STMT_VINFO_RELATED_STMT field records the last stmt in
4633 the original sequence that constitutes the pattern. */
4635 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4636 if (orig_stmt)
4638 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4639 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4640 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4643 /* 3. Check the operands of the operation. The first operands are defined
4644 inside the loop body. The last operand is the reduction variable,
4645 which is defined by the loop-header-phi. */
4647 gcc_assert (is_gimple_assign (stmt));
4649 /* Flatten RHS. */
4650 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4652 case GIMPLE_SINGLE_RHS:
4653 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4654 if (op_type == ternary_op)
4656 tree rhs = gimple_assign_rhs1 (stmt);
4657 ops[0] = TREE_OPERAND (rhs, 0);
4658 ops[1] = TREE_OPERAND (rhs, 1);
4659 ops[2] = TREE_OPERAND (rhs, 2);
4660 code = TREE_CODE (rhs);
4662 else
4663 return false;
4664 break;
4666 case GIMPLE_BINARY_RHS:
4667 code = gimple_assign_rhs_code (stmt);
4668 op_type = TREE_CODE_LENGTH (code);
4669 gcc_assert (op_type == binary_op);
4670 ops[0] = gimple_assign_rhs1 (stmt);
4671 ops[1] = gimple_assign_rhs2 (stmt);
4672 break;
4674 case GIMPLE_TERNARY_RHS:
4675 code = gimple_assign_rhs_code (stmt);
4676 op_type = TREE_CODE_LENGTH (code);
4677 gcc_assert (op_type == ternary_op);
4678 ops[0] = gimple_assign_rhs1 (stmt);
4679 ops[1] = gimple_assign_rhs2 (stmt);
4680 ops[2] = gimple_assign_rhs3 (stmt);
4681 break;
4683 case GIMPLE_UNARY_RHS:
4684 return false;
4686 default:
4687 gcc_unreachable ();
4690 if (code == COND_EXPR && slp_node)
4691 return false;
4693 scalar_dest = gimple_assign_lhs (stmt);
4694 scalar_type = TREE_TYPE (scalar_dest);
4695 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4696 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4697 return false;
4699 /* Do not try to vectorize bit-precision reductions. */
4700 if ((TYPE_PRECISION (scalar_type)
4701 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4702 return false;
4704 /* All uses but the last are expected to be defined in the loop.
4705 The last use is the reduction variable. In case of nested cycle this
4706 assumption is not true: we use reduc_index to record the index of the
4707 reduction variable. */
4708 for (i = 0; i < op_type - 1; i++)
4710 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4711 if (i == 0 && code == COND_EXPR)
4712 continue;
4714 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4715 &def_stmt, &def, &dt, &tem);
4716 if (!vectype_in)
4717 vectype_in = tem;
4718 gcc_assert (is_simple_use);
4720 if (dt != vect_internal_def
4721 && dt != vect_external_def
4722 && dt != vect_constant_def
4723 && dt != vect_induction_def
4724 && !(dt == vect_nested_cycle && nested_cycle))
4725 return false;
4727 if (dt == vect_nested_cycle)
4729 found_nested_cycle_def = true;
4730 reduc_def_stmt = def_stmt;
4731 reduc_index = i;
4735 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4736 &def_stmt, &def, &dt, &tem);
4737 if (!vectype_in)
4738 vectype_in = tem;
4739 gcc_assert (is_simple_use);
4740 if (!(dt == vect_reduction_def
4741 || dt == vect_nested_cycle
4742 || ((dt == vect_internal_def || dt == vect_external_def
4743 || dt == vect_constant_def || dt == vect_induction_def)
4744 && nested_cycle && found_nested_cycle_def)))
4746 /* For pattern recognized stmts, orig_stmt might be a reduction,
4747 but some helper statements for the pattern might not, or
4748 might be COND_EXPRs with reduction uses in the condition. */
4749 gcc_assert (orig_stmt);
4750 return false;
4752 if (!found_nested_cycle_def)
4753 reduc_def_stmt = def_stmt;
4755 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4756 if (orig_stmt)
4757 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4758 reduc_def_stmt,
4759 !nested_cycle,
4760 &dummy));
4761 else
4763 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4764 !nested_cycle, &dummy);
4765 /* We changed STMT to be the first stmt in reduction chain, hence we
4766 check that in this case the first element in the chain is STMT. */
4767 gcc_assert (stmt == tmp
4768 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4771 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4772 return false;
4774 if (slp_node || PURE_SLP_STMT (stmt_info))
4775 ncopies = 1;
4776 else
4777 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4778 / TYPE_VECTOR_SUBPARTS (vectype_in));
4780 gcc_assert (ncopies >= 1);
4782 vec_mode = TYPE_MODE (vectype_in);
4784 if (code == COND_EXPR)
4786 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4788 if (dump_enabled_p ())
4789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4790 "unsupported condition in reduction");
4792 return false;
4795 else
4797 /* 4. Supportable by target? */
4799 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4800 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4802 /* Shifts and rotates are only supported by vectorizable_shifts,
4803 not vectorizable_reduction. */
4804 if (dump_enabled_p ())
4805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4806 "unsupported shift or rotation.");
4807 return false;
4810 /* 4.1. check support for the operation in the loop */
4811 optab = optab_for_tree_code (code, vectype_in, optab_default);
4812 if (!optab)
4814 if (dump_enabled_p ())
4815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4816 "no optab.");
4818 return false;
4821 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4823 if (dump_enabled_p ())
4824 dump_printf (MSG_NOTE, "op not supported by target.");
4826 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4827 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4828 < vect_min_worthwhile_factor (code))
4829 return false;
4831 if (dump_enabled_p ())
4832 dump_printf (MSG_NOTE, "proceeding using word mode.");
4835 /* Worthwhile without SIMD support? */
4836 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4837 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4838 < vect_min_worthwhile_factor (code))
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4842 "not worthwhile without SIMD support.");
4844 return false;
4848 /* 4.2. Check support for the epilog operation.
4850 If STMT represents a reduction pattern, then the type of the
4851 reduction variable may be different than the type of the rest
4852 of the arguments. For example, consider the case of accumulation
4853 of shorts into an int accumulator; The original code:
4854 S1: int_a = (int) short_a;
4855 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4857 was replaced with:
4858 STMT: int_acc = widen_sum <short_a, int_acc>
4860 This means that:
4861 1. The tree-code that is used to create the vector operation in the
4862 epilog code (that reduces the partial results) is not the
4863 tree-code of STMT, but is rather the tree-code of the original
4864 stmt from the pattern that STMT is replacing. I.e, in the example
4865 above we want to use 'widen_sum' in the loop, but 'plus' in the
4866 epilog.
4867 2. The type (mode) we use to check available target support
4868 for the vector operation to be created in the *epilog*, is
4869 determined by the type of the reduction variable (in the example
4870 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4871 However the type (mode) we use to check available target support
4872 for the vector operation to be created *inside the loop*, is
4873 determined by the type of the other arguments to STMT (in the
4874 example we'd check this: optab_handler (widen_sum_optab,
4875 vect_short_mode)).
4877 This is contrary to "regular" reductions, in which the types of all
4878 the arguments are the same as the type of the reduction variable.
4879 For "regular" reductions we can therefore use the same vector type
4880 (and also the same tree-code) when generating the epilog code and
4881 when generating the code inside the loop. */
4883 if (orig_stmt)
4885 /* This is a reduction pattern: get the vectype from the type of the
4886 reduction variable, and get the tree-code from orig_stmt. */
4887 orig_code = gimple_assign_rhs_code (orig_stmt);
4888 gcc_assert (vectype_out);
4889 vec_mode = TYPE_MODE (vectype_out);
4891 else
4893 /* Regular reduction: use the same vectype and tree-code as used for
4894 the vector code inside the loop can be used for the epilog code. */
4895 orig_code = code;
4898 if (nested_cycle)
4900 def_bb = gimple_bb (reduc_def_stmt);
4901 def_stmt_loop = def_bb->loop_father;
4902 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4903 loop_preheader_edge (def_stmt_loop));
4904 if (TREE_CODE (def_arg) == SSA_NAME
4905 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4906 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4907 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4908 && vinfo_for_stmt (def_arg_stmt)
4909 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4910 == vect_double_reduction_def)
4911 double_reduc = true;
4914 epilog_reduc_code = ERROR_MARK;
4915 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4917 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4918 optab_default);
4919 if (!reduc_optab)
4921 if (dump_enabled_p ())
4922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4923 "no optab for reduction.");
4925 epilog_reduc_code = ERROR_MARK;
4928 if (reduc_optab
4929 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4931 if (dump_enabled_p ())
4932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933 "reduc op not supported by target.");
4935 epilog_reduc_code = ERROR_MARK;
4938 else
4940 if (!nested_cycle || double_reduc)
4942 if (dump_enabled_p ())
4943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4944 "no reduc code for scalar code.");
4946 return false;
4950 if (double_reduc && ncopies > 1)
4952 if (dump_enabled_p ())
4953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4954 "multiple types in double reduction");
4956 return false;
4959 /* In case of widenning multiplication by a constant, we update the type
4960 of the constant to be the type of the other operand. We check that the
4961 constant fits the type in the pattern recognition pass. */
4962 if (code == DOT_PROD_EXPR
4963 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4965 if (TREE_CODE (ops[0]) == INTEGER_CST)
4966 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4967 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4968 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4969 else
4971 if (dump_enabled_p ())
4972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4973 "invalid types in dot-prod");
4975 return false;
4979 if (!vec_stmt) /* transformation not required. */
4981 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4982 return false;
4983 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4984 return true;
4987 /** Transform. **/
4989 if (dump_enabled_p ())
4990 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4992 /* FORNOW: Multiple types are not supported for condition. */
4993 if (code == COND_EXPR)
4994 gcc_assert (ncopies == 1);
4996 /* Create the destination vector */
4997 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4999 /* In case the vectorization factor (VF) is bigger than the number
5000 of elements that we can fit in a vectype (nunits), we have to generate
5001 more than one vector stmt - i.e - we need to "unroll" the
5002 vector stmt by a factor VF/nunits. For more details see documentation
5003 in vectorizable_operation. */
5005 /* If the reduction is used in an outer loop we need to generate
5006 VF intermediate results, like so (e.g. for ncopies=2):
5007 r0 = phi (init, r0)
5008 r1 = phi (init, r1)
5009 r0 = x0 + r0;
5010 r1 = x1 + r1;
5011 (i.e. we generate VF results in 2 registers).
5012 In this case we have a separate def-use cycle for each copy, and therefore
5013 for each copy we get the vector def for the reduction variable from the
5014 respective phi node created for this copy.
5016 Otherwise (the reduction is unused in the loop nest), we can combine
5017 together intermediate results, like so (e.g. for ncopies=2):
5018 r = phi (init, r)
5019 r = x0 + r;
5020 r = x1 + r;
5021 (i.e. we generate VF/2 results in a single register).
5022 In this case for each copy we get the vector def for the reduction variable
5023 from the vectorized reduction operation generated in the previous iteration.
5026 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5028 single_defuse_cycle = true;
5029 epilog_copies = 1;
5031 else
5032 epilog_copies = ncopies;
5034 prev_stmt_info = NULL;
5035 prev_phi_info = NULL;
5036 if (slp_node)
5038 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5039 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5040 == TYPE_VECTOR_SUBPARTS (vectype_in));
5042 else
5044 vec_num = 1;
5045 vec_oprnds0.create (1);
5046 if (op_type == ternary_op)
5047 vec_oprnds1.create (1);
5050 phis.create (vec_num);
5051 vect_defs.create (vec_num);
5052 if (!slp_node)
5053 vect_defs.quick_push (NULL_TREE);
5055 for (j = 0; j < ncopies; j++)
5057 if (j == 0 || !single_defuse_cycle)
5059 for (i = 0; i < vec_num; i++)
5061 /* Create the reduction-phi that defines the reduction
5062 operand. */
5063 new_phi = create_phi_node (vec_dest, loop->header);
5064 set_vinfo_for_stmt (new_phi,
5065 new_stmt_vec_info (new_phi, loop_vinfo,
5066 NULL));
5067 if (j == 0 || slp_node)
5068 phis.quick_push (new_phi);
5072 if (code == COND_EXPR)
5074 gcc_assert (!slp_node);
5075 vectorizable_condition (stmt, gsi, vec_stmt,
5076 PHI_RESULT (phis[0]),
5077 reduc_index, NULL);
5078 /* Multiple types are not supported for condition. */
5079 break;
5082 /* Handle uses. */
5083 if (j == 0)
5085 op0 = ops[!reduc_index];
5086 if (op_type == ternary_op)
5088 if (reduc_index == 0)
5089 op1 = ops[2];
5090 else
5091 op1 = ops[1];
5094 if (slp_node)
5095 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5096 slp_node, -1);
5097 else
5099 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5100 stmt, NULL);
5101 vec_oprnds0.quick_push (loop_vec_def0);
5102 if (op_type == ternary_op)
5104 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5105 NULL);
5106 vec_oprnds1.quick_push (loop_vec_def1);
5110 else
5112 if (!slp_node)
5114 enum vect_def_type dt;
5115 gimple dummy_stmt;
5116 tree dummy;
5118 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5119 &dummy_stmt, &dummy, &dt);
5120 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5121 loop_vec_def0);
5122 vec_oprnds0[0] = loop_vec_def0;
5123 if (op_type == ternary_op)
5125 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5126 &dummy, &dt);
5127 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5128 loop_vec_def1);
5129 vec_oprnds1[0] = loop_vec_def1;
5133 if (single_defuse_cycle)
5134 reduc_def = gimple_assign_lhs (new_stmt);
5136 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5139 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5141 if (slp_node)
5142 reduc_def = PHI_RESULT (phis[i]);
5143 else
5145 if (!single_defuse_cycle || j == 0)
5146 reduc_def = PHI_RESULT (new_phi);
5149 def1 = ((op_type == ternary_op)
5150 ? vec_oprnds1[i] : NULL);
5151 if (op_type == binary_op)
5153 if (reduc_index == 0)
5154 expr = build2 (code, vectype_out, reduc_def, def0);
5155 else
5156 expr = build2 (code, vectype_out, def0, reduc_def);
5158 else
5160 if (reduc_index == 0)
5161 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5162 else
5164 if (reduc_index == 1)
5165 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5166 else
5167 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5171 new_stmt = gimple_build_assign (vec_dest, expr);
5172 new_temp = make_ssa_name (vec_dest, new_stmt);
5173 gimple_assign_set_lhs (new_stmt, new_temp);
5174 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5176 if (slp_node)
5178 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5179 vect_defs.quick_push (new_temp);
5181 else
5182 vect_defs[0] = new_temp;
5185 if (slp_node)
5186 continue;
5188 if (j == 0)
5189 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5190 else
5191 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5193 prev_stmt_info = vinfo_for_stmt (new_stmt);
5194 prev_phi_info = vinfo_for_stmt (new_phi);
5197 /* Finalize the reduction-phi (set its arguments) and create the
5198 epilog reduction code. */
5199 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5201 new_temp = gimple_assign_lhs (*vec_stmt);
5202 vect_defs[0] = new_temp;
5205 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5206 epilog_reduc_code, phis, reduc_index,
5207 double_reduc, slp_node);
5209 phis.release ();
5210 vect_defs.release ();
5211 vec_oprnds0.release ();
5212 vec_oprnds1.release ();
5214 return true;
5217 /* Function vect_min_worthwhile_factor.
5219 For a loop where we could vectorize the operation indicated by CODE,
5220 return the minimum vectorization factor that makes it worthwhile
5221 to use generic vectors. */
5223 vect_min_worthwhile_factor (enum tree_code code)
5225 switch (code)
5227 case PLUS_EXPR:
5228 case MINUS_EXPR:
5229 case NEGATE_EXPR:
5230 return 4;
5232 case BIT_AND_EXPR:
5233 case BIT_IOR_EXPR:
5234 case BIT_XOR_EXPR:
5235 case BIT_NOT_EXPR:
5236 return 2;
5238 default:
5239 return INT_MAX;
5244 /* Function vectorizable_induction
5246 Check if PHI performs an induction computation that can be vectorized.
5247 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5248 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5249 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5251 bool
5252 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5253 gimple *vec_stmt)
5255 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5256 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5259 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5260 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5261 tree vec_def;
5263 gcc_assert (ncopies >= 1);
5264 /* FORNOW. These restrictions should be relaxed. */
5265 if (nested_in_vect_loop_p (loop, phi))
5267 imm_use_iterator imm_iter;
5268 use_operand_p use_p;
5269 gimple exit_phi;
5270 edge latch_e;
5271 tree loop_arg;
5273 if (ncopies > 1)
5275 if (dump_enabled_p ())
5276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5277 "multiple types in nested loop.");
5278 return false;
5281 exit_phi = NULL;
5282 latch_e = loop_latch_edge (loop->inner);
5283 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5284 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5286 if (!flow_bb_inside_loop_p (loop->inner,
5287 gimple_bb (USE_STMT (use_p))))
5289 exit_phi = USE_STMT (use_p);
5290 break;
5293 if (exit_phi)
5295 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5296 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5297 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5301 "inner-loop induction only used outside "
5302 "of the outer vectorized loop.");
5303 return false;
5308 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5309 return false;
5311 /* FORNOW: SLP not supported. */
5312 if (STMT_SLP_TYPE (stmt_info))
5313 return false;
5315 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5317 if (gimple_code (phi) != GIMPLE_PHI)
5318 return false;
5320 if (!vec_stmt) /* transformation not required. */
5322 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5323 if (dump_enabled_p ())
5324 dump_printf_loc (MSG_NOTE, vect_location,
5325 "=== vectorizable_induction ===");
5326 vect_model_induction_cost (stmt_info, ncopies);
5327 return true;
5330 /** Transform. **/
5332 if (dump_enabled_p ())
5333 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5335 vec_def = get_initial_def_for_induction (phi);
5336 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5337 return true;
5340 /* Function vectorizable_live_operation.
5342 STMT computes a value that is used outside the loop. Check if
5343 it can be supported. */
5345 bool
5346 vectorizable_live_operation (gimple stmt,
5347 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5348 gimple *vec_stmt ATTRIBUTE_UNUSED)
5350 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5351 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5353 int i;
5354 int op_type;
5355 tree op;
5356 tree def;
5357 gimple def_stmt;
5358 enum vect_def_type dt;
5359 enum tree_code code;
5360 enum gimple_rhs_class rhs_class;
5362 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5364 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5365 return false;
5367 if (!is_gimple_assign (stmt))
5368 return false;
5370 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5371 return false;
5373 /* FORNOW. CHECKME. */
5374 if (nested_in_vect_loop_p (loop, stmt))
5375 return false;
5377 code = gimple_assign_rhs_code (stmt);
5378 op_type = TREE_CODE_LENGTH (code);
5379 rhs_class = get_gimple_rhs_class (code);
5380 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5381 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5383 /* FORNOW: support only if all uses are invariant. This means
5384 that the scalar operations can remain in place, unvectorized.
5385 The original last scalar value that they compute will be used. */
5387 for (i = 0; i < op_type; i++)
5389 if (rhs_class == GIMPLE_SINGLE_RHS)
5390 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5391 else
5392 op = gimple_op (stmt, i + 1);
5393 if (op
5394 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5395 &dt))
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 "use not simple.");
5400 return false;
5403 if (dt != vect_external_def && dt != vect_constant_def)
5404 return false;
5407 /* No transformation is required for the cases we currently support. */
5408 return true;
5411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5413 static void
5414 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5416 ssa_op_iter op_iter;
5417 imm_use_iterator imm_iter;
5418 def_operand_p def_p;
5419 gimple ustmt;
5421 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5423 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5425 basic_block bb;
5427 if (!is_gimple_debug (ustmt))
5428 continue;
5430 bb = gimple_bb (ustmt);
5432 if (!flow_bb_inside_loop_p (loop, bb))
5434 if (gimple_debug_bind_p (ustmt))
5436 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_NOTE, vect_location,
5438 "killing debug use");
5440 gimple_debug_bind_reset_value (ustmt);
5441 update_stmt (ustmt);
5443 else
5444 gcc_unreachable ();
5450 /* Function vect_transform_loop.
5452 The analysis phase has determined that the loop is vectorizable.
5453 Vectorize the loop - created vectorized stmts to replace the scalar
5454 stmts in the loop, and update the loop exit condition. */
5456 void
5457 vect_transform_loop (loop_vec_info loop_vinfo)
5459 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5460 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5461 int nbbs = loop->num_nodes;
5462 gimple_stmt_iterator si;
5463 int i;
5464 tree ratio = NULL;
5465 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5466 bool grouped_store;
5467 bool slp_scheduled = false;
5468 unsigned int nunits;
5469 gimple stmt, pattern_stmt;
5470 gimple_seq pattern_def_seq = NULL;
5471 gimple_stmt_iterator pattern_def_si = gsi_none ();
5472 bool transform_pattern_stmt = false;
5473 bool check_profitability = false;
5474 int th;
5475 /* Record number of iterations before we started tampering with the profile. */
5476 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5481 /* If profile is inprecise, we have chance to fix it up. */
5482 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5483 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5485 /* Use the more conservative vectorization threshold. If the number
5486 of iterations is constant assume the cost check has been performed
5487 by our caller. If the threshold makes all loops profitable that
5488 run at least the vectorization factor number of times checking
5489 is pointless, too. */
5490 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5491 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5492 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5493 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5494 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE, vect_location,
5498 "Profitability threshold is %d loop iterations.", th);
5499 check_profitability = true;
5502 /* Version the loop first, if required, so the profitability check
5503 comes first. */
5505 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5506 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5508 vect_loop_versioning (loop_vinfo, th, check_profitability);
5509 check_profitability = false;
5512 /* Peel the loop if there are data refs with unknown alignment.
5513 Only one data ref with unknown store is allowed. */
5515 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5517 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5518 check_profitability = false;
5521 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5522 compile time constant), or it is a constant that doesn't divide by the
5523 vectorization factor, then an epilog loop needs to be created.
5524 We therefore duplicate the loop: the original loop will be vectorized,
5525 and will compute the first (n/VF) iterations. The second copy of the loop
5526 will remain scalar and will compute the remaining (n%VF) iterations.
5527 (VF is the vectorization factor). */
5529 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5530 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5531 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5532 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5533 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5534 th, check_profitability);
5535 else
5536 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5537 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5539 /* 1) Make sure the loop header has exactly two entries
5540 2) Make sure we have a preheader basic block. */
5542 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5544 split_edge (loop_preheader_edge (loop));
5546 /* FORNOW: the vectorizer supports only loops which body consist
5547 of one basic block (header + empty latch). When the vectorizer will
5548 support more involved loop forms, the order by which the BBs are
5549 traversed need to be reconsidered. */
5551 for (i = 0; i < nbbs; i++)
5553 basic_block bb = bbs[i];
5554 stmt_vec_info stmt_info;
5555 gimple phi;
5557 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5559 phi = gsi_stmt (si);
5560 if (dump_enabled_p ())
5562 dump_printf_loc (MSG_NOTE, vect_location,
5563 "------>vectorizing phi: ");
5564 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5566 stmt_info = vinfo_for_stmt (phi);
5567 if (!stmt_info)
5568 continue;
5570 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5571 vect_loop_kill_debug_uses (loop, phi);
5573 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5574 && !STMT_VINFO_LIVE_P (stmt_info))
5575 continue;
5577 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5578 != (unsigned HOST_WIDE_INT) vectorization_factor)
5579 && dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5582 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5586 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5590 pattern_stmt = NULL;
5591 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5593 bool is_store;
5595 if (transform_pattern_stmt)
5596 stmt = pattern_stmt;
5597 else
5598 stmt = gsi_stmt (si);
5600 if (dump_enabled_p ())
5602 dump_printf_loc (MSG_NOTE, vect_location,
5603 "------>vectorizing statement: ");
5604 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5607 stmt_info = vinfo_for_stmt (stmt);
5609 /* vector stmts created in the outer-loop during vectorization of
5610 stmts in an inner-loop may not have a stmt_info, and do not
5611 need to be vectorized. */
5612 if (!stmt_info)
5614 gsi_next (&si);
5615 continue;
5618 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5619 vect_loop_kill_debug_uses (loop, stmt);
5621 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5622 && !STMT_VINFO_LIVE_P (stmt_info))
5624 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5625 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5626 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5627 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5629 stmt = pattern_stmt;
5630 stmt_info = vinfo_for_stmt (stmt);
5632 else
5634 gsi_next (&si);
5635 continue;
5638 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5639 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5640 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5641 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5642 transform_pattern_stmt = true;
5644 /* If pattern statement has def stmts, vectorize them too. */
5645 if (is_pattern_stmt_p (stmt_info))
5647 if (pattern_def_seq == NULL)
5649 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5650 pattern_def_si = gsi_start (pattern_def_seq);
5652 else if (!gsi_end_p (pattern_def_si))
5653 gsi_next (&pattern_def_si);
5654 if (pattern_def_seq != NULL)
5656 gimple pattern_def_stmt = NULL;
5657 stmt_vec_info pattern_def_stmt_info = NULL;
5659 while (!gsi_end_p (pattern_def_si))
5661 pattern_def_stmt = gsi_stmt (pattern_def_si);
5662 pattern_def_stmt_info
5663 = vinfo_for_stmt (pattern_def_stmt);
5664 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5665 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5666 break;
5667 gsi_next (&pattern_def_si);
5670 if (!gsi_end_p (pattern_def_si))
5672 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_NOTE, vect_location,
5675 "==> vectorizing pattern def "
5676 "stmt: ");
5677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5678 pattern_def_stmt, 0);
5681 stmt = pattern_def_stmt;
5682 stmt_info = pattern_def_stmt_info;
5684 else
5686 pattern_def_si = gsi_none ();
5687 transform_pattern_stmt = false;
5690 else
5691 transform_pattern_stmt = false;
5694 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5695 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5696 STMT_VINFO_VECTYPE (stmt_info));
5697 if (!STMT_SLP_TYPE (stmt_info)
5698 && nunits != (unsigned int) vectorization_factor
5699 && dump_enabled_p ())
5700 /* For SLP VF is set according to unrolling factor, and not to
5701 vector size, hence for SLP this print is not valid. */
5702 dump_printf_loc (MSG_NOTE, vect_location,
5703 "multiple-types.");
5705 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5706 reached. */
5707 if (STMT_SLP_TYPE (stmt_info))
5709 if (!slp_scheduled)
5711 slp_scheduled = true;
5713 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_NOTE, vect_location,
5715 "=== scheduling SLP instances ===");
5717 vect_schedule_slp (loop_vinfo, NULL);
5720 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5721 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5723 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5725 pattern_def_seq = NULL;
5726 gsi_next (&si);
5728 continue;
5732 /* -------- vectorize statement ------------ */
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5736 grouped_store = false;
5737 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5738 if (is_store)
5740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5742 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5743 interleaving chain was completed - free all the stores in
5744 the chain. */
5745 gsi_next (&si);
5746 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5747 continue;
5749 else
5751 /* Free the attached stmt_vec_info and remove the stmt. */
5752 gimple store = gsi_stmt (si);
5753 free_stmt_vec_info (store);
5754 unlink_stmt_vdef (store);
5755 gsi_remove (&si, true);
5756 release_defs (store);
5757 continue;
5761 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5763 pattern_def_seq = NULL;
5764 gsi_next (&si);
5766 } /* stmts in BB */
5767 } /* BBs in loop */
5769 slpeel_make_loop_iterate_ntimes (loop, ratio);
5771 /* Reduce loop iterations by the vectorization factor. */
5772 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5773 expected_iterations / vectorization_factor);
5774 loop->nb_iterations_upper_bound
5775 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5776 FLOOR_DIV_EXPR);
5777 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5778 && loop->nb_iterations_upper_bound != double_int_zero)
5779 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5780 if (loop->any_estimate)
5782 loop->nb_iterations_estimate
5783 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5784 FLOOR_DIV_EXPR);
5785 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5786 && loop->nb_iterations_estimate != double_int_zero)
5787 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5790 if (dump_enabled_p ())
5792 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5793 "LOOP VECTORIZED\n");
5794 if (loop->inner)
5795 dump_printf_loc (MSG_NOTE, vect_location,
5796 "OUTER LOOP VECTORIZED\n");