Replace enum gfc_try with bool type.
[official-gcc.git] / gcc / tree-vect-loop.c
blob2fc20f38cfbedefc38409ede217e42cc7ddf58e3
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
193 for (i = 0; i < nbbs; i++)
195 basic_block bb = bbs[i];
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
207 gcc_assert (stmt_info);
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
232 return false;
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
254 tree vf_vectype;
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
261 stmt_info = vinfo_for_stmt (stmt);
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
270 gcc_assert (stmt_info);
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
281 stmt = pattern_stmt;
282 stmt_info = vinfo_for_stmt (pattern_stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
290 else
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294 gsi_next (&si);
295 continue;
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 analyze_pattern_stmt = true;
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info))
307 if (pattern_def_seq == NULL)
309 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 pattern_def_si = gsi_start (pattern_def_seq);
312 else if (!gsi_end_p (pattern_def_si))
313 gsi_next (&pattern_def_si);
314 if (pattern_def_seq != NULL)
316 gimple pattern_def_stmt = NULL;
317 stmt_vec_info pattern_def_stmt_info = NULL;
319 while (!gsi_end_p (pattern_def_si))
321 pattern_def_stmt = gsi_stmt (pattern_def_si);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 break;
327 gsi_next (&pattern_def_si);
330 if (!gsi_end_p (pattern_def_si))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337 pattern_def_stmt, 0);
340 stmt = pattern_def_stmt;
341 stmt_info = pattern_def_stmt_info;
343 else
345 pattern_def_si = gsi_none ();
346 analyze_pattern_stmt = false;
349 else
350 analyze_pattern_stmt = false;
353 if (gimple_get_lhs (stmt) == NULL_TREE)
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
362 return false;
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
373 return false;
376 if (STMT_VINFO_VECTYPE (stmt_info))
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
381 idiom). */
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 || is_pattern_stmt_p (stmt_info)
384 || !gsi_end_p (pattern_def_si));
385 vectype = STMT_VINFO_VECTYPE (stmt_info);
387 else
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_NOTE, vect_location,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
397 vectype = get_vectype_for_scalar_type (scalar_type);
398 if (!vectype)
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "not vectorized: unsupported "
404 "data-type ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 scalar_type);
408 return false;
411 STMT_VINFO_VECTYPE (stmt_info) = vectype;
413 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
416 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
424 &dummy);
425 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE, vect_location,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
431 vf_vectype = get_vectype_for_scalar_type (scalar_type);
432 if (!vf_vectype)
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
439 scalar_type);
441 return false;
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
453 vectype);
454 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
456 vf_vectype);
458 return false;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
467 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
470 if (!vectorization_factor
471 || (nunits > vectorization_factor))
472 vectorization_factor = nunits;
474 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
476 pattern_def_seq = NULL;
477 gsi_next (&si);
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
485 vectorization_factor);
486 if (vectorization_factor <= 1)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "not vectorized: unsupported data-type");
491 return false;
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
495 return true;
499 /* Function vect_is_simple_iv_evolution.
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
504 static bool
505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
506 tree * step)
508 tree init_expr;
509 tree step_expr;
510 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
512 /* When there is no evolution in this loop, the evolution function
513 is not "simple". */
514 if (evolution_part == NULL_TREE)
515 return false;
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part))
520 return false;
522 step_expr = evolution_part;
523 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
528 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
529 dump_printf (MSG_NOTE, ", init: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
533 *init = init_expr;
534 *step = step_expr;
536 if (TREE_CODE (step_expr) != INTEGER_CST)
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "step unknown.");
541 return false;
544 return true;
547 /* Function vect_analyze_scalar_cycles_1.
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
552 enclosing LOOP). */
554 static void
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
557 basic_block bb = loop->header;
558 tree dumy;
559 vec<gimple> worklist;
560 worklist.create (64);
561 gimple_stmt_iterator gsi;
562 bool double_reduc;
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "=== vect_analyze_scalar_cycles ===");
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
570 changed. */
571 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
573 gimple phi = gsi_stmt (gsi);
574 tree access_fn = NULL;
575 tree def = PHI_RESULT (phi);
576 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
578 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def))
587 continue;
589 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
591 /* Analyze the evolution function. */
592 access_fn = analyze_scalar_evolution (loop, def);
593 if (access_fn)
595 STRIP_NOPS (access_fn);
596 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
603 = evolution_part_in_loop_num (access_fn, loop->num);
606 if (!access_fn
607 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
609 worklist.safe_push (phi);
610 continue;
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist.length () > 0)
624 gimple phi = worklist.pop ();
625 tree def = PHI_RESULT (phi);
626 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
627 gimple reduc_stmt;
628 bool nested_cycle;
630 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
636 gcc_assert (!virtual_operand_p (def)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
639 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
640 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
641 &double_reduc);
642 if (reduc_stmt)
644 if (double_reduc)
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Detected double reduction.");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
652 vect_double_reduction_def;
654 else
656 if (nested_cycle)
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected vectorizable nested cycle.");
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_nested_cycle;
666 else
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE, vect_location,
670 "Detected reduction.");
672 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
674 vect_reduction_def;
675 /* Store the reduction cycles for possible vectorization in
676 loop-aware SLP. */
677 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
681 else
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
684 "Unknown def-use cycle pattern.");
687 worklist.release ();
691 /* Function vect_analyze_scalar_cycles.
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
700 Example1: reduction:
702 loop1:
703 for (i=0; i<N; i++)
704 sum += a[i];
706 Example2: induction:
708 loop2:
709 for (i=0; i<N; i++)
710 a[i] = i; */
712 static void
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
715 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
717 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
724 vectorizing them.
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
728 if (loop->inner)
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
732 /* Function vect_get_loop_niters.
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
739 static gimple
740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
742 tree niters;
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "=== get_loop_niters ===");
747 niters = number_of_exit_cond_executions (loop);
749 if (niters != NULL_TREE
750 && niters != chrec_dont_know)
752 *number_of_iterations = niters;
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
761 return get_loop_exit_condition (loop);
765 /* Function bb_in_loop_p
767 Used as predicate for dfs order traversal of the loop bbs. */
769 static bool
770 bb_in_loop_p (const_basic_block bb, const void *data)
772 const struct loop *const loop = (const struct loop *)data;
773 if (flow_bb_inside_loop_p (loop, bb))
774 return true;
775 return false;
779 /* Function new_loop_vec_info.
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
784 static loop_vec_info
785 new_loop_vec_info (struct loop *loop)
787 loop_vec_info res;
788 basic_block *bbs;
789 gimple_stmt_iterator si;
790 unsigned int i, nbbs;
792 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
793 LOOP_VINFO_LOOP (res) = loop;
795 bbs = get_loop_body (loop);
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i = 0; i < loop->num_nodes; i++)
800 basic_block bb = bbs[i];
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb->loop_father != loop)
811 /* Inner-loop bb. */
812 gcc_assert (loop->inner && bb->loop_father == loop->inner);
813 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
815 gimple phi = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
817 loop_vec_info inner_loop_vinfo =
818 STMT_VINFO_LOOP_VINFO (stmt_info);
819 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
820 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
822 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
824 gimple stmt = gsi_stmt (si);
825 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
826 loop_vec_info inner_loop_vinfo =
827 STMT_VINFO_LOOP_VINFO (stmt_info);
828 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
829 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
832 else
834 /* bb in current nest. */
835 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
837 gimple phi = gsi_stmt (si);
838 gimple_set_uid (phi, 0);
839 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
842 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple stmt = gsi_stmt (si);
845 gimple_set_uid (stmt, 0);
846 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 free (bbs);
857 bbs = XCNEWVEC (basic_block, loop->num_nodes);
858 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
859 bbs, loop->num_nodes, loop);
860 gcc_assert (nbbs == loop->num_nodes);
862 LOOP_VINFO_BBS (res) = bbs;
863 LOOP_VINFO_NITERS (res) = NULL;
864 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
868 LOOP_VINFO_VECT_FACTOR (res) = 0;
869 LOOP_VINFO_LOOP_NEST (res).create (3);
870 LOOP_VINFO_DATAREFS (res).create (10);
871 LOOP_VINFO_DDRS (res).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
877 LOOP_VINFO_GROUPED_STORES (res).create (10);
878 LOOP_VINFO_REDUCTIONS (res).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
882 LOOP_VINFO_PEELING_HTAB (res) = NULL;
883 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
884 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
885 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
887 return res;
891 /* Function destroy_loop_vec_info.
893 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
894 stmts in the loop. */
896 void
897 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
899 struct loop *loop;
900 basic_block *bbs;
901 int nbbs;
902 gimple_stmt_iterator si;
903 int j;
904 vec<slp_instance> slp_instances;
905 slp_instance instance;
906 bool swapped;
908 if (!loop_vinfo)
909 return;
911 loop = LOOP_VINFO_LOOP (loop_vinfo);
913 bbs = LOOP_VINFO_BBS (loop_vinfo);
914 nbbs = clean_stmts ? loop->num_nodes : 0;
915 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
917 for (j = 0; j < nbbs; j++)
919 basic_block bb = bbs[j];
920 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
921 free_stmt_vec_info (gsi_stmt (si));
923 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
925 gimple stmt = gsi_stmt (si);
927 /* We may have broken canonical form by moving a constant
928 into RHS1 of a commutative op. Fix such occurrences. */
929 if (swapped && is_gimple_assign (stmt))
931 enum tree_code code = gimple_assign_rhs_code (stmt);
933 if ((code == PLUS_EXPR
934 || code == POINTER_PLUS_EXPR
935 || code == MULT_EXPR)
936 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
937 swap_tree_operands (stmt,
938 gimple_assign_rhs1_ptr (stmt),
939 gimple_assign_rhs2_ptr (stmt));
942 /* Free stmt_vec_info. */
943 free_stmt_vec_info (stmt);
944 gsi_next (&si);
948 free (LOOP_VINFO_BBS (loop_vinfo));
949 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
950 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
951 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
952 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
953 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
954 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
955 FOR_EACH_VEC_ELT (slp_instances, j, instance)
956 vect_free_slp_instance (instance);
958 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
959 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
960 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
961 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
963 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
964 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
966 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
968 free (loop_vinfo);
969 loop->aux = NULL;
973 /* Function vect_analyze_loop_1.
975 Apply a set of analyses on LOOP, and create a loop_vec_info struct
976 for it. The different analyses will record information in the
977 loop_vec_info struct. This is a subset of the analyses applied in
978 vect_analyze_loop, to be applied on an inner-loop nested in the loop
979 that is now considered for (outer-loop) vectorization. */
981 static loop_vec_info
982 vect_analyze_loop_1 (struct loop *loop)
984 loop_vec_info loop_vinfo;
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_NOTE, vect_location,
988 "===== analyze_loop_nest_1 =====");
990 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
992 loop_vinfo = vect_analyze_loop_form (loop);
993 if (!loop_vinfo)
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
997 "bad inner-loop form.");
998 return NULL;
1001 return loop_vinfo;
1005 /* Function vect_analyze_loop_form.
1007 Verify that certain CFG restrictions hold, including:
1008 - the loop has a pre-header
1009 - the loop has a single entry and exit
1010 - the loop exit condition is simple enough, and the number of iterations
1011 can be analyzed (a countable loop). */
1013 loop_vec_info
1014 vect_analyze_loop_form (struct loop *loop)
1016 loop_vec_info loop_vinfo;
1017 gimple loop_cond;
1018 tree number_of_iterations = NULL;
1019 loop_vec_info inner_loop_vinfo = NULL;
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_NOTE, vect_location,
1023 "=== vect_analyze_loop_form ===");
1025 /* Different restrictions apply when we are considering an inner-most loop,
1026 vs. an outer (nested) loop.
1027 (FORNOW. May want to relax some of these restrictions in the future). */
1029 if (!loop->inner)
1031 /* Inner-most loop. We currently require that the number of BBs is
1032 exactly 2 (the header and latch). Vectorizable inner-most loops
1033 look like this:
1035 (pre-header)
1037 header <--------+
1038 | | |
1039 | +--> latch --+
1041 (exit-bb) */
1043 if (loop->num_nodes != 2)
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047 "not vectorized: control flow in loop.");
1048 return NULL;
1051 if (empty_block_p (loop->header))
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "not vectorized: empty loop.");
1056 return NULL;
1059 else
1061 struct loop *innerloop = loop->inner;
1062 edge entryedge;
1064 /* Nested loop. We currently require that the loop is doubly-nested,
1065 contains a single inner loop, and the number of BBs is exactly 5.
1066 Vectorizable outer-loops look like this:
1068 (pre-header)
1070 header <---+
1072 inner-loop |
1074 tail ------+
1076 (exit-bb)
1078 The inner-loop has the properties expected of inner-most loops
1079 as described above. */
1081 if ((loop->inner)->inner || (loop->inner)->next)
1083 if (dump_enabled_p ())
1084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085 "not vectorized: multiple nested loops.");
1086 return NULL;
1089 /* Analyze the inner-loop. */
1090 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1091 if (!inner_loop_vinfo)
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "not vectorized: Bad inner loop.");
1096 return NULL;
1099 if (!expr_invariant_in_loop_p (loop,
1100 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1104 "not vectorized: inner-loop count not invariant.");
1105 destroy_loop_vec_info (inner_loop_vinfo, true);
1106 return NULL;
1109 if (loop->num_nodes != 5)
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: control flow in loop.");
1114 destroy_loop_vec_info (inner_loop_vinfo, true);
1115 return NULL;
1118 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1119 entryedge = EDGE_PRED (innerloop->header, 0);
1120 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1121 entryedge = EDGE_PRED (innerloop->header, 1);
1123 if (entryedge->src != loop->header
1124 || !single_exit (innerloop)
1125 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1127 if (dump_enabled_p ())
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 "not vectorized: unsupported outerloop form.");
1130 destroy_loop_vec_info (inner_loop_vinfo, true);
1131 return NULL;
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE, vect_location,
1136 "Considering outer-loop vectorization.");
1139 if (!single_exit (loop)
1140 || EDGE_COUNT (loop->header->preds) != 2)
1142 if (dump_enabled_p ())
1144 if (!single_exit (loop))
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 "not vectorized: multiple exits.");
1147 else if (EDGE_COUNT (loop->header->preds) != 2)
1148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149 "not vectorized: too many incoming edges.");
1151 if (inner_loop_vinfo)
1152 destroy_loop_vec_info (inner_loop_vinfo, true);
1153 return NULL;
1156 /* We assume that the loop exit condition is at the end of the loop. i.e,
1157 that the loop is represented as a do-while (with a proper if-guard
1158 before the loop if needed), where the loop header contains all the
1159 executable statements, and the latch is empty. */
1160 if (!empty_block_p (loop->latch)
1161 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "not vectorized: latch block not empty.");
1166 if (inner_loop_vinfo)
1167 destroy_loop_vec_info (inner_loop_vinfo, true);
1168 return NULL;
1171 /* Make sure there exists a single-predecessor exit bb: */
1172 if (!single_pred_p (single_exit (loop)->dest))
1174 edge e = single_exit (loop);
1175 if (!(e->flags & EDGE_ABNORMAL))
1177 split_loop_exit_edge (e);
1178 if (dump_enabled_p ())
1179 dump_printf (MSG_NOTE, "split exit edge.");
1181 else
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "not vectorized: abnormal loop exit edge.");
1186 if (inner_loop_vinfo)
1187 destroy_loop_vec_info (inner_loop_vinfo, true);
1188 return NULL;
1192 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1193 if (!loop_cond)
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "not vectorized: complicated exit condition.");
1198 if (inner_loop_vinfo)
1199 destroy_loop_vec_info (inner_loop_vinfo, true);
1200 return NULL;
1203 if (!number_of_iterations)
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 "not vectorized: number of iterations cannot be "
1208 "computed.");
1209 if (inner_loop_vinfo)
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1214 if (chrec_contains_undetermined (number_of_iterations))
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "Infinite number of iterations.");
1219 if (inner_loop_vinfo)
1220 destroy_loop_vec_info (inner_loop_vinfo, true);
1221 return NULL;
1224 if (!NITERS_KNOWN_P (number_of_iterations))
1226 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_NOTE, vect_location,
1229 "Symbolic number of iterations is ");
1230 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1233 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1235 if (dump_enabled_p ())
1236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237 "not vectorized: number of iterations = 0.");
1238 if (inner_loop_vinfo)
1239 destroy_loop_vec_info (inner_loop_vinfo, true);
1240 return NULL;
1243 loop_vinfo = new_loop_vec_info (loop);
1244 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1245 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1247 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1249 /* CHECKME: May want to keep it around it in the future. */
1250 if (inner_loop_vinfo)
1251 destroy_loop_vec_info (inner_loop_vinfo, false);
1253 gcc_assert (!loop->aux);
1254 loop->aux = loop_vinfo;
1255 return loop_vinfo;
1259 /* Function vect_analyze_loop_operations.
1261 Scan the loop stmts and make sure they are all vectorizable. */
1263 static bool
1264 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268 int nbbs = loop->num_nodes;
1269 gimple_stmt_iterator si;
1270 unsigned int vectorization_factor = 0;
1271 int i;
1272 gimple phi;
1273 stmt_vec_info stmt_info;
1274 bool need_to_vectorize = false;
1275 int min_profitable_iters;
1276 int min_scalar_loop_bound;
1277 unsigned int th;
1278 bool only_slp_in_loop = true, ok;
1279 HOST_WIDE_INT max_niter;
1280 HOST_WIDE_INT estimated_niter;
1281 int min_profitable_estimate;
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_NOTE, vect_location,
1285 "=== vect_analyze_loop_operations ===");
1287 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1288 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1289 if (slp)
1291 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1292 vectorization factor of the loop is the unrolling factor required by
1293 the SLP instances. If that unrolling factor is 1, we say, that we
1294 perform pure SLP on loop - cross iteration parallelism is not
1295 exploited. */
1296 for (i = 0; i < nbbs; i++)
1298 basic_block bb = bbs[i];
1299 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1301 gimple stmt = gsi_stmt (si);
1302 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1303 gcc_assert (stmt_info);
1304 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1305 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1306 && !PURE_SLP_STMT (stmt_info))
1307 /* STMT needs both SLP and loop-based vectorization. */
1308 only_slp_in_loop = false;
1312 if (only_slp_in_loop)
1313 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1314 else
1315 vectorization_factor = least_common_multiple (vectorization_factor,
1316 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1318 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_NOTE, vect_location,
1321 "Updating vectorization factor to %d ",
1322 vectorization_factor);
1325 for (i = 0; i < nbbs; i++)
1327 basic_block bb = bbs[i];
1329 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1331 phi = gsi_stmt (si);
1332 ok = true;
1334 stmt_info = vinfo_for_stmt (phi);
1335 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1338 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1341 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1342 (i.e., a phi in the tail of the outer-loop). */
1343 if (! is_loop_header_bb_p (bb))
1345 /* FORNOW: we currently don't support the case that these phis
1346 are not used in the outerloop (unless it is double reduction,
1347 i.e., this phi is vect_reduction_def), cause this case
1348 requires to actually do something here. */
1349 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1350 || STMT_VINFO_LIVE_P (stmt_info))
1351 && STMT_VINFO_DEF_TYPE (stmt_info)
1352 != vect_double_reduction_def)
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356 "Unsupported loop-closed phi in "
1357 "outer-loop.");
1358 return false;
1361 /* If PHI is used in the outer loop, we check that its operand
1362 is defined in the inner loop. */
1363 if (STMT_VINFO_RELEVANT_P (stmt_info))
1365 tree phi_op;
1366 gimple op_def_stmt;
1368 if (gimple_phi_num_args (phi) != 1)
1369 return false;
1371 phi_op = PHI_ARG_DEF (phi, 0);
1372 if (TREE_CODE (phi_op) != SSA_NAME)
1373 return false;
1375 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1376 if (!op_def_stmt
1377 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1378 || !vinfo_for_stmt (op_def_stmt))
1379 return false;
1381 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1382 != vect_used_in_outer
1383 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1384 != vect_used_in_outer_by_reduction)
1385 return false;
1388 continue;
1391 gcc_assert (stmt_info);
1393 if (STMT_VINFO_LIVE_P (stmt_info))
1395 /* FORNOW: not yet supported. */
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "not vectorized: value used after loop.");
1399 return false;
1402 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1403 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1405 /* A scalar-dependence cycle that we don't support. */
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1408 "not vectorized: scalar dependence cycle.");
1409 return false;
1412 if (STMT_VINFO_RELEVANT_P (stmt_info))
1414 need_to_vectorize = true;
1415 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1416 ok = vectorizable_induction (phi, NULL, NULL);
1419 if (!ok)
1421 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424 "not vectorized: relevant phi not "
1425 "supported: ");
1426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1428 return false;
1432 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1434 gimple stmt = gsi_stmt (si);
1435 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1436 return false;
1438 } /* bbs */
1440 /* All operations in the loop are either irrelevant (deal with loop
1441 control, or dead), or only used outside the loop and can be moved
1442 out of the loop (e.g. invariants, inductions). The loop can be
1443 optimized away by scalar optimizations. We're better off not
1444 touching this loop. */
1445 if (!need_to_vectorize)
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "All the computation can be taken out of the loop.");
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452 "not vectorized: redundant loop. no profit to "
1453 "vectorize.");
1454 return false;
1457 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "vectorization_factor = %d, niters = "
1460 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1461 LOOP_VINFO_INT_NITERS (loop_vinfo));
1463 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1464 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1465 || ((max_niter = max_stmt_executions_int (loop)) != -1
1466 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470 "not vectorized: iteration count too small.");
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473 "not vectorized: iteration count smaller than "
1474 "vectorization factor.");
1475 return false;
1478 /* Analyze cost. Decide if worth while to vectorize. */
1480 /* Once VF is set, SLP costs should be updated since the number of created
1481 vector stmts depends on VF. */
1482 vect_update_slp_costs_according_to_vf (loop_vinfo);
1484 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1485 &min_profitable_estimate);
1486 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1488 if (min_profitable_iters < 0)
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492 "not vectorized: vectorization not profitable.");
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495 "not vectorized: vector version will never be "
1496 "profitable.");
1497 return false;
1500 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1501 * vectorization_factor) - 1);
1504 /* Use the cost model only if it is more conservative than user specified
1505 threshold. */
1507 th = (unsigned) min_scalar_loop_bound;
1508 if (min_profitable_iters
1509 && (!min_scalar_loop_bound
1510 || min_profitable_iters > min_scalar_loop_bound))
1511 th = (unsigned) min_profitable_iters;
1513 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1514 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518 "not vectorized: vectorization not profitable.");
1519 if (dump_enabled_p ())
1520 dump_printf_loc (MSG_NOTE, vect_location,
1521 "not vectorized: iteration count smaller than user "
1522 "specified loop bound parameter or minimum profitable "
1523 "iterations (whichever is more conservative).");
1524 return false;
1527 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1528 && ((unsigned HOST_WIDE_INT) estimated_niter
1529 <= MAX (th, (unsigned)min_profitable_estimate)))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "not vectorized: estimated iteration count too "
1534 "small.");
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_NOTE, vect_location,
1537 "not vectorized: estimated iteration count smaller "
1538 "than specified loop bound parameter or minimum "
1539 "profitable iterations (whichever is more "
1540 "conservative).");
1541 return false;
1544 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1546 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1548 if (dump_enabled_p ())
1549 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1550 if (!vect_can_advance_ivs_p (loop_vinfo))
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554 "not vectorized: can't create epilog loop 1.");
1555 return false;
1557 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561 "not vectorized: can't create epilog loop 2.");
1562 return false;
1566 return true;
1570 /* Function vect_analyze_loop_2.
1572 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1573 for it. The different analyses will record information in the
1574 loop_vec_info struct. */
1575 static bool
1576 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1578 bool ok, slp = false;
1579 int max_vf = MAX_VECTORIZATION_FACTOR;
1580 int min_vf = 2;
1582 /* Find all data references in the loop (which correspond to vdefs/vuses)
1583 and analyze their evolution in the loop. Also adjust the minimal
1584 vectorization factor according to the loads and stores.
1586 FORNOW: Handle only simple, array references, which
1587 alignment can be forced, and aligned pointer-references. */
1589 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1590 if (!ok)
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1594 "bad data references.");
1595 return false;
1598 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1599 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1601 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1602 if (!ok)
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "bad data access.");
1607 return false;
1610 /* Classify all cross-iteration scalar data-flow cycles.
1611 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1613 vect_analyze_scalar_cycles (loop_vinfo);
1615 vect_pattern_recog (loop_vinfo, NULL);
1617 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1619 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620 if (!ok)
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 "unexpected pattern.");
1625 return false;
1628 /* Analyze data dependences between the data-refs in the loop
1629 and adjust the maximum vectorization factor according to
1630 the dependences.
1631 FORNOW: fail at the first data dependence that we encounter. */
1633 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1634 if (!ok
1635 || max_vf < min_vf)
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "bad data dependence.");
1640 return false;
1643 ok = vect_determine_vectorization_factor (loop_vinfo);
1644 if (!ok)
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648 "can't determine vectorization factor.");
1649 return false;
1651 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 "bad data dependence.");
1656 return false;
1659 /* Analyze the alignment of the data-refs in the loop.
1660 Fail if a data reference is found that cannot be vectorized. */
1662 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663 if (!ok)
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "bad data alignment.");
1668 return false;
1671 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1672 It is important to call pruning after vect_analyze_data_ref_accesses,
1673 since we use grouping information gathered by interleaving analysis. */
1674 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1675 if (!ok)
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "too long list of versioning for alias "
1680 "run-time tests.");
1681 return false;
1684 /* This pass will decide on using loop versioning and/or loop peeling in
1685 order to enhance the alignment of data references in the loop. */
1687 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1688 if (!ok)
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692 "bad data alignment.");
1693 return false;
1696 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1697 ok = vect_analyze_slp (loop_vinfo, NULL);
1698 if (ok)
1700 /* Decide which possible SLP instances to SLP. */
1701 slp = vect_make_slp_decision (loop_vinfo);
1703 /* Find stmts that need to be both vectorized and SLPed. */
1704 vect_detect_hybrid_slp (loop_vinfo);
1706 else
1707 return false;
1709 /* Scan all the operations in the loop and make sure they are
1710 vectorizable. */
1712 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1713 if (!ok)
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 "bad operation or unsupported loop bound.");
1718 return false;
1721 return true;
1724 /* Function vect_analyze_loop.
1726 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1727 for it. The different analyses will record information in the
1728 loop_vec_info struct. */
1729 loop_vec_info
1730 vect_analyze_loop (struct loop *loop)
1732 loop_vec_info loop_vinfo;
1733 unsigned int vector_sizes;
1735 /* Autodetect first vector size we try. */
1736 current_vector_size = 0;
1737 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "===== analyze_loop_nest =====");
1743 if (loop_outer (loop)
1744 && loop_vec_info_for_loop (loop_outer (loop))
1745 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1747 if (dump_enabled_p ())
1748 dump_printf_loc (MSG_NOTE, vect_location,
1749 "outer-loop already vectorized.");
1750 return NULL;
1753 while (1)
1755 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1756 loop_vinfo = vect_analyze_loop_form (loop);
1757 if (!loop_vinfo)
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "bad loop form.");
1762 return NULL;
1765 if (vect_analyze_loop_2 (loop_vinfo))
1767 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1769 return loop_vinfo;
1772 destroy_loop_vec_info (loop_vinfo, true);
1774 vector_sizes &= ~current_vector_size;
1775 if (vector_sizes == 0
1776 || current_vector_size == 0)
1777 return NULL;
1779 /* Try the next biggest vector size. */
1780 current_vector_size = 1 << floor_log2 (vector_sizes);
1781 if (dump_enabled_p ())
1782 dump_printf_loc (MSG_NOTE, vect_location,
1783 "***** Re-trying analysis with "
1784 "vector size %d\n", current_vector_size);
1789 /* Function reduction_code_for_scalar_code
1791 Input:
1792 CODE - tree_code of a reduction operations.
1794 Output:
1795 REDUC_CODE - the corresponding tree-code to be used to reduce the
1796 vector of partial results into a single scalar result (which
1797 will also reside in a vector) or ERROR_MARK if the operation is
1798 a supported reduction operation, but does not have such tree-code.
1800 Return FALSE if CODE currently cannot be vectorized as reduction. */
1802 static bool
1803 reduction_code_for_scalar_code (enum tree_code code,
1804 enum tree_code *reduc_code)
1806 switch (code)
1808 case MAX_EXPR:
1809 *reduc_code = REDUC_MAX_EXPR;
1810 return true;
1812 case MIN_EXPR:
1813 *reduc_code = REDUC_MIN_EXPR;
1814 return true;
1816 case PLUS_EXPR:
1817 *reduc_code = REDUC_PLUS_EXPR;
1818 return true;
1820 case MULT_EXPR:
1821 case MINUS_EXPR:
1822 case BIT_IOR_EXPR:
1823 case BIT_XOR_EXPR:
1824 case BIT_AND_EXPR:
1825 *reduc_code = ERROR_MARK;
1826 return true;
1828 default:
1829 return false;
1834 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1835 STMT is printed with a message MSG. */
1837 static void
1838 report_vect_op (int msg_type, gimple stmt, const char *msg)
1840 dump_printf_loc (msg_type, vect_location, "%s", msg);
1841 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1845 /* Detect SLP reduction of the form:
1847 #a1 = phi <a5, a0>
1848 a2 = operation (a1)
1849 a3 = operation (a2)
1850 a4 = operation (a3)
1851 a5 = operation (a4)
1853 #a = phi <a5>
1855 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1856 FIRST_STMT is the first reduction stmt in the chain
1857 (a2 = operation (a1)).
1859 Return TRUE if a reduction chain was detected. */
1861 static bool
1862 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1864 struct loop *loop = (gimple_bb (phi))->loop_father;
1865 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1866 enum tree_code code;
1867 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1868 stmt_vec_info use_stmt_info, current_stmt_info;
1869 tree lhs;
1870 imm_use_iterator imm_iter;
1871 use_operand_p use_p;
1872 int nloop_uses, size = 0, n_out_of_loop_uses;
1873 bool found = false;
1875 if (loop != vect_loop)
1876 return false;
1878 lhs = PHI_RESULT (phi);
1879 code = gimple_assign_rhs_code (first_stmt);
1880 while (1)
1882 nloop_uses = 0;
1883 n_out_of_loop_uses = 0;
1884 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1886 gimple use_stmt = USE_STMT (use_p);
1887 if (is_gimple_debug (use_stmt))
1888 continue;
1890 use_stmt = USE_STMT (use_p);
1892 /* Check if we got back to the reduction phi. */
1893 if (use_stmt == phi)
1895 loop_use_stmt = use_stmt;
1896 found = true;
1897 break;
1900 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1902 if (vinfo_for_stmt (use_stmt)
1903 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1905 loop_use_stmt = use_stmt;
1906 nloop_uses++;
1909 else
1910 n_out_of_loop_uses++;
1912 /* There are can be either a single use in the loop or two uses in
1913 phi nodes. */
1914 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1915 return false;
1918 if (found)
1919 break;
1921 /* We reached a statement with no loop uses. */
1922 if (nloop_uses == 0)
1923 return false;
1925 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1926 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1927 return false;
1929 if (!is_gimple_assign (loop_use_stmt)
1930 || code != gimple_assign_rhs_code (loop_use_stmt)
1931 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1932 return false;
1934 /* Insert USE_STMT into reduction chain. */
1935 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1936 if (current_stmt)
1938 current_stmt_info = vinfo_for_stmt (current_stmt);
1939 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1940 GROUP_FIRST_ELEMENT (use_stmt_info)
1941 = GROUP_FIRST_ELEMENT (current_stmt_info);
1943 else
1944 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1946 lhs = gimple_assign_lhs (loop_use_stmt);
1947 current_stmt = loop_use_stmt;
1948 size++;
1951 if (!found || loop_use_stmt != phi || size < 2)
1952 return false;
1954 /* Swap the operands, if needed, to make the reduction operand be the second
1955 operand. */
1956 lhs = PHI_RESULT (phi);
1957 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1958 while (next_stmt)
1960 if (gimple_assign_rhs2 (next_stmt) == lhs)
1962 tree op = gimple_assign_rhs1 (next_stmt);
1963 gimple def_stmt = NULL;
1965 if (TREE_CODE (op) == SSA_NAME)
1966 def_stmt = SSA_NAME_DEF_STMT (op);
1968 /* Check that the other def is either defined in the loop
1969 ("vect_internal_def"), or it's an induction (defined by a
1970 loop-header phi-node). */
1971 if (def_stmt
1972 && gimple_bb (def_stmt)
1973 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1974 && (is_gimple_assign (def_stmt)
1975 || is_gimple_call (def_stmt)
1976 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1977 == vect_induction_def
1978 || (gimple_code (def_stmt) == GIMPLE_PHI
1979 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1980 == vect_internal_def
1981 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1983 lhs = gimple_assign_lhs (next_stmt);
1984 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1985 continue;
1988 return false;
1990 else
1992 tree op = gimple_assign_rhs2 (next_stmt);
1993 gimple def_stmt = NULL;
1995 if (TREE_CODE (op) == SSA_NAME)
1996 def_stmt = SSA_NAME_DEF_STMT (op);
1998 /* Check that the other def is either defined in the loop
1999 ("vect_internal_def"), or it's an induction (defined by a
2000 loop-header phi-node). */
2001 if (def_stmt
2002 && gimple_bb (def_stmt)
2003 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2004 && (is_gimple_assign (def_stmt)
2005 || is_gimple_call (def_stmt)
2006 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2007 == vect_induction_def
2008 || (gimple_code (def_stmt) == GIMPLE_PHI
2009 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2010 == vect_internal_def
2011 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2013 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2016 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2019 swap_tree_operands (next_stmt,
2020 gimple_assign_rhs1_ptr (next_stmt),
2021 gimple_assign_rhs2_ptr (next_stmt));
2022 update_stmt (next_stmt);
2024 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2025 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2027 else
2028 return false;
2031 lhs = gimple_assign_lhs (next_stmt);
2032 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2035 /* Save the chain for further analysis in SLP detection. */
2036 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2037 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2038 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2040 return true;
2044 /* Function vect_is_simple_reduction_1
2046 (1) Detect a cross-iteration def-use cycle that represents a simple
2047 reduction computation. We look for the following pattern:
2049 loop_header:
2050 a1 = phi < a0, a2 >
2051 a3 = ...
2052 a2 = operation (a3, a1)
2054 such that:
2055 1. operation is commutative and associative and it is safe to
2056 change the order of the computation (if CHECK_REDUCTION is true)
2057 2. no uses for a2 in the loop (a2 is used out of the loop)
2058 3. no uses of a1 in the loop besides the reduction operation
2059 4. no uses of a1 outside the loop.
2061 Conditions 1,4 are tested here.
2062 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2064 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2065 nested cycles, if CHECK_REDUCTION is false.
2067 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2068 reductions:
2070 a1 = phi < a0, a2 >
2071 inner loop (def of a3)
2072 a2 = phi < a3 >
2074 If MODIFY is true it tries also to rework the code in-place to enable
2075 detection of more reduction patterns. For the time being we rewrite
2076 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2079 static gimple
2080 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2081 bool check_reduction, bool *double_reduc,
2082 bool modify)
2084 struct loop *loop = (gimple_bb (phi))->loop_father;
2085 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2086 edge latch_e = loop_latch_edge (loop);
2087 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2088 gimple def_stmt, def1 = NULL, def2 = NULL;
2089 enum tree_code orig_code, code;
2090 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2091 tree type;
2092 int nloop_uses;
2093 tree name;
2094 imm_use_iterator imm_iter;
2095 use_operand_p use_p;
2096 bool phi_def;
2098 *double_reduc = false;
2100 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2101 otherwise, we assume outer loop vectorization. */
2102 gcc_assert ((check_reduction && loop == vect_loop)
2103 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2105 name = PHI_RESULT (phi);
2106 nloop_uses = 0;
2107 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2109 gimple use_stmt = USE_STMT (use_p);
2110 if (is_gimple_debug (use_stmt))
2111 continue;
2113 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "intermediate value used outside loop.");
2119 return NULL;
2122 if (vinfo_for_stmt (use_stmt)
2123 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2124 nloop_uses++;
2125 if (nloop_uses > 1)
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "reduction used in loop.");
2130 return NULL;
2134 if (TREE_CODE (loop_arg) != SSA_NAME)
2136 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 "reduction: not ssa_name: ");
2140 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2142 return NULL;
2145 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2146 if (!def_stmt)
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "reduction: no def_stmt.");
2151 return NULL;
2154 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2156 if (dump_enabled_p ())
2157 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2158 return NULL;
2161 if (is_gimple_assign (def_stmt))
2163 name = gimple_assign_lhs (def_stmt);
2164 phi_def = false;
2166 else
2168 name = PHI_RESULT (def_stmt);
2169 phi_def = true;
2172 nloop_uses = 0;
2173 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2175 gimple use_stmt = USE_STMT (use_p);
2176 if (is_gimple_debug (use_stmt))
2177 continue;
2178 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2179 && vinfo_for_stmt (use_stmt)
2180 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2181 nloop_uses++;
2182 if (nloop_uses > 1)
2184 if (dump_enabled_p ())
2185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186 "reduction used in loop.");
2187 return NULL;
2191 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192 defined in the inner loop. */
2193 if (phi_def)
2195 op1 = PHI_ARG_DEF (def_stmt, 0);
2197 if (gimple_phi_num_args (def_stmt) != 1
2198 || TREE_CODE (op1) != SSA_NAME)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202 "unsupported phi node definition.");
2204 return NULL;
2207 def1 = SSA_NAME_DEF_STMT (op1);
2208 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2209 && loop->inner
2210 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2211 && is_gimple_assign (def1))
2213 if (dump_enabled_p ())
2214 report_vect_op (MSG_NOTE, def_stmt,
2215 "detected double reduction: ");
2217 *double_reduc = true;
2218 return def_stmt;
2221 return NULL;
2224 code = orig_code = gimple_assign_rhs_code (def_stmt);
2226 /* We can handle "res -= x[i]", which is non-associative by
2227 simply rewriting this into "res += -x[i]". Avoid changing
2228 gimple instruction for the first simple tests and only do this
2229 if we're allowed to change code at all. */
2230 if (code == MINUS_EXPR
2231 && modify
2232 && (op1 = gimple_assign_rhs1 (def_stmt))
2233 && TREE_CODE (op1) == SSA_NAME
2234 && SSA_NAME_DEF_STMT (op1) == phi)
2235 code = PLUS_EXPR;
2237 if (check_reduction
2238 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2240 if (dump_enabled_p ())
2241 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2242 "reduction: not commutative/associative: ");
2243 return NULL;
2246 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2248 if (code != COND_EXPR)
2250 if (dump_enabled_p ())
2251 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2252 "reduction: not binary operation: ");
2254 return NULL;
2257 op3 = gimple_assign_rhs1 (def_stmt);
2258 if (COMPARISON_CLASS_P (op3))
2260 op4 = TREE_OPERAND (op3, 1);
2261 op3 = TREE_OPERAND (op3, 0);
2264 op1 = gimple_assign_rhs2 (def_stmt);
2265 op2 = gimple_assign_rhs3 (def_stmt);
2267 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2269 if (dump_enabled_p ())
2270 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2271 "reduction: uses not ssa_names: ");
2273 return NULL;
2276 else
2278 op1 = gimple_assign_rhs1 (def_stmt);
2279 op2 = gimple_assign_rhs2 (def_stmt);
2281 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2283 if (dump_enabled_p ())
2284 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2285 "reduction: uses not ssa_names: ");
2287 return NULL;
2291 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2292 if ((TREE_CODE (op1) == SSA_NAME
2293 && !types_compatible_p (type,TREE_TYPE (op1)))
2294 || (TREE_CODE (op2) == SSA_NAME
2295 && !types_compatible_p (type, TREE_TYPE (op2)))
2296 || (op3 && TREE_CODE (op3) == SSA_NAME
2297 && !types_compatible_p (type, TREE_TYPE (op3)))
2298 || (op4 && TREE_CODE (op4) == SSA_NAME
2299 && !types_compatible_p (type, TREE_TYPE (op4))))
2301 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "reduction: multiple types: operation type: ");
2305 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2306 dump_printf (MSG_NOTE, ", operands types: ");
2307 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2308 TREE_TYPE (op1));
2309 dump_printf (MSG_NOTE, ",");
2310 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2311 TREE_TYPE (op2));
2312 if (op3)
2314 dump_printf (MSG_NOTE, ",");
2315 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316 TREE_TYPE (op3));
2319 if (op4)
2321 dump_printf (MSG_NOTE, ",");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323 TREE_TYPE (op4));
2327 return NULL;
2330 /* Check that it's ok to change the order of the computation.
2331 Generally, when vectorizing a reduction we change the order of the
2332 computation. This may change the behavior of the program in some
2333 cases, so we need to check that this is ok. One exception is when
2334 vectorizing an outer-loop: the inner-loop is executed sequentially,
2335 and therefore vectorizing reductions in the inner-loop during
2336 outer-loop vectorization is safe. */
2338 /* CHECKME: check for !flag_finite_math_only too? */
2339 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2340 && check_reduction)
2342 /* Changing the order of operations changes the semantics. */
2343 if (dump_enabled_p ())
2344 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2345 "reduction: unsafe fp math optimization: ");
2346 return NULL;
2348 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2349 && check_reduction)
2351 /* Changing the order of operations changes the semantics. */
2352 if (dump_enabled_p ())
2353 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2354 "reduction: unsafe int math optimization: ");
2355 return NULL;
2357 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2359 /* Changing the order of operations changes the semantics. */
2360 if (dump_enabled_p ())
2361 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362 "reduction: unsafe fixed-point math optimization: ");
2363 return NULL;
2366 /* If we detected "res -= x[i]" earlier, rewrite it into
2367 "res += -x[i]" now. If this turns out to be useless reassoc
2368 will clean it up again. */
2369 if (orig_code == MINUS_EXPR)
2371 tree rhs = gimple_assign_rhs2 (def_stmt);
2372 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2373 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2374 rhs, NULL);
2375 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2376 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2377 loop_info, NULL));
2378 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2379 gimple_assign_set_rhs2 (def_stmt, negrhs);
2380 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2381 update_stmt (def_stmt);
2384 /* Reduction is safe. We're dealing with one of the following:
2385 1) integer arithmetic and no trapv
2386 2) floating point arithmetic, and special flags permit this optimization
2387 3) nested cycle (i.e., outer loop vectorization). */
2388 if (TREE_CODE (op1) == SSA_NAME)
2389 def1 = SSA_NAME_DEF_STMT (op1);
2391 if (TREE_CODE (op2) == SSA_NAME)
2392 def2 = SSA_NAME_DEF_STMT (op2);
2394 if (code != COND_EXPR
2395 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2397 if (dump_enabled_p ())
2398 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2399 return NULL;
2402 /* Check that one def is the reduction def, defined by PHI,
2403 the other def is either defined in the loop ("vect_internal_def"),
2404 or it's an induction (defined by a loop-header phi-node). */
2406 if (def2 && def2 == phi
2407 && (code == COND_EXPR
2408 || !def1 || gimple_nop_p (def1)
2409 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2410 && (is_gimple_assign (def1)
2411 || is_gimple_call (def1)
2412 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2413 == vect_induction_def
2414 || (gimple_code (def1) == GIMPLE_PHI
2415 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2416 == vect_internal_def
2417 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2419 if (dump_enabled_p ())
2420 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2421 return def_stmt;
2424 if (def1 && def1 == phi
2425 && (code == COND_EXPR
2426 || !def2 || gimple_nop_p (def2)
2427 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2428 && (is_gimple_assign (def2)
2429 || is_gimple_call (def2)
2430 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2431 == vect_induction_def
2432 || (gimple_code (def2) == GIMPLE_PHI
2433 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2434 == vect_internal_def
2435 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2437 if (check_reduction)
2439 /* Swap operands (just for simplicity - so that the rest of the code
2440 can assume that the reduction variable is always the last (second)
2441 argument). */
2442 if (dump_enabled_p ())
2443 report_vect_op (MSG_NOTE, def_stmt,
2444 "detected reduction: need to swap operands: ");
2446 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2447 gimple_assign_rhs2_ptr (def_stmt));
2449 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2450 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2452 else
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2458 return def_stmt;
2461 /* Try to find SLP reduction chain. */
2462 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2464 if (dump_enabled_p ())
2465 report_vect_op (MSG_NOTE, def_stmt,
2466 "reduction: detected reduction chain: ");
2468 return def_stmt;
2471 if (dump_enabled_p ())
2472 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473 "reduction: unknown pattern: ");
2475 return NULL;
2478 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2479 in-place. Arguments as there. */
2481 static gimple
2482 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2483 bool check_reduction, bool *double_reduc)
2485 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2486 double_reduc, false);
2489 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2490 in-place if it enables detection of more reductions. Arguments
2491 as there. */
2493 gimple
2494 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2495 bool check_reduction, bool *double_reduc)
2497 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498 double_reduc, true);
2501 /* Calculate the cost of one scalar iteration of the loop. */
2503 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2505 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2506 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2507 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2508 int innerloop_iters, i, stmt_cost;
2510 /* Count statements in scalar loop. Using this as scalar cost for a single
2511 iteration for now.
2513 TODO: Add outer loop support.
2515 TODO: Consider assigning different costs to different scalar
2516 statements. */
2518 /* FORNOW. */
2519 innerloop_iters = 1;
2520 if (loop->inner)
2521 innerloop_iters = 50; /* FIXME */
2523 for (i = 0; i < nbbs; i++)
2525 gimple_stmt_iterator si;
2526 basic_block bb = bbs[i];
2528 if (bb->loop_father == loop->inner)
2529 factor = innerloop_iters;
2530 else
2531 factor = 1;
2533 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2535 gimple stmt = gsi_stmt (si);
2536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2538 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2539 continue;
2541 /* Skip stmts that are not vectorized inside the loop. */
2542 if (stmt_info
2543 && !STMT_VINFO_RELEVANT_P (stmt_info)
2544 && (!STMT_VINFO_LIVE_P (stmt_info)
2545 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2546 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2547 continue;
2549 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2551 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2552 stmt_cost = vect_get_stmt_cost (scalar_load);
2553 else
2554 stmt_cost = vect_get_stmt_cost (scalar_store);
2556 else
2557 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2559 scalar_single_iter_cost += stmt_cost * factor;
2562 return scalar_single_iter_cost;
2565 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2567 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2568 int *peel_iters_epilogue,
2569 int scalar_single_iter_cost,
2570 stmt_vector_for_cost *prologue_cost_vec,
2571 stmt_vector_for_cost *epilogue_cost_vec)
2573 int retval = 0;
2574 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2576 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2578 *peel_iters_epilogue = vf/2;
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "cost model: epilogue peel iters set to vf/2 "
2582 "because loop iterations are unknown .");
2584 /* If peeled iterations are known but number of scalar loop
2585 iterations are unknown, count a taken branch per peeled loop. */
2586 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2587 NULL, 0, vect_prologue);
2589 else
2591 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2592 peel_iters_prologue = niters < peel_iters_prologue ?
2593 niters : peel_iters_prologue;
2594 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2595 /* If we need to peel for gaps, but no peeling is required, we have to
2596 peel VF iterations. */
2597 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2598 *peel_iters_epilogue = vf;
2601 if (peel_iters_prologue)
2602 retval += record_stmt_cost (prologue_cost_vec,
2603 peel_iters_prologue * scalar_single_iter_cost,
2604 scalar_stmt, NULL, 0, vect_prologue);
2605 if (*peel_iters_epilogue)
2606 retval += record_stmt_cost (epilogue_cost_vec,
2607 *peel_iters_epilogue * scalar_single_iter_cost,
2608 scalar_stmt, NULL, 0, vect_epilogue);
2609 return retval;
2612 /* Function vect_estimate_min_profitable_iters
2614 Return the number of iterations required for the vector version of the
2615 loop to be profitable relative to the cost of the scalar version of the
2616 loop. */
2618 static void
2619 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2620 int *ret_min_profitable_niters,
2621 int *ret_min_profitable_estimate)
2623 int min_profitable_iters;
2624 int min_profitable_estimate;
2625 int peel_iters_prologue;
2626 int peel_iters_epilogue;
2627 unsigned vec_inside_cost = 0;
2628 int vec_outside_cost = 0;
2629 unsigned vec_prologue_cost = 0;
2630 unsigned vec_epilogue_cost = 0;
2631 int scalar_single_iter_cost = 0;
2632 int scalar_outside_cost = 0;
2633 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2634 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2635 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2637 /* Cost model disabled. */
2638 if (!flag_vect_cost_model)
2640 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2641 *ret_min_profitable_niters = 0;
2642 *ret_min_profitable_estimate = 0;
2643 return;
2646 /* Requires loop versioning tests to handle misalignment. */
2647 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2649 /* FIXME: Make cost depend on complexity of individual check. */
2650 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2651 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2652 vect_prologue);
2653 dump_printf (MSG_NOTE,
2654 "cost model: Adding cost of checks for loop "
2655 "versioning to treat misalignment.\n");
2658 /* Requires loop versioning with alias checks. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2663 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664 vect_prologue);
2665 dump_printf (MSG_NOTE,
2666 "cost model: Adding cost of checks for loop "
2667 "versioning aliasing.\n");
2670 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2671 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2672 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2673 vect_prologue);
2675 /* Count statements in scalar loop. Using this as scalar cost for a single
2676 iteration for now.
2678 TODO: Add outer loop support.
2680 TODO: Consider assigning different costs to different scalar
2681 statements. */
2683 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2685 /* Add additional cost for the peeled instructions in prologue and epilogue
2686 loop.
2688 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2689 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2691 TODO: Build an expression that represents peel_iters for prologue and
2692 epilogue to be used in a run-time test. */
2694 if (npeel < 0)
2696 peel_iters_prologue = vf/2;
2697 dump_printf (MSG_NOTE, "cost model: "
2698 "prologue peel iters set to vf/2.");
2700 /* If peeling for alignment is unknown, loop bound of main loop becomes
2701 unknown. */
2702 peel_iters_epilogue = vf/2;
2703 dump_printf (MSG_NOTE, "cost model: "
2704 "epilogue peel iters set to vf/2 because "
2705 "peeling for alignment is unknown.");
2707 /* If peeled iterations are unknown, count a taken branch and a not taken
2708 branch per peeled loop. Even if scalar loop iterations are known,
2709 vector iterations are not known since peeled prologue iterations are
2710 not known. Hence guards remain the same. */
2711 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2712 NULL, 0, vect_prologue);
2713 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2714 NULL, 0, vect_prologue);
2715 /* FORNOW: Don't attempt to pass individual scalar instructions to
2716 the model; just assume linear cost for scalar iterations. */
2717 (void) add_stmt_cost (target_cost_data,
2718 peel_iters_prologue * scalar_single_iter_cost,
2719 scalar_stmt, NULL, 0, vect_prologue);
2720 (void) add_stmt_cost (target_cost_data,
2721 peel_iters_epilogue * scalar_single_iter_cost,
2722 scalar_stmt, NULL, 0, vect_epilogue);
2724 else
2726 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2727 stmt_info_for_cost *si;
2728 int j;
2729 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2731 prologue_cost_vec.create (2);
2732 epilogue_cost_vec.create (2);
2733 peel_iters_prologue = npeel;
2735 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2736 &peel_iters_epilogue,
2737 scalar_single_iter_cost,
2738 &prologue_cost_vec,
2739 &epilogue_cost_vec);
2741 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2743 struct _stmt_vec_info *stmt_info
2744 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2745 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2746 si->misalign, vect_prologue);
2749 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2751 struct _stmt_vec_info *stmt_info
2752 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754 si->misalign, vect_epilogue);
2757 prologue_cost_vec.release ();
2758 epilogue_cost_vec.release ();
2761 /* FORNOW: The scalar outside cost is incremented in one of the
2762 following ways:
2764 1. The vectorizer checks for alignment and aliasing and generates
2765 a condition that allows dynamic vectorization. A cost model
2766 check is ANDED with the versioning condition. Hence scalar code
2767 path now has the added cost of the versioning check.
2769 if (cost > th & versioning_check)
2770 jmp to vector code
2772 Hence run-time scalar is incremented by not-taken branch cost.
2774 2. The vectorizer then checks if a prologue is required. If the
2775 cost model check was not done before during versioning, it has to
2776 be done before the prologue check.
2778 if (cost <= th)
2779 prologue = scalar_iters
2780 if (prologue == 0)
2781 jmp to vector code
2782 else
2783 execute prologue
2784 if (prologue == num_iters)
2785 go to exit
2787 Hence the run-time scalar cost is incremented by a taken branch,
2788 plus a not-taken branch, plus a taken branch cost.
2790 3. The vectorizer then checks if an epilogue is required. If the
2791 cost model check was not done before during prologue check, it
2792 has to be done with the epilogue check.
2794 if (prologue == 0)
2795 jmp to vector code
2796 else
2797 execute prologue
2798 if (prologue == num_iters)
2799 go to exit
2800 vector code:
2801 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2802 jmp to epilogue
2804 Hence the run-time scalar cost should be incremented by 2 taken
2805 branches.
2807 TODO: The back end may reorder the BBS's differently and reverse
2808 conditions/branch directions. Change the estimates below to
2809 something more reasonable. */
2811 /* If the number of iterations is known and we do not do versioning, we can
2812 decide whether to vectorize at compile time. Hence the scalar version
2813 do not carry cost model guard costs. */
2814 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2818 /* Cost model check occurs at versioning. */
2819 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2820 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2821 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2822 else
2824 /* Cost model check occurs at prologue generation. */
2825 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2826 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2827 + vect_get_stmt_cost (cond_branch_not_taken);
2828 /* Cost model check occurs at epilogue generation. */
2829 else
2830 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2834 /* Complete the target-specific cost calculations. */
2835 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2836 &vec_inside_cost, &vec_epilogue_cost);
2838 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2840 /* Calculate number of iterations required to make the vector version
2841 profitable, relative to the loop bodies only. The following condition
2842 must hold true:
2843 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2844 where
2845 SIC = scalar iteration cost, VIC = vector iteration cost,
2846 VOC = vector outside cost, VF = vectorization factor,
2847 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2848 SOC = scalar outside cost for run time cost model check. */
2850 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2852 if (vec_outside_cost <= 0)
2853 min_profitable_iters = 1;
2854 else
2856 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2857 - vec_inside_cost * peel_iters_prologue
2858 - vec_inside_cost * peel_iters_epilogue)
2859 / ((scalar_single_iter_cost * vf)
2860 - vec_inside_cost);
2862 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2863 <= (((int) vec_inside_cost * min_profitable_iters)
2864 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2865 min_profitable_iters++;
2868 /* vector version will never be profitable. */
2869 else
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2873 "cost model: the vector iteration cost = %d "
2874 "divided by the scalar iteration cost = %d "
2875 "is greater or equal to the vectorization factor = %d.",
2876 vec_inside_cost, scalar_single_iter_cost, vf);
2877 *ret_min_profitable_niters = -1;
2878 *ret_min_profitable_estimate = -1;
2879 return;
2882 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2885 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2886 vec_inside_cost);
2887 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2888 vec_prologue_cost);
2889 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2890 vec_epilogue_cost);
2891 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2892 scalar_single_iter_cost);
2893 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2894 scalar_outside_cost);
2895 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2896 vec_outside_cost);
2897 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2898 peel_iters_prologue);
2899 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2900 peel_iters_epilogue);
2901 dump_printf (MSG_NOTE,
2902 " Calculated minimum iters for profitability: %d\n",
2903 min_profitable_iters);
2906 min_profitable_iters =
2907 min_profitable_iters < vf ? vf : min_profitable_iters;
2909 /* Because the condition we create is:
2910 if (niters <= min_profitable_iters)
2911 then skip the vectorized loop. */
2912 min_profitable_iters--;
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_NOTE, vect_location,
2916 " Runtime profitability threshold = %d\n", min_profitable_iters);
2918 *ret_min_profitable_niters = min_profitable_iters;
2920 /* Calculate number of iterations required to make the vector version
2921 profitable, relative to the loop bodies only.
2923 Non-vectorized variant is SIC * niters and it must win over vector
2924 variant on the expected loop trip count. The following condition must hold true:
2925 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2927 if (vec_outside_cost <= 0)
2928 min_profitable_estimate = 1;
2929 else
2931 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2932 - vec_inside_cost * peel_iters_prologue
2933 - vec_inside_cost * peel_iters_epilogue)
2934 / ((scalar_single_iter_cost * vf)
2935 - vec_inside_cost);
2937 min_profitable_estimate --;
2938 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE, vect_location,
2941 " Static estimate profitability threshold = %d\n",
2942 min_profitable_iters);
2944 *ret_min_profitable_estimate = min_profitable_estimate;
2948 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2949 functions. Design better to avoid maintenance issues. */
2951 /* Function vect_model_reduction_cost.
2953 Models cost for a reduction operation, including the vector ops
2954 generated within the strip-mine loop, the initial definition before
2955 the loop, and the epilogue code that must be generated. */
2957 static bool
2958 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2959 int ncopies)
2961 int prologue_cost = 0, epilogue_cost = 0;
2962 enum tree_code code;
2963 optab optab;
2964 tree vectype;
2965 gimple stmt, orig_stmt;
2966 tree reduction_op;
2967 enum machine_mode mode;
2968 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2969 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2970 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2972 /* Cost of reduction op inside loop. */
2973 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2974 stmt_info, 0, vect_body);
2975 stmt = STMT_VINFO_STMT (stmt_info);
2977 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2979 case GIMPLE_SINGLE_RHS:
2980 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2981 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2982 break;
2983 case GIMPLE_UNARY_RHS:
2984 reduction_op = gimple_assign_rhs1 (stmt);
2985 break;
2986 case GIMPLE_BINARY_RHS:
2987 reduction_op = gimple_assign_rhs2 (stmt);
2988 break;
2989 case GIMPLE_TERNARY_RHS:
2990 reduction_op = gimple_assign_rhs3 (stmt);
2991 break;
2992 default:
2993 gcc_unreachable ();
2996 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2997 if (!vectype)
2999 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002 "unsupported data-type ");
3003 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3004 TREE_TYPE (reduction_op));
3006 return false;
3009 mode = TYPE_MODE (vectype);
3010 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3012 if (!orig_stmt)
3013 orig_stmt = STMT_VINFO_STMT (stmt_info);
3015 code = gimple_assign_rhs_code (orig_stmt);
3017 /* Add in cost for initial definition. */
3018 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3019 stmt_info, 0, vect_prologue);
3021 /* Determine cost of epilogue code.
3023 We have a reduction operator that will reduce the vector in one statement.
3024 Also requires scalar extract. */
3026 if (!nested_in_vect_loop_p (loop, orig_stmt))
3028 if (reduc_code != ERROR_MARK)
3030 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3031 stmt_info, 0, vect_epilogue);
3032 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3033 stmt_info, 0, vect_epilogue);
3035 else
3037 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3038 tree bitsize =
3039 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3040 int element_bitsize = tree_low_cst (bitsize, 1);
3041 int nelements = vec_size_in_bits / element_bitsize;
3043 optab = optab_for_tree_code (code, vectype, optab_default);
3045 /* We have a whole vector shift available. */
3046 if (VECTOR_MODE_P (mode)
3047 && optab_handler (optab, mode) != CODE_FOR_nothing
3048 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3050 /* Final reduction via vector shifts and the reduction operator.
3051 Also requires scalar extract. */
3052 epilogue_cost += add_stmt_cost (target_cost_data,
3053 exact_log2 (nelements) * 2,
3054 vector_stmt, stmt_info, 0,
3055 vect_epilogue);
3056 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3057 vec_to_scalar, stmt_info, 0,
3058 vect_epilogue);
3060 else
3061 /* Use extracts and reduction op for final reduction. For N
3062 elements, we have N extracts and N-1 reduction ops. */
3063 epilogue_cost += add_stmt_cost (target_cost_data,
3064 nelements + nelements - 1,
3065 vector_stmt, stmt_info, 0,
3066 vect_epilogue);
3070 if (dump_enabled_p ())
3071 dump_printf (MSG_NOTE,
3072 "vect_model_reduction_cost: inside_cost = %d, "
3073 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3074 prologue_cost, epilogue_cost);
3076 return true;
3080 /* Function vect_model_induction_cost.
3082 Models cost for induction operations. */
3084 static void
3085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3087 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3088 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3089 unsigned inside_cost, prologue_cost;
3091 /* loop cost for vec_loop. */
3092 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3093 stmt_info, 0, vect_body);
3095 /* prologue cost for vec_init and vec_step. */
3096 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3097 stmt_info, 0, vect_prologue);
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 "vect_model_induction_cost: inside_cost = %d, "
3102 "prologue_cost = %d .", inside_cost, prologue_cost);
3106 /* Function get_initial_def_for_induction
3108 Input:
3109 STMT - a stmt that performs an induction operation in the loop.
3110 IV_PHI - the initial value of the induction variable
3112 Output:
3113 Return a vector variable, initialized with the first VF values of
3114 the induction variable. E.g., for an iv with IV_PHI='X' and
3115 evolution S, for a vector of 4 units, we want to return:
3116 [X, X + S, X + 2*S, X + 3*S]. */
3118 static tree
3119 get_initial_def_for_induction (gimple iv_phi)
3121 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3122 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124 tree scalar_type;
3125 tree vectype;
3126 int nunits;
3127 edge pe = loop_preheader_edge (loop);
3128 struct loop *iv_loop;
3129 basic_block new_bb;
3130 tree new_vec, vec_init, vec_step, t;
3131 tree access_fn;
3132 tree new_var;
3133 tree new_name;
3134 gimple init_stmt, induction_phi, new_stmt;
3135 tree induc_def, vec_def, vec_dest;
3136 tree init_expr, step_expr;
3137 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138 int i;
3139 bool ok;
3140 int ncopies;
3141 tree expr;
3142 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3143 bool nested_in_vect_loop = false;
3144 gimple_seq stmts = NULL;
3145 imm_use_iterator imm_iter;
3146 use_operand_p use_p;
3147 gimple exit_phi;
3148 edge latch_e;
3149 tree loop_arg;
3150 gimple_stmt_iterator si;
3151 basic_block bb = gimple_bb (iv_phi);
3152 tree stepvectype;
3153 tree resvectype;
3155 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3156 if (nested_in_vect_loop_p (loop, iv_phi))
3158 nested_in_vect_loop = true;
3159 iv_loop = loop->inner;
3161 else
3162 iv_loop = loop;
3163 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3165 latch_e = loop_latch_edge (iv_loop);
3166 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3168 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3169 gcc_assert (access_fn);
3170 STRIP_NOPS (access_fn);
3171 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3172 &init_expr, &step_expr);
3173 gcc_assert (ok);
3174 pe = loop_preheader_edge (iv_loop);
3176 scalar_type = TREE_TYPE (init_expr);
3177 vectype = get_vectype_for_scalar_type (scalar_type);
3178 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3179 gcc_assert (vectype);
3180 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3181 ncopies = vf / nunits;
3183 gcc_assert (phi_info);
3184 gcc_assert (ncopies >= 1);
3186 /* Find the first insertion point in the BB. */
3187 si = gsi_after_labels (bb);
3189 /* Create the vector that holds the initial_value of the induction. */
3190 if (nested_in_vect_loop)
3192 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3193 been created during vectorization of previous stmts. We obtain it
3194 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3195 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3196 loop_preheader_edge (iv_loop));
3197 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3198 /* If the initial value is not of proper type, convert it. */
3199 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3201 new_stmt = gimple_build_assign_with_ops
3202 (VIEW_CONVERT_EXPR,
3203 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3204 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3205 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3206 gimple_assign_set_lhs (new_stmt, vec_init);
3207 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3208 new_stmt);
3209 gcc_assert (!new_bb);
3210 set_vinfo_for_stmt (new_stmt,
3211 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3214 else
3216 vec<constructor_elt, va_gc> *v;
3218 /* iv_loop is the loop to be vectorized. Create:
3219 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3220 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3221 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3222 if (stmts)
3224 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3225 gcc_assert (!new_bb);
3228 vec_alloc (v, nunits);
3229 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3230 for (i = 1; i < nunits; i++)
3232 /* Create: new_name_i = new_name + step_expr */
3233 enum tree_code code = POINTER_TYPE_P (scalar_type)
3234 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3235 init_stmt = gimple_build_assign_with_ops (code, new_var,
3236 new_name, step_expr);
3237 new_name = make_ssa_name (new_var, init_stmt);
3238 gimple_assign_set_lhs (init_stmt, new_name);
3240 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3241 gcc_assert (!new_bb);
3243 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE, vect_location,
3246 "created new init_stmt: ");
3247 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3249 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3251 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3252 new_vec = build_constructor (vectype, v);
3253 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3257 /* Create the vector that holds the step of the induction. */
3258 if (nested_in_vect_loop)
3259 /* iv_loop is nested in the loop to be vectorized. Generate:
3260 vec_step = [S, S, S, S] */
3261 new_name = step_expr;
3262 else
3264 /* iv_loop is the loop to be vectorized. Generate:
3265 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3266 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3267 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3268 expr, step_expr);
3271 t = unshare_expr (new_name);
3272 gcc_assert (CONSTANT_CLASS_P (new_name));
3273 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3274 gcc_assert (stepvectype);
3275 new_vec = build_vector_from_val (stepvectype, t);
3276 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3279 /* Create the following def-use cycle:
3280 loop prolog:
3281 vec_init = ...
3282 vec_step = ...
3283 loop:
3284 vec_iv = PHI <vec_init, vec_loop>
3286 STMT
3288 vec_loop = vec_iv + vec_step; */
3290 /* Create the induction-phi that defines the induction-operand. */
3291 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3292 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3293 set_vinfo_for_stmt (induction_phi,
3294 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3295 induc_def = PHI_RESULT (induction_phi);
3297 /* Create the iv update inside the loop */
3298 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3299 induc_def, vec_step);
3300 vec_def = make_ssa_name (vec_dest, new_stmt);
3301 gimple_assign_set_lhs (new_stmt, vec_def);
3302 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3303 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3304 NULL));
3306 /* Set the arguments of the phi node: */
3307 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3308 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3309 UNKNOWN_LOCATION);
3312 /* In case that vectorization factor (VF) is bigger than the number
3313 of elements that we can fit in a vectype (nunits), we have to generate
3314 more than one vector stmt - i.e - we need to "unroll" the
3315 vector stmt by a factor VF/nunits. For more details see documentation
3316 in vectorizable_operation. */
3318 if (ncopies > 1)
3320 stmt_vec_info prev_stmt_vinfo;
3321 /* FORNOW. This restriction should be relaxed. */
3322 gcc_assert (!nested_in_vect_loop);
3324 /* Create the vector that holds the step of the induction. */
3325 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3326 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3327 expr, step_expr);
3328 t = unshare_expr (new_name);
3329 gcc_assert (CONSTANT_CLASS_P (new_name));
3330 new_vec = build_vector_from_val (stepvectype, t);
3331 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3333 vec_def = induc_def;
3334 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3335 for (i = 1; i < ncopies; i++)
3337 /* vec_i = vec_prev + vec_step */
3338 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3339 vec_def, vec_step);
3340 vec_def = make_ssa_name (vec_dest, new_stmt);
3341 gimple_assign_set_lhs (new_stmt, vec_def);
3343 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3344 if (!useless_type_conversion_p (resvectype, vectype))
3346 new_stmt = gimple_build_assign_with_ops
3347 (VIEW_CONVERT_EXPR,
3348 vect_get_new_vect_var (resvectype, vect_simple_var,
3349 "vec_iv_"),
3350 build1 (VIEW_CONVERT_EXPR, resvectype,
3351 gimple_assign_lhs (new_stmt)), NULL_TREE);
3352 gimple_assign_set_lhs (new_stmt,
3353 make_ssa_name
3354 (gimple_assign_lhs (new_stmt), new_stmt));
3355 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3357 set_vinfo_for_stmt (new_stmt,
3358 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3359 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3360 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3364 if (nested_in_vect_loop)
3366 /* Find the loop-closed exit-phi of the induction, and record
3367 the final vector of induction results: */
3368 exit_phi = NULL;
3369 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3371 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3373 exit_phi = USE_STMT (use_p);
3374 break;
3377 if (exit_phi)
3379 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3380 /* FORNOW. Currently not supporting the case that an inner-loop induction
3381 is not used in the outer-loop (i.e. only outside the outer-loop). */
3382 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3383 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3385 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3386 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location,
3389 "vector of inductions after inner-loop:");
3390 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3396 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "transform induction: created def-use cycle: ");
3400 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3401 dump_printf (MSG_NOTE, "\n");
3402 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3403 SSA_NAME_DEF_STMT (vec_def), 0);
3406 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3407 if (!useless_type_conversion_p (resvectype, vectype))
3409 new_stmt = gimple_build_assign_with_ops
3410 (VIEW_CONVERT_EXPR,
3411 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3412 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3413 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3414 gimple_assign_set_lhs (new_stmt, induc_def);
3415 si = gsi_after_labels (bb);
3416 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3417 set_vinfo_for_stmt (new_stmt,
3418 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3419 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3420 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3423 return induc_def;
3427 /* Function get_initial_def_for_reduction
3429 Input:
3430 STMT - a stmt that performs a reduction operation in the loop.
3431 INIT_VAL - the initial value of the reduction variable
3433 Output:
3434 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3435 of the reduction (used for adjusting the epilog - see below).
3436 Return a vector variable, initialized according to the operation that STMT
3437 performs. This vector will be used as the initial value of the
3438 vector of partial results.
3440 Option1 (adjust in epilog): Initialize the vector as follows:
3441 add/bit or/xor: [0,0,...,0,0]
3442 mult/bit and: [1,1,...,1,1]
3443 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3444 and when necessary (e.g. add/mult case) let the caller know
3445 that it needs to adjust the result by init_val.
3447 Option2: Initialize the vector as follows:
3448 add/bit or/xor: [init_val,0,0,...,0]
3449 mult/bit and: [init_val,1,1,...,1]
3450 min/max/cond_expr: [init_val,init_val,...,init_val]
3451 and no adjustments are needed.
3453 For example, for the following code:
3455 s = init_val;
3456 for (i=0;i<n;i++)
3457 s = s + a[i];
3459 STMT is 's = s + a[i]', and the reduction variable is 's'.
3460 For a vector of 4 units, we want to return either [0,0,0,init_val],
3461 or [0,0,0,0] and let the caller know that it needs to adjust
3462 the result at the end by 'init_val'.
3464 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3465 initialization vector is simpler (same element in all entries), if
3466 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3468 A cost model should help decide between these two schemes. */
3470 tree
3471 get_initial_def_for_reduction (gimple stmt, tree init_val,
3472 tree *adjustment_def)
3474 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3475 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3476 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3477 tree scalar_type = TREE_TYPE (init_val);
3478 tree vectype = get_vectype_for_scalar_type (scalar_type);
3479 int nunits;
3480 enum tree_code code = gimple_assign_rhs_code (stmt);
3481 tree def_for_init;
3482 tree init_def;
3483 tree *elts;
3484 int i;
3485 bool nested_in_vect_loop = false;
3486 tree init_value;
3487 REAL_VALUE_TYPE real_init_val = dconst0;
3488 int int_init_val = 0;
3489 gimple def_stmt = NULL;
3491 gcc_assert (vectype);
3492 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3494 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3495 || SCALAR_FLOAT_TYPE_P (scalar_type));
3497 if (nested_in_vect_loop_p (loop, stmt))
3498 nested_in_vect_loop = true;
3499 else
3500 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3502 /* In case of double reduction we only create a vector variable to be put
3503 in the reduction phi node. The actual statement creation is done in
3504 vect_create_epilog_for_reduction. */
3505 if (adjustment_def && nested_in_vect_loop
3506 && TREE_CODE (init_val) == SSA_NAME
3507 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3508 && gimple_code (def_stmt) == GIMPLE_PHI
3509 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3510 && vinfo_for_stmt (def_stmt)
3511 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3512 == vect_double_reduction_def)
3514 *adjustment_def = NULL;
3515 return vect_create_destination_var (init_val, vectype);
3518 if (TREE_CONSTANT (init_val))
3520 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3521 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3522 else
3523 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3525 else
3526 init_value = init_val;
3528 switch (code)
3530 case WIDEN_SUM_EXPR:
3531 case DOT_PROD_EXPR:
3532 case PLUS_EXPR:
3533 case MINUS_EXPR:
3534 case BIT_IOR_EXPR:
3535 case BIT_XOR_EXPR:
3536 case MULT_EXPR:
3537 case BIT_AND_EXPR:
3538 /* ADJUSMENT_DEF is NULL when called from
3539 vect_create_epilog_for_reduction to vectorize double reduction. */
3540 if (adjustment_def)
3542 if (nested_in_vect_loop)
3543 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3544 NULL);
3545 else
3546 *adjustment_def = init_val;
3549 if (code == MULT_EXPR)
3551 real_init_val = dconst1;
3552 int_init_val = 1;
3555 if (code == BIT_AND_EXPR)
3556 int_init_val = -1;
3558 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3559 def_for_init = build_real (scalar_type, real_init_val);
3560 else
3561 def_for_init = build_int_cst (scalar_type, int_init_val);
3563 /* Create a vector of '0' or '1' except the first element. */
3564 elts = XALLOCAVEC (tree, nunits);
3565 for (i = nunits - 2; i >= 0; --i)
3566 elts[i + 1] = def_for_init;
3568 /* Option1: the first element is '0' or '1' as well. */
3569 if (adjustment_def)
3571 elts[0] = def_for_init;
3572 init_def = build_vector (vectype, elts);
3573 break;
3576 /* Option2: the first element is INIT_VAL. */
3577 elts[0] = init_val;
3578 if (TREE_CONSTANT (init_val))
3579 init_def = build_vector (vectype, elts);
3580 else
3582 vec<constructor_elt, va_gc> *v;
3583 vec_alloc (v, nunits);
3584 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3585 for (i = 1; i < nunits; ++i)
3586 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3587 init_def = build_constructor (vectype, v);
3590 break;
3592 case MIN_EXPR:
3593 case MAX_EXPR:
3594 case COND_EXPR:
3595 if (adjustment_def)
3597 *adjustment_def = NULL_TREE;
3598 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3599 break;
3602 init_def = build_vector_from_val (vectype, init_value);
3603 break;
3605 default:
3606 gcc_unreachable ();
3609 return init_def;
3613 /* Function vect_create_epilog_for_reduction
3615 Create code at the loop-epilog to finalize the result of a reduction
3616 computation.
3618 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3619 reduction statements.
3620 STMT is the scalar reduction stmt that is being vectorized.
3621 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3622 number of elements that we can fit in a vectype (nunits). In this case
3623 we have to generate more than one vector stmt - i.e - we need to "unroll"
3624 the vector stmt by a factor VF/nunits. For more details see documentation
3625 in vectorizable_operation.
3626 REDUC_CODE is the tree-code for the epilog reduction.
3627 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3628 computation.
3629 REDUC_INDEX is the index of the operand in the right hand side of the
3630 statement that is defined by REDUCTION_PHI.
3631 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3632 SLP_NODE is an SLP node containing a group of reduction statements. The
3633 first one in this group is STMT.
3635 This function:
3636 1. Creates the reduction def-use cycles: sets the arguments for
3637 REDUCTION_PHIS:
3638 The loop-entry argument is the vectorized initial-value of the reduction.
3639 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3640 sums.
3641 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3642 by applying the operation specified by REDUC_CODE if available, or by
3643 other means (whole-vector shifts or a scalar loop).
3644 The function also creates a new phi node at the loop exit to preserve
3645 loop-closed form, as illustrated below.
3647 The flow at the entry to this function:
3649 loop:
3650 vec_def = phi <null, null> # REDUCTION_PHI
3651 VECT_DEF = vector_stmt # vectorized form of STMT
3652 s_loop = scalar_stmt # (scalar) STMT
3653 loop_exit:
3654 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3655 use <s_out0>
3656 use <s_out0>
3658 The above is transformed by this function into:
3660 loop:
3661 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3662 VECT_DEF = vector_stmt # vectorized form of STMT
3663 s_loop = scalar_stmt # (scalar) STMT
3664 loop_exit:
3665 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3666 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3667 v_out2 = reduce <v_out1>
3668 s_out3 = extract_field <v_out2, 0>
3669 s_out4 = adjust_result <s_out3>
3670 use <s_out4>
3671 use <s_out4>
3674 static void
3675 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3676 int ncopies, enum tree_code reduc_code,
3677 vec<gimple> reduction_phis,
3678 int reduc_index, bool double_reduc,
3679 slp_tree slp_node)
3681 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3682 stmt_vec_info prev_phi_info;
3683 tree vectype;
3684 enum machine_mode mode;
3685 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3686 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3687 basic_block exit_bb;
3688 tree scalar_dest;
3689 tree scalar_type;
3690 gimple new_phi = NULL, phi;
3691 gimple_stmt_iterator exit_gsi;
3692 tree vec_dest;
3693 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3694 gimple epilog_stmt = NULL;
3695 enum tree_code code = gimple_assign_rhs_code (stmt);
3696 gimple exit_phi;
3697 tree bitsize, bitpos;
3698 tree adjustment_def = NULL;
3699 tree vec_initial_def = NULL;
3700 tree reduction_op, expr, def;
3701 tree orig_name, scalar_result;
3702 imm_use_iterator imm_iter, phi_imm_iter;
3703 use_operand_p use_p, phi_use_p;
3704 bool extract_scalar_result = false;
3705 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3706 bool nested_in_vect_loop = false;
3707 vec<gimple> new_phis = vNULL;
3708 vec<gimple> inner_phis = vNULL;
3709 enum vect_def_type dt = vect_unknown_def_type;
3710 int j, i;
3711 vec<tree> scalar_results = vNULL;
3712 unsigned int group_size = 1, k, ratio;
3713 vec<tree> vec_initial_defs = vNULL;
3714 vec<gimple> phis;
3715 bool slp_reduc = false;
3716 tree new_phi_result;
3717 gimple inner_phi = NULL;
3719 if (slp_node)
3720 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3722 if (nested_in_vect_loop_p (loop, stmt))
3724 outer_loop = loop;
3725 loop = loop->inner;
3726 nested_in_vect_loop = true;
3727 gcc_assert (!slp_node);
3730 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3732 case GIMPLE_SINGLE_RHS:
3733 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3734 == ternary_op);
3735 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3736 break;
3737 case GIMPLE_UNARY_RHS:
3738 reduction_op = gimple_assign_rhs1 (stmt);
3739 break;
3740 case GIMPLE_BINARY_RHS:
3741 reduction_op = reduc_index ?
3742 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3743 break;
3744 case GIMPLE_TERNARY_RHS:
3745 reduction_op = gimple_op (stmt, reduc_index + 1);
3746 break;
3747 default:
3748 gcc_unreachable ();
3751 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3752 gcc_assert (vectype);
3753 mode = TYPE_MODE (vectype);
3755 /* 1. Create the reduction def-use cycle:
3756 Set the arguments of REDUCTION_PHIS, i.e., transform
3758 loop:
3759 vec_def = phi <null, null> # REDUCTION_PHI
3760 VECT_DEF = vector_stmt # vectorized form of STMT
3763 into:
3765 loop:
3766 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3767 VECT_DEF = vector_stmt # vectorized form of STMT
3770 (in case of SLP, do it for all the phis). */
3772 /* Get the loop-entry arguments. */
3773 if (slp_node)
3774 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3775 NULL, slp_node, reduc_index);
3776 else
3778 vec_initial_defs.create (1);
3779 /* For the case of reduction, vect_get_vec_def_for_operand returns
3780 the scalar def before the loop, that defines the initial value
3781 of the reduction variable. */
3782 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3783 &adjustment_def);
3784 vec_initial_defs.quick_push (vec_initial_def);
3787 /* Set phi nodes arguments. */
3788 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3790 tree vec_init_def = vec_initial_defs[i];
3791 tree def = vect_defs[i];
3792 for (j = 0; j < ncopies; j++)
3794 /* Set the loop-entry arg of the reduction-phi. */
3795 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3796 UNKNOWN_LOCATION);
3798 /* Set the loop-latch arg for the reduction-phi. */
3799 if (j > 0)
3800 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3802 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3804 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE, vect_location,
3807 "transform reduction: created def-use cycle: ");
3808 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3809 dump_printf (MSG_NOTE, "\n");
3810 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3813 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3817 vec_initial_defs.release ();
3819 /* 2. Create epilog code.
3820 The reduction epilog code operates across the elements of the vector
3821 of partial results computed by the vectorized loop.
3822 The reduction epilog code consists of:
3824 step 1: compute the scalar result in a vector (v_out2)
3825 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3826 step 3: adjust the scalar result (s_out3) if needed.
3828 Step 1 can be accomplished using one the following three schemes:
3829 (scheme 1) using reduc_code, if available.
3830 (scheme 2) using whole-vector shifts, if available.
3831 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3832 combined.
3834 The overall epilog code looks like this:
3836 s_out0 = phi <s_loop> # original EXIT_PHI
3837 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3838 v_out2 = reduce <v_out1> # step 1
3839 s_out3 = extract_field <v_out2, 0> # step 2
3840 s_out4 = adjust_result <s_out3> # step 3
3842 (step 3 is optional, and steps 1 and 2 may be combined).
3843 Lastly, the uses of s_out0 are replaced by s_out4. */
3846 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3847 v_out1 = phi <VECT_DEF>
3848 Store them in NEW_PHIS. */
3850 exit_bb = single_exit (loop)->dest;
3851 prev_phi_info = NULL;
3852 new_phis.create (vect_defs.length ());
3853 FOR_EACH_VEC_ELT (vect_defs, i, def)
3855 for (j = 0; j < ncopies; j++)
3857 tree new_def = copy_ssa_name (def, NULL);
3858 phi = create_phi_node (new_def, exit_bb);
3859 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3860 if (j == 0)
3861 new_phis.quick_push (phi);
3862 else
3864 def = vect_get_vec_def_for_stmt_copy (dt, def);
3865 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3868 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3869 prev_phi_info = vinfo_for_stmt (phi);
3873 /* The epilogue is created for the outer-loop, i.e., for the loop being
3874 vectorized. Create exit phis for the outer loop. */
3875 if (double_reduc)
3877 loop = outer_loop;
3878 exit_bb = single_exit (loop)->dest;
3879 inner_phis.create (vect_defs.length ());
3880 FOR_EACH_VEC_ELT (new_phis, i, phi)
3882 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3883 gimple outer_phi = create_phi_node (new_result, exit_bb);
3884 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3885 PHI_RESULT (phi));
3886 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3887 loop_vinfo, NULL));
3888 inner_phis.quick_push (phi);
3889 new_phis[i] = outer_phi;
3890 prev_phi_info = vinfo_for_stmt (outer_phi);
3891 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3893 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3894 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3895 outer_phi = create_phi_node (new_result, exit_bb);
3896 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3897 PHI_RESULT (phi));
3898 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3899 loop_vinfo, NULL));
3900 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3901 prev_phi_info = vinfo_for_stmt (outer_phi);
3906 exit_gsi = gsi_after_labels (exit_bb);
3908 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3909 (i.e. when reduc_code is not available) and in the final adjustment
3910 code (if needed). Also get the original scalar reduction variable as
3911 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3912 represents a reduction pattern), the tree-code and scalar-def are
3913 taken from the original stmt that the pattern-stmt (STMT) replaces.
3914 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3915 are taken from STMT. */
3917 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3918 if (!orig_stmt)
3920 /* Regular reduction */
3921 orig_stmt = stmt;
3923 else
3925 /* Reduction pattern */
3926 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3927 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3928 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3931 code = gimple_assign_rhs_code (orig_stmt);
3932 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3933 partial results are added and not subtracted. */
3934 if (code == MINUS_EXPR)
3935 code = PLUS_EXPR;
3937 scalar_dest = gimple_assign_lhs (orig_stmt);
3938 scalar_type = TREE_TYPE (scalar_dest);
3939 scalar_results.create (group_size);
3940 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3941 bitsize = TYPE_SIZE (scalar_type);
3943 /* In case this is a reduction in an inner-loop while vectorizing an outer
3944 loop - we don't need to extract a single scalar result at the end of the
3945 inner-loop (unless it is double reduction, i.e., the use of reduction is
3946 outside the outer-loop). The final vector of partial results will be used
3947 in the vectorized outer-loop, or reduced to a scalar result at the end of
3948 the outer-loop. */
3949 if (nested_in_vect_loop && !double_reduc)
3950 goto vect_finalize_reduction;
3952 /* SLP reduction without reduction chain, e.g.,
3953 # a1 = phi <a2, a0>
3954 # b1 = phi <b2, b0>
3955 a2 = operation (a1)
3956 b2 = operation (b1) */
3957 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3959 /* In case of reduction chain, e.g.,
3960 # a1 = phi <a3, a0>
3961 a2 = operation (a1)
3962 a3 = operation (a2),
3964 we may end up with more than one vector result. Here we reduce them to
3965 one vector. */
3966 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3968 tree first_vect = PHI_RESULT (new_phis[0]);
3969 tree tmp;
3970 gimple new_vec_stmt = NULL;
3972 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3973 for (k = 1; k < new_phis.length (); k++)
3975 gimple next_phi = new_phis[k];
3976 tree second_vect = PHI_RESULT (next_phi);
3978 tmp = build2 (code, vectype, first_vect, second_vect);
3979 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3980 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3981 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3982 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3985 new_phi_result = first_vect;
3986 if (new_vec_stmt)
3988 new_phis.truncate (0);
3989 new_phis.safe_push (new_vec_stmt);
3992 else
3993 new_phi_result = PHI_RESULT (new_phis[0]);
3995 /* 2.3 Create the reduction code, using one of the three schemes described
3996 above. In SLP we simply need to extract all the elements from the
3997 vector (without reducing them), so we use scalar shifts. */
3998 if (reduc_code != ERROR_MARK && !slp_reduc)
4000 tree tmp;
4002 /*** Case 1: Create:
4003 v_out2 = reduc_expr <v_out1> */
4005 if (dump_enabled_p ())
4006 dump_printf_loc (MSG_NOTE, vect_location,
4007 "Reduce using direct vector reduction.");
4009 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4010 tmp = build1 (reduc_code, vectype, new_phi_result);
4011 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4012 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4013 gimple_assign_set_lhs (epilog_stmt, new_temp);
4014 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4016 extract_scalar_result = true;
4018 else
4020 enum tree_code shift_code = ERROR_MARK;
4021 bool have_whole_vector_shift = true;
4022 int bit_offset;
4023 int element_bitsize = tree_low_cst (bitsize, 1);
4024 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4025 tree vec_temp;
4027 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4028 shift_code = VEC_RSHIFT_EXPR;
4029 else
4030 have_whole_vector_shift = false;
4032 /* Regardless of whether we have a whole vector shift, if we're
4033 emulating the operation via tree-vect-generic, we don't want
4034 to use it. Only the first round of the reduction is likely
4035 to still be profitable via emulation. */
4036 /* ??? It might be better to emit a reduction tree code here, so that
4037 tree-vect-generic can expand the first round via bit tricks. */
4038 if (!VECTOR_MODE_P (mode))
4039 have_whole_vector_shift = false;
4040 else
4042 optab optab = optab_for_tree_code (code, vectype, optab_default);
4043 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4044 have_whole_vector_shift = false;
4047 if (have_whole_vector_shift && !slp_reduc)
4049 /*** Case 2: Create:
4050 for (offset = VS/2; offset >= element_size; offset/=2)
4052 Create: va' = vec_shift <va, offset>
4053 Create: va = vop <va, va'>
4054 } */
4056 if (dump_enabled_p ())
4057 dump_printf_loc (MSG_NOTE, vect_location,
4058 "Reduce using vector shifts");
4060 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4061 new_temp = new_phi_result;
4062 for (bit_offset = vec_size_in_bits/2;
4063 bit_offset >= element_bitsize;
4064 bit_offset /= 2)
4066 tree bitpos = size_int (bit_offset);
4068 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4069 vec_dest, new_temp, bitpos);
4070 new_name = make_ssa_name (vec_dest, epilog_stmt);
4071 gimple_assign_set_lhs (epilog_stmt, new_name);
4072 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4074 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4075 new_name, new_temp);
4076 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4077 gimple_assign_set_lhs (epilog_stmt, new_temp);
4078 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4081 extract_scalar_result = true;
4083 else
4085 tree rhs;
4087 /*** Case 3: Create:
4088 s = extract_field <v_out2, 0>
4089 for (offset = element_size;
4090 offset < vector_size;
4091 offset += element_size;)
4093 Create: s' = extract_field <v_out2, offset>
4094 Create: s = op <s, s'> // For non SLP cases
4095 } */
4097 if (dump_enabled_p ())
4098 dump_printf_loc (MSG_NOTE, vect_location,
4099 "Reduce using scalar code. ");
4101 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4102 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4104 if (gimple_code (new_phi) == GIMPLE_PHI)
4105 vec_temp = PHI_RESULT (new_phi);
4106 else
4107 vec_temp = gimple_assign_lhs (new_phi);
4108 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4109 bitsize_zero_node);
4110 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4111 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4112 gimple_assign_set_lhs (epilog_stmt, new_temp);
4113 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4115 /* In SLP we don't need to apply reduction operation, so we just
4116 collect s' values in SCALAR_RESULTS. */
4117 if (slp_reduc)
4118 scalar_results.safe_push (new_temp);
4120 for (bit_offset = element_bitsize;
4121 bit_offset < vec_size_in_bits;
4122 bit_offset += element_bitsize)
4124 tree bitpos = bitsize_int (bit_offset);
4125 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4126 bitsize, bitpos);
4128 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4129 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4130 gimple_assign_set_lhs (epilog_stmt, new_name);
4131 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4133 if (slp_reduc)
4135 /* In SLP we don't need to apply reduction operation, so
4136 we just collect s' values in SCALAR_RESULTS. */
4137 new_temp = new_name;
4138 scalar_results.safe_push (new_name);
4140 else
4142 epilog_stmt = gimple_build_assign_with_ops (code,
4143 new_scalar_dest, new_name, new_temp);
4144 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4145 gimple_assign_set_lhs (epilog_stmt, new_temp);
4146 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4151 /* The only case where we need to reduce scalar results in SLP, is
4152 unrolling. If the size of SCALAR_RESULTS is greater than
4153 GROUP_SIZE, we reduce them combining elements modulo
4154 GROUP_SIZE. */
4155 if (slp_reduc)
4157 tree res, first_res, new_res;
4158 gimple new_stmt;
4160 /* Reduce multiple scalar results in case of SLP unrolling. */
4161 for (j = group_size; scalar_results.iterate (j, &res);
4162 j++)
4164 first_res = scalar_results[j % group_size];
4165 new_stmt = gimple_build_assign_with_ops (code,
4166 new_scalar_dest, first_res, res);
4167 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4168 gimple_assign_set_lhs (new_stmt, new_res);
4169 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4170 scalar_results[j % group_size] = new_res;
4173 else
4174 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4175 scalar_results.safe_push (new_temp);
4177 extract_scalar_result = false;
4181 /* 2.4 Extract the final scalar result. Create:
4182 s_out3 = extract_field <v_out2, bitpos> */
4184 if (extract_scalar_result)
4186 tree rhs;
4188 if (dump_enabled_p ())
4189 dump_printf_loc (MSG_NOTE, vect_location,
4190 "extract scalar result");
4192 if (BYTES_BIG_ENDIAN)
4193 bitpos = size_binop (MULT_EXPR,
4194 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4195 TYPE_SIZE (scalar_type));
4196 else
4197 bitpos = bitsize_zero_node;
4199 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4200 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4201 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4202 gimple_assign_set_lhs (epilog_stmt, new_temp);
4203 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4204 scalar_results.safe_push (new_temp);
4207 vect_finalize_reduction:
4209 if (double_reduc)
4210 loop = loop->inner;
4212 /* 2.5 Adjust the final result by the initial value of the reduction
4213 variable. (When such adjustment is not needed, then
4214 'adjustment_def' is zero). For example, if code is PLUS we create:
4215 new_temp = loop_exit_def + adjustment_def */
4217 if (adjustment_def)
4219 gcc_assert (!slp_reduc);
4220 if (nested_in_vect_loop)
4222 new_phi = new_phis[0];
4223 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4224 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4225 new_dest = vect_create_destination_var (scalar_dest, vectype);
4227 else
4229 new_temp = scalar_results[0];
4230 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4231 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4232 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4235 epilog_stmt = gimple_build_assign (new_dest, expr);
4236 new_temp = make_ssa_name (new_dest, epilog_stmt);
4237 gimple_assign_set_lhs (epilog_stmt, new_temp);
4238 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4239 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4240 if (nested_in_vect_loop)
4242 set_vinfo_for_stmt (epilog_stmt,
4243 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4244 NULL));
4245 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4246 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4248 if (!double_reduc)
4249 scalar_results.quick_push (new_temp);
4250 else
4251 scalar_results[0] = new_temp;
4253 else
4254 scalar_results[0] = new_temp;
4256 new_phis[0] = epilog_stmt;
4259 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4260 phis with new adjusted scalar results, i.e., replace use <s_out0>
4261 with use <s_out4>.
4263 Transform:
4264 loop_exit:
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4270 use <s_out0>
4271 use <s_out0>
4273 into:
4275 loop_exit:
4276 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4277 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4278 v_out2 = reduce <v_out1>
4279 s_out3 = extract_field <v_out2, 0>
4280 s_out4 = adjust_result <s_out3>
4281 use <s_out4>
4282 use <s_out4> */
4285 /* In SLP reduction chain we reduce vector results into one vector if
4286 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4287 the last stmt in the reduction chain, since we are looking for the loop
4288 exit phi node. */
4289 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4291 scalar_dest = gimple_assign_lhs (
4292 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4293 group_size = 1;
4296 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4297 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4298 need to match SCALAR_RESULTS with corresponding statements. The first
4299 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4300 the first vector stmt, etc.
4301 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4302 if (group_size > new_phis.length ())
4304 ratio = group_size / new_phis.length ();
4305 gcc_assert (!(group_size % new_phis.length ()));
4307 else
4308 ratio = 1;
4310 for (k = 0; k < group_size; k++)
4312 if (k % ratio == 0)
4314 epilog_stmt = new_phis[k / ratio];
4315 reduction_phi = reduction_phis[k / ratio];
4316 if (double_reduc)
4317 inner_phi = inner_phis[k / ratio];
4320 if (slp_reduc)
4322 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4324 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4325 /* SLP statements can't participate in patterns. */
4326 gcc_assert (!orig_stmt);
4327 scalar_dest = gimple_assign_lhs (current_stmt);
4330 phis.create (3);
4331 /* Find the loop-closed-use at the loop exit of the original scalar
4332 result. (The reduction result is expected to have two immediate uses -
4333 one at the latch block, and one at the loop exit). */
4334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4335 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4336 phis.safe_push (USE_STMT (use_p));
4338 /* We expect to have found an exit_phi because of loop-closed-ssa
4339 form. */
4340 gcc_assert (!phis.is_empty ());
4342 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4344 if (outer_loop)
4346 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4347 gimple vect_phi;
4349 /* FORNOW. Currently not supporting the case that an inner-loop
4350 reduction is not used in the outer-loop (but only outside the
4351 outer-loop), unless it is double reduction. */
4352 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4353 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4354 || double_reduc);
4356 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4357 if (!double_reduc
4358 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4359 != vect_double_reduction_def)
4360 continue;
4362 /* Handle double reduction:
4364 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4365 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4366 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4367 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4369 At that point the regular reduction (stmt2 and stmt3) is
4370 already vectorized, as well as the exit phi node, stmt4.
4371 Here we vectorize the phi node of double reduction, stmt1, and
4372 update all relevant statements. */
4374 /* Go through all the uses of s2 to find double reduction phi
4375 node, i.e., stmt1 above. */
4376 orig_name = PHI_RESULT (exit_phi);
4377 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4379 stmt_vec_info use_stmt_vinfo;
4380 stmt_vec_info new_phi_vinfo;
4381 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4382 basic_block bb = gimple_bb (use_stmt);
4383 gimple use;
4385 /* Check that USE_STMT is really double reduction phi
4386 node. */
4387 if (gimple_code (use_stmt) != GIMPLE_PHI
4388 || gimple_phi_num_args (use_stmt) != 2
4389 || bb->loop_father != outer_loop)
4390 continue;
4391 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4392 if (!use_stmt_vinfo
4393 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4394 != vect_double_reduction_def)
4395 continue;
4397 /* Create vector phi node for double reduction:
4398 vs1 = phi <vs0, vs2>
4399 vs1 was created previously in this function by a call to
4400 vect_get_vec_def_for_operand and is stored in
4401 vec_initial_def;
4402 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4403 vs0 is created here. */
4405 /* Create vector phi node. */
4406 vect_phi = create_phi_node (vec_initial_def, bb);
4407 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4408 loop_vec_info_for_loop (outer_loop), NULL);
4409 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4411 /* Create vs0 - initial def of the double reduction phi. */
4412 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4413 loop_preheader_edge (outer_loop));
4414 init_def = get_initial_def_for_reduction (stmt,
4415 preheader_arg, NULL);
4416 vect_phi_init = vect_init_vector (use_stmt, init_def,
4417 vectype, NULL);
4419 /* Update phi node arguments with vs0 and vs2. */
4420 add_phi_arg (vect_phi, vect_phi_init,
4421 loop_preheader_edge (outer_loop),
4422 UNKNOWN_LOCATION);
4423 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4424 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4425 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_NOTE, vect_location,
4428 "created double reduction phi node: ");
4429 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4432 vect_phi_res = PHI_RESULT (vect_phi);
4434 /* Replace the use, i.e., set the correct vs1 in the regular
4435 reduction phi node. FORNOW, NCOPIES is always 1, so the
4436 loop is redundant. */
4437 use = reduction_phi;
4438 for (j = 0; j < ncopies; j++)
4440 edge pr_edge = loop_preheader_edge (loop);
4441 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4442 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4448 phis.release ();
4449 if (nested_in_vect_loop)
4451 if (double_reduc)
4452 loop = outer_loop;
4453 else
4454 continue;
4457 phis.create (3);
4458 /* Find the loop-closed-use at the loop exit of the original scalar
4459 result. (The reduction result is expected to have two immediate uses,
4460 one at the latch block, and one at the loop exit). For double
4461 reductions we are looking for exit phis of the outer loop. */
4462 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4464 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4465 phis.safe_push (USE_STMT (use_p));
4466 else
4468 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4470 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4472 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4474 if (!flow_bb_inside_loop_p (loop,
4475 gimple_bb (USE_STMT (phi_use_p))))
4476 phis.safe_push (USE_STMT (phi_use_p));
4482 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4484 /* Replace the uses: */
4485 orig_name = PHI_RESULT (exit_phi);
4486 scalar_result = scalar_results[k];
4487 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4488 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4489 SET_USE (use_p, scalar_result);
4492 phis.release ();
4495 scalar_results.release ();
4496 inner_phis.release ();
4497 new_phis.release ();
4501 /* Function vectorizable_reduction.
4503 Check if STMT performs a reduction operation that can be vectorized.
4504 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4505 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4506 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4508 This function also handles reduction idioms (patterns) that have been
4509 recognized in advance during vect_pattern_recog. In this case, STMT may be
4510 of this form:
4511 X = pattern_expr (arg0, arg1, ..., X)
4512 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4513 sequence that had been detected and replaced by the pattern-stmt (STMT).
4515 In some cases of reduction patterns, the type of the reduction variable X is
4516 different than the type of the other arguments of STMT.
4517 In such cases, the vectype that is used when transforming STMT into a vector
4518 stmt is different than the vectype that is used to determine the
4519 vectorization factor, because it consists of a different number of elements
4520 than the actual number of elements that are being operated upon in parallel.
4522 For example, consider an accumulation of shorts into an int accumulator.
4523 On some targets it's possible to vectorize this pattern operating on 8
4524 shorts at a time (hence, the vectype for purposes of determining the
4525 vectorization factor should be V8HI); on the other hand, the vectype that
4526 is used to create the vector form is actually V4SI (the type of the result).
4528 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4529 indicates what is the actual level of parallelism (V8HI in the example), so
4530 that the right vectorization factor would be derived. This vectype
4531 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4532 be used to create the vectorized stmt. The right vectype for the vectorized
4533 stmt is obtained from the type of the result X:
4534 get_vectype_for_scalar_type (TREE_TYPE (X))
4536 This means that, contrary to "regular" reductions (or "regular" stmts in
4537 general), the following equation:
4538 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4539 does *NOT* necessarily hold for reduction patterns. */
4541 bool
4542 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4543 gimple *vec_stmt, slp_tree slp_node)
4545 tree vec_dest;
4546 tree scalar_dest;
4547 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4548 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4549 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4550 tree vectype_in = NULL_TREE;
4551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4553 enum tree_code code, orig_code, epilog_reduc_code;
4554 enum machine_mode vec_mode;
4555 int op_type;
4556 optab optab, reduc_optab;
4557 tree new_temp = NULL_TREE;
4558 tree def;
4559 gimple def_stmt;
4560 enum vect_def_type dt;
4561 gimple new_phi = NULL;
4562 tree scalar_type;
4563 bool is_simple_use;
4564 gimple orig_stmt;
4565 stmt_vec_info orig_stmt_info;
4566 tree expr = NULL_TREE;
4567 int i;
4568 int ncopies;
4569 int epilog_copies;
4570 stmt_vec_info prev_stmt_info, prev_phi_info;
4571 bool single_defuse_cycle = false;
4572 tree reduc_def = NULL_TREE;
4573 gimple new_stmt = NULL;
4574 int j;
4575 tree ops[3];
4576 bool nested_cycle = false, found_nested_cycle_def = false;
4577 gimple reduc_def_stmt = NULL;
4578 /* The default is that the reduction variable is the last in statement. */
4579 int reduc_index = 2;
4580 bool double_reduc = false, dummy;
4581 basic_block def_bb;
4582 struct loop * def_stmt_loop, *outer_loop = NULL;
4583 tree def_arg;
4584 gimple def_arg_stmt;
4585 vec<tree> vec_oprnds0 = vNULL;
4586 vec<tree> vec_oprnds1 = vNULL;
4587 vec<tree> vect_defs = vNULL;
4588 vec<gimple> phis = vNULL;
4589 int vec_num;
4590 tree def0, def1, tem, op0, op1 = NULL_TREE;
4592 /* In case of reduction chain we switch to the first stmt in the chain, but
4593 we don't update STMT_INFO, since only the last stmt is marked as reduction
4594 and has reduction properties. */
4595 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4596 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4598 if (nested_in_vect_loop_p (loop, stmt))
4600 outer_loop = loop;
4601 loop = loop->inner;
4602 nested_cycle = true;
4605 /* 1. Is vectorizable reduction? */
4606 /* Not supportable if the reduction variable is used in the loop, unless
4607 it's a reduction chain. */
4608 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4609 && !GROUP_FIRST_ELEMENT (stmt_info))
4610 return false;
4612 /* Reductions that are not used even in an enclosing outer-loop,
4613 are expected to be "live" (used out of the loop). */
4614 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4615 && !STMT_VINFO_LIVE_P (stmt_info))
4616 return false;
4618 /* Make sure it was already recognized as a reduction computation. */
4619 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4620 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4621 return false;
4623 /* 2. Has this been recognized as a reduction pattern?
4625 Check if STMT represents a pattern that has been recognized
4626 in earlier analysis stages. For stmts that represent a pattern,
4627 the STMT_VINFO_RELATED_STMT field records the last stmt in
4628 the original sequence that constitutes the pattern. */
4630 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4631 if (orig_stmt)
4633 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4634 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4635 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4638 /* 3. Check the operands of the operation. The first operands are defined
4639 inside the loop body. The last operand is the reduction variable,
4640 which is defined by the loop-header-phi. */
4642 gcc_assert (is_gimple_assign (stmt));
4644 /* Flatten RHS. */
4645 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4647 case GIMPLE_SINGLE_RHS:
4648 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4649 if (op_type == ternary_op)
4651 tree rhs = gimple_assign_rhs1 (stmt);
4652 ops[0] = TREE_OPERAND (rhs, 0);
4653 ops[1] = TREE_OPERAND (rhs, 1);
4654 ops[2] = TREE_OPERAND (rhs, 2);
4655 code = TREE_CODE (rhs);
4657 else
4658 return false;
4659 break;
4661 case GIMPLE_BINARY_RHS:
4662 code = gimple_assign_rhs_code (stmt);
4663 op_type = TREE_CODE_LENGTH (code);
4664 gcc_assert (op_type == binary_op);
4665 ops[0] = gimple_assign_rhs1 (stmt);
4666 ops[1] = gimple_assign_rhs2 (stmt);
4667 break;
4669 case GIMPLE_TERNARY_RHS:
4670 code = gimple_assign_rhs_code (stmt);
4671 op_type = TREE_CODE_LENGTH (code);
4672 gcc_assert (op_type == ternary_op);
4673 ops[0] = gimple_assign_rhs1 (stmt);
4674 ops[1] = gimple_assign_rhs2 (stmt);
4675 ops[2] = gimple_assign_rhs3 (stmt);
4676 break;
4678 case GIMPLE_UNARY_RHS:
4679 return false;
4681 default:
4682 gcc_unreachable ();
4685 if (code == COND_EXPR && slp_node)
4686 return false;
4688 scalar_dest = gimple_assign_lhs (stmt);
4689 scalar_type = TREE_TYPE (scalar_dest);
4690 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4691 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4692 return false;
4694 /* Do not try to vectorize bit-precision reductions. */
4695 if ((TYPE_PRECISION (scalar_type)
4696 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4697 return false;
4699 /* All uses but the last are expected to be defined in the loop.
4700 The last use is the reduction variable. In case of nested cycle this
4701 assumption is not true: we use reduc_index to record the index of the
4702 reduction variable. */
4703 for (i = 0; i < op_type - 1; i++)
4705 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4706 if (i == 0 && code == COND_EXPR)
4707 continue;
4709 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4710 &def_stmt, &def, &dt, &tem);
4711 if (!vectype_in)
4712 vectype_in = tem;
4713 gcc_assert (is_simple_use);
4715 if (dt != vect_internal_def
4716 && dt != vect_external_def
4717 && dt != vect_constant_def
4718 && dt != vect_induction_def
4719 && !(dt == vect_nested_cycle && nested_cycle))
4720 return false;
4722 if (dt == vect_nested_cycle)
4724 found_nested_cycle_def = true;
4725 reduc_def_stmt = def_stmt;
4726 reduc_index = i;
4730 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4731 &def_stmt, &def, &dt, &tem);
4732 if (!vectype_in)
4733 vectype_in = tem;
4734 gcc_assert (is_simple_use);
4735 if (!(dt == vect_reduction_def
4736 || dt == vect_nested_cycle
4737 || ((dt == vect_internal_def || dt == vect_external_def
4738 || dt == vect_constant_def || dt == vect_induction_def)
4739 && nested_cycle && found_nested_cycle_def)))
4741 /* For pattern recognized stmts, orig_stmt might be a reduction,
4742 but some helper statements for the pattern might not, or
4743 might be COND_EXPRs with reduction uses in the condition. */
4744 gcc_assert (orig_stmt);
4745 return false;
4747 if (!found_nested_cycle_def)
4748 reduc_def_stmt = def_stmt;
4750 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4751 if (orig_stmt)
4752 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4753 reduc_def_stmt,
4754 !nested_cycle,
4755 &dummy));
4756 else
4758 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4759 !nested_cycle, &dummy);
4760 /* We changed STMT to be the first stmt in reduction chain, hence we
4761 check that in this case the first element in the chain is STMT. */
4762 gcc_assert (stmt == tmp
4763 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4766 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4767 return false;
4769 if (slp_node || PURE_SLP_STMT (stmt_info))
4770 ncopies = 1;
4771 else
4772 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4773 / TYPE_VECTOR_SUBPARTS (vectype_in));
4775 gcc_assert (ncopies >= 1);
4777 vec_mode = TYPE_MODE (vectype_in);
4779 if (code == COND_EXPR)
4781 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4783 if (dump_enabled_p ())
4784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4785 "unsupported condition in reduction");
4787 return false;
4790 else
4792 /* 4. Supportable by target? */
4794 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4795 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4797 /* Shifts and rotates are only supported by vectorizable_shifts,
4798 not vectorizable_reduction. */
4799 if (dump_enabled_p ())
4800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4801 "unsupported shift or rotation.");
4802 return false;
4805 /* 4.1. check support for the operation in the loop */
4806 optab = optab_for_tree_code (code, vectype_in, optab_default);
4807 if (!optab)
4809 if (dump_enabled_p ())
4810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4811 "no optab.");
4813 return false;
4816 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4818 if (dump_enabled_p ())
4819 dump_printf (MSG_NOTE, "op not supported by target.");
4821 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4822 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4823 < vect_min_worthwhile_factor (code))
4824 return false;
4826 if (dump_enabled_p ())
4827 dump_printf (MSG_NOTE, "proceeding using word mode.");
4830 /* Worthwhile without SIMD support? */
4831 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4832 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4833 < vect_min_worthwhile_factor (code))
4835 if (dump_enabled_p ())
4836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4837 "not worthwhile without SIMD support.");
4839 return false;
4843 /* 4.2. Check support for the epilog operation.
4845 If STMT represents a reduction pattern, then the type of the
4846 reduction variable may be different than the type of the rest
4847 of the arguments. For example, consider the case of accumulation
4848 of shorts into an int accumulator; The original code:
4849 S1: int_a = (int) short_a;
4850 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4852 was replaced with:
4853 STMT: int_acc = widen_sum <short_a, int_acc>
4855 This means that:
4856 1. The tree-code that is used to create the vector operation in the
4857 epilog code (that reduces the partial results) is not the
4858 tree-code of STMT, but is rather the tree-code of the original
4859 stmt from the pattern that STMT is replacing. I.e, in the example
4860 above we want to use 'widen_sum' in the loop, but 'plus' in the
4861 epilog.
4862 2. The type (mode) we use to check available target support
4863 for the vector operation to be created in the *epilog*, is
4864 determined by the type of the reduction variable (in the example
4865 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4866 However the type (mode) we use to check available target support
4867 for the vector operation to be created *inside the loop*, is
4868 determined by the type of the other arguments to STMT (in the
4869 example we'd check this: optab_handler (widen_sum_optab,
4870 vect_short_mode)).
4872 This is contrary to "regular" reductions, in which the types of all
4873 the arguments are the same as the type of the reduction variable.
4874 For "regular" reductions we can therefore use the same vector type
4875 (and also the same tree-code) when generating the epilog code and
4876 when generating the code inside the loop. */
4878 if (orig_stmt)
4880 /* This is a reduction pattern: get the vectype from the type of the
4881 reduction variable, and get the tree-code from orig_stmt. */
4882 orig_code = gimple_assign_rhs_code (orig_stmt);
4883 gcc_assert (vectype_out);
4884 vec_mode = TYPE_MODE (vectype_out);
4886 else
4888 /* Regular reduction: use the same vectype and tree-code as used for
4889 the vector code inside the loop can be used for the epilog code. */
4890 orig_code = code;
4893 if (nested_cycle)
4895 def_bb = gimple_bb (reduc_def_stmt);
4896 def_stmt_loop = def_bb->loop_father;
4897 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4898 loop_preheader_edge (def_stmt_loop));
4899 if (TREE_CODE (def_arg) == SSA_NAME
4900 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4901 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4902 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4903 && vinfo_for_stmt (def_arg_stmt)
4904 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4905 == vect_double_reduction_def)
4906 double_reduc = true;
4909 epilog_reduc_code = ERROR_MARK;
4910 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4912 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4913 optab_default);
4914 if (!reduc_optab)
4916 if (dump_enabled_p ())
4917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4918 "no optab for reduction.");
4920 epilog_reduc_code = ERROR_MARK;
4923 if (reduc_optab
4924 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4926 if (dump_enabled_p ())
4927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4928 "reduc op not supported by target.");
4930 epilog_reduc_code = ERROR_MARK;
4933 else
4935 if (!nested_cycle || double_reduc)
4937 if (dump_enabled_p ())
4938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4939 "no reduc code for scalar code.");
4941 return false;
4945 if (double_reduc && ncopies > 1)
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949 "multiple types in double reduction");
4951 return false;
4954 /* In case of widenning multiplication by a constant, we update the type
4955 of the constant to be the type of the other operand. We check that the
4956 constant fits the type in the pattern recognition pass. */
4957 if (code == DOT_PROD_EXPR
4958 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4960 if (TREE_CODE (ops[0]) == INTEGER_CST)
4961 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4962 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4963 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4964 else
4966 if (dump_enabled_p ())
4967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4968 "invalid types in dot-prod");
4970 return false;
4974 if (!vec_stmt) /* transformation not required. */
4976 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4977 return false;
4978 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4979 return true;
4982 /** Transform. **/
4984 if (dump_enabled_p ())
4985 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4987 /* FORNOW: Multiple types are not supported for condition. */
4988 if (code == COND_EXPR)
4989 gcc_assert (ncopies == 1);
4991 /* Create the destination vector */
4992 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4994 /* In case the vectorization factor (VF) is bigger than the number
4995 of elements that we can fit in a vectype (nunits), we have to generate
4996 more than one vector stmt - i.e - we need to "unroll" the
4997 vector stmt by a factor VF/nunits. For more details see documentation
4998 in vectorizable_operation. */
5000 /* If the reduction is used in an outer loop we need to generate
5001 VF intermediate results, like so (e.g. for ncopies=2):
5002 r0 = phi (init, r0)
5003 r1 = phi (init, r1)
5004 r0 = x0 + r0;
5005 r1 = x1 + r1;
5006 (i.e. we generate VF results in 2 registers).
5007 In this case we have a separate def-use cycle for each copy, and therefore
5008 for each copy we get the vector def for the reduction variable from the
5009 respective phi node created for this copy.
5011 Otherwise (the reduction is unused in the loop nest), we can combine
5012 together intermediate results, like so (e.g. for ncopies=2):
5013 r = phi (init, r)
5014 r = x0 + r;
5015 r = x1 + r;
5016 (i.e. we generate VF/2 results in a single register).
5017 In this case for each copy we get the vector def for the reduction variable
5018 from the vectorized reduction operation generated in the previous iteration.
5021 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5023 single_defuse_cycle = true;
5024 epilog_copies = 1;
5026 else
5027 epilog_copies = ncopies;
5029 prev_stmt_info = NULL;
5030 prev_phi_info = NULL;
5031 if (slp_node)
5033 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5034 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5035 == TYPE_VECTOR_SUBPARTS (vectype_in));
5037 else
5039 vec_num = 1;
5040 vec_oprnds0.create (1);
5041 if (op_type == ternary_op)
5042 vec_oprnds1.create (1);
5045 phis.create (vec_num);
5046 vect_defs.create (vec_num);
5047 if (!slp_node)
5048 vect_defs.quick_push (NULL_TREE);
5050 for (j = 0; j < ncopies; j++)
5052 if (j == 0 || !single_defuse_cycle)
5054 for (i = 0; i < vec_num; i++)
5056 /* Create the reduction-phi that defines the reduction
5057 operand. */
5058 new_phi = create_phi_node (vec_dest, loop->header);
5059 set_vinfo_for_stmt (new_phi,
5060 new_stmt_vec_info (new_phi, loop_vinfo,
5061 NULL));
5062 if (j == 0 || slp_node)
5063 phis.quick_push (new_phi);
5067 if (code == COND_EXPR)
5069 gcc_assert (!slp_node);
5070 vectorizable_condition (stmt, gsi, vec_stmt,
5071 PHI_RESULT (phis[0]),
5072 reduc_index, NULL);
5073 /* Multiple types are not supported for condition. */
5074 break;
5077 /* Handle uses. */
5078 if (j == 0)
5080 op0 = ops[!reduc_index];
5081 if (op_type == ternary_op)
5083 if (reduc_index == 0)
5084 op1 = ops[2];
5085 else
5086 op1 = ops[1];
5089 if (slp_node)
5090 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5091 slp_node, -1);
5092 else
5094 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5095 stmt, NULL);
5096 vec_oprnds0.quick_push (loop_vec_def0);
5097 if (op_type == ternary_op)
5099 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5100 NULL);
5101 vec_oprnds1.quick_push (loop_vec_def1);
5105 else
5107 if (!slp_node)
5109 enum vect_def_type dt;
5110 gimple dummy_stmt;
5111 tree dummy;
5113 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5114 &dummy_stmt, &dummy, &dt);
5115 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5116 loop_vec_def0);
5117 vec_oprnds0[0] = loop_vec_def0;
5118 if (op_type == ternary_op)
5120 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5121 &dummy, &dt);
5122 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5123 loop_vec_def1);
5124 vec_oprnds1[0] = loop_vec_def1;
5128 if (single_defuse_cycle)
5129 reduc_def = gimple_assign_lhs (new_stmt);
5131 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5134 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5136 if (slp_node)
5137 reduc_def = PHI_RESULT (phis[i]);
5138 else
5140 if (!single_defuse_cycle || j == 0)
5141 reduc_def = PHI_RESULT (new_phi);
5144 def1 = ((op_type == ternary_op)
5145 ? vec_oprnds1[i] : NULL);
5146 if (op_type == binary_op)
5148 if (reduc_index == 0)
5149 expr = build2 (code, vectype_out, reduc_def, def0);
5150 else
5151 expr = build2 (code, vectype_out, def0, reduc_def);
5153 else
5155 if (reduc_index == 0)
5156 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5157 else
5159 if (reduc_index == 1)
5160 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5161 else
5162 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5166 new_stmt = gimple_build_assign (vec_dest, expr);
5167 new_temp = make_ssa_name (vec_dest, new_stmt);
5168 gimple_assign_set_lhs (new_stmt, new_temp);
5169 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5171 if (slp_node)
5173 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5174 vect_defs.quick_push (new_temp);
5176 else
5177 vect_defs[0] = new_temp;
5180 if (slp_node)
5181 continue;
5183 if (j == 0)
5184 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5185 else
5186 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5188 prev_stmt_info = vinfo_for_stmt (new_stmt);
5189 prev_phi_info = vinfo_for_stmt (new_phi);
5192 /* Finalize the reduction-phi (set its arguments) and create the
5193 epilog reduction code. */
5194 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5196 new_temp = gimple_assign_lhs (*vec_stmt);
5197 vect_defs[0] = new_temp;
5200 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5201 epilog_reduc_code, phis, reduc_index,
5202 double_reduc, slp_node);
5204 phis.release ();
5205 vect_defs.release ();
5206 vec_oprnds0.release ();
5207 vec_oprnds1.release ();
5209 return true;
5212 /* Function vect_min_worthwhile_factor.
5214 For a loop where we could vectorize the operation indicated by CODE,
5215 return the minimum vectorization factor that makes it worthwhile
5216 to use generic vectors. */
5218 vect_min_worthwhile_factor (enum tree_code code)
5220 switch (code)
5222 case PLUS_EXPR:
5223 case MINUS_EXPR:
5224 case NEGATE_EXPR:
5225 return 4;
5227 case BIT_AND_EXPR:
5228 case BIT_IOR_EXPR:
5229 case BIT_XOR_EXPR:
5230 case BIT_NOT_EXPR:
5231 return 2;
5233 default:
5234 return INT_MAX;
5239 /* Function vectorizable_induction
5241 Check if PHI performs an induction computation that can be vectorized.
5242 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5243 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5244 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5246 bool
5247 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5248 gimple *vec_stmt)
5250 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5251 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5252 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5253 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5254 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5255 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5256 tree vec_def;
5258 gcc_assert (ncopies >= 1);
5259 /* FORNOW. These restrictions should be relaxed. */
5260 if (nested_in_vect_loop_p (loop, phi))
5262 imm_use_iterator imm_iter;
5263 use_operand_p use_p;
5264 gimple exit_phi;
5265 edge latch_e;
5266 tree loop_arg;
5268 if (ncopies > 1)
5270 if (dump_enabled_p ())
5271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5272 "multiple types in nested loop.");
5273 return false;
5276 exit_phi = NULL;
5277 latch_e = loop_latch_edge (loop->inner);
5278 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5279 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5281 if (!flow_bb_inside_loop_p (loop->inner,
5282 gimple_bb (USE_STMT (use_p))))
5284 exit_phi = USE_STMT (use_p);
5285 break;
5288 if (exit_phi)
5290 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5291 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5292 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5294 if (dump_enabled_p ())
5295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5296 "inner-loop induction only used outside "
5297 "of the outer vectorized loop.");
5298 return false;
5303 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5304 return false;
5306 /* FORNOW: SLP not supported. */
5307 if (STMT_SLP_TYPE (stmt_info))
5308 return false;
5310 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5312 if (gimple_code (phi) != GIMPLE_PHI)
5313 return false;
5315 if (!vec_stmt) /* transformation not required. */
5317 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5318 if (dump_enabled_p ())
5319 dump_printf_loc (MSG_NOTE, vect_location,
5320 "=== vectorizable_induction ===");
5321 vect_model_induction_cost (stmt_info, ncopies);
5322 return true;
5325 /** Transform. **/
5327 if (dump_enabled_p ())
5328 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5330 vec_def = get_initial_def_for_induction (phi);
5331 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5332 return true;
5335 /* Function vectorizable_live_operation.
5337 STMT computes a value that is used outside the loop. Check if
5338 it can be supported. */
5340 bool
5341 vectorizable_live_operation (gimple stmt,
5342 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5343 gimple *vec_stmt ATTRIBUTE_UNUSED)
5345 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5348 int i;
5349 int op_type;
5350 tree op;
5351 tree def;
5352 gimple def_stmt;
5353 enum vect_def_type dt;
5354 enum tree_code code;
5355 enum gimple_rhs_class rhs_class;
5357 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5359 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5360 return false;
5362 if (!is_gimple_assign (stmt))
5363 return false;
5365 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5366 return false;
5368 /* FORNOW. CHECKME. */
5369 if (nested_in_vect_loop_p (loop, stmt))
5370 return false;
5372 code = gimple_assign_rhs_code (stmt);
5373 op_type = TREE_CODE_LENGTH (code);
5374 rhs_class = get_gimple_rhs_class (code);
5375 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5376 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5378 /* FORNOW: support only if all uses are invariant. This means
5379 that the scalar operations can remain in place, unvectorized.
5380 The original last scalar value that they compute will be used. */
5382 for (i = 0; i < op_type; i++)
5384 if (rhs_class == GIMPLE_SINGLE_RHS)
5385 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5386 else
5387 op = gimple_op (stmt, i + 1);
5388 if (op
5389 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5390 &dt))
5392 if (dump_enabled_p ())
5393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5394 "use not simple.");
5395 return false;
5398 if (dt != vect_external_def && dt != vect_constant_def)
5399 return false;
5402 /* No transformation is required for the cases we currently support. */
5403 return true;
5406 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5408 static void
5409 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5411 ssa_op_iter op_iter;
5412 imm_use_iterator imm_iter;
5413 def_operand_p def_p;
5414 gimple ustmt;
5416 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5418 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5420 basic_block bb;
5422 if (!is_gimple_debug (ustmt))
5423 continue;
5425 bb = gimple_bb (ustmt);
5427 if (!flow_bb_inside_loop_p (loop, bb))
5429 if (gimple_debug_bind_p (ustmt))
5431 if (dump_enabled_p ())
5432 dump_printf_loc (MSG_NOTE, vect_location,
5433 "killing debug use");
5435 gimple_debug_bind_reset_value (ustmt);
5436 update_stmt (ustmt);
5438 else
5439 gcc_unreachable ();
5445 /* Function vect_transform_loop.
5447 The analysis phase has determined that the loop is vectorizable.
5448 Vectorize the loop - created vectorized stmts to replace the scalar
5449 stmts in the loop, and update the loop exit condition. */
5451 void
5452 vect_transform_loop (loop_vec_info loop_vinfo)
5454 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5455 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5456 int nbbs = loop->num_nodes;
5457 gimple_stmt_iterator si;
5458 int i;
5459 tree ratio = NULL;
5460 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5461 bool grouped_store;
5462 bool slp_scheduled = false;
5463 unsigned int nunits;
5464 gimple stmt, pattern_stmt;
5465 gimple_seq pattern_def_seq = NULL;
5466 gimple_stmt_iterator pattern_def_si = gsi_none ();
5467 bool transform_pattern_stmt = false;
5468 bool check_profitability = false;
5469 int th;
5470 /* Record number of iterations before we started tampering with the profile. */
5471 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5473 if (dump_enabled_p ())
5474 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5476 /* If profile is inprecise, we have chance to fix it up. */
5477 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5478 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5480 /* Use the more conservative vectorization threshold. If the number
5481 of iterations is constant assume the cost check has been performed
5482 by our caller. If the threshold makes all loops profitable that
5483 run at least the vectorization factor number of times checking
5484 is pointless, too. */
5485 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5486 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5487 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5488 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5489 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5491 if (dump_enabled_p ())
5492 dump_printf_loc (MSG_NOTE, vect_location,
5493 "Profitability threshold is %d loop iterations.", th);
5494 check_profitability = true;
5497 /* Peel the loop if there are data refs with unknown alignment.
5498 Only one data ref with unknown store is allowed. */
5500 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5502 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5503 check_profitability = false;
5506 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5507 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5509 vect_loop_versioning (loop_vinfo, th, check_profitability);
5510 check_profitability = false;
5513 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5514 compile time constant), or it is a constant that doesn't divide by the
5515 vectorization factor, then an epilog loop needs to be created.
5516 We therefore duplicate the loop: the original loop will be vectorized,
5517 and will compute the first (n/VF) iterations. The second copy of the loop
5518 will remain scalar and will compute the remaining (n%VF) iterations.
5519 (VF is the vectorization factor). */
5521 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5522 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5523 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5524 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5525 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5526 th, check_profitability);
5527 else
5528 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5529 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5531 /* 1) Make sure the loop header has exactly two entries
5532 2) Make sure we have a preheader basic block. */
5534 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5536 split_edge (loop_preheader_edge (loop));
5538 /* FORNOW: the vectorizer supports only loops which body consist
5539 of one basic block (header + empty latch). When the vectorizer will
5540 support more involved loop forms, the order by which the BBs are
5541 traversed need to be reconsidered. */
5543 for (i = 0; i < nbbs; i++)
5545 basic_block bb = bbs[i];
5546 stmt_vec_info stmt_info;
5547 gimple phi;
5549 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5551 phi = gsi_stmt (si);
5552 if (dump_enabled_p ())
5554 dump_printf_loc (MSG_NOTE, vect_location,
5555 "------>vectorizing phi: ");
5556 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5558 stmt_info = vinfo_for_stmt (phi);
5559 if (!stmt_info)
5560 continue;
5562 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5563 vect_loop_kill_debug_uses (loop, phi);
5565 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5566 && !STMT_VINFO_LIVE_P (stmt_info))
5567 continue;
5569 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5570 != (unsigned HOST_WIDE_INT) vectorization_factor)
5571 && dump_enabled_p ())
5572 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5574 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5576 if (dump_enabled_p ())
5577 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5578 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5582 pattern_stmt = NULL;
5583 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5585 bool is_store;
5587 if (transform_pattern_stmt)
5588 stmt = pattern_stmt;
5589 else
5590 stmt = gsi_stmt (si);
5592 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_NOTE, vect_location,
5595 "------>vectorizing statement: ");
5596 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5599 stmt_info = vinfo_for_stmt (stmt);
5601 /* vector stmts created in the outer-loop during vectorization of
5602 stmts in an inner-loop may not have a stmt_info, and do not
5603 need to be vectorized. */
5604 if (!stmt_info)
5606 gsi_next (&si);
5607 continue;
5610 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5611 vect_loop_kill_debug_uses (loop, stmt);
5613 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5614 && !STMT_VINFO_LIVE_P (stmt_info))
5616 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5617 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5618 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5619 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5621 stmt = pattern_stmt;
5622 stmt_info = vinfo_for_stmt (stmt);
5624 else
5626 gsi_next (&si);
5627 continue;
5630 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5631 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5632 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5633 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5634 transform_pattern_stmt = true;
5636 /* If pattern statement has def stmts, vectorize them too. */
5637 if (is_pattern_stmt_p (stmt_info))
5639 if (pattern_def_seq == NULL)
5641 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5642 pattern_def_si = gsi_start (pattern_def_seq);
5644 else if (!gsi_end_p (pattern_def_si))
5645 gsi_next (&pattern_def_si);
5646 if (pattern_def_seq != NULL)
5648 gimple pattern_def_stmt = NULL;
5649 stmt_vec_info pattern_def_stmt_info = NULL;
5651 while (!gsi_end_p (pattern_def_si))
5653 pattern_def_stmt = gsi_stmt (pattern_def_si);
5654 pattern_def_stmt_info
5655 = vinfo_for_stmt (pattern_def_stmt);
5656 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5657 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5658 break;
5659 gsi_next (&pattern_def_si);
5662 if (!gsi_end_p (pattern_def_si))
5664 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_NOTE, vect_location,
5667 "==> vectorizing pattern def "
5668 "stmt: ");
5669 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5670 pattern_def_stmt, 0);
5673 stmt = pattern_def_stmt;
5674 stmt_info = pattern_def_stmt_info;
5676 else
5678 pattern_def_si = gsi_none ();
5679 transform_pattern_stmt = false;
5682 else
5683 transform_pattern_stmt = false;
5686 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5687 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5688 STMT_VINFO_VECTYPE (stmt_info));
5689 if (!STMT_SLP_TYPE (stmt_info)
5690 && nunits != (unsigned int) vectorization_factor
5691 && dump_enabled_p ())
5692 /* For SLP VF is set according to unrolling factor, and not to
5693 vector size, hence for SLP this print is not valid. */
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 "multiple-types.");
5697 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5698 reached. */
5699 if (STMT_SLP_TYPE (stmt_info))
5701 if (!slp_scheduled)
5703 slp_scheduled = true;
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_NOTE, vect_location,
5707 "=== scheduling SLP instances ===");
5709 vect_schedule_slp (loop_vinfo, NULL);
5712 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5713 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5715 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5717 pattern_def_seq = NULL;
5718 gsi_next (&si);
5720 continue;
5724 /* -------- vectorize statement ------------ */
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5728 grouped_store = false;
5729 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5730 if (is_store)
5732 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5734 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5735 interleaving chain was completed - free all the stores in
5736 the chain. */
5737 gsi_next (&si);
5738 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5739 continue;
5741 else
5743 /* Free the attached stmt_vec_info and remove the stmt. */
5744 gimple store = gsi_stmt (si);
5745 free_stmt_vec_info (store);
5746 unlink_stmt_vdef (store);
5747 gsi_remove (&si, true);
5748 release_defs (store);
5749 continue;
5753 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5755 pattern_def_seq = NULL;
5756 gsi_next (&si);
5758 } /* stmts in BB */
5759 } /* BBs in loop */
5761 slpeel_make_loop_iterate_ntimes (loop, ratio);
5763 /* Reduce loop iterations by the vectorization factor. */
5764 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5765 expected_iterations / vectorization_factor);
5766 loop->nb_iterations_upper_bound
5767 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5768 FLOOR_DIV_EXPR);
5769 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5770 && loop->nb_iterations_upper_bound != double_int_zero)
5771 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5772 if (loop->any_estimate)
5774 loop->nb_iterations_estimate
5775 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5776 FLOOR_DIV_EXPR);
5777 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5778 && loop->nb_iterations_estimate != double_int_zero)
5779 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5782 if (dump_enabled_p ())
5783 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5784 if (loop->inner && dump_enabled_p ())
5785 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5786 "OUTER LOOP VECTORIZED.");