* g++.dg/tree-ssa-pr43411.C: Rename function to be inlined and
[official-gcc.git] / gcc / tree-vect-loop.c
blob666bd9b0fa7b8d675bca47af93cadf223f893063
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 return false;
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 return false;
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 return false;
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 else
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 return false;
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 return false;
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 return false;
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386 return true;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 *init = init_expr;
425 *step = step_expr;
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
434 return true;
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
525 if (double_reduc)
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
534 else
536 if (nested_cycle)
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
545 else
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
579 Example1: reduction:
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
585 Example2: induction:
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
621 tree niters;
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
711 else
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
761 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
762 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
763 LOOP_VINFO_PEELING_HTAB (res) = NULL;
764 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
766 return res;
770 /* Function destroy_loop_vec_info.
772 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
773 stmts in the loop. */
775 void
776 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
778 struct loop *loop;
779 basic_block *bbs;
780 int nbbs;
781 gimple_stmt_iterator si;
782 int j;
783 VEC (slp_instance, heap) *slp_instances;
784 slp_instance instance;
786 if (!loop_vinfo)
787 return;
789 loop = LOOP_VINFO_LOOP (loop_vinfo);
791 bbs = LOOP_VINFO_BBS (loop_vinfo);
792 nbbs = loop->num_nodes;
794 if (!clean_stmts)
796 free (LOOP_VINFO_BBS (loop_vinfo));
797 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
798 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
799 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
800 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
801 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
803 free (loop_vinfo);
804 loop->aux = NULL;
805 return;
808 for (j = 0; j < nbbs; j++)
810 basic_block bb = bbs[j];
811 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
812 free_stmt_vec_info (gsi_stmt (si));
814 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
816 gimple stmt = gsi_stmt (si);
817 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
819 if (stmt_info)
821 /* Check if this is a "pattern stmt" (introduced by the
822 vectorizer during the pattern recognition pass). */
823 bool remove_stmt_p = false;
824 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
825 if (orig_stmt)
827 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
828 if (orig_stmt_info
829 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
830 remove_stmt_p = true;
833 /* Free stmt_vec_info. */
834 free_stmt_vec_info (stmt);
836 /* Remove dead "pattern stmts". */
837 if (remove_stmt_p)
838 gsi_remove (&si, true);
840 gsi_next (&si);
844 free (LOOP_VINFO_BBS (loop_vinfo));
845 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
846 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
847 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
849 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
850 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
851 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
852 vect_free_slp_instance (instance);
854 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
855 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
856 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
857 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
859 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
860 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
862 free (loop_vinfo);
863 loop->aux = NULL;
867 /* Function vect_analyze_loop_1.
869 Apply a set of analyses on LOOP, and create a loop_vec_info struct
870 for it. The different analyses will record information in the
871 loop_vec_info struct. This is a subset of the analyses applied in
872 vect_analyze_loop, to be applied on an inner-loop nested in the loop
873 that is now considered for (outer-loop) vectorization. */
875 static loop_vec_info
876 vect_analyze_loop_1 (struct loop *loop)
878 loop_vec_info loop_vinfo;
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
883 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
885 loop_vinfo = vect_analyze_loop_form (loop);
886 if (!loop_vinfo)
888 if (vect_print_dump_info (REPORT_DETAILS))
889 fprintf (vect_dump, "bad inner-loop form.");
890 return NULL;
893 return loop_vinfo;
897 /* Function vect_analyze_loop_form.
899 Verify that certain CFG restrictions hold, including:
900 - the loop has a pre-header
901 - the loop has a single entry and exit
902 - the loop exit condition is simple enough, and the number of iterations
903 can be analyzed (a countable loop). */
905 loop_vec_info
906 vect_analyze_loop_form (struct loop *loop)
908 loop_vec_info loop_vinfo;
909 gimple loop_cond;
910 tree number_of_iterations = NULL;
911 loop_vec_info inner_loop_vinfo = NULL;
913 if (vect_print_dump_info (REPORT_DETAILS))
914 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
916 /* Different restrictions apply when we are considering an inner-most loop,
917 vs. an outer (nested) loop.
918 (FORNOW. May want to relax some of these restrictions in the future). */
920 if (!loop->inner)
922 /* Inner-most loop. We currently require that the number of BBs is
923 exactly 2 (the header and latch). Vectorizable inner-most loops
924 look like this:
926 (pre-header)
928 header <--------+
929 | | |
930 | +--> latch --+
932 (exit-bb) */
934 if (loop->num_nodes != 2)
936 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
937 fprintf (vect_dump, "not vectorized: control flow in loop.");
938 return NULL;
941 if (empty_block_p (loop->header))
943 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
944 fprintf (vect_dump, "not vectorized: empty loop.");
945 return NULL;
948 else
950 struct loop *innerloop = loop->inner;
951 edge entryedge;
953 /* Nested loop. We currently require that the loop is doubly-nested,
954 contains a single inner loop, and the number of BBs is exactly 5.
955 Vectorizable outer-loops look like this:
957 (pre-header)
959 header <---+
961 inner-loop |
963 tail ------+
965 (exit-bb)
967 The inner-loop has the properties expected of inner-most loops
968 as described above. */
970 if ((loop->inner)->inner || (loop->inner)->next)
972 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
973 fprintf (vect_dump, "not vectorized: multiple nested loops.");
974 return NULL;
977 /* Analyze the inner-loop. */
978 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
979 if (!inner_loop_vinfo)
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
982 fprintf (vect_dump, "not vectorized: Bad inner loop.");
983 return NULL;
986 if (!expr_invariant_in_loop_p (loop,
987 LOOP_VINFO_NITERS (inner_loop_vinfo)))
989 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
990 fprintf (vect_dump,
991 "not vectorized: inner-loop count not invariant.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
993 return NULL;
996 if (loop->num_nodes != 5)
998 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
999 fprintf (vect_dump, "not vectorized: control flow in loop.");
1000 destroy_loop_vec_info (inner_loop_vinfo, true);
1001 return NULL;
1004 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1005 entryedge = EDGE_PRED (innerloop->header, 0);
1006 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1007 entryedge = EDGE_PRED (innerloop->header, 1);
1009 if (entryedge->src != loop->header
1010 || !single_exit (innerloop)
1011 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1013 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1014 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1015 destroy_loop_vec_info (inner_loop_vinfo, true);
1016 return NULL;
1019 if (vect_print_dump_info (REPORT_DETAILS))
1020 fprintf (vect_dump, "Considering outer-loop vectorization.");
1023 if (!single_exit (loop)
1024 || EDGE_COUNT (loop->header->preds) != 2)
1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1028 if (!single_exit (loop))
1029 fprintf (vect_dump, "not vectorized: multiple exits.");
1030 else if (EDGE_COUNT (loop->header->preds) != 2)
1031 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1033 if (inner_loop_vinfo)
1034 destroy_loop_vec_info (inner_loop_vinfo, true);
1035 return NULL;
1038 /* We assume that the loop exit condition is at the end of the loop. i.e,
1039 that the loop is represented as a do-while (with a proper if-guard
1040 before the loop if needed), where the loop header contains all the
1041 executable statements, and the latch is empty. */
1042 if (!empty_block_p (loop->latch)
1043 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1045 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1046 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1047 if (inner_loop_vinfo)
1048 destroy_loop_vec_info (inner_loop_vinfo, true);
1049 return NULL;
1052 /* Make sure there exists a single-predecessor exit bb: */
1053 if (!single_pred_p (single_exit (loop)->dest))
1055 edge e = single_exit (loop);
1056 if (!(e->flags & EDGE_ABNORMAL))
1058 split_loop_exit_edge (e);
1059 if (vect_print_dump_info (REPORT_DETAILS))
1060 fprintf (vect_dump, "split exit edge.");
1062 else
1064 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1065 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1066 if (inner_loop_vinfo)
1067 destroy_loop_vec_info (inner_loop_vinfo, true);
1068 return NULL;
1072 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1073 if (!loop_cond)
1075 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1076 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1077 if (inner_loop_vinfo)
1078 destroy_loop_vec_info (inner_loop_vinfo, true);
1079 return NULL;
1082 if (!number_of_iterations)
1084 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085 fprintf (vect_dump,
1086 "not vectorized: number of iterations cannot be computed.");
1087 if (inner_loop_vinfo)
1088 destroy_loop_vec_info (inner_loop_vinfo, true);
1089 return NULL;
1092 if (chrec_contains_undetermined (number_of_iterations))
1094 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1095 fprintf (vect_dump, "Infinite number of iterations.");
1096 if (inner_loop_vinfo)
1097 destroy_loop_vec_info (inner_loop_vinfo, true);
1098 return NULL;
1101 if (!NITERS_KNOWN_P (number_of_iterations))
1103 if (vect_print_dump_info (REPORT_DETAILS))
1105 fprintf (vect_dump, "Symbolic number of iterations is ");
1106 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1109 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1111 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1112 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1113 if (inner_loop_vinfo)
1114 destroy_loop_vec_info (inner_loop_vinfo, false);
1115 return NULL;
1118 loop_vinfo = new_loop_vec_info (loop);
1119 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1120 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1122 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1124 /* CHECKME: May want to keep it around it in the future. */
1125 if (inner_loop_vinfo)
1126 destroy_loop_vec_info (inner_loop_vinfo, false);
1128 gcc_assert (!loop->aux);
1129 loop->aux = loop_vinfo;
1130 return loop_vinfo;
1134 /* Get cost by calling cost target builtin. */
1136 static inline int
1137 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1139 tree dummy_type = NULL;
1140 int dummy = 0;
1142 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1143 dummy_type, dummy);
1147 /* Function vect_analyze_loop_operations.
1149 Scan the loop stmts and make sure they are all vectorizable. */
1151 static bool
1152 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1155 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1156 int nbbs = loop->num_nodes;
1157 gimple_stmt_iterator si;
1158 unsigned int vectorization_factor = 0;
1159 int i;
1160 gimple phi;
1161 stmt_vec_info stmt_info;
1162 bool need_to_vectorize = false;
1163 int min_profitable_iters;
1164 int min_scalar_loop_bound;
1165 unsigned int th;
1166 bool only_slp_in_loop = true, ok;
1168 if (vect_print_dump_info (REPORT_DETAILS))
1169 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1171 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1172 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1173 if (slp)
1175 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1176 vectorization factor of the loop is the unrolling factor required by
1177 the SLP instances. If that unrolling factor is 1, we say, that we
1178 perform pure SLP on loop - cross iteration parallelism is not
1179 exploited. */
1180 for (i = 0; i < nbbs; i++)
1182 basic_block bb = bbs[i];
1183 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1185 gimple stmt = gsi_stmt (si);
1186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1187 gcc_assert (stmt_info);
1188 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1189 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1190 && !PURE_SLP_STMT (stmt_info))
1191 /* STMT needs both SLP and loop-based vectorization. */
1192 only_slp_in_loop = false;
1196 if (only_slp_in_loop)
1197 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1198 else
1199 vectorization_factor = least_common_multiple (vectorization_factor,
1200 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1202 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1203 if (vect_print_dump_info (REPORT_DETAILS))
1204 fprintf (vect_dump, "Updating vectorization factor to %d ",
1205 vectorization_factor);
1208 for (i = 0; i < nbbs; i++)
1210 basic_block bb = bbs[i];
1212 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1214 phi = gsi_stmt (si);
1215 ok = true;
1217 stmt_info = vinfo_for_stmt (phi);
1218 if (vect_print_dump_info (REPORT_DETAILS))
1220 fprintf (vect_dump, "examining phi: ");
1221 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1224 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1225 (i.e., a phi in the tail of the outer-loop). */
1226 if (! is_loop_header_bb_p (bb))
1228 /* FORNOW: we currently don't support the case that these phis
1229 are not used in the outerloop (unless it is double reduction,
1230 i.e., this phi is vect_reduction_def), cause this case
1231 requires to actually do something here. */
1232 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1233 || STMT_VINFO_LIVE_P (stmt_info))
1234 && STMT_VINFO_DEF_TYPE (stmt_info)
1235 != vect_double_reduction_def)
1237 if (vect_print_dump_info (REPORT_DETAILS))
1238 fprintf (vect_dump,
1239 "Unsupported loop-closed phi in outer-loop.");
1240 return false;
1243 /* If PHI is used in the outer loop, we check that its operand
1244 is defined in the inner loop. */
1245 if (STMT_VINFO_RELEVANT_P (stmt_info))
1247 tree phi_op;
1248 gimple op_def_stmt;
1250 if (gimple_phi_num_args (phi) != 1)
1251 return false;
1253 phi_op = PHI_ARG_DEF (phi, 0);
1254 if (TREE_CODE (phi_op) != SSA_NAME)
1255 return false;
1257 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1258 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1259 return false;
1261 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1262 != vect_used_in_outer
1263 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1264 != vect_used_in_outer_by_reduction)
1265 return false;
1268 continue;
1271 gcc_assert (stmt_info);
1273 if (STMT_VINFO_LIVE_P (stmt_info))
1275 /* FORNOW: not yet supported. */
1276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277 fprintf (vect_dump, "not vectorized: value used after loop.");
1278 return false;
1281 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1282 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1284 /* A scalar-dependence cycle that we don't support. */
1285 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1286 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1287 return false;
1290 if (STMT_VINFO_RELEVANT_P (stmt_info))
1292 need_to_vectorize = true;
1293 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1294 ok = vectorizable_induction (phi, NULL, NULL);
1297 if (!ok)
1299 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301 fprintf (vect_dump,
1302 "not vectorized: relevant phi not supported: ");
1303 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1305 return false;
1309 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1311 gimple stmt = gsi_stmt (si);
1312 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1313 return false;
1315 } /* bbs */
1317 /* All operations in the loop are either irrelevant (deal with loop
1318 control, or dead), or only used outside the loop and can be moved
1319 out of the loop (e.g. invariants, inductions). The loop can be
1320 optimized away by scalar optimizations. We're better off not
1321 touching this loop. */
1322 if (!need_to_vectorize)
1324 if (vect_print_dump_info (REPORT_DETAILS))
1325 fprintf (vect_dump,
1326 "All the computation can be taken out of the loop.");
1327 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1328 fprintf (vect_dump,
1329 "not vectorized: redundant loop. no profit to vectorize.");
1330 return false;
1333 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1334 && vect_print_dump_info (REPORT_DETAILS))
1335 fprintf (vect_dump,
1336 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1337 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1339 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1340 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1343 fprintf (vect_dump, "not vectorized: iteration count too small.");
1344 if (vect_print_dump_info (REPORT_DETAILS))
1345 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1346 "vectorization factor.");
1347 return false;
1350 /* Analyze cost. Decide if worth while to vectorize. */
1352 /* Once VF is set, SLP costs should be updated since the number of created
1353 vector stmts depends on VF. */
1354 vect_update_slp_costs_according_to_vf (loop_vinfo);
1356 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1357 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1359 if (min_profitable_iters < 0)
1361 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1362 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1363 if (vect_print_dump_info (REPORT_DETAILS))
1364 fprintf (vect_dump, "not vectorized: vector version will never be "
1365 "profitable.");
1366 return false;
1369 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1370 * vectorization_factor) - 1);
1372 /* Use the cost model only if it is more conservative than user specified
1373 threshold. */
1375 th = (unsigned) min_scalar_loop_bound;
1376 if (min_profitable_iters
1377 && (!min_scalar_loop_bound
1378 || min_profitable_iters > min_scalar_loop_bound))
1379 th = (unsigned) min_profitable_iters;
1381 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1382 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1384 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1385 fprintf (vect_dump, "not vectorized: vectorization not "
1386 "profitable.");
1387 if (vect_print_dump_info (REPORT_DETAILS))
1388 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1389 "user specified loop bound parameter or minimum "
1390 "profitable iterations (whichever is more conservative).");
1391 return false;
1394 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1395 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1396 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1398 if (vect_print_dump_info (REPORT_DETAILS))
1399 fprintf (vect_dump, "epilog loop required.");
1400 if (!vect_can_advance_ivs_p (loop_vinfo))
1402 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1403 fprintf (vect_dump,
1404 "not vectorized: can't create epilog loop 1.");
1405 return false;
1407 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1409 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1410 fprintf (vect_dump,
1411 "not vectorized: can't create epilog loop 2.");
1412 return false;
1416 return true;
1420 /* Function vect_analyze_loop_2.
1422 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1423 for it. The different analyses will record information in the
1424 loop_vec_info struct. */
1425 static bool
1426 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1428 bool ok, dummy, slp = false;
1429 int max_vf = MAX_VECTORIZATION_FACTOR;
1430 int min_vf = 2;
1432 /* Find all data references in the loop (which correspond to vdefs/vuses)
1433 and analyze their evolution in the loop. Also adjust the minimal
1434 vectorization factor according to the loads and stores.
1436 FORNOW: Handle only simple, array references, which
1437 alignment can be forced, and aligned pointer-references. */
1439 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1440 if (!ok)
1442 if (vect_print_dump_info (REPORT_DETAILS))
1443 fprintf (vect_dump, "bad data references.");
1444 return false;
1447 /* Classify all cross-iteration scalar data-flow cycles.
1448 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1450 vect_analyze_scalar_cycles (loop_vinfo);
1452 vect_pattern_recog (loop_vinfo);
1454 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1456 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1457 if (!ok)
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "unexpected pattern.");
1461 return false;
1464 /* Analyze data dependences between the data-refs in the loop
1465 and adjust the maximum vectorization factor according to
1466 the dependences.
1467 FORNOW: fail at the first data dependence that we encounter. */
1469 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1470 if (!ok
1471 || max_vf < min_vf)
1473 if (vect_print_dump_info (REPORT_DETAILS))
1474 fprintf (vect_dump, "bad data dependence.");
1475 return false;
1478 ok = vect_determine_vectorization_factor (loop_vinfo);
1479 if (!ok)
1481 if (vect_print_dump_info (REPORT_DETAILS))
1482 fprintf (vect_dump, "can't determine vectorization factor.");
1483 return false;
1485 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1487 if (vect_print_dump_info (REPORT_DETAILS))
1488 fprintf (vect_dump, "bad data dependence.");
1489 return false;
1492 /* Analyze the alignment of the data-refs in the loop.
1493 Fail if a data reference is found that cannot be vectorized. */
1495 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1496 if (!ok)
1498 if (vect_print_dump_info (REPORT_DETAILS))
1499 fprintf (vect_dump, "bad data alignment.");
1500 return false;
1503 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1504 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1506 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1507 if (!ok)
1509 if (vect_print_dump_info (REPORT_DETAILS))
1510 fprintf (vect_dump, "bad data access.");
1511 return false;
1514 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1515 It is important to call pruning after vect_analyze_data_ref_accesses,
1516 since we use grouping information gathered by interleaving analysis. */
1517 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1518 if (!ok)
1520 if (vect_print_dump_info (REPORT_DETAILS))
1521 fprintf (vect_dump, "too long list of versioning for alias "
1522 "run-time tests.");
1523 return false;
1526 /* This pass will decide on using loop versioning and/or loop peeling in
1527 order to enhance the alignment of data references in the loop. */
1529 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1530 if (!ok)
1532 if (vect_print_dump_info (REPORT_DETAILS))
1533 fprintf (vect_dump, "bad data alignment.");
1534 return false;
1537 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1538 ok = vect_analyze_slp (loop_vinfo, NULL);
1539 if (ok)
1541 /* Decide which possible SLP instances to SLP. */
1542 slp = vect_make_slp_decision (loop_vinfo);
1544 /* Find stmts that need to be both vectorized and SLPed. */
1545 vect_detect_hybrid_slp (loop_vinfo);
1547 else
1548 return false;
1550 /* Scan all the operations in the loop and make sure they are
1551 vectorizable. */
1553 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1554 if (!ok)
1556 if (vect_print_dump_info (REPORT_DETAILS))
1557 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1558 return false;
1561 return true;
1564 /* Function vect_analyze_loop.
1566 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1567 for it. The different analyses will record information in the
1568 loop_vec_info struct. */
1569 loop_vec_info
1570 vect_analyze_loop (struct loop *loop)
1572 loop_vec_info loop_vinfo;
1573 unsigned int vector_sizes;
1575 /* Autodetect first vector size we try. */
1576 current_vector_size = 0;
1577 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1579 if (vect_print_dump_info (REPORT_DETAILS))
1580 fprintf (vect_dump, "===== analyze_loop_nest =====");
1582 if (loop_outer (loop)
1583 && loop_vec_info_for_loop (loop_outer (loop))
1584 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1586 if (vect_print_dump_info (REPORT_DETAILS))
1587 fprintf (vect_dump, "outer-loop already vectorized.");
1588 return NULL;
1591 while (1)
1593 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1594 loop_vinfo = vect_analyze_loop_form (loop);
1595 if (!loop_vinfo)
1597 if (vect_print_dump_info (REPORT_DETAILS))
1598 fprintf (vect_dump, "bad loop form.");
1599 return NULL;
1602 if (vect_analyze_loop_2 (loop_vinfo))
1604 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1606 return loop_vinfo;
1609 destroy_loop_vec_info (loop_vinfo, true);
1611 vector_sizes &= ~current_vector_size;
1612 if (vector_sizes == 0
1613 || current_vector_size == 0)
1614 return NULL;
1616 /* Try the next biggest vector size. */
1617 current_vector_size = 1 << floor_log2 (vector_sizes);
1618 if (vect_print_dump_info (REPORT_DETAILS))
1619 fprintf (vect_dump, "***** Re-trying analysis with "
1620 "vector size %d\n", current_vector_size);
1625 /* Function reduction_code_for_scalar_code
1627 Input:
1628 CODE - tree_code of a reduction operations.
1630 Output:
1631 REDUC_CODE - the corresponding tree-code to be used to reduce the
1632 vector of partial results into a single scalar result (which
1633 will also reside in a vector) or ERROR_MARK if the operation is
1634 a supported reduction operation, but does not have such tree-code.
1636 Return FALSE if CODE currently cannot be vectorized as reduction. */
1638 static bool
1639 reduction_code_for_scalar_code (enum tree_code code,
1640 enum tree_code *reduc_code)
1642 switch (code)
1644 case MAX_EXPR:
1645 *reduc_code = REDUC_MAX_EXPR;
1646 return true;
1648 case MIN_EXPR:
1649 *reduc_code = REDUC_MIN_EXPR;
1650 return true;
1652 case PLUS_EXPR:
1653 *reduc_code = REDUC_PLUS_EXPR;
1654 return true;
1656 case MULT_EXPR:
1657 case MINUS_EXPR:
1658 case BIT_IOR_EXPR:
1659 case BIT_XOR_EXPR:
1660 case BIT_AND_EXPR:
1661 *reduc_code = ERROR_MARK;
1662 return true;
1664 default:
1665 return false;
1670 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1671 STMT is printed with a message MSG. */
1673 static void
1674 report_vect_op (gimple stmt, const char *msg)
1676 fprintf (vect_dump, "%s", msg);
1677 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1681 /* Detect SLP reduction of the form:
1683 #a1 = phi <a5, a0>
1684 a2 = operation (a1)
1685 a3 = operation (a2)
1686 a4 = operation (a3)
1687 a5 = operation (a4)
1689 #a = phi <a5>
1691 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1692 FIRST_STMT is the first reduction stmt in the chain
1693 (a2 = operation (a1)).
1695 Return TRUE if a reduction chain was detected. */
1697 static bool
1698 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1700 struct loop *loop = (gimple_bb (phi))->loop_father;
1701 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1702 enum tree_code code;
1703 gimple current_stmt = NULL, use_stmt = NULL, first;
1704 stmt_vec_info use_stmt_info, current_stmt_info;
1705 tree lhs;
1706 imm_use_iterator imm_iter;
1707 use_operand_p use_p;
1708 int nloop_uses, size = 0, nuses;
1709 bool found = false;
1711 if (loop != vect_loop)
1712 return false;
1714 lhs = PHI_RESULT (phi);
1715 code = gimple_assign_rhs_code (first_stmt);
1716 while (1)
1718 nloop_uses = 0;
1719 nuses = 0;
1720 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1722 use_stmt = USE_STMT (use_p);
1723 nuses++;
1724 if (is_gimple_debug (use_stmt))
1725 continue;
1727 /* Check if we got back to the reduction phi. */
1728 if (gimple_code (use_stmt) == GIMPLE_PHI
1729 && use_stmt == phi)
1731 found = true;
1732 break;
1735 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1736 && vinfo_for_stmt (use_stmt)
1737 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))
1738 && use_stmt != first_stmt)
1739 nloop_uses++;
1741 if (nloop_uses > 1)
1742 return false;
1745 /* We reached a statement with no uses. */
1746 if (nuses == 0)
1747 return false;
1749 if (found)
1750 break;
1752 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1753 if (gimple_code (use_stmt) == GIMPLE_PHI)
1754 return false;
1756 if (!is_gimple_assign (use_stmt)
1757 || code != gimple_assign_rhs_code (use_stmt)
1758 || !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1759 return false;
1761 /* Insert USE_STMT into reduction chain. */
1762 use_stmt_info = vinfo_for_stmt (use_stmt);
1763 if (current_stmt)
1765 current_stmt_info = vinfo_for_stmt (current_stmt);
1766 GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt;
1767 GROUP_FIRST_ELEMENT (use_stmt_info)
1768 = GROUP_FIRST_ELEMENT (current_stmt_info);
1770 else
1771 GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt;
1773 lhs = gimple_assign_lhs (use_stmt);
1774 current_stmt = use_stmt;
1775 size++;
1778 if (!found || use_stmt != phi || size < 2)
1779 return false;
1781 /* Save the chain for further analysis in SLP detection. */
1782 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1783 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1784 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1786 /* Swap the operands, if needed, to make the reduction operand be the second
1787 operand. */
1788 lhs = PHI_RESULT (phi);
1789 current_stmt = first;
1790 while (current_stmt)
1792 if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS
1793 && gimple_assign_rhs2 (current_stmt) != lhs)
1795 if (vect_print_dump_info (REPORT_DETAILS))
1797 fprintf (vect_dump, "swapping oprnds: ");
1798 print_gimple_stmt (vect_dump, current_stmt, 0, TDF_SLIM);
1801 swap_tree_operands (current_stmt,
1802 gimple_assign_rhs1_ptr (current_stmt),
1803 gimple_assign_rhs2_ptr (current_stmt));
1804 mark_symbols_for_renaming (current_stmt);
1807 lhs = gimple_assign_lhs (current_stmt);
1808 current_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (current_stmt));
1811 return true;
1815 /* Function vect_is_simple_reduction_1
1817 (1) Detect a cross-iteration def-use cycle that represents a simple
1818 reduction computation. We look for the following pattern:
1820 loop_header:
1821 a1 = phi < a0, a2 >
1822 a3 = ...
1823 a2 = operation (a3, a1)
1825 such that:
1826 1. operation is commutative and associative and it is safe to
1827 change the order of the computation (if CHECK_REDUCTION is true)
1828 2. no uses for a2 in the loop (a2 is used out of the loop)
1829 3. no uses of a1 in the loop besides the reduction operation
1830 4. no uses of a1 outside the loop.
1832 Conditions 1,4 are tested here.
1833 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1835 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1836 nested cycles, if CHECK_REDUCTION is false.
1838 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1839 reductions:
1841 a1 = phi < a0, a2 >
1842 inner loop (def of a3)
1843 a2 = phi < a3 >
1845 If MODIFY is true it tries also to rework the code in-place to enable
1846 detection of more reduction patterns. For the time being we rewrite
1847 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1850 static gimple
1851 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1852 bool check_reduction, bool *double_reduc,
1853 bool modify)
1855 struct loop *loop = (gimple_bb (phi))->loop_father;
1856 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1857 edge latch_e = loop_latch_edge (loop);
1858 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1859 gimple def_stmt, def1 = NULL, def2 = NULL;
1860 enum tree_code orig_code, code;
1861 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1862 tree type;
1863 int nloop_uses;
1864 tree name;
1865 imm_use_iterator imm_iter;
1866 use_operand_p use_p;
1867 bool phi_def;
1869 *double_reduc = false;
1871 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1872 otherwise, we assume outer loop vectorization. */
1873 gcc_assert ((check_reduction && loop == vect_loop)
1874 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1876 name = PHI_RESULT (phi);
1877 nloop_uses = 0;
1878 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1880 gimple use_stmt = USE_STMT (use_p);
1881 if (is_gimple_debug (use_stmt))
1882 continue;
1884 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1886 if (vect_print_dump_info (REPORT_DETAILS))
1887 fprintf (vect_dump, "intermediate value used outside loop.");
1889 return NULL;
1892 if (vinfo_for_stmt (use_stmt)
1893 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1894 nloop_uses++;
1895 if (nloop_uses > 1)
1897 if (vect_print_dump_info (REPORT_DETAILS))
1898 fprintf (vect_dump, "reduction used in loop.");
1899 return NULL;
1903 if (TREE_CODE (loop_arg) != SSA_NAME)
1905 if (vect_print_dump_info (REPORT_DETAILS))
1907 fprintf (vect_dump, "reduction: not ssa_name: ");
1908 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1910 return NULL;
1913 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1914 if (!def_stmt)
1916 if (vect_print_dump_info (REPORT_DETAILS))
1917 fprintf (vect_dump, "reduction: no def_stmt.");
1918 return NULL;
1921 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1923 if (vect_print_dump_info (REPORT_DETAILS))
1924 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1925 return NULL;
1928 if (is_gimple_assign (def_stmt))
1930 name = gimple_assign_lhs (def_stmt);
1931 phi_def = false;
1933 else
1935 name = PHI_RESULT (def_stmt);
1936 phi_def = true;
1939 nloop_uses = 0;
1940 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1942 gimple use_stmt = USE_STMT (use_p);
1943 if (is_gimple_debug (use_stmt))
1944 continue;
1945 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1946 && vinfo_for_stmt (use_stmt)
1947 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1948 nloop_uses++;
1949 if (nloop_uses > 1)
1951 if (vect_print_dump_info (REPORT_DETAILS))
1952 fprintf (vect_dump, "reduction used in loop.");
1953 return NULL;
1957 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1958 defined in the inner loop. */
1959 if (phi_def)
1961 op1 = PHI_ARG_DEF (def_stmt, 0);
1963 if (gimple_phi_num_args (def_stmt) != 1
1964 || TREE_CODE (op1) != SSA_NAME)
1966 if (vect_print_dump_info (REPORT_DETAILS))
1967 fprintf (vect_dump, "unsupported phi node definition.");
1969 return NULL;
1972 def1 = SSA_NAME_DEF_STMT (op1);
1973 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1974 && loop->inner
1975 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1976 && is_gimple_assign (def1))
1978 if (vect_print_dump_info (REPORT_DETAILS))
1979 report_vect_op (def_stmt, "detected double reduction: ");
1981 *double_reduc = true;
1982 return def_stmt;
1985 return NULL;
1988 code = orig_code = gimple_assign_rhs_code (def_stmt);
1990 /* We can handle "res -= x[i]", which is non-associative by
1991 simply rewriting this into "res += -x[i]". Avoid changing
1992 gimple instruction for the first simple tests and only do this
1993 if we're allowed to change code at all. */
1994 if (code == MINUS_EXPR
1995 && modify
1996 && (op1 = gimple_assign_rhs1 (def_stmt))
1997 && TREE_CODE (op1) == SSA_NAME
1998 && SSA_NAME_DEF_STMT (op1) == phi)
1999 code = PLUS_EXPR;
2001 if (check_reduction
2002 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2004 if (vect_print_dump_info (REPORT_DETAILS))
2005 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2006 return NULL;
2009 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2011 if (code != COND_EXPR)
2013 if (vect_print_dump_info (REPORT_DETAILS))
2014 report_vect_op (def_stmt, "reduction: not binary operation: ");
2016 return NULL;
2019 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2020 if (COMPARISON_CLASS_P (op3))
2022 op4 = TREE_OPERAND (op3, 1);
2023 op3 = TREE_OPERAND (op3, 0);
2026 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
2027 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
2029 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2031 if (vect_print_dump_info (REPORT_DETAILS))
2032 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2034 return NULL;
2037 else
2039 op1 = gimple_assign_rhs1 (def_stmt);
2040 op2 = gimple_assign_rhs2 (def_stmt);
2042 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2044 if (vect_print_dump_info (REPORT_DETAILS))
2045 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2047 return NULL;
2051 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2052 if ((TREE_CODE (op1) == SSA_NAME
2053 && !types_compatible_p (type,TREE_TYPE (op1)))
2054 || (TREE_CODE (op2) == SSA_NAME
2055 && !types_compatible_p (type, TREE_TYPE (op2)))
2056 || (op3 && TREE_CODE (op3) == SSA_NAME
2057 && !types_compatible_p (type, TREE_TYPE (op3)))
2058 || (op4 && TREE_CODE (op4) == SSA_NAME
2059 && !types_compatible_p (type, TREE_TYPE (op4))))
2061 if (vect_print_dump_info (REPORT_DETAILS))
2063 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2064 print_generic_expr (vect_dump, type, TDF_SLIM);
2065 fprintf (vect_dump, ", operands types: ");
2066 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2067 fprintf (vect_dump, ",");
2068 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2069 if (op3)
2071 fprintf (vect_dump, ",");
2072 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2075 if (op4)
2077 fprintf (vect_dump, ",");
2078 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2082 return NULL;
2085 /* Check that it's ok to change the order of the computation.
2086 Generally, when vectorizing a reduction we change the order of the
2087 computation. This may change the behavior of the program in some
2088 cases, so we need to check that this is ok. One exception is when
2089 vectorizing an outer-loop: the inner-loop is executed sequentially,
2090 and therefore vectorizing reductions in the inner-loop during
2091 outer-loop vectorization is safe. */
2093 /* CHECKME: check for !flag_finite_math_only too? */
2094 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2095 && check_reduction)
2097 /* Changing the order of operations changes the semantics. */
2098 if (vect_print_dump_info (REPORT_DETAILS))
2099 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2100 return NULL;
2102 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2103 && check_reduction)
2105 /* Changing the order of operations changes the semantics. */
2106 if (vect_print_dump_info (REPORT_DETAILS))
2107 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2108 return NULL;
2110 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2112 /* Changing the order of operations changes the semantics. */
2113 if (vect_print_dump_info (REPORT_DETAILS))
2114 report_vect_op (def_stmt,
2115 "reduction: unsafe fixed-point math optimization: ");
2116 return NULL;
2119 /* If we detected "res -= x[i]" earlier, rewrite it into
2120 "res += -x[i]" now. If this turns out to be useless reassoc
2121 will clean it up again. */
2122 if (orig_code == MINUS_EXPR)
2124 tree rhs = gimple_assign_rhs2 (def_stmt);
2125 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2126 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2127 rhs, NULL);
2128 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2129 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2130 loop_info, NULL));
2131 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2132 gimple_assign_set_rhs2 (def_stmt, negrhs);
2133 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2134 update_stmt (def_stmt);
2137 /* Reduction is safe. We're dealing with one of the following:
2138 1) integer arithmetic and no trapv
2139 2) floating point arithmetic, and special flags permit this optimization
2140 3) nested cycle (i.e., outer loop vectorization). */
2141 if (TREE_CODE (op1) == SSA_NAME)
2142 def1 = SSA_NAME_DEF_STMT (op1);
2144 if (TREE_CODE (op2) == SSA_NAME)
2145 def2 = SSA_NAME_DEF_STMT (op2);
2147 if (code != COND_EXPR
2148 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2150 if (vect_print_dump_info (REPORT_DETAILS))
2151 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2152 return NULL;
2155 /* Check that one def is the reduction def, defined by PHI,
2156 the other def is either defined in the loop ("vect_internal_def"),
2157 or it's an induction (defined by a loop-header phi-node). */
2159 if (def2 && def2 == phi
2160 && (code == COND_EXPR
2161 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2162 && (is_gimple_assign (def1)
2163 || is_gimple_call (def1)
2164 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2165 == vect_induction_def
2166 || (gimple_code (def1) == GIMPLE_PHI
2167 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2168 == vect_internal_def
2169 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2171 if (vect_print_dump_info (REPORT_DETAILS))
2172 report_vect_op (def_stmt, "detected reduction: ");
2173 return def_stmt;
2176 if (def1 && def1 == phi
2177 && (code == COND_EXPR
2178 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2179 && (is_gimple_assign (def2)
2180 || is_gimple_call (def2)
2181 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2182 == vect_induction_def
2183 || (gimple_code (def2) == GIMPLE_PHI
2184 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2185 == vect_internal_def
2186 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2188 if (check_reduction)
2190 /* Swap operands (just for simplicity - so that the rest of the code
2191 can assume that the reduction variable is always the last (second)
2192 argument). */
2193 if (vect_print_dump_info (REPORT_DETAILS))
2194 report_vect_op (def_stmt,
2195 "detected reduction: need to swap operands: ");
2197 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2198 gimple_assign_rhs2_ptr (def_stmt));
2200 else
2202 if (vect_print_dump_info (REPORT_DETAILS))
2203 report_vect_op (def_stmt, "detected reduction: ");
2206 return def_stmt;
2209 /* Try to find SLP reduction chain. */
2210 if (vect_is_slp_reduction (loop_info, phi, def_stmt))
2212 if (vect_print_dump_info (REPORT_DETAILS))
2213 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2215 return def_stmt;
2218 if (vect_print_dump_info (REPORT_DETAILS))
2219 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2221 return NULL;
2224 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2225 in-place. Arguments as there. */
2227 static gimple
2228 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2229 bool check_reduction, bool *double_reduc)
2231 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2232 double_reduc, false);
2235 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2236 in-place if it enables detection of more reductions. Arguments
2237 as there. */
2239 gimple
2240 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2241 bool check_reduction, bool *double_reduc)
2243 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2244 double_reduc, true);
2247 /* Calculate the cost of one scalar iteration of the loop. */
2249 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2252 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2253 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2254 int innerloop_iters, i, stmt_cost;
2256 /* Count statements in scalar loop. Using this as scalar cost for a single
2257 iteration for now.
2259 TODO: Add outer loop support.
2261 TODO: Consider assigning different costs to different scalar
2262 statements. */
2264 /* FORNOW. */
2265 innerloop_iters = 1;
2266 if (loop->inner)
2267 innerloop_iters = 50; /* FIXME */
2269 for (i = 0; i < nbbs; i++)
2271 gimple_stmt_iterator si;
2272 basic_block bb = bbs[i];
2274 if (bb->loop_father == loop->inner)
2275 factor = innerloop_iters;
2276 else
2277 factor = 1;
2279 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2281 gimple stmt = gsi_stmt (si);
2282 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2284 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2285 continue;
2287 /* Skip stmts that are not vectorized inside the loop. */
2288 if (stmt_info
2289 && !STMT_VINFO_RELEVANT_P (stmt_info)
2290 && (!STMT_VINFO_LIVE_P (stmt_info)
2291 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2292 continue;
2294 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2296 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2297 stmt_cost = vect_get_cost (scalar_load);
2298 else
2299 stmt_cost = vect_get_cost (scalar_store);
2301 else
2302 stmt_cost = vect_get_cost (scalar_stmt);
2304 scalar_single_iter_cost += stmt_cost * factor;
2307 return scalar_single_iter_cost;
2310 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2312 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2313 int *peel_iters_epilogue,
2314 int scalar_single_iter_cost)
2316 int peel_guard_costs = 0;
2317 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2319 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2321 *peel_iters_epilogue = vf/2;
2322 if (vect_print_dump_info (REPORT_COST))
2323 fprintf (vect_dump, "cost model: "
2324 "epilogue peel iters set to vf/2 because "
2325 "loop iterations are unknown .");
2327 /* If peeled iterations are known but number of scalar loop
2328 iterations are unknown, count a taken branch per peeled loop. */
2329 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2331 else
2333 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2334 peel_iters_prologue = niters < peel_iters_prologue ?
2335 niters : peel_iters_prologue;
2336 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2337 /* If we need to peel for gaps, but no peeling is required, we have to
2338 peel VF iterations. */
2339 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2340 *peel_iters_epilogue = vf;
2343 return (peel_iters_prologue * scalar_single_iter_cost)
2344 + (*peel_iters_epilogue * scalar_single_iter_cost)
2345 + peel_guard_costs;
2348 /* Function vect_estimate_min_profitable_iters
2350 Return the number of iterations required for the vector version of the
2351 loop to be profitable relative to the cost of the scalar version of the
2352 loop.
2354 TODO: Take profile info into account before making vectorization
2355 decisions, if available. */
2358 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2360 int i;
2361 int min_profitable_iters;
2362 int peel_iters_prologue;
2363 int peel_iters_epilogue;
2364 int vec_inside_cost = 0;
2365 int vec_outside_cost = 0;
2366 int scalar_single_iter_cost = 0;
2367 int scalar_outside_cost = 0;
2368 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2369 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2370 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2371 int nbbs = loop->num_nodes;
2372 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2373 int peel_guard_costs = 0;
2374 int innerloop_iters = 0, factor;
2375 VEC (slp_instance, heap) *slp_instances;
2376 slp_instance instance;
2378 /* Cost model disabled. */
2379 if (!flag_vect_cost_model)
2381 if (vect_print_dump_info (REPORT_COST))
2382 fprintf (vect_dump, "cost model disabled.");
2383 return 0;
2386 /* Requires loop versioning tests to handle misalignment. */
2387 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2389 /* FIXME: Make cost depend on complexity of individual check. */
2390 vec_outside_cost +=
2391 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2392 if (vect_print_dump_info (REPORT_COST))
2393 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2394 "versioning to treat misalignment.\n");
2397 /* Requires loop versioning with alias checks. */
2398 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2400 /* FIXME: Make cost depend on complexity of individual check. */
2401 vec_outside_cost +=
2402 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2403 if (vect_print_dump_info (REPORT_COST))
2404 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2405 "versioning aliasing.\n");
2408 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2409 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2410 vec_outside_cost += vect_get_cost (cond_branch_taken);
2412 /* Count statements in scalar loop. Using this as scalar cost for a single
2413 iteration for now.
2415 TODO: Add outer loop support.
2417 TODO: Consider assigning different costs to different scalar
2418 statements. */
2420 /* FORNOW. */
2421 if (loop->inner)
2422 innerloop_iters = 50; /* FIXME */
2424 for (i = 0; i < nbbs; i++)
2426 gimple_stmt_iterator si;
2427 basic_block bb = bbs[i];
2429 if (bb->loop_father == loop->inner)
2430 factor = innerloop_iters;
2431 else
2432 factor = 1;
2434 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2436 gimple stmt = gsi_stmt (si);
2437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2438 /* Skip stmts that are not vectorized inside the loop. */
2439 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2440 && (!STMT_VINFO_LIVE_P (stmt_info)
2441 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2442 continue;
2443 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2444 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2445 some of the "outside" costs are generated inside the outer-loop. */
2446 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2450 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2452 /* Add additional cost for the peeled instructions in prologue and epilogue
2453 loop.
2455 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2456 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2458 TODO: Build an expression that represents peel_iters for prologue and
2459 epilogue to be used in a run-time test. */
2461 if (npeel < 0)
2463 peel_iters_prologue = vf/2;
2464 if (vect_print_dump_info (REPORT_COST))
2465 fprintf (vect_dump, "cost model: "
2466 "prologue peel iters set to vf/2.");
2468 /* If peeling for alignment is unknown, loop bound of main loop becomes
2469 unknown. */
2470 peel_iters_epilogue = vf/2;
2471 if (vect_print_dump_info (REPORT_COST))
2472 fprintf (vect_dump, "cost model: "
2473 "epilogue peel iters set to vf/2 because "
2474 "peeling for alignment is unknown .");
2476 /* If peeled iterations are unknown, count a taken branch and a not taken
2477 branch per peeled loop. Even if scalar loop iterations are known,
2478 vector iterations are not known since peeled prologue iterations are
2479 not known. Hence guards remain the same. */
2480 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2481 + vect_get_cost (cond_branch_not_taken));
2482 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2483 + (peel_iters_epilogue * scalar_single_iter_cost)
2484 + peel_guard_costs;
2486 else
2488 peel_iters_prologue = npeel;
2489 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2490 peel_iters_prologue, &peel_iters_epilogue,
2491 scalar_single_iter_cost);
2494 /* FORNOW: The scalar outside cost is incremented in one of the
2495 following ways:
2497 1. The vectorizer checks for alignment and aliasing and generates
2498 a condition that allows dynamic vectorization. A cost model
2499 check is ANDED with the versioning condition. Hence scalar code
2500 path now has the added cost of the versioning check.
2502 if (cost > th & versioning_check)
2503 jmp to vector code
2505 Hence run-time scalar is incremented by not-taken branch cost.
2507 2. The vectorizer then checks if a prologue is required. If the
2508 cost model check was not done before during versioning, it has to
2509 be done before the prologue check.
2511 if (cost <= th)
2512 prologue = scalar_iters
2513 if (prologue == 0)
2514 jmp to vector code
2515 else
2516 execute prologue
2517 if (prologue == num_iters)
2518 go to exit
2520 Hence the run-time scalar cost is incremented by a taken branch,
2521 plus a not-taken branch, plus a taken branch cost.
2523 3. The vectorizer then checks if an epilogue is required. If the
2524 cost model check was not done before during prologue check, it
2525 has to be done with the epilogue check.
2527 if (prologue == 0)
2528 jmp to vector code
2529 else
2530 execute prologue
2531 if (prologue == num_iters)
2532 go to exit
2533 vector code:
2534 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2535 jmp to epilogue
2537 Hence the run-time scalar cost should be incremented by 2 taken
2538 branches.
2540 TODO: The back end may reorder the BBS's differently and reverse
2541 conditions/branch directions. Change the estimates below to
2542 something more reasonable. */
2544 /* If the number of iterations is known and we do not do versioning, we can
2545 decide whether to vectorize at compile time. Hence the scalar version
2546 do not carry cost model guard costs. */
2547 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2548 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2549 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2551 /* Cost model check occurs at versioning. */
2552 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2553 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2554 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2555 else
2557 /* Cost model check occurs at prologue generation. */
2558 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2559 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2560 + vect_get_cost (cond_branch_not_taken);
2561 /* Cost model check occurs at epilogue generation. */
2562 else
2563 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2567 /* Add SLP costs. */
2568 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2569 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2571 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2572 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2575 /* Calculate number of iterations required to make the vector version
2576 profitable, relative to the loop bodies only. The following condition
2577 must hold true:
2578 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2579 where
2580 SIC = scalar iteration cost, VIC = vector iteration cost,
2581 VOC = vector outside cost, VF = vectorization factor,
2582 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2583 SOC = scalar outside cost for run time cost model check. */
2585 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2587 if (vec_outside_cost <= 0)
2588 min_profitable_iters = 1;
2589 else
2591 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2592 - vec_inside_cost * peel_iters_prologue
2593 - vec_inside_cost * peel_iters_epilogue)
2594 / ((scalar_single_iter_cost * vf)
2595 - vec_inside_cost);
2597 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2598 <= ((vec_inside_cost * min_profitable_iters)
2599 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2600 min_profitable_iters++;
2603 /* vector version will never be profitable. */
2604 else
2606 if (vect_print_dump_info (REPORT_COST))
2607 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2608 "divided by the scalar iteration cost = %d "
2609 "is greater or equal to the vectorization factor = %d.",
2610 vec_inside_cost, scalar_single_iter_cost, vf);
2611 return -1;
2614 if (vect_print_dump_info (REPORT_COST))
2616 fprintf (vect_dump, "Cost model analysis: \n");
2617 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2618 vec_inside_cost);
2619 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2620 vec_outside_cost);
2621 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2622 scalar_single_iter_cost);
2623 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2624 fprintf (vect_dump, " prologue iterations: %d\n",
2625 peel_iters_prologue);
2626 fprintf (vect_dump, " epilogue iterations: %d\n",
2627 peel_iters_epilogue);
2628 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2629 min_profitable_iters);
2632 min_profitable_iters =
2633 min_profitable_iters < vf ? vf : min_profitable_iters;
2635 /* Because the condition we create is:
2636 if (niters <= min_profitable_iters)
2637 then skip the vectorized loop. */
2638 min_profitable_iters--;
2640 if (vect_print_dump_info (REPORT_COST))
2641 fprintf (vect_dump, " Profitability threshold = %d\n",
2642 min_profitable_iters);
2644 return min_profitable_iters;
2648 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2649 functions. Design better to avoid maintenance issues. */
2651 /* Function vect_model_reduction_cost.
2653 Models cost for a reduction operation, including the vector ops
2654 generated within the strip-mine loop, the initial definition before
2655 the loop, and the epilogue code that must be generated. */
2657 static bool
2658 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2659 int ncopies)
2661 int outer_cost = 0;
2662 enum tree_code code;
2663 optab optab;
2664 tree vectype;
2665 gimple stmt, orig_stmt;
2666 tree reduction_op;
2667 enum machine_mode mode;
2668 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2669 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2672 /* Cost of reduction op inside loop. */
2673 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2674 += ncopies * vect_get_cost (vector_stmt);
2676 stmt = STMT_VINFO_STMT (stmt_info);
2678 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2680 case GIMPLE_SINGLE_RHS:
2681 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2682 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2683 break;
2684 case GIMPLE_UNARY_RHS:
2685 reduction_op = gimple_assign_rhs1 (stmt);
2686 break;
2687 case GIMPLE_BINARY_RHS:
2688 reduction_op = gimple_assign_rhs2 (stmt);
2689 break;
2690 case GIMPLE_TERNARY_RHS:
2691 reduction_op = gimple_assign_rhs3 (stmt);
2692 break;
2693 default:
2694 gcc_unreachable ();
2697 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2698 if (!vectype)
2700 if (vect_print_dump_info (REPORT_COST))
2702 fprintf (vect_dump, "unsupported data-type ");
2703 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2705 return false;
2708 mode = TYPE_MODE (vectype);
2709 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2711 if (!orig_stmt)
2712 orig_stmt = STMT_VINFO_STMT (stmt_info);
2714 code = gimple_assign_rhs_code (orig_stmt);
2716 /* Add in cost for initial definition. */
2717 outer_cost += vect_get_cost (scalar_to_vec);
2719 /* Determine cost of epilogue code.
2721 We have a reduction operator that will reduce the vector in one statement.
2722 Also requires scalar extract. */
2724 if (!nested_in_vect_loop_p (loop, orig_stmt))
2726 if (reduc_code != ERROR_MARK)
2727 outer_cost += vect_get_cost (vector_stmt)
2728 + vect_get_cost (vec_to_scalar);
2729 else
2731 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2732 tree bitsize =
2733 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2734 int element_bitsize = tree_low_cst (bitsize, 1);
2735 int nelements = vec_size_in_bits / element_bitsize;
2737 optab = optab_for_tree_code (code, vectype, optab_default);
2739 /* We have a whole vector shift available. */
2740 if (VECTOR_MODE_P (mode)
2741 && optab_handler (optab, mode) != CODE_FOR_nothing
2742 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2743 /* Final reduction via vector shifts and the reduction operator. Also
2744 requires scalar extract. */
2745 outer_cost += ((exact_log2(nelements) * 2)
2746 * vect_get_cost (vector_stmt)
2747 + vect_get_cost (vec_to_scalar));
2748 else
2749 /* Use extracts and reduction op for final reduction. For N elements,
2750 we have N extracts and N-1 reduction ops. */
2751 outer_cost += ((nelements + nelements - 1)
2752 * vect_get_cost (vector_stmt));
2756 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2758 if (vect_print_dump_info (REPORT_COST))
2759 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2760 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2761 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2763 return true;
2767 /* Function vect_model_induction_cost.
2769 Models cost for induction operations. */
2771 static void
2772 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2774 /* loop cost for vec_loop. */
2775 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2776 = ncopies * vect_get_cost (vector_stmt);
2777 /* prologue cost for vec_init and vec_step. */
2778 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2779 = 2 * vect_get_cost (scalar_to_vec);
2781 if (vect_print_dump_info (REPORT_COST))
2782 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2783 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2784 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2788 /* Function get_initial_def_for_induction
2790 Input:
2791 STMT - a stmt that performs an induction operation in the loop.
2792 IV_PHI - the initial value of the induction variable
2794 Output:
2795 Return a vector variable, initialized with the first VF values of
2796 the induction variable. E.g., for an iv with IV_PHI='X' and
2797 evolution S, for a vector of 4 units, we want to return:
2798 [X, X + S, X + 2*S, X + 3*S]. */
2800 static tree
2801 get_initial_def_for_induction (gimple iv_phi)
2803 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2804 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2805 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2806 tree scalar_type;
2807 tree vectype;
2808 int nunits;
2809 edge pe = loop_preheader_edge (loop);
2810 struct loop *iv_loop;
2811 basic_block new_bb;
2812 tree vec, vec_init, vec_step, t;
2813 tree access_fn;
2814 tree new_var;
2815 tree new_name;
2816 gimple init_stmt, induction_phi, new_stmt;
2817 tree induc_def, vec_def, vec_dest;
2818 tree init_expr, step_expr;
2819 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2820 int i;
2821 bool ok;
2822 int ncopies;
2823 tree expr;
2824 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2825 bool nested_in_vect_loop = false;
2826 gimple_seq stmts = NULL;
2827 imm_use_iterator imm_iter;
2828 use_operand_p use_p;
2829 gimple exit_phi;
2830 edge latch_e;
2831 tree loop_arg;
2832 gimple_stmt_iterator si;
2833 basic_block bb = gimple_bb (iv_phi);
2834 tree stepvectype;
2835 tree resvectype;
2837 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2838 if (nested_in_vect_loop_p (loop, iv_phi))
2840 nested_in_vect_loop = true;
2841 iv_loop = loop->inner;
2843 else
2844 iv_loop = loop;
2845 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2847 latch_e = loop_latch_edge (iv_loop);
2848 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2850 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2851 gcc_assert (access_fn);
2852 STRIP_NOPS (access_fn);
2853 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2854 &init_expr, &step_expr);
2855 gcc_assert (ok);
2856 pe = loop_preheader_edge (iv_loop);
2858 scalar_type = TREE_TYPE (init_expr);
2859 vectype = get_vectype_for_scalar_type (scalar_type);
2860 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2861 gcc_assert (vectype);
2862 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2863 ncopies = vf / nunits;
2865 gcc_assert (phi_info);
2866 gcc_assert (ncopies >= 1);
2868 /* Find the first insertion point in the BB. */
2869 si = gsi_after_labels (bb);
2871 /* Create the vector that holds the initial_value of the induction. */
2872 if (nested_in_vect_loop)
2874 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2875 been created during vectorization of previous stmts. We obtain it
2876 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2877 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2878 loop_preheader_edge (iv_loop));
2879 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2881 else
2883 /* iv_loop is the loop to be vectorized. Create:
2884 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2885 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2886 add_referenced_var (new_var);
2888 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2889 if (stmts)
2891 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2892 gcc_assert (!new_bb);
2895 t = NULL_TREE;
2896 t = tree_cons (NULL_TREE, new_name, t);
2897 for (i = 1; i < nunits; i++)
2899 /* Create: new_name_i = new_name + step_expr */
2900 enum tree_code code = POINTER_TYPE_P (scalar_type)
2901 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2902 init_stmt = gimple_build_assign_with_ops (code, new_var,
2903 new_name, step_expr);
2904 new_name = make_ssa_name (new_var, init_stmt);
2905 gimple_assign_set_lhs (init_stmt, new_name);
2907 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2908 gcc_assert (!new_bb);
2910 if (vect_print_dump_info (REPORT_DETAILS))
2912 fprintf (vect_dump, "created new init_stmt: ");
2913 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2915 t = tree_cons (NULL_TREE, new_name, t);
2917 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2918 vec = build_constructor_from_list (vectype, nreverse (t));
2919 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2923 /* Create the vector that holds the step of the induction. */
2924 if (nested_in_vect_loop)
2925 /* iv_loop is nested in the loop to be vectorized. Generate:
2926 vec_step = [S, S, S, S] */
2927 new_name = step_expr;
2928 else
2930 /* iv_loop is the loop to be vectorized. Generate:
2931 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2932 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2933 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2934 expr, step_expr);
2937 t = unshare_expr (new_name);
2938 gcc_assert (CONSTANT_CLASS_P (new_name));
2939 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2940 gcc_assert (stepvectype);
2941 vec = build_vector_from_val (stepvectype, t);
2942 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2945 /* Create the following def-use cycle:
2946 loop prolog:
2947 vec_init = ...
2948 vec_step = ...
2949 loop:
2950 vec_iv = PHI <vec_init, vec_loop>
2952 STMT
2954 vec_loop = vec_iv + vec_step; */
2956 /* Create the induction-phi that defines the induction-operand. */
2957 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2958 add_referenced_var (vec_dest);
2959 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2960 set_vinfo_for_stmt (induction_phi,
2961 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2962 induc_def = PHI_RESULT (induction_phi);
2964 /* Create the iv update inside the loop */
2965 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2966 induc_def, vec_step);
2967 vec_def = make_ssa_name (vec_dest, new_stmt);
2968 gimple_assign_set_lhs (new_stmt, vec_def);
2969 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2970 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2971 NULL));
2973 /* Set the arguments of the phi node: */
2974 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2975 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2976 UNKNOWN_LOCATION);
2979 /* In case that vectorization factor (VF) is bigger than the number
2980 of elements that we can fit in a vectype (nunits), we have to generate
2981 more than one vector stmt - i.e - we need to "unroll" the
2982 vector stmt by a factor VF/nunits. For more details see documentation
2983 in vectorizable_operation. */
2985 if (ncopies > 1)
2987 stmt_vec_info prev_stmt_vinfo;
2988 /* FORNOW. This restriction should be relaxed. */
2989 gcc_assert (!nested_in_vect_loop);
2991 /* Create the vector that holds the step of the induction. */
2992 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2993 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2994 expr, step_expr);
2995 t = unshare_expr (new_name);
2996 gcc_assert (CONSTANT_CLASS_P (new_name));
2997 vec = build_vector_from_val (stepvectype, t);
2998 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3000 vec_def = induc_def;
3001 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3002 for (i = 1; i < ncopies; i++)
3004 /* vec_i = vec_prev + vec_step */
3005 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3006 vec_def, vec_step);
3007 vec_def = make_ssa_name (vec_dest, new_stmt);
3008 gimple_assign_set_lhs (new_stmt, vec_def);
3010 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3011 if (!useless_type_conversion_p (resvectype, vectype))
3013 new_stmt = gimple_build_assign_with_ops
3014 (VIEW_CONVERT_EXPR,
3015 vect_get_new_vect_var (resvectype, vect_simple_var,
3016 "vec_iv_"),
3017 build1 (VIEW_CONVERT_EXPR, resvectype,
3018 gimple_assign_lhs (new_stmt)), NULL_TREE);
3019 gimple_assign_set_lhs (new_stmt,
3020 make_ssa_name
3021 (gimple_assign_lhs (new_stmt), new_stmt));
3022 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3024 set_vinfo_for_stmt (new_stmt,
3025 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3026 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3027 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3031 if (nested_in_vect_loop)
3033 /* Find the loop-closed exit-phi of the induction, and record
3034 the final vector of induction results: */
3035 exit_phi = NULL;
3036 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3038 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3040 exit_phi = USE_STMT (use_p);
3041 break;
3044 if (exit_phi)
3046 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3047 /* FORNOW. Currently not supporting the case that an inner-loop induction
3048 is not used in the outer-loop (i.e. only outside the outer-loop). */
3049 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3050 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3052 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3053 if (vect_print_dump_info (REPORT_DETAILS))
3055 fprintf (vect_dump, "vector of inductions after inner-loop:");
3056 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3062 if (vect_print_dump_info (REPORT_DETAILS))
3064 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3065 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3066 fprintf (vect_dump, "\n");
3067 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3070 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3071 if (!useless_type_conversion_p (resvectype, vectype))
3073 new_stmt = gimple_build_assign_with_ops
3074 (VIEW_CONVERT_EXPR,
3075 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3076 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3077 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3078 gimple_assign_set_lhs (new_stmt, induc_def);
3079 si = gsi_start_bb (bb);
3080 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3081 set_vinfo_for_stmt (new_stmt,
3082 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3083 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3084 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3087 return induc_def;
3091 /* Function get_initial_def_for_reduction
3093 Input:
3094 STMT - a stmt that performs a reduction operation in the loop.
3095 INIT_VAL - the initial value of the reduction variable
3097 Output:
3098 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3099 of the reduction (used for adjusting the epilog - see below).
3100 Return a vector variable, initialized according to the operation that STMT
3101 performs. This vector will be used as the initial value of the
3102 vector of partial results.
3104 Option1 (adjust in epilog): Initialize the vector as follows:
3105 add/bit or/xor: [0,0,...,0,0]
3106 mult/bit and: [1,1,...,1,1]
3107 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3108 and when necessary (e.g. add/mult case) let the caller know
3109 that it needs to adjust the result by init_val.
3111 Option2: Initialize the vector as follows:
3112 add/bit or/xor: [init_val,0,0,...,0]
3113 mult/bit and: [init_val,1,1,...,1]
3114 min/max/cond_expr: [init_val,init_val,...,init_val]
3115 and no adjustments are needed.
3117 For example, for the following code:
3119 s = init_val;
3120 for (i=0;i<n;i++)
3121 s = s + a[i];
3123 STMT is 's = s + a[i]', and the reduction variable is 's'.
3124 For a vector of 4 units, we want to return either [0,0,0,init_val],
3125 or [0,0,0,0] and let the caller know that it needs to adjust
3126 the result at the end by 'init_val'.
3128 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3129 initialization vector is simpler (same element in all entries), if
3130 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3132 A cost model should help decide between these two schemes. */
3134 tree
3135 get_initial_def_for_reduction (gimple stmt, tree init_val,
3136 tree *adjustment_def)
3138 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3139 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3140 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3141 tree scalar_type = TREE_TYPE (init_val);
3142 tree vectype = get_vectype_for_scalar_type (scalar_type);
3143 int nunits;
3144 enum tree_code code = gimple_assign_rhs_code (stmt);
3145 tree def_for_init;
3146 tree init_def;
3147 tree t = NULL_TREE;
3148 int i;
3149 bool nested_in_vect_loop = false;
3150 tree init_value;
3151 REAL_VALUE_TYPE real_init_val = dconst0;
3152 int int_init_val = 0;
3153 gimple def_stmt = NULL;
3155 gcc_assert (vectype);
3156 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3158 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3159 || SCALAR_FLOAT_TYPE_P (scalar_type));
3161 if (nested_in_vect_loop_p (loop, stmt))
3162 nested_in_vect_loop = true;
3163 else
3164 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3166 /* In case of double reduction we only create a vector variable to be put
3167 in the reduction phi node. The actual statement creation is done in
3168 vect_create_epilog_for_reduction. */
3169 if (adjustment_def && nested_in_vect_loop
3170 && TREE_CODE (init_val) == SSA_NAME
3171 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3172 && gimple_code (def_stmt) == GIMPLE_PHI
3173 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3174 && vinfo_for_stmt (def_stmt)
3175 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3176 == vect_double_reduction_def)
3178 *adjustment_def = NULL;
3179 return vect_create_destination_var (init_val, vectype);
3182 if (TREE_CONSTANT (init_val))
3184 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3185 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3186 else
3187 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3189 else
3190 init_value = init_val;
3192 switch (code)
3194 case WIDEN_SUM_EXPR:
3195 case DOT_PROD_EXPR:
3196 case PLUS_EXPR:
3197 case MINUS_EXPR:
3198 case BIT_IOR_EXPR:
3199 case BIT_XOR_EXPR:
3200 case MULT_EXPR:
3201 case BIT_AND_EXPR:
3202 /* ADJUSMENT_DEF is NULL when called from
3203 vect_create_epilog_for_reduction to vectorize double reduction. */
3204 if (adjustment_def)
3206 if (nested_in_vect_loop)
3207 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3208 NULL);
3209 else
3210 *adjustment_def = init_val;
3213 if (code == MULT_EXPR)
3215 real_init_val = dconst1;
3216 int_init_val = 1;
3219 if (code == BIT_AND_EXPR)
3220 int_init_val = -1;
3222 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3223 def_for_init = build_real (scalar_type, real_init_val);
3224 else
3225 def_for_init = build_int_cst (scalar_type, int_init_val);
3227 /* Create a vector of '0' or '1' except the first element. */
3228 for (i = nunits - 2; i >= 0; --i)
3229 t = tree_cons (NULL_TREE, def_for_init, t);
3231 /* Option1: the first element is '0' or '1' as well. */
3232 if (adjustment_def)
3234 t = tree_cons (NULL_TREE, def_for_init, t);
3235 init_def = build_vector (vectype, t);
3236 break;
3239 /* Option2: the first element is INIT_VAL. */
3240 t = tree_cons (NULL_TREE, init_value, t);
3241 if (TREE_CONSTANT (init_val))
3242 init_def = build_vector (vectype, t);
3243 else
3244 init_def = build_constructor_from_list (vectype, t);
3246 break;
3248 case MIN_EXPR:
3249 case MAX_EXPR:
3250 case COND_EXPR:
3251 if (adjustment_def)
3253 *adjustment_def = NULL_TREE;
3254 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3255 break;
3258 init_def = build_vector_from_val (vectype, init_value);
3259 break;
3261 default:
3262 gcc_unreachable ();
3265 return init_def;
3269 /* Function vect_create_epilog_for_reduction
3271 Create code at the loop-epilog to finalize the result of a reduction
3272 computation.
3274 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3275 reduction statements.
3276 STMT is the scalar reduction stmt that is being vectorized.
3277 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3278 number of elements that we can fit in a vectype (nunits). In this case
3279 we have to generate more than one vector stmt - i.e - we need to "unroll"
3280 the vector stmt by a factor VF/nunits. For more details see documentation
3281 in vectorizable_operation.
3282 REDUC_CODE is the tree-code for the epilog reduction.
3283 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3284 computation.
3285 REDUC_INDEX is the index of the operand in the right hand side of the
3286 statement that is defined by REDUCTION_PHI.
3287 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3288 SLP_NODE is an SLP node containing a group of reduction statements. The
3289 first one in this group is STMT.
3291 This function:
3292 1. Creates the reduction def-use cycles: sets the arguments for
3293 REDUCTION_PHIS:
3294 The loop-entry argument is the vectorized initial-value of the reduction.
3295 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3296 sums.
3297 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3298 by applying the operation specified by REDUC_CODE if available, or by
3299 other means (whole-vector shifts or a scalar loop).
3300 The function also creates a new phi node at the loop exit to preserve
3301 loop-closed form, as illustrated below.
3303 The flow at the entry to this function:
3305 loop:
3306 vec_def = phi <null, null> # REDUCTION_PHI
3307 VECT_DEF = vector_stmt # vectorized form of STMT
3308 s_loop = scalar_stmt # (scalar) STMT
3309 loop_exit:
3310 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3311 use <s_out0>
3312 use <s_out0>
3314 The above is transformed by this function into:
3316 loop:
3317 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3318 VECT_DEF = vector_stmt # vectorized form of STMT
3319 s_loop = scalar_stmt # (scalar) STMT
3320 loop_exit:
3321 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3322 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3323 v_out2 = reduce <v_out1>
3324 s_out3 = extract_field <v_out2, 0>
3325 s_out4 = adjust_result <s_out3>
3326 use <s_out4>
3327 use <s_out4>
3330 static void
3331 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3332 int ncopies, enum tree_code reduc_code,
3333 VEC (gimple, heap) *reduction_phis,
3334 int reduc_index, bool double_reduc,
3335 slp_tree slp_node)
3337 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3338 stmt_vec_info prev_phi_info;
3339 tree vectype;
3340 enum machine_mode mode;
3341 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3343 basic_block exit_bb;
3344 tree scalar_dest;
3345 tree scalar_type;
3346 gimple new_phi = NULL, phi;
3347 gimple_stmt_iterator exit_gsi;
3348 tree vec_dest;
3349 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3350 gimple epilog_stmt = NULL;
3351 enum tree_code code = gimple_assign_rhs_code (stmt);
3352 gimple exit_phi;
3353 tree bitsize, bitpos;
3354 tree adjustment_def = NULL;
3355 tree vec_initial_def = NULL;
3356 tree reduction_op, expr, def;
3357 tree orig_name, scalar_result;
3358 imm_use_iterator imm_iter, phi_imm_iter;
3359 use_operand_p use_p, phi_use_p;
3360 bool extract_scalar_result = false;
3361 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3362 bool nested_in_vect_loop = false;
3363 VEC (gimple, heap) *new_phis = NULL;
3364 enum vect_def_type dt = vect_unknown_def_type;
3365 int j, i;
3366 VEC (tree, heap) *scalar_results = NULL;
3367 unsigned int group_size = 1, k, ratio;
3368 VEC (tree, heap) *vec_initial_defs = NULL;
3369 VEC (gimple, heap) *phis;
3370 bool slp_reduc = false;
3371 tree new_phi_result;
3373 if (slp_node)
3374 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3376 if (nested_in_vect_loop_p (loop, stmt))
3378 outer_loop = loop;
3379 loop = loop->inner;
3380 nested_in_vect_loop = true;
3381 gcc_assert (!slp_node);
3384 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3386 case GIMPLE_SINGLE_RHS:
3387 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3388 == ternary_op);
3389 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3390 break;
3391 case GIMPLE_UNARY_RHS:
3392 reduction_op = gimple_assign_rhs1 (stmt);
3393 break;
3394 case GIMPLE_BINARY_RHS:
3395 reduction_op = reduc_index ?
3396 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3397 break;
3398 case GIMPLE_TERNARY_RHS:
3399 reduction_op = gimple_op (stmt, reduc_index + 1);
3400 break;
3401 default:
3402 gcc_unreachable ();
3405 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3406 gcc_assert (vectype);
3407 mode = TYPE_MODE (vectype);
3409 /* 1. Create the reduction def-use cycle:
3410 Set the arguments of REDUCTION_PHIS, i.e., transform
3412 loop:
3413 vec_def = phi <null, null> # REDUCTION_PHI
3414 VECT_DEF = vector_stmt # vectorized form of STMT
3417 into:
3419 loop:
3420 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3421 VECT_DEF = vector_stmt # vectorized form of STMT
3424 (in case of SLP, do it for all the phis). */
3426 /* Get the loop-entry arguments. */
3427 if (slp_node)
3428 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3429 NULL, reduc_index);
3430 else
3432 vec_initial_defs = VEC_alloc (tree, heap, 1);
3433 /* For the case of reduction, vect_get_vec_def_for_operand returns
3434 the scalar def before the loop, that defines the initial value
3435 of the reduction variable. */
3436 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3437 &adjustment_def);
3438 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3441 /* Set phi nodes arguments. */
3442 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3444 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3445 tree def = VEC_index (tree, vect_defs, i);
3446 for (j = 0; j < ncopies; j++)
3448 /* Set the loop-entry arg of the reduction-phi. */
3449 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3450 UNKNOWN_LOCATION);
3452 /* Set the loop-latch arg for the reduction-phi. */
3453 if (j > 0)
3454 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3456 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3458 if (vect_print_dump_info (REPORT_DETAILS))
3460 fprintf (vect_dump, "transform reduction: created def-use"
3461 " cycle: ");
3462 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3463 fprintf (vect_dump, "\n");
3464 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3465 TDF_SLIM);
3468 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3472 VEC_free (tree, heap, vec_initial_defs);
3474 /* 2. Create epilog code.
3475 The reduction epilog code operates across the elements of the vector
3476 of partial results computed by the vectorized loop.
3477 The reduction epilog code consists of:
3479 step 1: compute the scalar result in a vector (v_out2)
3480 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3481 step 3: adjust the scalar result (s_out3) if needed.
3483 Step 1 can be accomplished using one the following three schemes:
3484 (scheme 1) using reduc_code, if available.
3485 (scheme 2) using whole-vector shifts, if available.
3486 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3487 combined.
3489 The overall epilog code looks like this:
3491 s_out0 = phi <s_loop> # original EXIT_PHI
3492 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3493 v_out2 = reduce <v_out1> # step 1
3494 s_out3 = extract_field <v_out2, 0> # step 2
3495 s_out4 = adjust_result <s_out3> # step 3
3497 (step 3 is optional, and steps 1 and 2 may be combined).
3498 Lastly, the uses of s_out0 are replaced by s_out4. */
3501 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3502 v_out1 = phi <VECT_DEF>
3503 Store them in NEW_PHIS. */
3505 exit_bb = single_exit (loop)->dest;
3506 prev_phi_info = NULL;
3507 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3508 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3510 for (j = 0; j < ncopies; j++)
3512 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3513 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3514 if (j == 0)
3515 VEC_quick_push (gimple, new_phis, phi);
3516 else
3518 def = vect_get_vec_def_for_stmt_copy (dt, def);
3519 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3522 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3523 prev_phi_info = vinfo_for_stmt (phi);
3527 /* The epilogue is created for the outer-loop, i.e., for the loop being
3528 vectorized. */
3529 if (double_reduc)
3531 loop = outer_loop;
3532 exit_bb = single_exit (loop)->dest;
3535 exit_gsi = gsi_after_labels (exit_bb);
3537 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3538 (i.e. when reduc_code is not available) and in the final adjustment
3539 code (if needed). Also get the original scalar reduction variable as
3540 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3541 represents a reduction pattern), the tree-code and scalar-def are
3542 taken from the original stmt that the pattern-stmt (STMT) replaces.
3543 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3544 are taken from STMT. */
3546 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3547 if (!orig_stmt)
3549 /* Regular reduction */
3550 orig_stmt = stmt;
3552 else
3554 /* Reduction pattern */
3555 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3556 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3557 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3560 code = gimple_assign_rhs_code (orig_stmt);
3561 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3562 partial results are added and not subtracted. */
3563 if (code == MINUS_EXPR)
3564 code = PLUS_EXPR;
3566 scalar_dest = gimple_assign_lhs (orig_stmt);
3567 scalar_type = TREE_TYPE (scalar_dest);
3568 scalar_results = VEC_alloc (tree, heap, group_size);
3569 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3570 bitsize = TYPE_SIZE (scalar_type);
3572 /* In case this is a reduction in an inner-loop while vectorizing an outer
3573 loop - we don't need to extract a single scalar result at the end of the
3574 inner-loop (unless it is double reduction, i.e., the use of reduction is
3575 outside the outer-loop). The final vector of partial results will be used
3576 in the vectorized outer-loop, or reduced to a scalar result at the end of
3577 the outer-loop. */
3578 if (nested_in_vect_loop && !double_reduc)
3579 goto vect_finalize_reduction;
3581 /* SLP reduction without reduction chain, e.g.,
3582 # a1 = phi <a2, a0>
3583 # b1 = phi <b2, b0>
3584 a2 = operation (a1)
3585 b2 = operation (b1) */
3586 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3588 /* In case of reduction chain, e.g.,
3589 # a1 = phi <a3, a0>
3590 a2 = operation (a1)
3591 a3 = operation (a2),
3593 we may end up with more than one vector result. Here we reduce them to
3594 one vector. */
3595 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3597 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3598 tree tmp;
3600 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3601 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3603 gimple next_phi = VEC_index (gimple, new_phis, k);
3604 tree second_vect = PHI_RESULT (next_phi);
3605 gimple new_vec_stmt;
3607 tmp = build2 (code, vectype, first_vect, second_vect);
3608 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3609 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3610 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3611 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3614 new_phi_result = first_vect;
3616 else
3617 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3619 /* 2.3 Create the reduction code, using one of the three schemes described
3620 above. In SLP we simply need to extract all the elements from the
3621 vector (without reducing them), so we use scalar shifts. */
3622 if (reduc_code != ERROR_MARK && !slp_reduc)
3624 tree tmp;
3626 /*** Case 1: Create:
3627 v_out2 = reduc_expr <v_out1> */
3629 if (vect_print_dump_info (REPORT_DETAILS))
3630 fprintf (vect_dump, "Reduce using direct vector reduction.");
3632 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3633 tmp = build1 (reduc_code, vectype, new_phi_result);
3634 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3635 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3636 gimple_assign_set_lhs (epilog_stmt, new_temp);
3637 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3639 extract_scalar_result = true;
3641 else
3643 enum tree_code shift_code = ERROR_MARK;
3644 bool have_whole_vector_shift = true;
3645 int bit_offset;
3646 int element_bitsize = tree_low_cst (bitsize, 1);
3647 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3648 tree vec_temp;
3650 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3651 shift_code = VEC_RSHIFT_EXPR;
3652 else
3653 have_whole_vector_shift = false;
3655 /* Regardless of whether we have a whole vector shift, if we're
3656 emulating the operation via tree-vect-generic, we don't want
3657 to use it. Only the first round of the reduction is likely
3658 to still be profitable via emulation. */
3659 /* ??? It might be better to emit a reduction tree code here, so that
3660 tree-vect-generic can expand the first round via bit tricks. */
3661 if (!VECTOR_MODE_P (mode))
3662 have_whole_vector_shift = false;
3663 else
3665 optab optab = optab_for_tree_code (code, vectype, optab_default);
3666 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3667 have_whole_vector_shift = false;
3670 if (have_whole_vector_shift && !slp_reduc)
3672 /*** Case 2: Create:
3673 for (offset = VS/2; offset >= element_size; offset/=2)
3675 Create: va' = vec_shift <va, offset>
3676 Create: va = vop <va, va'>
3677 } */
3679 if (vect_print_dump_info (REPORT_DETAILS))
3680 fprintf (vect_dump, "Reduce using vector shifts");
3682 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3683 new_temp = new_phi_result;
3684 for (bit_offset = vec_size_in_bits/2;
3685 bit_offset >= element_bitsize;
3686 bit_offset /= 2)
3688 tree bitpos = size_int (bit_offset);
3690 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3691 vec_dest, new_temp, bitpos);
3692 new_name = make_ssa_name (vec_dest, epilog_stmt);
3693 gimple_assign_set_lhs (epilog_stmt, new_name);
3694 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3696 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3697 new_name, new_temp);
3698 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3699 gimple_assign_set_lhs (epilog_stmt, new_temp);
3700 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3703 extract_scalar_result = true;
3705 else
3707 tree rhs;
3709 /*** Case 3: Create:
3710 s = extract_field <v_out2, 0>
3711 for (offset = element_size;
3712 offset < vector_size;
3713 offset += element_size;)
3715 Create: s' = extract_field <v_out2, offset>
3716 Create: s = op <s, s'> // For non SLP cases
3717 } */
3719 if (vect_print_dump_info (REPORT_DETAILS))
3720 fprintf (vect_dump, "Reduce using scalar code. ");
3722 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3723 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3725 vec_temp = PHI_RESULT (new_phi);
3726 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3727 bitsize_zero_node);
3728 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3729 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3730 gimple_assign_set_lhs (epilog_stmt, new_temp);
3731 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3733 /* In SLP we don't need to apply reduction operation, so we just
3734 collect s' values in SCALAR_RESULTS. */
3735 if (slp_reduc)
3736 VEC_safe_push (tree, heap, scalar_results, new_temp);
3738 for (bit_offset = element_bitsize;
3739 bit_offset < vec_size_in_bits;
3740 bit_offset += element_bitsize)
3742 tree bitpos = bitsize_int (bit_offset);
3743 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3744 bitsize, bitpos);
3746 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3747 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3748 gimple_assign_set_lhs (epilog_stmt, new_name);
3749 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3751 if (slp_reduc)
3753 /* In SLP we don't need to apply reduction operation, so
3754 we just collect s' values in SCALAR_RESULTS. */
3755 new_temp = new_name;
3756 VEC_safe_push (tree, heap, scalar_results, new_name);
3758 else
3760 epilog_stmt = gimple_build_assign_with_ops (code,
3761 new_scalar_dest, new_name, new_temp);
3762 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3763 gimple_assign_set_lhs (epilog_stmt, new_temp);
3764 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3769 /* The only case where we need to reduce scalar results in SLP, is
3770 unrolling. If the size of SCALAR_RESULTS is greater than
3771 GROUP_SIZE, we reduce them combining elements modulo
3772 GROUP_SIZE. */
3773 if (slp_reduc)
3775 tree res, first_res, new_res;
3776 gimple new_stmt;
3778 /* Reduce multiple scalar results in case of SLP unrolling. */
3779 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3780 j++)
3782 first_res = VEC_index (tree, scalar_results, j % group_size);
3783 new_stmt = gimple_build_assign_with_ops (code,
3784 new_scalar_dest, first_res, res);
3785 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3786 gimple_assign_set_lhs (new_stmt, new_res);
3787 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3788 VEC_replace (tree, scalar_results, j % group_size, new_res);
3791 else
3792 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3793 VEC_safe_push (tree, heap, scalar_results, new_temp);
3795 extract_scalar_result = false;
3799 /* 2.4 Extract the final scalar result. Create:
3800 s_out3 = extract_field <v_out2, bitpos> */
3802 if (extract_scalar_result)
3804 tree rhs;
3806 if (vect_print_dump_info (REPORT_DETAILS))
3807 fprintf (vect_dump, "extract scalar result");
3809 if (BYTES_BIG_ENDIAN)
3810 bitpos = size_binop (MULT_EXPR,
3811 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3812 TYPE_SIZE (scalar_type));
3813 else
3814 bitpos = bitsize_zero_node;
3816 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3817 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3818 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3819 gimple_assign_set_lhs (epilog_stmt, new_temp);
3820 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3821 VEC_safe_push (tree, heap, scalar_results, new_temp);
3824 vect_finalize_reduction:
3826 if (double_reduc)
3827 loop = loop->inner;
3829 /* 2.5 Adjust the final result by the initial value of the reduction
3830 variable. (When such adjustment is not needed, then
3831 'adjustment_def' is zero). For example, if code is PLUS we create:
3832 new_temp = loop_exit_def + adjustment_def */
3834 if (adjustment_def)
3836 gcc_assert (!slp_reduc);
3837 if (nested_in_vect_loop)
3839 new_phi = VEC_index (gimple, new_phis, 0);
3840 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3841 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3842 new_dest = vect_create_destination_var (scalar_dest, vectype);
3844 else
3846 new_temp = VEC_index (tree, scalar_results, 0);
3847 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3848 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3849 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3852 epilog_stmt = gimple_build_assign (new_dest, expr);
3853 new_temp = make_ssa_name (new_dest, epilog_stmt);
3854 gimple_assign_set_lhs (epilog_stmt, new_temp);
3855 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3856 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3857 if (nested_in_vect_loop)
3859 set_vinfo_for_stmt (epilog_stmt,
3860 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3861 NULL));
3862 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3863 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3865 if (!double_reduc)
3866 VEC_quick_push (tree, scalar_results, new_temp);
3867 else
3868 VEC_replace (tree, scalar_results, 0, new_temp);
3870 else
3871 VEC_replace (tree, scalar_results, 0, new_temp);
3873 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3876 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3877 phis with new adjusted scalar results, i.e., replace use <s_out0>
3878 with use <s_out4>.
3880 Transform:
3881 loop_exit:
3882 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3883 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3884 v_out2 = reduce <v_out1>
3885 s_out3 = extract_field <v_out2, 0>
3886 s_out4 = adjust_result <s_out3>
3887 use <s_out0>
3888 use <s_out0>
3890 into:
3892 loop_exit:
3893 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3894 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3895 v_out2 = reduce <v_out1>
3896 s_out3 = extract_field <v_out2, 0>
3897 s_out4 = adjust_result <s_out3>
3898 use <s_out4>
3899 use <s_out4> */
3902 /* In SLP reduction chain we reduce vector results into one vector if
3903 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
3904 the last stmt in the reduction chain, since we are looking for the loop
3905 exit phi node. */
3906 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3908 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
3909 SLP_TREE_SCALAR_STMTS (slp_node),
3910 group_size - 1));
3911 group_size = 1;
3914 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3915 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3916 need to match SCALAR_RESULTS with corresponding statements. The first
3917 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3918 the first vector stmt, etc.
3919 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3920 if (group_size > VEC_length (gimple, new_phis))
3922 ratio = group_size / VEC_length (gimple, new_phis);
3923 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3925 else
3926 ratio = 1;
3928 for (k = 0; k < group_size; k++)
3930 if (k % ratio == 0)
3932 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3933 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3936 if (slp_reduc)
3938 gimple current_stmt = VEC_index (gimple,
3939 SLP_TREE_SCALAR_STMTS (slp_node), k);
3941 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3942 /* SLP statements can't participate in patterns. */
3943 gcc_assert (!orig_stmt);
3944 scalar_dest = gimple_assign_lhs (current_stmt);
3947 phis = VEC_alloc (gimple, heap, 3);
3948 /* Find the loop-closed-use at the loop exit of the original scalar
3949 result. (The reduction result is expected to have two immediate uses -
3950 one at the latch block, and one at the loop exit). */
3951 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3952 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3953 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3955 /* We expect to have found an exit_phi because of loop-closed-ssa
3956 form. */
3957 gcc_assert (!VEC_empty (gimple, phis));
3959 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3961 if (outer_loop)
3963 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3964 gimple vect_phi;
3966 /* FORNOW. Currently not supporting the case that an inner-loop
3967 reduction is not used in the outer-loop (but only outside the
3968 outer-loop), unless it is double reduction. */
3969 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3970 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3971 || double_reduc);
3973 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3974 if (!double_reduc
3975 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3976 != vect_double_reduction_def)
3977 continue;
3979 /* Handle double reduction:
3981 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3982 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3983 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3984 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3986 At that point the regular reduction (stmt2 and stmt3) is
3987 already vectorized, as well as the exit phi node, stmt4.
3988 Here we vectorize the phi node of double reduction, stmt1, and
3989 update all relevant statements. */
3991 /* Go through all the uses of s2 to find double reduction phi
3992 node, i.e., stmt1 above. */
3993 orig_name = PHI_RESULT (exit_phi);
3994 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3996 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3997 stmt_vec_info new_phi_vinfo;
3998 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3999 basic_block bb = gimple_bb (use_stmt);
4000 gimple use;
4002 /* Check that USE_STMT is really double reduction phi
4003 node. */
4004 if (gimple_code (use_stmt) != GIMPLE_PHI
4005 || gimple_phi_num_args (use_stmt) != 2
4006 || !use_stmt_vinfo
4007 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4008 != vect_double_reduction_def
4009 || bb->loop_father != outer_loop)
4010 continue;
4012 /* Create vector phi node for double reduction:
4013 vs1 = phi <vs0, vs2>
4014 vs1 was created previously in this function by a call to
4015 vect_get_vec_def_for_operand and is stored in
4016 vec_initial_def;
4017 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4018 vs0 is created here. */
4020 /* Create vector phi node. */
4021 vect_phi = create_phi_node (vec_initial_def, bb);
4022 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4023 loop_vec_info_for_loop (outer_loop), NULL);
4024 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4026 /* Create vs0 - initial def of the double reduction phi. */
4027 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4028 loop_preheader_edge (outer_loop));
4029 init_def = get_initial_def_for_reduction (stmt,
4030 preheader_arg, NULL);
4031 vect_phi_init = vect_init_vector (use_stmt, init_def,
4032 vectype, NULL);
4034 /* Update phi node arguments with vs0 and vs2. */
4035 add_phi_arg (vect_phi, vect_phi_init,
4036 loop_preheader_edge (outer_loop),
4037 UNKNOWN_LOCATION);
4038 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4039 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4040 if (vect_print_dump_info (REPORT_DETAILS))
4042 fprintf (vect_dump, "created double reduction phi "
4043 "node: ");
4044 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4047 vect_phi_res = PHI_RESULT (vect_phi);
4049 /* Replace the use, i.e., set the correct vs1 in the regular
4050 reduction phi node. FORNOW, NCOPIES is always 1, so the
4051 loop is redundant. */
4052 use = reduction_phi;
4053 for (j = 0; j < ncopies; j++)
4055 edge pr_edge = loop_preheader_edge (loop);
4056 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4057 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4063 VEC_free (gimple, heap, phis);
4064 if (nested_in_vect_loop)
4066 if (double_reduc)
4067 loop = outer_loop;
4068 else
4069 continue;
4072 phis = VEC_alloc (gimple, heap, 3);
4073 /* Find the loop-closed-use at the loop exit of the original scalar
4074 result. (The reduction result is expected to have two immediate uses,
4075 one at the latch block, and one at the loop exit). For double
4076 reductions we are looking for exit phis of the outer loop. */
4077 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4079 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4080 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4081 else
4083 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4085 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4087 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4089 if (!flow_bb_inside_loop_p (loop,
4090 gimple_bb (USE_STMT (phi_use_p))))
4091 VEC_safe_push (gimple, heap, phis,
4092 USE_STMT (phi_use_p));
4098 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4100 /* Replace the uses: */
4101 orig_name = PHI_RESULT (exit_phi);
4102 scalar_result = VEC_index (tree, scalar_results, k);
4103 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4104 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4105 SET_USE (use_p, scalar_result);
4108 VEC_free (gimple, heap, phis);
4111 VEC_free (tree, heap, scalar_results);
4112 VEC_free (gimple, heap, new_phis);
4116 /* Function vectorizable_reduction.
4118 Check if STMT performs a reduction operation that can be vectorized.
4119 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4120 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4121 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4123 This function also handles reduction idioms (patterns) that have been
4124 recognized in advance during vect_pattern_recog. In this case, STMT may be
4125 of this form:
4126 X = pattern_expr (arg0, arg1, ..., X)
4127 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4128 sequence that had been detected and replaced by the pattern-stmt (STMT).
4130 In some cases of reduction patterns, the type of the reduction variable X is
4131 different than the type of the other arguments of STMT.
4132 In such cases, the vectype that is used when transforming STMT into a vector
4133 stmt is different than the vectype that is used to determine the
4134 vectorization factor, because it consists of a different number of elements
4135 than the actual number of elements that are being operated upon in parallel.
4137 For example, consider an accumulation of shorts into an int accumulator.
4138 On some targets it's possible to vectorize this pattern operating on 8
4139 shorts at a time (hence, the vectype for purposes of determining the
4140 vectorization factor should be V8HI); on the other hand, the vectype that
4141 is used to create the vector form is actually V4SI (the type of the result).
4143 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4144 indicates what is the actual level of parallelism (V8HI in the example), so
4145 that the right vectorization factor would be derived. This vectype
4146 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4147 be used to create the vectorized stmt. The right vectype for the vectorized
4148 stmt is obtained from the type of the result X:
4149 get_vectype_for_scalar_type (TREE_TYPE (X))
4151 This means that, contrary to "regular" reductions (or "regular" stmts in
4152 general), the following equation:
4153 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4154 does *NOT* necessarily hold for reduction patterns. */
4156 bool
4157 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4158 gimple *vec_stmt, slp_tree slp_node)
4160 tree vec_dest;
4161 tree scalar_dest;
4162 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4164 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4165 tree vectype_in = NULL_TREE;
4166 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4167 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4168 enum tree_code code, orig_code, epilog_reduc_code;
4169 enum machine_mode vec_mode;
4170 int op_type;
4171 optab optab, reduc_optab;
4172 tree new_temp = NULL_TREE;
4173 tree def;
4174 gimple def_stmt;
4175 enum vect_def_type dt;
4176 gimple new_phi = NULL;
4177 tree scalar_type;
4178 bool is_simple_use;
4179 gimple orig_stmt;
4180 stmt_vec_info orig_stmt_info;
4181 tree expr = NULL_TREE;
4182 int i;
4183 int ncopies;
4184 int epilog_copies;
4185 stmt_vec_info prev_stmt_info, prev_phi_info;
4186 bool single_defuse_cycle = false;
4187 tree reduc_def = NULL_TREE;
4188 gimple new_stmt = NULL;
4189 int j;
4190 tree ops[3];
4191 bool nested_cycle = false, found_nested_cycle_def = false;
4192 gimple reduc_def_stmt = NULL;
4193 /* The default is that the reduction variable is the last in statement. */
4194 int reduc_index = 2;
4195 bool double_reduc = false, dummy;
4196 basic_block def_bb;
4197 struct loop * def_stmt_loop, *outer_loop = NULL;
4198 tree def_arg;
4199 gimple def_arg_stmt;
4200 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4201 VEC (gimple, heap) *phis = NULL;
4202 int vec_num;
4203 tree def0, def1, tem;
4205 /* In case of reduction chain we switch to the first stmt in the chain, but
4206 we don't update STMT_INFO, since only the last stmt is marked as reduction
4207 and has reduction properties. */
4208 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4209 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4211 if (nested_in_vect_loop_p (loop, stmt))
4213 outer_loop = loop;
4214 loop = loop->inner;
4215 nested_cycle = true;
4218 /* 1. Is vectorizable reduction? */
4219 /* Not supportable if the reduction variable is used in the loop, unless
4220 it's a reduction chain. */
4221 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4222 && !GROUP_FIRST_ELEMENT (stmt_info))
4223 return false;
4225 /* Reductions that are not used even in an enclosing outer-loop,
4226 are expected to be "live" (used out of the loop). */
4227 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4228 && !STMT_VINFO_LIVE_P (stmt_info))
4229 return false;
4231 /* Make sure it was already recognized as a reduction computation. */
4232 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4233 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4234 return false;
4236 /* 2. Has this been recognized as a reduction pattern?
4238 Check if STMT represents a pattern that has been recognized
4239 in earlier analysis stages. For stmts that represent a pattern,
4240 the STMT_VINFO_RELATED_STMT field records the last stmt in
4241 the original sequence that constitutes the pattern. */
4243 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4244 if (orig_stmt)
4246 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4247 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4248 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4249 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4252 /* 3. Check the operands of the operation. The first operands are defined
4253 inside the loop body. The last operand is the reduction variable,
4254 which is defined by the loop-header-phi. */
4256 gcc_assert (is_gimple_assign (stmt));
4258 /* Flatten RHS. */
4259 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4261 case GIMPLE_SINGLE_RHS:
4262 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4263 if (op_type == ternary_op)
4265 tree rhs = gimple_assign_rhs1 (stmt);
4266 ops[0] = TREE_OPERAND (rhs, 0);
4267 ops[1] = TREE_OPERAND (rhs, 1);
4268 ops[2] = TREE_OPERAND (rhs, 2);
4269 code = TREE_CODE (rhs);
4271 else
4272 return false;
4273 break;
4275 case GIMPLE_BINARY_RHS:
4276 code = gimple_assign_rhs_code (stmt);
4277 op_type = TREE_CODE_LENGTH (code);
4278 gcc_assert (op_type == binary_op);
4279 ops[0] = gimple_assign_rhs1 (stmt);
4280 ops[1] = gimple_assign_rhs2 (stmt);
4281 break;
4283 case GIMPLE_TERNARY_RHS:
4284 code = gimple_assign_rhs_code (stmt);
4285 op_type = TREE_CODE_LENGTH (code);
4286 gcc_assert (op_type == ternary_op);
4287 ops[0] = gimple_assign_rhs1 (stmt);
4288 ops[1] = gimple_assign_rhs2 (stmt);
4289 ops[2] = gimple_assign_rhs3 (stmt);
4290 break;
4292 case GIMPLE_UNARY_RHS:
4293 return false;
4295 default:
4296 gcc_unreachable ();
4299 scalar_dest = gimple_assign_lhs (stmt);
4300 scalar_type = TREE_TYPE (scalar_dest);
4301 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4302 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4303 return false;
4305 /* All uses but the last are expected to be defined in the loop.
4306 The last use is the reduction variable. In case of nested cycle this
4307 assumption is not true: we use reduc_index to record the index of the
4308 reduction variable. */
4309 for (i = 0; i < op_type-1; i++)
4311 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4312 if (i == 0 && code == COND_EXPR)
4313 continue;
4315 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4316 &def_stmt, &def, &dt, &tem);
4317 if (!vectype_in)
4318 vectype_in = tem;
4319 gcc_assert (is_simple_use);
4321 if (dt != vect_internal_def
4322 && dt != vect_external_def
4323 && dt != vect_constant_def
4324 && dt != vect_induction_def
4325 && !(dt == vect_nested_cycle && nested_cycle))
4326 return false;
4328 if (dt == vect_nested_cycle)
4330 found_nested_cycle_def = true;
4331 reduc_def_stmt = def_stmt;
4332 reduc_index = i;
4336 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4337 &def, &dt, &tem);
4338 if (!vectype_in)
4339 vectype_in = tem;
4340 gcc_assert (is_simple_use);
4341 gcc_assert (dt == vect_reduction_def
4342 || dt == vect_nested_cycle
4343 || ((dt == vect_internal_def || dt == vect_external_def
4344 || dt == vect_constant_def || dt == vect_induction_def)
4345 && nested_cycle && found_nested_cycle_def));
4346 if (!found_nested_cycle_def)
4347 reduc_def_stmt = def_stmt;
4349 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4350 if (orig_stmt)
4351 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4352 reduc_def_stmt,
4353 !nested_cycle,
4354 &dummy));
4355 else
4357 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4358 !nested_cycle, &dummy);
4359 /* We changed STMT to be the first stmt in reduction chain, hence we
4360 check that in this case the first element in the chain is STMT. */
4361 gcc_assert (stmt == tmp
4362 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4365 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4366 return false;
4368 if (slp_node || PURE_SLP_STMT (stmt_info))
4369 ncopies = 1;
4370 else
4371 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4372 / TYPE_VECTOR_SUBPARTS (vectype_in));
4374 gcc_assert (ncopies >= 1);
4376 vec_mode = TYPE_MODE (vectype_in);
4378 if (code == COND_EXPR)
4380 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4382 if (vect_print_dump_info (REPORT_DETAILS))
4383 fprintf (vect_dump, "unsupported condition in reduction");
4385 return false;
4388 else
4390 /* 4. Supportable by target? */
4392 /* 4.1. check support for the operation in the loop */
4393 optab = optab_for_tree_code (code, vectype_in, optab_default);
4394 if (!optab)
4396 if (vect_print_dump_info (REPORT_DETAILS))
4397 fprintf (vect_dump, "no optab.");
4399 return false;
4402 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4404 if (vect_print_dump_info (REPORT_DETAILS))
4405 fprintf (vect_dump, "op not supported by target.");
4407 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4408 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4409 < vect_min_worthwhile_factor (code))
4410 return false;
4412 if (vect_print_dump_info (REPORT_DETAILS))
4413 fprintf (vect_dump, "proceeding using word mode.");
4416 /* Worthwhile without SIMD support? */
4417 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4418 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4419 < vect_min_worthwhile_factor (code))
4421 if (vect_print_dump_info (REPORT_DETAILS))
4422 fprintf (vect_dump, "not worthwhile without SIMD support.");
4424 return false;
4428 /* 4.2. Check support for the epilog operation.
4430 If STMT represents a reduction pattern, then the type of the
4431 reduction variable may be different than the type of the rest
4432 of the arguments. For example, consider the case of accumulation
4433 of shorts into an int accumulator; The original code:
4434 S1: int_a = (int) short_a;
4435 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4437 was replaced with:
4438 STMT: int_acc = widen_sum <short_a, int_acc>
4440 This means that:
4441 1. The tree-code that is used to create the vector operation in the
4442 epilog code (that reduces the partial results) is not the
4443 tree-code of STMT, but is rather the tree-code of the original
4444 stmt from the pattern that STMT is replacing. I.e, in the example
4445 above we want to use 'widen_sum' in the loop, but 'plus' in the
4446 epilog.
4447 2. The type (mode) we use to check available target support
4448 for the vector operation to be created in the *epilog*, is
4449 determined by the type of the reduction variable (in the example
4450 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4451 However the type (mode) we use to check available target support
4452 for the vector operation to be created *inside the loop*, is
4453 determined by the type of the other arguments to STMT (in the
4454 example we'd check this: optab_handler (widen_sum_optab,
4455 vect_short_mode)).
4457 This is contrary to "regular" reductions, in which the types of all
4458 the arguments are the same as the type of the reduction variable.
4459 For "regular" reductions we can therefore use the same vector type
4460 (and also the same tree-code) when generating the epilog code and
4461 when generating the code inside the loop. */
4463 if (orig_stmt)
4465 /* This is a reduction pattern: get the vectype from the type of the
4466 reduction variable, and get the tree-code from orig_stmt. */
4467 orig_code = gimple_assign_rhs_code (orig_stmt);
4468 gcc_assert (vectype_out);
4469 vec_mode = TYPE_MODE (vectype_out);
4471 else
4473 /* Regular reduction: use the same vectype and tree-code as used for
4474 the vector code inside the loop can be used for the epilog code. */
4475 orig_code = code;
4478 if (nested_cycle)
4480 def_bb = gimple_bb (reduc_def_stmt);
4481 def_stmt_loop = def_bb->loop_father;
4482 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4483 loop_preheader_edge (def_stmt_loop));
4484 if (TREE_CODE (def_arg) == SSA_NAME
4485 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4486 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4487 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4488 && vinfo_for_stmt (def_arg_stmt)
4489 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4490 == vect_double_reduction_def)
4491 double_reduc = true;
4494 epilog_reduc_code = ERROR_MARK;
4495 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4497 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4498 optab_default);
4499 if (!reduc_optab)
4501 if (vect_print_dump_info (REPORT_DETAILS))
4502 fprintf (vect_dump, "no optab for reduction.");
4504 epilog_reduc_code = ERROR_MARK;
4507 if (reduc_optab
4508 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4510 if (vect_print_dump_info (REPORT_DETAILS))
4511 fprintf (vect_dump, "reduc op not supported by target.");
4513 epilog_reduc_code = ERROR_MARK;
4516 else
4518 if (!nested_cycle || double_reduc)
4520 if (vect_print_dump_info (REPORT_DETAILS))
4521 fprintf (vect_dump, "no reduc code for scalar code.");
4523 return false;
4527 if (double_reduc && ncopies > 1)
4529 if (vect_print_dump_info (REPORT_DETAILS))
4530 fprintf (vect_dump, "multiple types in double reduction");
4532 return false;
4535 if (!vec_stmt) /* transformation not required. */
4537 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4538 return false;
4539 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4540 return true;
4543 /** Transform. **/
4545 if (vect_print_dump_info (REPORT_DETAILS))
4546 fprintf (vect_dump, "transform reduction.");
4548 /* FORNOW: Multiple types are not supported for condition. */
4549 if (code == COND_EXPR)
4550 gcc_assert (ncopies == 1);
4552 /* Create the destination vector */
4553 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4555 /* In case the vectorization factor (VF) is bigger than the number
4556 of elements that we can fit in a vectype (nunits), we have to generate
4557 more than one vector stmt - i.e - we need to "unroll" the
4558 vector stmt by a factor VF/nunits. For more details see documentation
4559 in vectorizable_operation. */
4561 /* If the reduction is used in an outer loop we need to generate
4562 VF intermediate results, like so (e.g. for ncopies=2):
4563 r0 = phi (init, r0)
4564 r1 = phi (init, r1)
4565 r0 = x0 + r0;
4566 r1 = x1 + r1;
4567 (i.e. we generate VF results in 2 registers).
4568 In this case we have a separate def-use cycle for each copy, and therefore
4569 for each copy we get the vector def for the reduction variable from the
4570 respective phi node created for this copy.
4572 Otherwise (the reduction is unused in the loop nest), we can combine
4573 together intermediate results, like so (e.g. for ncopies=2):
4574 r = phi (init, r)
4575 r = x0 + r;
4576 r = x1 + r;
4577 (i.e. we generate VF/2 results in a single register).
4578 In this case for each copy we get the vector def for the reduction variable
4579 from the vectorized reduction operation generated in the previous iteration.
4582 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4584 single_defuse_cycle = true;
4585 epilog_copies = 1;
4587 else
4588 epilog_copies = ncopies;
4590 prev_stmt_info = NULL;
4591 prev_phi_info = NULL;
4592 if (slp_node)
4594 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4595 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4596 == TYPE_VECTOR_SUBPARTS (vectype_in));
4598 else
4600 vec_num = 1;
4601 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4602 if (op_type == ternary_op)
4603 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4606 phis = VEC_alloc (gimple, heap, vec_num);
4607 vect_defs = VEC_alloc (tree, heap, vec_num);
4608 if (!slp_node)
4609 VEC_quick_push (tree, vect_defs, NULL_TREE);
4611 for (j = 0; j < ncopies; j++)
4613 if (j == 0 || !single_defuse_cycle)
4615 for (i = 0; i < vec_num; i++)
4617 /* Create the reduction-phi that defines the reduction
4618 operand. */
4619 new_phi = create_phi_node (vec_dest, loop->header);
4620 set_vinfo_for_stmt (new_phi,
4621 new_stmt_vec_info (new_phi, loop_vinfo,
4622 NULL));
4623 if (j == 0 || slp_node)
4624 VEC_quick_push (gimple, phis, new_phi);
4628 if (code == COND_EXPR)
4630 gcc_assert (!slp_node);
4631 vectorizable_condition (stmt, gsi, vec_stmt,
4632 PHI_RESULT (VEC_index (gimple, phis, 0)),
4633 reduc_index);
4634 /* Multiple types are not supported for condition. */
4635 break;
4638 /* Handle uses. */
4639 if (j == 0)
4641 tree op0, op1 = NULL_TREE;
4643 op0 = ops[!reduc_index];
4644 if (op_type == ternary_op)
4646 if (reduc_index == 0)
4647 op1 = ops[2];
4648 else
4649 op1 = ops[1];
4652 if (slp_node)
4653 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4654 -1);
4655 else
4657 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4658 stmt, NULL);
4659 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4660 if (op_type == ternary_op)
4662 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4663 NULL);
4664 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4668 else
4670 if (!slp_node)
4672 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4673 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4674 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4675 if (op_type == ternary_op)
4677 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4678 loop_vec_def1);
4679 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4683 if (single_defuse_cycle)
4684 reduc_def = gimple_assign_lhs (new_stmt);
4686 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4689 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4691 if (slp_node)
4692 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4693 else
4695 if (!single_defuse_cycle || j == 0)
4696 reduc_def = PHI_RESULT (new_phi);
4699 def1 = ((op_type == ternary_op)
4700 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4701 if (op_type == binary_op)
4703 if (reduc_index == 0)
4704 expr = build2 (code, vectype_out, reduc_def, def0);
4705 else
4706 expr = build2 (code, vectype_out, def0, reduc_def);
4708 else
4710 if (reduc_index == 0)
4711 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4712 else
4714 if (reduc_index == 1)
4715 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4716 else
4717 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4721 new_stmt = gimple_build_assign (vec_dest, expr);
4722 new_temp = make_ssa_name (vec_dest, new_stmt);
4723 gimple_assign_set_lhs (new_stmt, new_temp);
4724 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4726 if (slp_node)
4728 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4729 VEC_quick_push (tree, vect_defs, new_temp);
4731 else
4732 VEC_replace (tree, vect_defs, 0, new_temp);
4735 if (slp_node)
4736 continue;
4738 if (j == 0)
4739 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4740 else
4741 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4743 prev_stmt_info = vinfo_for_stmt (new_stmt);
4744 prev_phi_info = vinfo_for_stmt (new_phi);
4747 /* Finalize the reduction-phi (set its arguments) and create the
4748 epilog reduction code. */
4749 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4751 new_temp = gimple_assign_lhs (*vec_stmt);
4752 VEC_replace (tree, vect_defs, 0, new_temp);
4755 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4756 epilog_reduc_code, phis, reduc_index,
4757 double_reduc, slp_node);
4759 VEC_free (gimple, heap, phis);
4760 VEC_free (tree, heap, vec_oprnds0);
4761 if (vec_oprnds1)
4762 VEC_free (tree, heap, vec_oprnds1);
4764 return true;
4767 /* Function vect_min_worthwhile_factor.
4769 For a loop where we could vectorize the operation indicated by CODE,
4770 return the minimum vectorization factor that makes it worthwhile
4771 to use generic vectors. */
4773 vect_min_worthwhile_factor (enum tree_code code)
4775 switch (code)
4777 case PLUS_EXPR:
4778 case MINUS_EXPR:
4779 case NEGATE_EXPR:
4780 return 4;
4782 case BIT_AND_EXPR:
4783 case BIT_IOR_EXPR:
4784 case BIT_XOR_EXPR:
4785 case BIT_NOT_EXPR:
4786 return 2;
4788 default:
4789 return INT_MAX;
4794 /* Function vectorizable_induction
4796 Check if PHI performs an induction computation that can be vectorized.
4797 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4798 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4799 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4801 bool
4802 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4803 gimple *vec_stmt)
4805 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4806 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4808 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4809 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4810 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4811 tree vec_def;
4813 gcc_assert (ncopies >= 1);
4814 /* FORNOW. This restriction should be relaxed. */
4815 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4817 if (vect_print_dump_info (REPORT_DETAILS))
4818 fprintf (vect_dump, "multiple types in nested loop.");
4819 return false;
4822 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4823 return false;
4825 /* FORNOW: SLP not supported. */
4826 if (STMT_SLP_TYPE (stmt_info))
4827 return false;
4829 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4831 if (gimple_code (phi) != GIMPLE_PHI)
4832 return false;
4834 if (!vec_stmt) /* transformation not required. */
4836 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4837 if (vect_print_dump_info (REPORT_DETAILS))
4838 fprintf (vect_dump, "=== vectorizable_induction ===");
4839 vect_model_induction_cost (stmt_info, ncopies);
4840 return true;
4843 /** Transform. **/
4845 if (vect_print_dump_info (REPORT_DETAILS))
4846 fprintf (vect_dump, "transform induction phi.");
4848 vec_def = get_initial_def_for_induction (phi);
4849 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4850 return true;
4853 /* Function vectorizable_live_operation.
4855 STMT computes a value that is used outside the loop. Check if
4856 it can be supported. */
4858 bool
4859 vectorizable_live_operation (gimple stmt,
4860 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4861 gimple *vec_stmt ATTRIBUTE_UNUSED)
4863 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4864 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4865 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4866 int i;
4867 int op_type;
4868 tree op;
4869 tree def;
4870 gimple def_stmt;
4871 enum vect_def_type dt;
4872 enum tree_code code;
4873 enum gimple_rhs_class rhs_class;
4875 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4877 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4878 return false;
4880 if (!is_gimple_assign (stmt))
4881 return false;
4883 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4884 return false;
4886 /* FORNOW. CHECKME. */
4887 if (nested_in_vect_loop_p (loop, stmt))
4888 return false;
4890 code = gimple_assign_rhs_code (stmt);
4891 op_type = TREE_CODE_LENGTH (code);
4892 rhs_class = get_gimple_rhs_class (code);
4893 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4894 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4896 /* FORNOW: support only if all uses are invariant. This means
4897 that the scalar operations can remain in place, unvectorized.
4898 The original last scalar value that they compute will be used. */
4900 for (i = 0; i < op_type; i++)
4902 if (rhs_class == GIMPLE_SINGLE_RHS)
4903 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4904 else
4905 op = gimple_op (stmt, i + 1);
4906 if (op
4907 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4909 if (vect_print_dump_info (REPORT_DETAILS))
4910 fprintf (vect_dump, "use not simple.");
4911 return false;
4914 if (dt != vect_external_def && dt != vect_constant_def)
4915 return false;
4918 /* No transformation is required for the cases we currently support. */
4919 return true;
4922 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4924 static void
4925 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4927 ssa_op_iter op_iter;
4928 imm_use_iterator imm_iter;
4929 def_operand_p def_p;
4930 gimple ustmt;
4932 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4934 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4936 basic_block bb;
4938 if (!is_gimple_debug (ustmt))
4939 continue;
4941 bb = gimple_bb (ustmt);
4943 if (!flow_bb_inside_loop_p (loop, bb))
4945 if (gimple_debug_bind_p (ustmt))
4947 if (vect_print_dump_info (REPORT_DETAILS))
4948 fprintf (vect_dump, "killing debug use");
4950 gimple_debug_bind_reset_value (ustmt);
4951 update_stmt (ustmt);
4953 else
4954 gcc_unreachable ();
4960 /* Function vect_transform_loop.
4962 The analysis phase has determined that the loop is vectorizable.
4963 Vectorize the loop - created vectorized stmts to replace the scalar
4964 stmts in the loop, and update the loop exit condition. */
4966 void
4967 vect_transform_loop (loop_vec_info loop_vinfo)
4969 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4970 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4971 int nbbs = loop->num_nodes;
4972 gimple_stmt_iterator si;
4973 int i;
4974 tree ratio = NULL;
4975 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4976 bool strided_store;
4977 bool slp_scheduled = false;
4978 unsigned int nunits;
4979 tree cond_expr = NULL_TREE;
4980 gimple_seq cond_expr_stmt_list = NULL;
4981 bool do_peeling_for_loop_bound;
4983 if (vect_print_dump_info (REPORT_DETAILS))
4984 fprintf (vect_dump, "=== vec_transform_loop ===");
4986 /* Peel the loop if there are data refs with unknown alignment.
4987 Only one data ref with unknown store is allowed. */
4989 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4990 vect_do_peeling_for_alignment (loop_vinfo);
4992 do_peeling_for_loop_bound
4993 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4994 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4995 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
4996 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
4998 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4999 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5000 vect_loop_versioning (loop_vinfo,
5001 !do_peeling_for_loop_bound,
5002 &cond_expr, &cond_expr_stmt_list);
5004 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5005 compile time constant), or it is a constant that doesn't divide by the
5006 vectorization factor, then an epilog loop needs to be created.
5007 We therefore duplicate the loop: the original loop will be vectorized,
5008 and will compute the first (n/VF) iterations. The second copy of the loop
5009 will remain scalar and will compute the remaining (n%VF) iterations.
5010 (VF is the vectorization factor). */
5012 if (do_peeling_for_loop_bound)
5013 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5014 cond_expr, cond_expr_stmt_list);
5015 else
5016 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5017 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5019 /* 1) Make sure the loop header has exactly two entries
5020 2) Make sure we have a preheader basic block. */
5022 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5024 split_edge (loop_preheader_edge (loop));
5026 /* FORNOW: the vectorizer supports only loops which body consist
5027 of one basic block (header + empty latch). When the vectorizer will
5028 support more involved loop forms, the order by which the BBs are
5029 traversed need to be reconsidered. */
5031 for (i = 0; i < nbbs; i++)
5033 basic_block bb = bbs[i];
5034 stmt_vec_info stmt_info;
5035 gimple phi;
5037 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5039 phi = gsi_stmt (si);
5040 if (vect_print_dump_info (REPORT_DETAILS))
5042 fprintf (vect_dump, "------>vectorizing phi: ");
5043 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5045 stmt_info = vinfo_for_stmt (phi);
5046 if (!stmt_info)
5047 continue;
5049 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5050 vect_loop_kill_debug_uses (loop, phi);
5052 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5053 && !STMT_VINFO_LIVE_P (stmt_info))
5054 continue;
5056 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5057 != (unsigned HOST_WIDE_INT) vectorization_factor)
5058 && vect_print_dump_info (REPORT_DETAILS))
5059 fprintf (vect_dump, "multiple-types.");
5061 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5063 if (vect_print_dump_info (REPORT_DETAILS))
5064 fprintf (vect_dump, "transform phi.");
5065 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5069 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
5071 gimple stmt = gsi_stmt (si);
5072 bool is_store;
5074 if (vect_print_dump_info (REPORT_DETAILS))
5076 fprintf (vect_dump, "------>vectorizing statement: ");
5077 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5080 stmt_info = vinfo_for_stmt (stmt);
5082 /* vector stmts created in the outer-loop during vectorization of
5083 stmts in an inner-loop may not have a stmt_info, and do not
5084 need to be vectorized. */
5085 if (!stmt_info)
5087 gsi_next (&si);
5088 continue;
5091 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5092 vect_loop_kill_debug_uses (loop, stmt);
5094 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5095 && !STMT_VINFO_LIVE_P (stmt_info))
5097 gsi_next (&si);
5098 continue;
5101 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5102 nunits =
5103 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5104 if (!STMT_SLP_TYPE (stmt_info)
5105 && nunits != (unsigned int) vectorization_factor
5106 && vect_print_dump_info (REPORT_DETAILS))
5107 /* For SLP VF is set according to unrolling factor, and not to
5108 vector size, hence for SLP this print is not valid. */
5109 fprintf (vect_dump, "multiple-types.");
5111 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5112 reached. */
5113 if (STMT_SLP_TYPE (stmt_info))
5115 if (!slp_scheduled)
5117 slp_scheduled = true;
5119 if (vect_print_dump_info (REPORT_DETAILS))
5120 fprintf (vect_dump, "=== scheduling SLP instances ===");
5122 vect_schedule_slp (loop_vinfo, NULL);
5125 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5126 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5128 gsi_next (&si);
5129 continue;
5133 /* -------- vectorize statement ------------ */
5134 if (vect_print_dump_info (REPORT_DETAILS))
5135 fprintf (vect_dump, "transform statement.");
5137 strided_store = false;
5138 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5139 if (is_store)
5141 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5143 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5144 interleaving chain was completed - free all the stores in
5145 the chain. */
5146 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5147 gsi_remove (&si, true);
5148 continue;
5150 else
5152 /* Free the attached stmt_vec_info and remove the stmt. */
5153 free_stmt_vec_info (stmt);
5154 gsi_remove (&si, true);
5155 continue;
5158 gsi_next (&si);
5159 } /* stmts in BB */
5160 } /* BBs in loop */
5162 slpeel_make_loop_iterate_ntimes (loop, ratio);
5164 /* The memory tags and pointers in vectorized statements need to
5165 have their SSA forms updated. FIXME, why can't this be delayed
5166 until all the loops have been transformed? */
5167 update_ssa (TODO_update_ssa);
5169 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5170 fprintf (vect_dump, "LOOP VECTORIZED.");
5171 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5172 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");