* c-decl.c (finish_decl): Don't call get_pending_sizes.
[official-gcc.git] / gcc / tree-vect-loop.c
blob8f3b8e61c642c5142f2baef4cf2d33f235660b8a
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 return false;
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 return false;
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 return false;
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 else
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 return false;
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 return false;
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 return false;
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386 return true;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 *init = init_expr;
425 *step = step_expr;
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
434 return true;
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
525 if (double_reduc)
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
534 else
536 if (nested_cycle)
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
545 else
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
579 Example1: reduction:
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
585 Example2: induction:
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
621 tree niters;
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
711 else
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
762 LOOP_VINFO_PEELING_HTAB (res) = NULL;
764 return res;
768 /* Function destroy_loop_vec_info.
770 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
771 stmts in the loop. */
773 void
774 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
776 struct loop *loop;
777 basic_block *bbs;
778 int nbbs;
779 gimple_stmt_iterator si;
780 int j;
781 VEC (slp_instance, heap) *slp_instances;
782 slp_instance instance;
784 if (!loop_vinfo)
785 return;
787 loop = LOOP_VINFO_LOOP (loop_vinfo);
789 bbs = LOOP_VINFO_BBS (loop_vinfo);
790 nbbs = loop->num_nodes;
792 if (!clean_stmts)
794 free (LOOP_VINFO_BBS (loop_vinfo));
795 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
796 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
797 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
798 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
799 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
801 free (loop_vinfo);
802 loop->aux = NULL;
803 return;
806 for (j = 0; j < nbbs; j++)
808 basic_block bb = bbs[j];
809 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
810 free_stmt_vec_info (gsi_stmt (si));
812 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
814 gimple stmt = gsi_stmt (si);
815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
817 if (stmt_info)
819 /* Check if this is a "pattern stmt" (introduced by the
820 vectorizer during the pattern recognition pass). */
821 bool remove_stmt_p = false;
822 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
823 if (orig_stmt)
825 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
826 if (orig_stmt_info
827 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
828 remove_stmt_p = true;
831 /* Free stmt_vec_info. */
832 free_stmt_vec_info (stmt);
834 /* Remove dead "pattern stmts". */
835 if (remove_stmt_p)
836 gsi_remove (&si, true);
838 gsi_next (&si);
842 free (LOOP_VINFO_BBS (loop_vinfo));
843 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
844 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
845 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
846 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
847 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
848 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
849 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
850 vect_free_slp_instance (instance);
852 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
853 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
854 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
856 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
857 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
859 free (loop_vinfo);
860 loop->aux = NULL;
864 /* Function vect_analyze_loop_1.
866 Apply a set of analyses on LOOP, and create a loop_vec_info struct
867 for it. The different analyses will record information in the
868 loop_vec_info struct. This is a subset of the analyses applied in
869 vect_analyze_loop, to be applied on an inner-loop nested in the loop
870 that is now considered for (outer-loop) vectorization. */
872 static loop_vec_info
873 vect_analyze_loop_1 (struct loop *loop)
875 loop_vec_info loop_vinfo;
877 if (vect_print_dump_info (REPORT_DETAILS))
878 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
880 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
882 loop_vinfo = vect_analyze_loop_form (loop);
883 if (!loop_vinfo)
885 if (vect_print_dump_info (REPORT_DETAILS))
886 fprintf (vect_dump, "bad inner-loop form.");
887 return NULL;
890 return loop_vinfo;
894 /* Function vect_analyze_loop_form.
896 Verify that certain CFG restrictions hold, including:
897 - the loop has a pre-header
898 - the loop has a single entry and exit
899 - the loop exit condition is simple enough, and the number of iterations
900 can be analyzed (a countable loop). */
902 loop_vec_info
903 vect_analyze_loop_form (struct loop *loop)
905 loop_vec_info loop_vinfo;
906 gimple loop_cond;
907 tree number_of_iterations = NULL;
908 loop_vec_info inner_loop_vinfo = NULL;
910 if (vect_print_dump_info (REPORT_DETAILS))
911 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
913 /* Different restrictions apply when we are considering an inner-most loop,
914 vs. an outer (nested) loop.
915 (FORNOW. May want to relax some of these restrictions in the future). */
917 if (!loop->inner)
919 /* Inner-most loop. We currently require that the number of BBs is
920 exactly 2 (the header and latch). Vectorizable inner-most loops
921 look like this:
923 (pre-header)
925 header <--------+
926 | | |
927 | +--> latch --+
929 (exit-bb) */
931 if (loop->num_nodes != 2)
933 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
934 fprintf (vect_dump, "not vectorized: control flow in loop.");
935 return NULL;
938 if (empty_block_p (loop->header))
940 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
941 fprintf (vect_dump, "not vectorized: empty loop.");
942 return NULL;
945 else
947 struct loop *innerloop = loop->inner;
948 edge entryedge;
950 /* Nested loop. We currently require that the loop is doubly-nested,
951 contains a single inner loop, and the number of BBs is exactly 5.
952 Vectorizable outer-loops look like this:
954 (pre-header)
956 header <---+
958 inner-loop |
960 tail ------+
962 (exit-bb)
964 The inner-loop has the properties expected of inner-most loops
965 as described above. */
967 if ((loop->inner)->inner || (loop->inner)->next)
969 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
970 fprintf (vect_dump, "not vectorized: multiple nested loops.");
971 return NULL;
974 /* Analyze the inner-loop. */
975 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
976 if (!inner_loop_vinfo)
978 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
979 fprintf (vect_dump, "not vectorized: Bad inner loop.");
980 return NULL;
983 if (!expr_invariant_in_loop_p (loop,
984 LOOP_VINFO_NITERS (inner_loop_vinfo)))
986 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
987 fprintf (vect_dump,
988 "not vectorized: inner-loop count not invariant.");
989 destroy_loop_vec_info (inner_loop_vinfo, true);
990 return NULL;
993 if (loop->num_nodes != 5)
995 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
996 fprintf (vect_dump, "not vectorized: control flow in loop.");
997 destroy_loop_vec_info (inner_loop_vinfo, true);
998 return NULL;
1001 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1002 entryedge = EDGE_PRED (innerloop->header, 0);
1003 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1004 entryedge = EDGE_PRED (innerloop->header, 1);
1006 if (entryedge->src != loop->header
1007 || !single_exit (innerloop)
1008 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1011 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1012 destroy_loop_vec_info (inner_loop_vinfo, true);
1013 return NULL;
1016 if (vect_print_dump_info (REPORT_DETAILS))
1017 fprintf (vect_dump, "Considering outer-loop vectorization.");
1020 if (!single_exit (loop)
1021 || EDGE_COUNT (loop->header->preds) != 2)
1023 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1025 if (!single_exit (loop))
1026 fprintf (vect_dump, "not vectorized: multiple exits.");
1027 else if (EDGE_COUNT (loop->header->preds) != 2)
1028 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1030 if (inner_loop_vinfo)
1031 destroy_loop_vec_info (inner_loop_vinfo, true);
1032 return NULL;
1035 /* We assume that the loop exit condition is at the end of the loop. i.e,
1036 that the loop is represented as a do-while (with a proper if-guard
1037 before the loop if needed), where the loop header contains all the
1038 executable statements, and the latch is empty. */
1039 if (!empty_block_p (loop->latch)
1040 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1042 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1043 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1044 if (inner_loop_vinfo)
1045 destroy_loop_vec_info (inner_loop_vinfo, true);
1046 return NULL;
1049 /* Make sure there exists a single-predecessor exit bb: */
1050 if (!single_pred_p (single_exit (loop)->dest))
1052 edge e = single_exit (loop);
1053 if (!(e->flags & EDGE_ABNORMAL))
1055 split_loop_exit_edge (e);
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "split exit edge.");
1059 else
1061 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1062 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1063 if (inner_loop_vinfo)
1064 destroy_loop_vec_info (inner_loop_vinfo, true);
1065 return NULL;
1069 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1070 if (!loop_cond)
1072 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1073 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1074 if (inner_loop_vinfo)
1075 destroy_loop_vec_info (inner_loop_vinfo, true);
1076 return NULL;
1079 if (!number_of_iterations)
1081 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1082 fprintf (vect_dump,
1083 "not vectorized: number of iterations cannot be computed.");
1084 if (inner_loop_vinfo)
1085 destroy_loop_vec_info (inner_loop_vinfo, true);
1086 return NULL;
1089 if (chrec_contains_undetermined (number_of_iterations))
1091 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1092 fprintf (vect_dump, "Infinite number of iterations.");
1093 if (inner_loop_vinfo)
1094 destroy_loop_vec_info (inner_loop_vinfo, true);
1095 return NULL;
1098 if (!NITERS_KNOWN_P (number_of_iterations))
1100 if (vect_print_dump_info (REPORT_DETAILS))
1102 fprintf (vect_dump, "Symbolic number of iterations is ");
1103 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1106 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1108 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1109 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1110 if (inner_loop_vinfo)
1111 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 return NULL;
1115 loop_vinfo = new_loop_vec_info (loop);
1116 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1117 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1119 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1121 /* CHECKME: May want to keep it around it in the future. */
1122 if (inner_loop_vinfo)
1123 destroy_loop_vec_info (inner_loop_vinfo, false);
1125 gcc_assert (!loop->aux);
1126 loop->aux = loop_vinfo;
1127 return loop_vinfo;
1131 /* Get cost by calling cost target builtin. */
1133 static inline int
1134 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1136 tree dummy_type = NULL;
1137 int dummy = 0;
1139 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1140 dummy_type, dummy);
1144 /* Function vect_analyze_loop_operations.
1146 Scan the loop stmts and make sure they are all vectorizable. */
1148 static bool
1149 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1151 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1152 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1153 int nbbs = loop->num_nodes;
1154 gimple_stmt_iterator si;
1155 unsigned int vectorization_factor = 0;
1156 int i;
1157 gimple phi;
1158 stmt_vec_info stmt_info;
1159 bool need_to_vectorize = false;
1160 int min_profitable_iters;
1161 int min_scalar_loop_bound;
1162 unsigned int th;
1163 bool only_slp_in_loop = true, ok;
1165 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1168 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1169 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1170 if (slp)
1172 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1173 vectorization factor of the loop is the unrolling factor required by
1174 the SLP instances. If that unrolling factor is 1, we say, that we
1175 perform pure SLP on loop - cross iteration parallelism is not
1176 exploited. */
1177 for (i = 0; i < nbbs; i++)
1179 basic_block bb = bbs[i];
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182 gimple stmt = gsi_stmt (si);
1183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184 gcc_assert (stmt_info);
1185 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1186 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1187 && !PURE_SLP_STMT (stmt_info))
1188 /* STMT needs both SLP and loop-based vectorization. */
1189 only_slp_in_loop = false;
1193 if (only_slp_in_loop)
1194 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1195 else
1196 vectorization_factor = least_common_multiple (vectorization_factor,
1197 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1199 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1200 if (vect_print_dump_info (REPORT_DETAILS))
1201 fprintf (vect_dump, "Updating vectorization factor to %d ",
1202 vectorization_factor);
1205 for (i = 0; i < nbbs; i++)
1207 basic_block bb = bbs[i];
1209 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211 phi = gsi_stmt (si);
1212 ok = true;
1214 stmt_info = vinfo_for_stmt (phi);
1215 if (vect_print_dump_info (REPORT_DETAILS))
1217 fprintf (vect_dump, "examining phi: ");
1218 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1221 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1222 (i.e., a phi in the tail of the outer-loop). */
1223 if (! is_loop_header_bb_p (bb))
1225 /* FORNOW: we currently don't support the case that these phis
1226 are not used in the outerloop (unless it is double reduction,
1227 i.e., this phi is vect_reduction_def), cause this case
1228 requires to actually do something here. */
1229 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1230 || STMT_VINFO_LIVE_P (stmt_info))
1231 && STMT_VINFO_DEF_TYPE (stmt_info)
1232 != vect_double_reduction_def)
1234 if (vect_print_dump_info (REPORT_DETAILS))
1235 fprintf (vect_dump,
1236 "Unsupported loop-closed phi in outer-loop.");
1237 return false;
1240 /* If PHI is used in the outer loop, we check that its operand
1241 is defined in the inner loop. */
1242 if (STMT_VINFO_RELEVANT_P (stmt_info))
1244 tree phi_op;
1245 gimple op_def_stmt;
1247 if (gimple_phi_num_args (phi) != 1)
1248 return false;
1250 phi_op = PHI_ARG_DEF (phi, 0);
1251 if (TREE_CODE (phi_op) != SSA_NAME)
1252 return false;
1254 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1255 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1256 return false;
1258 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1259 != vect_used_in_outer
1260 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1261 != vect_used_in_outer_by_reduction)
1262 return false;
1265 continue;
1268 gcc_assert (stmt_info);
1270 if (STMT_VINFO_LIVE_P (stmt_info))
1272 /* FORNOW: not yet supported. */
1273 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1274 fprintf (vect_dump, "not vectorized: value used after loop.");
1275 return false;
1278 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1279 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1281 /* A scalar-dependence cycle that we don't support. */
1282 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1283 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1284 return false;
1287 if (STMT_VINFO_RELEVANT_P (stmt_info))
1289 need_to_vectorize = true;
1290 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1291 ok = vectorizable_induction (phi, NULL, NULL);
1294 if (!ok)
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1298 fprintf (vect_dump,
1299 "not vectorized: relevant phi not supported: ");
1300 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1302 return false;
1306 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1308 gimple stmt = gsi_stmt (si);
1309 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1310 return false;
1312 } /* bbs */
1314 /* All operations in the loop are either irrelevant (deal with loop
1315 control, or dead), or only used outside the loop and can be moved
1316 out of the loop (e.g. invariants, inductions). The loop can be
1317 optimized away by scalar optimizations. We're better off not
1318 touching this loop. */
1319 if (!need_to_vectorize)
1321 if (vect_print_dump_info (REPORT_DETAILS))
1322 fprintf (vect_dump,
1323 "All the computation can be taken out of the loop.");
1324 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1325 fprintf (vect_dump,
1326 "not vectorized: redundant loop. no profit to vectorize.");
1327 return false;
1330 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1331 && vect_print_dump_info (REPORT_DETAILS))
1332 fprintf (vect_dump,
1333 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1334 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1336 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1337 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1339 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1340 fprintf (vect_dump, "not vectorized: iteration count too small.");
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1343 "vectorization factor.");
1344 return false;
1347 /* Analyze cost. Decide if worth while to vectorize. */
1349 /* Once VF is set, SLP costs should be updated since the number of created
1350 vector stmts depends on VF. */
1351 vect_update_slp_costs_according_to_vf (loop_vinfo);
1353 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1354 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1356 if (min_profitable_iters < 0)
1358 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1359 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1360 if (vect_print_dump_info (REPORT_DETAILS))
1361 fprintf (vect_dump, "not vectorized: vector version will never be "
1362 "profitable.");
1363 return false;
1366 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1367 * vectorization_factor) - 1);
1369 /* Use the cost model only if it is more conservative than user specified
1370 threshold. */
1372 th = (unsigned) min_scalar_loop_bound;
1373 if (min_profitable_iters
1374 && (!min_scalar_loop_bound
1375 || min_profitable_iters > min_scalar_loop_bound))
1376 th = (unsigned) min_profitable_iters;
1378 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1379 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1382 fprintf (vect_dump, "not vectorized: vectorization not "
1383 "profitable.");
1384 if (vect_print_dump_info (REPORT_DETAILS))
1385 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1386 "user specified loop bound parameter or minimum "
1387 "profitable iterations (whichever is more conservative).");
1388 return false;
1391 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1392 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1393 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1395 if (vect_print_dump_info (REPORT_DETAILS))
1396 fprintf (vect_dump, "epilog loop required.");
1397 if (!vect_can_advance_ivs_p (loop_vinfo))
1399 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1400 fprintf (vect_dump,
1401 "not vectorized: can't create epilog loop 1.");
1402 return false;
1404 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1406 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1407 fprintf (vect_dump,
1408 "not vectorized: can't create epilog loop 2.");
1409 return false;
1413 return true;
1417 /* Function vect_analyze_loop_2.
1419 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1420 for it. The different analyses will record information in the
1421 loop_vec_info struct. */
1422 static bool
1423 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1425 bool ok, dummy, slp = false;
1426 int max_vf = MAX_VECTORIZATION_FACTOR;
1427 int min_vf = 2;
1429 /* Find all data references in the loop (which correspond to vdefs/vuses)
1430 and analyze their evolution in the loop. Also adjust the minimal
1431 vectorization factor according to the loads and stores.
1433 FORNOW: Handle only simple, array references, which
1434 alignment can be forced, and aligned pointer-references. */
1436 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1437 if (!ok)
1439 if (vect_print_dump_info (REPORT_DETAILS))
1440 fprintf (vect_dump, "bad data references.");
1441 return false;
1444 /* Classify all cross-iteration scalar data-flow cycles.
1445 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1447 vect_analyze_scalar_cycles (loop_vinfo);
1449 vect_pattern_recog (loop_vinfo);
1451 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1453 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1454 if (!ok)
1456 if (vect_print_dump_info (REPORT_DETAILS))
1457 fprintf (vect_dump, "unexpected pattern.");
1458 return false;
1461 /* Analyze data dependences between the data-refs in the loop
1462 and adjust the maximum vectorization factor according to
1463 the dependences.
1464 FORNOW: fail at the first data dependence that we encounter. */
1466 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1467 if (!ok
1468 || max_vf < min_vf)
1470 if (vect_print_dump_info (REPORT_DETAILS))
1471 fprintf (vect_dump, "bad data dependence.");
1472 return false;
1475 ok = vect_determine_vectorization_factor (loop_vinfo);
1476 if (!ok)
1478 if (vect_print_dump_info (REPORT_DETAILS))
1479 fprintf (vect_dump, "can't determine vectorization factor.");
1480 return false;
1482 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1484 if (vect_print_dump_info (REPORT_DETAILS))
1485 fprintf (vect_dump, "bad data dependence.");
1486 return false;
1489 /* Analyze the alignment of the data-refs in the loop.
1490 Fail if a data reference is found that cannot be vectorized. */
1492 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1493 if (!ok)
1495 if (vect_print_dump_info (REPORT_DETAILS))
1496 fprintf (vect_dump, "bad data alignment.");
1497 return false;
1500 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1501 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1503 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1504 if (!ok)
1506 if (vect_print_dump_info (REPORT_DETAILS))
1507 fprintf (vect_dump, "bad data access.");
1508 return false;
1511 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1512 It is important to call pruning after vect_analyze_data_ref_accesses,
1513 since we use grouping information gathered by interleaving analysis. */
1514 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1515 if (!ok)
1517 if (vect_print_dump_info (REPORT_DETAILS))
1518 fprintf (vect_dump, "too long list of versioning for alias "
1519 "run-time tests.");
1520 return false;
1523 /* This pass will decide on using loop versioning and/or loop peeling in
1524 order to enhance the alignment of data references in the loop. */
1526 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1527 if (!ok)
1529 if (vect_print_dump_info (REPORT_DETAILS))
1530 fprintf (vect_dump, "bad data alignment.");
1531 return false;
1534 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1535 ok = vect_analyze_slp (loop_vinfo, NULL);
1536 if (ok)
1538 /* Decide which possible SLP instances to SLP. */
1539 slp = vect_make_slp_decision (loop_vinfo);
1541 /* Find stmts that need to be both vectorized and SLPed. */
1542 vect_detect_hybrid_slp (loop_vinfo);
1545 /* Scan all the operations in the loop and make sure they are
1546 vectorizable. */
1548 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1549 if (!ok)
1551 if (vect_print_dump_info (REPORT_DETAILS))
1552 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1553 return false;
1556 return true;
1559 /* Function vect_analyze_loop.
1561 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1562 for it. The different analyses will record information in the
1563 loop_vec_info struct. */
1564 loop_vec_info
1565 vect_analyze_loop (struct loop *loop)
1567 loop_vec_info loop_vinfo;
1568 unsigned int vector_sizes;
1570 /* Autodetect first vector size we try. */
1571 current_vector_size = 0;
1572 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1574 if (vect_print_dump_info (REPORT_DETAILS))
1575 fprintf (vect_dump, "===== analyze_loop_nest =====");
1577 if (loop_outer (loop)
1578 && loop_vec_info_for_loop (loop_outer (loop))
1579 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1581 if (vect_print_dump_info (REPORT_DETAILS))
1582 fprintf (vect_dump, "outer-loop already vectorized.");
1583 return NULL;
1586 while (1)
1588 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1589 loop_vinfo = vect_analyze_loop_form (loop);
1590 if (!loop_vinfo)
1592 if (vect_print_dump_info (REPORT_DETAILS))
1593 fprintf (vect_dump, "bad loop form.");
1594 return NULL;
1597 if (vect_analyze_loop_2 (loop_vinfo))
1599 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1601 return loop_vinfo;
1604 destroy_loop_vec_info (loop_vinfo, true);
1606 vector_sizes &= ~current_vector_size;
1607 if (vector_sizes == 0
1608 || current_vector_size == 0)
1609 return NULL;
1611 /* Try the next biggest vector size. */
1612 current_vector_size = 1 << floor_log2 (vector_sizes);
1613 if (vect_print_dump_info (REPORT_DETAILS))
1614 fprintf (vect_dump, "***** Re-trying analysis with "
1615 "vector size %d\n", current_vector_size);
1620 /* Function reduction_code_for_scalar_code
1622 Input:
1623 CODE - tree_code of a reduction operations.
1625 Output:
1626 REDUC_CODE - the corresponding tree-code to be used to reduce the
1627 vector of partial results into a single scalar result (which
1628 will also reside in a vector) or ERROR_MARK if the operation is
1629 a supported reduction operation, but does not have such tree-code.
1631 Return FALSE if CODE currently cannot be vectorized as reduction. */
1633 static bool
1634 reduction_code_for_scalar_code (enum tree_code code,
1635 enum tree_code *reduc_code)
1637 switch (code)
1639 case MAX_EXPR:
1640 *reduc_code = REDUC_MAX_EXPR;
1641 return true;
1643 case MIN_EXPR:
1644 *reduc_code = REDUC_MIN_EXPR;
1645 return true;
1647 case PLUS_EXPR:
1648 *reduc_code = REDUC_PLUS_EXPR;
1649 return true;
1651 case MULT_EXPR:
1652 case MINUS_EXPR:
1653 case BIT_IOR_EXPR:
1654 case BIT_XOR_EXPR:
1655 case BIT_AND_EXPR:
1656 *reduc_code = ERROR_MARK;
1657 return true;
1659 default:
1660 return false;
1665 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1666 STMT is printed with a message MSG. */
1668 static void
1669 report_vect_op (gimple stmt, const char *msg)
1671 fprintf (vect_dump, "%s", msg);
1672 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1676 /* Function vect_is_simple_reduction_1
1678 (1) Detect a cross-iteration def-use cycle that represents a simple
1679 reduction computation. We look for the following pattern:
1681 loop_header:
1682 a1 = phi < a0, a2 >
1683 a3 = ...
1684 a2 = operation (a3, a1)
1686 such that:
1687 1. operation is commutative and associative and it is safe to
1688 change the order of the computation (if CHECK_REDUCTION is true)
1689 2. no uses for a2 in the loop (a2 is used out of the loop)
1690 3. no uses of a1 in the loop besides the reduction operation
1691 4. no uses of a1 outside the loop.
1693 Conditions 1,4 are tested here.
1694 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1696 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1697 nested cycles, if CHECK_REDUCTION is false.
1699 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1700 reductions:
1702 a1 = phi < a0, a2 >
1703 inner loop (def of a3)
1704 a2 = phi < a3 >
1706 If MODIFY is true it tries also to rework the code in-place to enable
1707 detection of more reduction patterns. For the time being we rewrite
1708 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1711 static gimple
1712 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1713 bool check_reduction, bool *double_reduc,
1714 bool modify)
1716 struct loop *loop = (gimple_bb (phi))->loop_father;
1717 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1718 edge latch_e = loop_latch_edge (loop);
1719 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1720 gimple def_stmt, def1 = NULL, def2 = NULL;
1721 enum tree_code orig_code, code;
1722 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1723 tree type;
1724 int nloop_uses;
1725 tree name;
1726 imm_use_iterator imm_iter;
1727 use_operand_p use_p;
1728 bool phi_def;
1730 *double_reduc = false;
1732 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1733 otherwise, we assume outer loop vectorization. */
1734 gcc_assert ((check_reduction && loop == vect_loop)
1735 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1737 name = PHI_RESULT (phi);
1738 nloop_uses = 0;
1739 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1741 gimple use_stmt = USE_STMT (use_p);
1742 if (is_gimple_debug (use_stmt))
1743 continue;
1745 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1747 if (vect_print_dump_info (REPORT_DETAILS))
1748 fprintf (vect_dump, "intermediate value used outside loop.");
1750 return NULL;
1753 if (vinfo_for_stmt (use_stmt)
1754 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1755 nloop_uses++;
1756 if (nloop_uses > 1)
1758 if (vect_print_dump_info (REPORT_DETAILS))
1759 fprintf (vect_dump, "reduction used in loop.");
1760 return NULL;
1764 if (TREE_CODE (loop_arg) != SSA_NAME)
1766 if (vect_print_dump_info (REPORT_DETAILS))
1768 fprintf (vect_dump, "reduction: not ssa_name: ");
1769 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1771 return NULL;
1774 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1775 if (!def_stmt)
1777 if (vect_print_dump_info (REPORT_DETAILS))
1778 fprintf (vect_dump, "reduction: no def_stmt.");
1779 return NULL;
1782 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1784 if (vect_print_dump_info (REPORT_DETAILS))
1785 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1786 return NULL;
1789 if (is_gimple_assign (def_stmt))
1791 name = gimple_assign_lhs (def_stmt);
1792 phi_def = false;
1794 else
1796 name = PHI_RESULT (def_stmt);
1797 phi_def = true;
1800 nloop_uses = 0;
1801 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1803 gimple use_stmt = USE_STMT (use_p);
1804 if (is_gimple_debug (use_stmt))
1805 continue;
1806 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1807 && vinfo_for_stmt (use_stmt)
1808 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1809 nloop_uses++;
1810 if (nloop_uses > 1)
1812 if (vect_print_dump_info (REPORT_DETAILS))
1813 fprintf (vect_dump, "reduction used in loop.");
1814 return NULL;
1818 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1819 defined in the inner loop. */
1820 if (phi_def)
1822 op1 = PHI_ARG_DEF (def_stmt, 0);
1824 if (gimple_phi_num_args (def_stmt) != 1
1825 || TREE_CODE (op1) != SSA_NAME)
1827 if (vect_print_dump_info (REPORT_DETAILS))
1828 fprintf (vect_dump, "unsupported phi node definition.");
1830 return NULL;
1833 def1 = SSA_NAME_DEF_STMT (op1);
1834 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1835 && loop->inner
1836 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1837 && is_gimple_assign (def1))
1839 if (vect_print_dump_info (REPORT_DETAILS))
1840 report_vect_op (def_stmt, "detected double reduction: ");
1842 *double_reduc = true;
1843 return def_stmt;
1846 return NULL;
1849 code = orig_code = gimple_assign_rhs_code (def_stmt);
1851 /* We can handle "res -= x[i]", which is non-associative by
1852 simply rewriting this into "res += -x[i]". Avoid changing
1853 gimple instruction for the first simple tests and only do this
1854 if we're allowed to change code at all. */
1855 if (code == MINUS_EXPR
1856 && modify
1857 && (op1 = gimple_assign_rhs1 (def_stmt))
1858 && TREE_CODE (op1) == SSA_NAME
1859 && SSA_NAME_DEF_STMT (op1) == phi)
1860 code = PLUS_EXPR;
1862 if (check_reduction
1863 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1865 if (vect_print_dump_info (REPORT_DETAILS))
1866 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1867 return NULL;
1870 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1872 if (code != COND_EXPR)
1874 if (vect_print_dump_info (REPORT_DETAILS))
1875 report_vect_op (def_stmt, "reduction: not binary operation: ");
1877 return NULL;
1880 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1881 if (COMPARISON_CLASS_P (op3))
1883 op4 = TREE_OPERAND (op3, 1);
1884 op3 = TREE_OPERAND (op3, 0);
1887 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1888 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1890 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1892 if (vect_print_dump_info (REPORT_DETAILS))
1893 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1895 return NULL;
1898 else
1900 op1 = gimple_assign_rhs1 (def_stmt);
1901 op2 = gimple_assign_rhs2 (def_stmt);
1903 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1905 if (vect_print_dump_info (REPORT_DETAILS))
1906 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1908 return NULL;
1912 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1913 if ((TREE_CODE (op1) == SSA_NAME
1914 && !types_compatible_p (type,TREE_TYPE (op1)))
1915 || (TREE_CODE (op2) == SSA_NAME
1916 && !types_compatible_p (type, TREE_TYPE (op2)))
1917 || (op3 && TREE_CODE (op3) == SSA_NAME
1918 && !types_compatible_p (type, TREE_TYPE (op3)))
1919 || (op4 && TREE_CODE (op4) == SSA_NAME
1920 && !types_compatible_p (type, TREE_TYPE (op4))))
1922 if (vect_print_dump_info (REPORT_DETAILS))
1924 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1925 print_generic_expr (vect_dump, type, TDF_SLIM);
1926 fprintf (vect_dump, ", operands types: ");
1927 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1928 fprintf (vect_dump, ",");
1929 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1930 if (op3)
1932 fprintf (vect_dump, ",");
1933 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1936 if (op4)
1938 fprintf (vect_dump, ",");
1939 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1943 return NULL;
1946 /* Check that it's ok to change the order of the computation.
1947 Generally, when vectorizing a reduction we change the order of the
1948 computation. This may change the behavior of the program in some
1949 cases, so we need to check that this is ok. One exception is when
1950 vectorizing an outer-loop: the inner-loop is executed sequentially,
1951 and therefore vectorizing reductions in the inner-loop during
1952 outer-loop vectorization is safe. */
1954 /* CHECKME: check for !flag_finite_math_only too? */
1955 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1956 && check_reduction)
1958 /* Changing the order of operations changes the semantics. */
1959 if (vect_print_dump_info (REPORT_DETAILS))
1960 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1961 return NULL;
1963 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1964 && check_reduction)
1966 /* Changing the order of operations changes the semantics. */
1967 if (vect_print_dump_info (REPORT_DETAILS))
1968 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1969 return NULL;
1971 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1973 /* Changing the order of operations changes the semantics. */
1974 if (vect_print_dump_info (REPORT_DETAILS))
1975 report_vect_op (def_stmt,
1976 "reduction: unsafe fixed-point math optimization: ");
1977 return NULL;
1980 /* If we detected "res -= x[i]" earlier, rewrite it into
1981 "res += -x[i]" now. If this turns out to be useless reassoc
1982 will clean it up again. */
1983 if (orig_code == MINUS_EXPR)
1985 tree rhs = gimple_assign_rhs2 (def_stmt);
1986 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1987 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1988 rhs, NULL);
1989 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1990 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1991 loop_info, NULL));
1992 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1993 gimple_assign_set_rhs2 (def_stmt, negrhs);
1994 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1995 update_stmt (def_stmt);
1998 /* Reduction is safe. We're dealing with one of the following:
1999 1) integer arithmetic and no trapv
2000 2) floating point arithmetic, and special flags permit this optimization
2001 3) nested cycle (i.e., outer loop vectorization). */
2002 if (TREE_CODE (op1) == SSA_NAME)
2003 def1 = SSA_NAME_DEF_STMT (op1);
2005 if (TREE_CODE (op2) == SSA_NAME)
2006 def2 = SSA_NAME_DEF_STMT (op2);
2008 if (code != COND_EXPR
2009 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2011 if (vect_print_dump_info (REPORT_DETAILS))
2012 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2013 return NULL;
2016 /* Check that one def is the reduction def, defined by PHI,
2017 the other def is either defined in the loop ("vect_internal_def"),
2018 or it's an induction (defined by a loop-header phi-node). */
2020 if (def2 && def2 == phi
2021 && (code == COND_EXPR
2022 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2023 && (is_gimple_assign (def1)
2024 || is_gimple_call (def1)
2025 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2026 == vect_induction_def
2027 || (gimple_code (def1) == GIMPLE_PHI
2028 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2029 == vect_internal_def
2030 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2032 if (vect_print_dump_info (REPORT_DETAILS))
2033 report_vect_op (def_stmt, "detected reduction: ");
2034 return def_stmt;
2036 else if (def1 && def1 == phi
2037 && (code == COND_EXPR
2038 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2039 && (is_gimple_assign (def2)
2040 || is_gimple_call (def2)
2041 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2042 == vect_induction_def
2043 || (gimple_code (def2) == GIMPLE_PHI
2044 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2045 == vect_internal_def
2046 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2048 if (check_reduction)
2050 /* Swap operands (just for simplicity - so that the rest of the code
2051 can assume that the reduction variable is always the last (second)
2052 argument). */
2053 if (vect_print_dump_info (REPORT_DETAILS))
2054 report_vect_op (def_stmt,
2055 "detected reduction: need to swap operands: ");
2057 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2058 gimple_assign_rhs2_ptr (def_stmt));
2060 else
2062 if (vect_print_dump_info (REPORT_DETAILS))
2063 report_vect_op (def_stmt, "detected reduction: ");
2066 return def_stmt;
2068 else
2070 if (vect_print_dump_info (REPORT_DETAILS))
2071 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2073 return NULL;
2077 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2078 in-place. Arguments as there. */
2080 static gimple
2081 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2082 bool check_reduction, bool *double_reduc)
2084 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2085 double_reduc, false);
2088 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2089 in-place if it enables detection of more reductions. Arguments
2090 as there. */
2092 gimple
2093 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2094 bool check_reduction, bool *double_reduc)
2096 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2097 double_reduc, true);
2100 /* Calculate the cost of one scalar iteration of the loop. */
2102 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2104 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2105 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2106 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2107 int innerloop_iters, i, stmt_cost;
2109 /* Count statements in scalar loop. Using this as scalar cost for a single
2110 iteration for now.
2112 TODO: Add outer loop support.
2114 TODO: Consider assigning different costs to different scalar
2115 statements. */
2117 /* FORNOW. */
2118 innerloop_iters = 1;
2119 if (loop->inner)
2120 innerloop_iters = 50; /* FIXME */
2122 for (i = 0; i < nbbs; i++)
2124 gimple_stmt_iterator si;
2125 basic_block bb = bbs[i];
2127 if (bb->loop_father == loop->inner)
2128 factor = innerloop_iters;
2129 else
2130 factor = 1;
2132 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2134 gimple stmt = gsi_stmt (si);
2135 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2137 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2138 continue;
2140 /* Skip stmts that are not vectorized inside the loop. */
2141 if (stmt_info
2142 && !STMT_VINFO_RELEVANT_P (stmt_info)
2143 && (!STMT_VINFO_LIVE_P (stmt_info)
2144 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2145 continue;
2147 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2149 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2150 stmt_cost = vect_get_cost (scalar_load);
2151 else
2152 stmt_cost = vect_get_cost (scalar_store);
2154 else
2155 stmt_cost = vect_get_cost (scalar_stmt);
2157 scalar_single_iter_cost += stmt_cost * factor;
2160 return scalar_single_iter_cost;
2163 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2165 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2166 int *peel_iters_epilogue,
2167 int scalar_single_iter_cost)
2169 int peel_guard_costs = 0;
2170 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2172 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2174 *peel_iters_epilogue = vf/2;
2175 if (vect_print_dump_info (REPORT_COST))
2176 fprintf (vect_dump, "cost model: "
2177 "epilogue peel iters set to vf/2 because "
2178 "loop iterations are unknown .");
2180 /* If peeled iterations are known but number of scalar loop
2181 iterations are unknown, count a taken branch per peeled loop. */
2182 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2184 else
2186 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2187 peel_iters_prologue = niters < peel_iters_prologue ?
2188 niters : peel_iters_prologue;
2189 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2192 return (peel_iters_prologue * scalar_single_iter_cost)
2193 + (*peel_iters_epilogue * scalar_single_iter_cost)
2194 + peel_guard_costs;
2197 /* Function vect_estimate_min_profitable_iters
2199 Return the number of iterations required for the vector version of the
2200 loop to be profitable relative to the cost of the scalar version of the
2201 loop.
2203 TODO: Take profile info into account before making vectorization
2204 decisions, if available. */
2207 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2209 int i;
2210 int min_profitable_iters;
2211 int peel_iters_prologue;
2212 int peel_iters_epilogue;
2213 int vec_inside_cost = 0;
2214 int vec_outside_cost = 0;
2215 int scalar_single_iter_cost = 0;
2216 int scalar_outside_cost = 0;
2217 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2220 int nbbs = loop->num_nodes;
2221 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2222 int peel_guard_costs = 0;
2223 int innerloop_iters = 0, factor;
2224 VEC (slp_instance, heap) *slp_instances;
2225 slp_instance instance;
2227 /* Cost model disabled. */
2228 if (!flag_vect_cost_model)
2230 if (vect_print_dump_info (REPORT_COST))
2231 fprintf (vect_dump, "cost model disabled.");
2232 return 0;
2235 /* Requires loop versioning tests to handle misalignment. */
2236 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2238 /* FIXME: Make cost depend on complexity of individual check. */
2239 vec_outside_cost +=
2240 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2241 if (vect_print_dump_info (REPORT_COST))
2242 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2243 "versioning to treat misalignment.\n");
2246 /* Requires loop versioning with alias checks. */
2247 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2249 /* FIXME: Make cost depend on complexity of individual check. */
2250 vec_outside_cost +=
2251 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2252 if (vect_print_dump_info (REPORT_COST))
2253 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2254 "versioning aliasing.\n");
2257 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2258 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2259 vec_outside_cost += vect_get_cost (cond_branch_taken);
2261 /* Count statements in scalar loop. Using this as scalar cost for a single
2262 iteration for now.
2264 TODO: Add outer loop support.
2266 TODO: Consider assigning different costs to different scalar
2267 statements. */
2269 /* FORNOW. */
2270 if (loop->inner)
2271 innerloop_iters = 50; /* FIXME */
2273 for (i = 0; i < nbbs; i++)
2275 gimple_stmt_iterator si;
2276 basic_block bb = bbs[i];
2278 if (bb->loop_father == loop->inner)
2279 factor = innerloop_iters;
2280 else
2281 factor = 1;
2283 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2285 gimple stmt = gsi_stmt (si);
2286 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2287 /* Skip stmts that are not vectorized inside the loop. */
2288 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2289 && (!STMT_VINFO_LIVE_P (stmt_info)
2290 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2291 continue;
2292 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2293 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2294 some of the "outside" costs are generated inside the outer-loop. */
2295 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2299 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2301 /* Add additional cost for the peeled instructions in prologue and epilogue
2302 loop.
2304 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2305 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2307 TODO: Build an expression that represents peel_iters for prologue and
2308 epilogue to be used in a run-time test. */
2310 if (npeel < 0)
2312 peel_iters_prologue = vf/2;
2313 if (vect_print_dump_info (REPORT_COST))
2314 fprintf (vect_dump, "cost model: "
2315 "prologue peel iters set to vf/2.");
2317 /* If peeling for alignment is unknown, loop bound of main loop becomes
2318 unknown. */
2319 peel_iters_epilogue = vf/2;
2320 if (vect_print_dump_info (REPORT_COST))
2321 fprintf (vect_dump, "cost model: "
2322 "epilogue peel iters set to vf/2 because "
2323 "peeling for alignment is unknown .");
2325 /* If peeled iterations are unknown, count a taken branch and a not taken
2326 branch per peeled loop. Even if scalar loop iterations are known,
2327 vector iterations are not known since peeled prologue iterations are
2328 not known. Hence guards remain the same. */
2329 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2330 + vect_get_cost (cond_branch_not_taken));
2331 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2332 + (peel_iters_epilogue * scalar_single_iter_cost)
2333 + peel_guard_costs;
2335 else
2337 peel_iters_prologue = npeel;
2338 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2339 peel_iters_prologue, &peel_iters_epilogue,
2340 scalar_single_iter_cost);
2343 /* FORNOW: The scalar outside cost is incremented in one of the
2344 following ways:
2346 1. The vectorizer checks for alignment and aliasing and generates
2347 a condition that allows dynamic vectorization. A cost model
2348 check is ANDED with the versioning condition. Hence scalar code
2349 path now has the added cost of the versioning check.
2351 if (cost > th & versioning_check)
2352 jmp to vector code
2354 Hence run-time scalar is incremented by not-taken branch cost.
2356 2. The vectorizer then checks if a prologue is required. If the
2357 cost model check was not done before during versioning, it has to
2358 be done before the prologue check.
2360 if (cost <= th)
2361 prologue = scalar_iters
2362 if (prologue == 0)
2363 jmp to vector code
2364 else
2365 execute prologue
2366 if (prologue == num_iters)
2367 go to exit
2369 Hence the run-time scalar cost is incremented by a taken branch,
2370 plus a not-taken branch, plus a taken branch cost.
2372 3. The vectorizer then checks if an epilogue is required. If the
2373 cost model check was not done before during prologue check, it
2374 has to be done with the epilogue check.
2376 if (prologue == 0)
2377 jmp to vector code
2378 else
2379 execute prologue
2380 if (prologue == num_iters)
2381 go to exit
2382 vector code:
2383 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2384 jmp to epilogue
2386 Hence the run-time scalar cost should be incremented by 2 taken
2387 branches.
2389 TODO: The back end may reorder the BBS's differently and reverse
2390 conditions/branch directions. Change the estimates below to
2391 something more reasonable. */
2393 /* If the number of iterations is known and we do not do versioning, we can
2394 decide whether to vectorize at compile time. Hence the scalar version
2395 do not carry cost model guard costs. */
2396 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2397 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2398 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2400 /* Cost model check occurs at versioning. */
2401 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2402 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2403 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2404 else
2406 /* Cost model check occurs at prologue generation. */
2407 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2408 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2409 + vect_get_cost (cond_branch_not_taken);
2410 /* Cost model check occurs at epilogue generation. */
2411 else
2412 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2416 /* Add SLP costs. */
2417 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2418 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2420 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2421 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2424 /* Calculate number of iterations required to make the vector version
2425 profitable, relative to the loop bodies only. The following condition
2426 must hold true:
2427 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2428 where
2429 SIC = scalar iteration cost, VIC = vector iteration cost,
2430 VOC = vector outside cost, VF = vectorization factor,
2431 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2432 SOC = scalar outside cost for run time cost model check. */
2434 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2436 if (vec_outside_cost <= 0)
2437 min_profitable_iters = 1;
2438 else
2440 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2441 - vec_inside_cost * peel_iters_prologue
2442 - vec_inside_cost * peel_iters_epilogue)
2443 / ((scalar_single_iter_cost * vf)
2444 - vec_inside_cost);
2446 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2447 <= ((vec_inside_cost * min_profitable_iters)
2448 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2449 min_profitable_iters++;
2452 /* vector version will never be profitable. */
2453 else
2455 if (vect_print_dump_info (REPORT_COST))
2456 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2457 "divided by the scalar iteration cost = %d "
2458 "is greater or equal to the vectorization factor = %d.",
2459 vec_inside_cost, scalar_single_iter_cost, vf);
2460 return -1;
2463 if (vect_print_dump_info (REPORT_COST))
2465 fprintf (vect_dump, "Cost model analysis: \n");
2466 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2467 vec_inside_cost);
2468 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2469 vec_outside_cost);
2470 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2471 scalar_single_iter_cost);
2472 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2473 fprintf (vect_dump, " prologue iterations: %d\n",
2474 peel_iters_prologue);
2475 fprintf (vect_dump, " epilogue iterations: %d\n",
2476 peel_iters_epilogue);
2477 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2478 min_profitable_iters);
2481 min_profitable_iters =
2482 min_profitable_iters < vf ? vf : min_profitable_iters;
2484 /* Because the condition we create is:
2485 if (niters <= min_profitable_iters)
2486 then skip the vectorized loop. */
2487 min_profitable_iters--;
2489 if (vect_print_dump_info (REPORT_COST))
2490 fprintf (vect_dump, " Profitability threshold = %d\n",
2491 min_profitable_iters);
2493 return min_profitable_iters;
2497 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2498 functions. Design better to avoid maintenance issues. */
2500 /* Function vect_model_reduction_cost.
2502 Models cost for a reduction operation, including the vector ops
2503 generated within the strip-mine loop, the initial definition before
2504 the loop, and the epilogue code that must be generated. */
2506 static bool
2507 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2508 int ncopies)
2510 int outer_cost = 0;
2511 enum tree_code code;
2512 optab optab;
2513 tree vectype;
2514 gimple stmt, orig_stmt;
2515 tree reduction_op;
2516 enum machine_mode mode;
2517 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2521 /* Cost of reduction op inside loop. */
2522 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2523 += ncopies * vect_get_cost (vector_stmt);
2525 stmt = STMT_VINFO_STMT (stmt_info);
2527 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2529 case GIMPLE_SINGLE_RHS:
2530 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2531 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2532 break;
2533 case GIMPLE_UNARY_RHS:
2534 reduction_op = gimple_assign_rhs1 (stmt);
2535 break;
2536 case GIMPLE_BINARY_RHS:
2537 reduction_op = gimple_assign_rhs2 (stmt);
2538 break;
2539 case GIMPLE_TERNARY_RHS:
2540 reduction_op = gimple_assign_rhs3 (stmt);
2541 break;
2542 default:
2543 gcc_unreachable ();
2546 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2547 if (!vectype)
2549 if (vect_print_dump_info (REPORT_COST))
2551 fprintf (vect_dump, "unsupported data-type ");
2552 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2554 return false;
2557 mode = TYPE_MODE (vectype);
2558 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2560 if (!orig_stmt)
2561 orig_stmt = STMT_VINFO_STMT (stmt_info);
2563 code = gimple_assign_rhs_code (orig_stmt);
2565 /* Add in cost for initial definition. */
2566 outer_cost += vect_get_cost (scalar_to_vec);
2568 /* Determine cost of epilogue code.
2570 We have a reduction operator that will reduce the vector in one statement.
2571 Also requires scalar extract. */
2573 if (!nested_in_vect_loop_p (loop, orig_stmt))
2575 if (reduc_code != ERROR_MARK)
2576 outer_cost += vect_get_cost (vector_stmt)
2577 + vect_get_cost (vec_to_scalar);
2578 else
2580 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2581 tree bitsize =
2582 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2583 int element_bitsize = tree_low_cst (bitsize, 1);
2584 int nelements = vec_size_in_bits / element_bitsize;
2586 optab = optab_for_tree_code (code, vectype, optab_default);
2588 /* We have a whole vector shift available. */
2589 if (VECTOR_MODE_P (mode)
2590 && optab_handler (optab, mode) != CODE_FOR_nothing
2591 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2592 /* Final reduction via vector shifts and the reduction operator. Also
2593 requires scalar extract. */
2594 outer_cost += ((exact_log2(nelements) * 2)
2595 * vect_get_cost (vector_stmt)
2596 + vect_get_cost (vec_to_scalar));
2597 else
2598 /* Use extracts and reduction op for final reduction. For N elements,
2599 we have N extracts and N-1 reduction ops. */
2600 outer_cost += ((nelements + nelements - 1)
2601 * vect_get_cost (vector_stmt));
2605 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2607 if (vect_print_dump_info (REPORT_COST))
2608 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2609 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2610 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2612 return true;
2616 /* Function vect_model_induction_cost.
2618 Models cost for induction operations. */
2620 static void
2621 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2623 /* loop cost for vec_loop. */
2624 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2625 = ncopies * vect_get_cost (vector_stmt);
2626 /* prologue cost for vec_init and vec_step. */
2627 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2628 = 2 * vect_get_cost (scalar_to_vec);
2630 if (vect_print_dump_info (REPORT_COST))
2631 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2632 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2633 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2637 /* Function get_initial_def_for_induction
2639 Input:
2640 STMT - a stmt that performs an induction operation in the loop.
2641 IV_PHI - the initial value of the induction variable
2643 Output:
2644 Return a vector variable, initialized with the first VF values of
2645 the induction variable. E.g., for an iv with IV_PHI='X' and
2646 evolution S, for a vector of 4 units, we want to return:
2647 [X, X + S, X + 2*S, X + 3*S]. */
2649 static tree
2650 get_initial_def_for_induction (gimple iv_phi)
2652 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2653 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2654 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2655 tree scalar_type;
2656 tree vectype;
2657 int nunits;
2658 edge pe = loop_preheader_edge (loop);
2659 struct loop *iv_loop;
2660 basic_block new_bb;
2661 tree vec, vec_init, vec_step, t;
2662 tree access_fn;
2663 tree new_var;
2664 tree new_name;
2665 gimple init_stmt, induction_phi, new_stmt;
2666 tree induc_def, vec_def, vec_dest;
2667 tree init_expr, step_expr;
2668 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2669 int i;
2670 bool ok;
2671 int ncopies;
2672 tree expr;
2673 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2674 bool nested_in_vect_loop = false;
2675 gimple_seq stmts = NULL;
2676 imm_use_iterator imm_iter;
2677 use_operand_p use_p;
2678 gimple exit_phi;
2679 edge latch_e;
2680 tree loop_arg;
2681 gimple_stmt_iterator si;
2682 basic_block bb = gimple_bb (iv_phi);
2683 tree stepvectype;
2684 tree resvectype;
2686 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2687 if (nested_in_vect_loop_p (loop, iv_phi))
2689 nested_in_vect_loop = true;
2690 iv_loop = loop->inner;
2692 else
2693 iv_loop = loop;
2694 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2696 latch_e = loop_latch_edge (iv_loop);
2697 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2699 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2700 gcc_assert (access_fn);
2701 STRIP_NOPS (access_fn);
2702 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2703 &init_expr, &step_expr);
2704 gcc_assert (ok);
2705 pe = loop_preheader_edge (iv_loop);
2707 scalar_type = TREE_TYPE (init_expr);
2708 vectype = get_vectype_for_scalar_type (scalar_type);
2709 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2710 gcc_assert (vectype);
2711 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2712 ncopies = vf / nunits;
2714 gcc_assert (phi_info);
2715 gcc_assert (ncopies >= 1);
2717 /* Find the first insertion point in the BB. */
2718 si = gsi_after_labels (bb);
2720 /* Create the vector that holds the initial_value of the induction. */
2721 if (nested_in_vect_loop)
2723 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2724 been created during vectorization of previous stmts. We obtain it
2725 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2726 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2727 loop_preheader_edge (iv_loop));
2728 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2730 else
2732 /* iv_loop is the loop to be vectorized. Create:
2733 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2734 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2735 add_referenced_var (new_var);
2737 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2738 if (stmts)
2740 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2741 gcc_assert (!new_bb);
2744 t = NULL_TREE;
2745 t = tree_cons (NULL_TREE, new_name, t);
2746 for (i = 1; i < nunits; i++)
2748 /* Create: new_name_i = new_name + step_expr */
2749 enum tree_code code = POINTER_TYPE_P (scalar_type)
2750 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2751 init_stmt = gimple_build_assign_with_ops (code, new_var,
2752 new_name, step_expr);
2753 new_name = make_ssa_name (new_var, init_stmt);
2754 gimple_assign_set_lhs (init_stmt, new_name);
2756 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2757 gcc_assert (!new_bb);
2759 if (vect_print_dump_info (REPORT_DETAILS))
2761 fprintf (vect_dump, "created new init_stmt: ");
2762 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2764 t = tree_cons (NULL_TREE, new_name, t);
2766 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2767 vec = build_constructor_from_list (vectype, nreverse (t));
2768 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2772 /* Create the vector that holds the step of the induction. */
2773 if (nested_in_vect_loop)
2774 /* iv_loop is nested in the loop to be vectorized. Generate:
2775 vec_step = [S, S, S, S] */
2776 new_name = step_expr;
2777 else
2779 /* iv_loop is the loop to be vectorized. Generate:
2780 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2781 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2782 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2783 expr, step_expr);
2786 t = unshare_expr (new_name);
2787 gcc_assert (CONSTANT_CLASS_P (new_name));
2788 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2789 gcc_assert (stepvectype);
2790 vec = build_vector_from_val (stepvectype, t);
2791 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2794 /* Create the following def-use cycle:
2795 loop prolog:
2796 vec_init = ...
2797 vec_step = ...
2798 loop:
2799 vec_iv = PHI <vec_init, vec_loop>
2801 STMT
2803 vec_loop = vec_iv + vec_step; */
2805 /* Create the induction-phi that defines the induction-operand. */
2806 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2807 add_referenced_var (vec_dest);
2808 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2809 set_vinfo_for_stmt (induction_phi,
2810 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2811 induc_def = PHI_RESULT (induction_phi);
2813 /* Create the iv update inside the loop */
2814 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2815 induc_def, vec_step);
2816 vec_def = make_ssa_name (vec_dest, new_stmt);
2817 gimple_assign_set_lhs (new_stmt, vec_def);
2818 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2819 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2820 NULL));
2822 /* Set the arguments of the phi node: */
2823 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2824 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2825 UNKNOWN_LOCATION);
2828 /* In case that vectorization factor (VF) is bigger than the number
2829 of elements that we can fit in a vectype (nunits), we have to generate
2830 more than one vector stmt - i.e - we need to "unroll" the
2831 vector stmt by a factor VF/nunits. For more details see documentation
2832 in vectorizable_operation. */
2834 if (ncopies > 1)
2836 stmt_vec_info prev_stmt_vinfo;
2837 /* FORNOW. This restriction should be relaxed. */
2838 gcc_assert (!nested_in_vect_loop);
2840 /* Create the vector that holds the step of the induction. */
2841 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2842 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2843 expr, step_expr);
2844 t = unshare_expr (new_name);
2845 gcc_assert (CONSTANT_CLASS_P (new_name));
2846 vec = build_vector_from_val (stepvectype, t);
2847 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2849 vec_def = induc_def;
2850 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2851 for (i = 1; i < ncopies; i++)
2853 /* vec_i = vec_prev + vec_step */
2854 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2855 vec_def, vec_step);
2856 vec_def = make_ssa_name (vec_dest, new_stmt);
2857 gimple_assign_set_lhs (new_stmt, vec_def);
2859 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2860 if (!useless_type_conversion_p (resvectype, vectype))
2862 new_stmt = gimple_build_assign_with_ops
2863 (VIEW_CONVERT_EXPR,
2864 vect_get_new_vect_var (resvectype, vect_simple_var,
2865 "vec_iv_"),
2866 build1 (VIEW_CONVERT_EXPR, resvectype,
2867 gimple_assign_lhs (new_stmt)), NULL_TREE);
2868 gimple_assign_set_lhs (new_stmt,
2869 make_ssa_name
2870 (gimple_assign_lhs (new_stmt), new_stmt));
2871 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2873 set_vinfo_for_stmt (new_stmt,
2874 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2875 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2876 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2880 if (nested_in_vect_loop)
2882 /* Find the loop-closed exit-phi of the induction, and record
2883 the final vector of induction results: */
2884 exit_phi = NULL;
2885 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2887 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2889 exit_phi = USE_STMT (use_p);
2890 break;
2893 if (exit_phi)
2895 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2896 /* FORNOW. Currently not supporting the case that an inner-loop induction
2897 is not used in the outer-loop (i.e. only outside the outer-loop). */
2898 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2899 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2901 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2902 if (vect_print_dump_info (REPORT_DETAILS))
2904 fprintf (vect_dump, "vector of inductions after inner-loop:");
2905 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2911 if (vect_print_dump_info (REPORT_DETAILS))
2913 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2914 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2915 fprintf (vect_dump, "\n");
2916 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2919 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2920 if (!useless_type_conversion_p (resvectype, vectype))
2922 new_stmt = gimple_build_assign_with_ops
2923 (VIEW_CONVERT_EXPR,
2924 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2925 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2926 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2927 gimple_assign_set_lhs (new_stmt, induc_def);
2928 si = gsi_start_bb (bb);
2929 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2930 set_vinfo_for_stmt (new_stmt,
2931 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2932 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2933 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2936 return induc_def;
2940 /* Function get_initial_def_for_reduction
2942 Input:
2943 STMT - a stmt that performs a reduction operation in the loop.
2944 INIT_VAL - the initial value of the reduction variable
2946 Output:
2947 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2948 of the reduction (used for adjusting the epilog - see below).
2949 Return a vector variable, initialized according to the operation that STMT
2950 performs. This vector will be used as the initial value of the
2951 vector of partial results.
2953 Option1 (adjust in epilog): Initialize the vector as follows:
2954 add/bit or/xor: [0,0,...,0,0]
2955 mult/bit and: [1,1,...,1,1]
2956 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2957 and when necessary (e.g. add/mult case) let the caller know
2958 that it needs to adjust the result by init_val.
2960 Option2: Initialize the vector as follows:
2961 add/bit or/xor: [init_val,0,0,...,0]
2962 mult/bit and: [init_val,1,1,...,1]
2963 min/max/cond_expr: [init_val,init_val,...,init_val]
2964 and no adjustments are needed.
2966 For example, for the following code:
2968 s = init_val;
2969 for (i=0;i<n;i++)
2970 s = s + a[i];
2972 STMT is 's = s + a[i]', and the reduction variable is 's'.
2973 For a vector of 4 units, we want to return either [0,0,0,init_val],
2974 or [0,0,0,0] and let the caller know that it needs to adjust
2975 the result at the end by 'init_val'.
2977 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2978 initialization vector is simpler (same element in all entries), if
2979 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2981 A cost model should help decide between these two schemes. */
2983 tree
2984 get_initial_def_for_reduction (gimple stmt, tree init_val,
2985 tree *adjustment_def)
2987 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2990 tree scalar_type = TREE_TYPE (init_val);
2991 tree vectype = get_vectype_for_scalar_type (scalar_type);
2992 int nunits;
2993 enum tree_code code = gimple_assign_rhs_code (stmt);
2994 tree def_for_init;
2995 tree init_def;
2996 tree t = NULL_TREE;
2997 int i;
2998 bool nested_in_vect_loop = false;
2999 tree init_value;
3000 REAL_VALUE_TYPE real_init_val = dconst0;
3001 int int_init_val = 0;
3002 gimple def_stmt = NULL;
3004 gcc_assert (vectype);
3005 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3007 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3008 || SCALAR_FLOAT_TYPE_P (scalar_type));
3010 if (nested_in_vect_loop_p (loop, stmt))
3011 nested_in_vect_loop = true;
3012 else
3013 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3015 /* In case of double reduction we only create a vector variable to be put
3016 in the reduction phi node. The actual statement creation is done in
3017 vect_create_epilog_for_reduction. */
3018 if (adjustment_def && nested_in_vect_loop
3019 && TREE_CODE (init_val) == SSA_NAME
3020 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3021 && gimple_code (def_stmt) == GIMPLE_PHI
3022 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3023 && vinfo_for_stmt (def_stmt)
3024 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3025 == vect_double_reduction_def)
3027 *adjustment_def = NULL;
3028 return vect_create_destination_var (init_val, vectype);
3031 if (TREE_CONSTANT (init_val))
3033 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3034 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3035 else
3036 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3038 else
3039 init_value = init_val;
3041 switch (code)
3043 case WIDEN_SUM_EXPR:
3044 case DOT_PROD_EXPR:
3045 case PLUS_EXPR:
3046 case MINUS_EXPR:
3047 case BIT_IOR_EXPR:
3048 case BIT_XOR_EXPR:
3049 case MULT_EXPR:
3050 case BIT_AND_EXPR:
3051 /* ADJUSMENT_DEF is NULL when called from
3052 vect_create_epilog_for_reduction to vectorize double reduction. */
3053 if (adjustment_def)
3055 if (nested_in_vect_loop)
3056 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3057 NULL);
3058 else
3059 *adjustment_def = init_val;
3062 if (code == MULT_EXPR)
3064 real_init_val = dconst1;
3065 int_init_val = 1;
3068 if (code == BIT_AND_EXPR)
3069 int_init_val = -1;
3071 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3072 def_for_init = build_real (scalar_type, real_init_val);
3073 else
3074 def_for_init = build_int_cst (scalar_type, int_init_val);
3076 /* Create a vector of '0' or '1' except the first element. */
3077 for (i = nunits - 2; i >= 0; --i)
3078 t = tree_cons (NULL_TREE, def_for_init, t);
3080 /* Option1: the first element is '0' or '1' as well. */
3081 if (adjustment_def)
3083 t = tree_cons (NULL_TREE, def_for_init, t);
3084 init_def = build_vector (vectype, t);
3085 break;
3088 /* Option2: the first element is INIT_VAL. */
3089 t = tree_cons (NULL_TREE, init_value, t);
3090 if (TREE_CONSTANT (init_val))
3091 init_def = build_vector (vectype, t);
3092 else
3093 init_def = build_constructor_from_list (vectype, t);
3095 break;
3097 case MIN_EXPR:
3098 case MAX_EXPR:
3099 case COND_EXPR:
3100 if (adjustment_def)
3102 *adjustment_def = NULL_TREE;
3103 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3104 break;
3107 init_def = build_vector_from_val (vectype, init_value);
3108 break;
3110 default:
3111 gcc_unreachable ();
3114 return init_def;
3118 /* Function vect_create_epilog_for_reduction
3120 Create code at the loop-epilog to finalize the result of a reduction
3121 computation.
3123 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3124 reduction statements.
3125 STMT is the scalar reduction stmt that is being vectorized.
3126 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3127 number of elements that we can fit in a vectype (nunits). In this case
3128 we have to generate more than one vector stmt - i.e - we need to "unroll"
3129 the vector stmt by a factor VF/nunits. For more details see documentation
3130 in vectorizable_operation.
3131 REDUC_CODE is the tree-code for the epilog reduction.
3132 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3133 computation.
3134 REDUC_INDEX is the index of the operand in the right hand side of the
3135 statement that is defined by REDUCTION_PHI.
3136 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3137 SLP_NODE is an SLP node containing a group of reduction statements. The
3138 first one in this group is STMT.
3140 This function:
3141 1. Creates the reduction def-use cycles: sets the arguments for
3142 REDUCTION_PHIS:
3143 The loop-entry argument is the vectorized initial-value of the reduction.
3144 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3145 sums.
3146 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3147 by applying the operation specified by REDUC_CODE if available, or by
3148 other means (whole-vector shifts or a scalar loop).
3149 The function also creates a new phi node at the loop exit to preserve
3150 loop-closed form, as illustrated below.
3152 The flow at the entry to this function:
3154 loop:
3155 vec_def = phi <null, null> # REDUCTION_PHI
3156 VECT_DEF = vector_stmt # vectorized form of STMT
3157 s_loop = scalar_stmt # (scalar) STMT
3158 loop_exit:
3159 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3160 use <s_out0>
3161 use <s_out0>
3163 The above is transformed by this function into:
3165 loop:
3166 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3167 VECT_DEF = vector_stmt # vectorized form of STMT
3168 s_loop = scalar_stmt # (scalar) STMT
3169 loop_exit:
3170 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3171 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3172 v_out2 = reduce <v_out1>
3173 s_out3 = extract_field <v_out2, 0>
3174 s_out4 = adjust_result <s_out3>
3175 use <s_out4>
3176 use <s_out4>
3179 static void
3180 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3181 int ncopies, enum tree_code reduc_code,
3182 VEC (gimple, heap) *reduction_phis,
3183 int reduc_index, bool double_reduc,
3184 slp_tree slp_node)
3186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3187 stmt_vec_info prev_phi_info;
3188 tree vectype;
3189 enum machine_mode mode;
3190 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3191 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3192 basic_block exit_bb;
3193 tree scalar_dest;
3194 tree scalar_type;
3195 gimple new_phi = NULL, phi;
3196 gimple_stmt_iterator exit_gsi;
3197 tree vec_dest;
3198 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3199 gimple epilog_stmt = NULL;
3200 enum tree_code code = gimple_assign_rhs_code (stmt);
3201 gimple exit_phi;
3202 tree bitsize, bitpos;
3203 tree adjustment_def = NULL;
3204 tree vec_initial_def = NULL;
3205 tree reduction_op, expr, def;
3206 tree orig_name, scalar_result;
3207 imm_use_iterator imm_iter, phi_imm_iter;
3208 use_operand_p use_p, phi_use_p;
3209 bool extract_scalar_result = false;
3210 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3211 bool nested_in_vect_loop = false;
3212 VEC (gimple, heap) *new_phis = NULL;
3213 enum vect_def_type dt = vect_unknown_def_type;
3214 int j, i;
3215 VEC (tree, heap) *scalar_results = NULL;
3216 unsigned int group_size = 1, k, ratio;
3217 VEC (tree, heap) *vec_initial_defs = NULL;
3218 VEC (gimple, heap) *phis;
3220 if (slp_node)
3221 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3223 if (nested_in_vect_loop_p (loop, stmt))
3225 outer_loop = loop;
3226 loop = loop->inner;
3227 nested_in_vect_loop = true;
3228 gcc_assert (!slp_node);
3231 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3233 case GIMPLE_SINGLE_RHS:
3234 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3235 == ternary_op);
3236 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3237 break;
3238 case GIMPLE_UNARY_RHS:
3239 reduction_op = gimple_assign_rhs1 (stmt);
3240 break;
3241 case GIMPLE_BINARY_RHS:
3242 reduction_op = reduc_index ?
3243 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3244 break;
3245 case GIMPLE_TERNARY_RHS:
3246 reduction_op = gimple_op (stmt, reduc_index + 1);
3247 break;
3248 default:
3249 gcc_unreachable ();
3252 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3253 gcc_assert (vectype);
3254 mode = TYPE_MODE (vectype);
3256 /* 1. Create the reduction def-use cycle:
3257 Set the arguments of REDUCTION_PHIS, i.e., transform
3259 loop:
3260 vec_def = phi <null, null> # REDUCTION_PHI
3261 VECT_DEF = vector_stmt # vectorized form of STMT
3264 into:
3266 loop:
3267 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3268 VECT_DEF = vector_stmt # vectorized form of STMT
3271 (in case of SLP, do it for all the phis). */
3273 /* Get the loop-entry arguments. */
3274 if (slp_node)
3275 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3276 NULL, reduc_index);
3277 else
3279 vec_initial_defs = VEC_alloc (tree, heap, 1);
3280 /* For the case of reduction, vect_get_vec_def_for_operand returns
3281 the scalar def before the loop, that defines the initial value
3282 of the reduction variable. */
3283 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3284 &adjustment_def);
3285 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3288 /* Set phi nodes arguments. */
3289 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3291 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3292 tree def = VEC_index (tree, vect_defs, i);
3293 for (j = 0; j < ncopies; j++)
3295 /* Set the loop-entry arg of the reduction-phi. */
3296 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3297 UNKNOWN_LOCATION);
3299 /* Set the loop-latch arg for the reduction-phi. */
3300 if (j > 0)
3301 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3303 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3305 if (vect_print_dump_info (REPORT_DETAILS))
3307 fprintf (vect_dump, "transform reduction: created def-use"
3308 " cycle: ");
3309 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3310 fprintf (vect_dump, "\n");
3311 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3312 TDF_SLIM);
3315 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3319 VEC_free (tree, heap, vec_initial_defs);
3321 /* 2. Create epilog code.
3322 The reduction epilog code operates across the elements of the vector
3323 of partial results computed by the vectorized loop.
3324 The reduction epilog code consists of:
3326 step 1: compute the scalar result in a vector (v_out2)
3327 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3328 step 3: adjust the scalar result (s_out3) if needed.
3330 Step 1 can be accomplished using one the following three schemes:
3331 (scheme 1) using reduc_code, if available.
3332 (scheme 2) using whole-vector shifts, if available.
3333 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3334 combined.
3336 The overall epilog code looks like this:
3338 s_out0 = phi <s_loop> # original EXIT_PHI
3339 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3340 v_out2 = reduce <v_out1> # step 1
3341 s_out3 = extract_field <v_out2, 0> # step 2
3342 s_out4 = adjust_result <s_out3> # step 3
3344 (step 3 is optional, and steps 1 and 2 may be combined).
3345 Lastly, the uses of s_out0 are replaced by s_out4. */
3348 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3349 v_out1 = phi <VECT_DEF>
3350 Store them in NEW_PHIS. */
3352 exit_bb = single_exit (loop)->dest;
3353 prev_phi_info = NULL;
3354 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3355 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3357 for (j = 0; j < ncopies; j++)
3359 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3360 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3361 if (j == 0)
3362 VEC_quick_push (gimple, new_phis, phi);
3363 else
3365 def = vect_get_vec_def_for_stmt_copy (dt, def);
3366 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3369 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3370 prev_phi_info = vinfo_for_stmt (phi);
3374 /* The epilogue is created for the outer-loop, i.e., for the loop being
3375 vectorized. */
3376 if (double_reduc)
3378 loop = outer_loop;
3379 exit_bb = single_exit (loop)->dest;
3382 exit_gsi = gsi_after_labels (exit_bb);
3384 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3385 (i.e. when reduc_code is not available) and in the final adjustment
3386 code (if needed). Also get the original scalar reduction variable as
3387 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3388 represents a reduction pattern), the tree-code and scalar-def are
3389 taken from the original stmt that the pattern-stmt (STMT) replaces.
3390 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3391 are taken from STMT. */
3393 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3394 if (!orig_stmt)
3396 /* Regular reduction */
3397 orig_stmt = stmt;
3399 else
3401 /* Reduction pattern */
3402 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3403 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3404 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3407 code = gimple_assign_rhs_code (orig_stmt);
3408 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3409 partial results are added and not subtracted. */
3410 if (code == MINUS_EXPR)
3411 code = PLUS_EXPR;
3413 scalar_dest = gimple_assign_lhs (orig_stmt);
3414 scalar_type = TREE_TYPE (scalar_dest);
3415 scalar_results = VEC_alloc (tree, heap, group_size);
3416 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3417 bitsize = TYPE_SIZE (scalar_type);
3419 /* In case this is a reduction in an inner-loop while vectorizing an outer
3420 loop - we don't need to extract a single scalar result at the end of the
3421 inner-loop (unless it is double reduction, i.e., the use of reduction is
3422 outside the outer-loop). The final vector of partial results will be used
3423 in the vectorized outer-loop, or reduced to a scalar result at the end of
3424 the outer-loop. */
3425 if (nested_in_vect_loop && !double_reduc)
3426 goto vect_finalize_reduction;
3428 /* 2.3 Create the reduction code, using one of the three schemes described
3429 above. In SLP we simply need to extract all the elements from the
3430 vector (without reducing them), so we use scalar shifts. */
3431 if (reduc_code != ERROR_MARK && !slp_node)
3433 tree tmp;
3435 /*** Case 1: Create:
3436 v_out2 = reduc_expr <v_out1> */
3438 if (vect_print_dump_info (REPORT_DETAILS))
3439 fprintf (vect_dump, "Reduce using direct vector reduction.");
3441 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3442 new_phi = VEC_index (gimple, new_phis, 0);
3443 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3444 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3445 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3446 gimple_assign_set_lhs (epilog_stmt, new_temp);
3447 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3449 extract_scalar_result = true;
3451 else
3453 enum tree_code shift_code = ERROR_MARK;
3454 bool have_whole_vector_shift = true;
3455 int bit_offset;
3456 int element_bitsize = tree_low_cst (bitsize, 1);
3457 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3458 tree vec_temp;
3460 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3461 shift_code = VEC_RSHIFT_EXPR;
3462 else
3463 have_whole_vector_shift = false;
3465 /* Regardless of whether we have a whole vector shift, if we're
3466 emulating the operation via tree-vect-generic, we don't want
3467 to use it. Only the first round of the reduction is likely
3468 to still be profitable via emulation. */
3469 /* ??? It might be better to emit a reduction tree code here, so that
3470 tree-vect-generic can expand the first round via bit tricks. */
3471 if (!VECTOR_MODE_P (mode))
3472 have_whole_vector_shift = false;
3473 else
3475 optab optab = optab_for_tree_code (code, vectype, optab_default);
3476 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3477 have_whole_vector_shift = false;
3480 if (have_whole_vector_shift && !slp_node)
3482 /*** Case 2: Create:
3483 for (offset = VS/2; offset >= element_size; offset/=2)
3485 Create: va' = vec_shift <va, offset>
3486 Create: va = vop <va, va'>
3487 } */
3489 if (vect_print_dump_info (REPORT_DETAILS))
3490 fprintf (vect_dump, "Reduce using vector shifts");
3492 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3493 new_phi = VEC_index (gimple, new_phis, 0);
3494 new_temp = PHI_RESULT (new_phi);
3495 for (bit_offset = vec_size_in_bits/2;
3496 bit_offset >= element_bitsize;
3497 bit_offset /= 2)
3499 tree bitpos = size_int (bit_offset);
3501 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3502 vec_dest, new_temp, bitpos);
3503 new_name = make_ssa_name (vec_dest, epilog_stmt);
3504 gimple_assign_set_lhs (epilog_stmt, new_name);
3505 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3507 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3508 new_name, new_temp);
3509 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3510 gimple_assign_set_lhs (epilog_stmt, new_temp);
3511 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3514 extract_scalar_result = true;
3516 else
3518 tree rhs;
3520 /*** Case 3: Create:
3521 s = extract_field <v_out2, 0>
3522 for (offset = element_size;
3523 offset < vector_size;
3524 offset += element_size;)
3526 Create: s' = extract_field <v_out2, offset>
3527 Create: s = op <s, s'> // For non SLP cases
3528 } */
3530 if (vect_print_dump_info (REPORT_DETAILS))
3531 fprintf (vect_dump, "Reduce using scalar code. ");
3533 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3534 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3536 vec_temp = PHI_RESULT (new_phi);
3537 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3538 bitsize_zero_node);
3539 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3540 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3541 gimple_assign_set_lhs (epilog_stmt, new_temp);
3542 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3544 /* In SLP we don't need to apply reduction operation, so we just
3545 collect s' values in SCALAR_RESULTS. */
3546 if (slp_node)
3547 VEC_safe_push (tree, heap, scalar_results, new_temp);
3549 for (bit_offset = element_bitsize;
3550 bit_offset < vec_size_in_bits;
3551 bit_offset += element_bitsize)
3553 tree bitpos = bitsize_int (bit_offset);
3554 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3555 bitsize, bitpos);
3557 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3558 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3559 gimple_assign_set_lhs (epilog_stmt, new_name);
3560 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3562 if (slp_node)
3564 /* In SLP we don't need to apply reduction operation, so
3565 we just collect s' values in SCALAR_RESULTS. */
3566 new_temp = new_name;
3567 VEC_safe_push (tree, heap, scalar_results, new_name);
3569 else
3571 epilog_stmt = gimple_build_assign_with_ops (code,
3572 new_scalar_dest, new_name, new_temp);
3573 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3574 gimple_assign_set_lhs (epilog_stmt, new_temp);
3575 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3580 /* The only case where we need to reduce scalar results in SLP, is
3581 unrolling. If the size of SCALAR_RESULTS is greater than
3582 GROUP_SIZE, we reduce them combining elements modulo
3583 GROUP_SIZE. */
3584 if (slp_node)
3586 tree res, first_res, new_res;
3587 gimple new_stmt;
3589 /* Reduce multiple scalar results in case of SLP unrolling. */
3590 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3591 j++)
3593 first_res = VEC_index (tree, scalar_results, j % group_size);
3594 new_stmt = gimple_build_assign_with_ops (code,
3595 new_scalar_dest, first_res, res);
3596 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3597 gimple_assign_set_lhs (new_stmt, new_res);
3598 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3599 VEC_replace (tree, scalar_results, j % group_size, new_res);
3602 else
3603 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3604 VEC_safe_push (tree, heap, scalar_results, new_temp);
3606 extract_scalar_result = false;
3610 /* 2.4 Extract the final scalar result. Create:
3611 s_out3 = extract_field <v_out2, bitpos> */
3613 if (extract_scalar_result)
3615 tree rhs;
3617 if (vect_print_dump_info (REPORT_DETAILS))
3618 fprintf (vect_dump, "extract scalar result");
3620 if (BYTES_BIG_ENDIAN)
3621 bitpos = size_binop (MULT_EXPR,
3622 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3623 TYPE_SIZE (scalar_type));
3624 else
3625 bitpos = bitsize_zero_node;
3627 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3628 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3629 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3630 gimple_assign_set_lhs (epilog_stmt, new_temp);
3631 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3632 VEC_safe_push (tree, heap, scalar_results, new_temp);
3635 vect_finalize_reduction:
3637 if (double_reduc)
3638 loop = loop->inner;
3640 /* 2.5 Adjust the final result by the initial value of the reduction
3641 variable. (When such adjustment is not needed, then
3642 'adjustment_def' is zero). For example, if code is PLUS we create:
3643 new_temp = loop_exit_def + adjustment_def */
3645 if (adjustment_def)
3647 gcc_assert (!slp_node);
3648 if (nested_in_vect_loop)
3650 new_phi = VEC_index (gimple, new_phis, 0);
3651 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3652 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3653 new_dest = vect_create_destination_var (scalar_dest, vectype);
3655 else
3657 new_temp = VEC_index (tree, scalar_results, 0);
3658 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3659 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3660 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3663 epilog_stmt = gimple_build_assign (new_dest, expr);
3664 new_temp = make_ssa_name (new_dest, epilog_stmt);
3665 gimple_assign_set_lhs (epilog_stmt, new_temp);
3666 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3667 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3668 if (nested_in_vect_loop)
3670 set_vinfo_for_stmt (epilog_stmt,
3671 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3672 NULL));
3673 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3674 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3676 if (!double_reduc)
3677 VEC_quick_push (tree, scalar_results, new_temp);
3678 else
3679 VEC_replace (tree, scalar_results, 0, new_temp);
3681 else
3682 VEC_replace (tree, scalar_results, 0, new_temp);
3684 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3687 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3688 phis with new adjusted scalar results, i.e., replace use <s_out0>
3689 with use <s_out4>.
3691 Transform:
3692 loop_exit:
3693 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3694 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3695 v_out2 = reduce <v_out1>
3696 s_out3 = extract_field <v_out2, 0>
3697 s_out4 = adjust_result <s_out3>
3698 use <s_out0>
3699 use <s_out0>
3701 into:
3703 loop_exit:
3704 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3705 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3706 v_out2 = reduce <v_out1>
3707 s_out3 = extract_field <v_out2, 0>
3708 s_out4 = adjust_result <s_out3>
3709 use <s_out4>
3710 use <s_out4> */
3712 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3713 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3714 need to match SCALAR_RESULTS with corresponding statements. The first
3715 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3716 the first vector stmt, etc.
3717 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3718 if (group_size > VEC_length (gimple, new_phis))
3720 ratio = group_size / VEC_length (gimple, new_phis);
3721 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3723 else
3724 ratio = 1;
3726 for (k = 0; k < group_size; k++)
3728 if (k % ratio == 0)
3730 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3731 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3734 if (slp_node)
3736 gimple current_stmt = VEC_index (gimple,
3737 SLP_TREE_SCALAR_STMTS (slp_node), k);
3739 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3740 /* SLP statements can't participate in patterns. */
3741 gcc_assert (!orig_stmt);
3742 scalar_dest = gimple_assign_lhs (current_stmt);
3745 phis = VEC_alloc (gimple, heap, 3);
3746 /* Find the loop-closed-use at the loop exit of the original scalar
3747 result. (The reduction result is expected to have two immediate uses -
3748 one at the latch block, and one at the loop exit). */
3749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3750 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3751 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3753 /* We expect to have found an exit_phi because of loop-closed-ssa
3754 form. */
3755 gcc_assert (!VEC_empty (gimple, phis));
3757 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3759 if (outer_loop)
3761 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3762 gimple vect_phi;
3764 /* FORNOW. Currently not supporting the case that an inner-loop
3765 reduction is not used in the outer-loop (but only outside the
3766 outer-loop), unless it is double reduction. */
3767 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3768 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3769 || double_reduc);
3771 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3772 if (!double_reduc
3773 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3774 != vect_double_reduction_def)
3775 continue;
3777 /* Handle double reduction:
3779 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3780 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3781 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3782 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3784 At that point the regular reduction (stmt2 and stmt3) is
3785 already vectorized, as well as the exit phi node, stmt4.
3786 Here we vectorize the phi node of double reduction, stmt1, and
3787 update all relevant statements. */
3789 /* Go through all the uses of s2 to find double reduction phi
3790 node, i.e., stmt1 above. */
3791 orig_name = PHI_RESULT (exit_phi);
3792 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3794 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3795 stmt_vec_info new_phi_vinfo;
3796 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3797 basic_block bb = gimple_bb (use_stmt);
3798 gimple use;
3800 /* Check that USE_STMT is really double reduction phi
3801 node. */
3802 if (gimple_code (use_stmt) != GIMPLE_PHI
3803 || gimple_phi_num_args (use_stmt) != 2
3804 || !use_stmt_vinfo
3805 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3806 != vect_double_reduction_def
3807 || bb->loop_father != outer_loop)
3808 continue;
3810 /* Create vector phi node for double reduction:
3811 vs1 = phi <vs0, vs2>
3812 vs1 was created previously in this function by a call to
3813 vect_get_vec_def_for_operand and is stored in
3814 vec_initial_def;
3815 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3816 vs0 is created here. */
3818 /* Create vector phi node. */
3819 vect_phi = create_phi_node (vec_initial_def, bb);
3820 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3821 loop_vec_info_for_loop (outer_loop), NULL);
3822 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3824 /* Create vs0 - initial def of the double reduction phi. */
3825 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3826 loop_preheader_edge (outer_loop));
3827 init_def = get_initial_def_for_reduction (stmt,
3828 preheader_arg, NULL);
3829 vect_phi_init = vect_init_vector (use_stmt, init_def,
3830 vectype, NULL);
3832 /* Update phi node arguments with vs0 and vs2. */
3833 add_phi_arg (vect_phi, vect_phi_init,
3834 loop_preheader_edge (outer_loop),
3835 UNKNOWN_LOCATION);
3836 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3837 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3838 if (vect_print_dump_info (REPORT_DETAILS))
3840 fprintf (vect_dump, "created double reduction phi "
3841 "node: ");
3842 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3845 vect_phi_res = PHI_RESULT (vect_phi);
3847 /* Replace the use, i.e., set the correct vs1 in the regular
3848 reduction phi node. FORNOW, NCOPIES is always 1, so the
3849 loop is redundant. */
3850 use = reduction_phi;
3851 for (j = 0; j < ncopies; j++)
3853 edge pr_edge = loop_preheader_edge (loop);
3854 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3855 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3861 VEC_free (gimple, heap, phis);
3862 if (nested_in_vect_loop)
3864 if (double_reduc)
3865 loop = outer_loop;
3866 else
3867 continue;
3870 phis = VEC_alloc (gimple, heap, 3);
3871 /* Find the loop-closed-use at the loop exit of the original scalar
3872 result. (The reduction result is expected to have two immediate uses,
3873 one at the latch block, and one at the loop exit). For double
3874 reductions we are looking for exit phis of the outer loop. */
3875 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3877 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3878 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3879 else
3881 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3883 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3885 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3887 if (!flow_bb_inside_loop_p (loop,
3888 gimple_bb (USE_STMT (phi_use_p))))
3889 VEC_safe_push (gimple, heap, phis,
3890 USE_STMT (phi_use_p));
3896 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3898 /* Replace the uses: */
3899 orig_name = PHI_RESULT (exit_phi);
3900 scalar_result = VEC_index (tree, scalar_results, k);
3901 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3902 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3903 SET_USE (use_p, scalar_result);
3906 VEC_free (gimple, heap, phis);
3909 VEC_free (tree, heap, scalar_results);
3910 VEC_free (gimple, heap, new_phis);
3914 /* Function vectorizable_reduction.
3916 Check if STMT performs a reduction operation that can be vectorized.
3917 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3918 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3919 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3921 This function also handles reduction idioms (patterns) that have been
3922 recognized in advance during vect_pattern_recog. In this case, STMT may be
3923 of this form:
3924 X = pattern_expr (arg0, arg1, ..., X)
3925 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3926 sequence that had been detected and replaced by the pattern-stmt (STMT).
3928 In some cases of reduction patterns, the type of the reduction variable X is
3929 different than the type of the other arguments of STMT.
3930 In such cases, the vectype that is used when transforming STMT into a vector
3931 stmt is different than the vectype that is used to determine the
3932 vectorization factor, because it consists of a different number of elements
3933 than the actual number of elements that are being operated upon in parallel.
3935 For example, consider an accumulation of shorts into an int accumulator.
3936 On some targets it's possible to vectorize this pattern operating on 8
3937 shorts at a time (hence, the vectype for purposes of determining the
3938 vectorization factor should be V8HI); on the other hand, the vectype that
3939 is used to create the vector form is actually V4SI (the type of the result).
3941 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3942 indicates what is the actual level of parallelism (V8HI in the example), so
3943 that the right vectorization factor would be derived. This vectype
3944 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3945 be used to create the vectorized stmt. The right vectype for the vectorized
3946 stmt is obtained from the type of the result X:
3947 get_vectype_for_scalar_type (TREE_TYPE (X))
3949 This means that, contrary to "regular" reductions (or "regular" stmts in
3950 general), the following equation:
3951 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3952 does *NOT* necessarily hold for reduction patterns. */
3954 bool
3955 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3956 gimple *vec_stmt, slp_tree slp_node)
3958 tree vec_dest;
3959 tree scalar_dest;
3960 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3962 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3963 tree vectype_in = NULL_TREE;
3964 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3966 enum tree_code code, orig_code, epilog_reduc_code;
3967 enum machine_mode vec_mode;
3968 int op_type;
3969 optab optab, reduc_optab;
3970 tree new_temp = NULL_TREE;
3971 tree def;
3972 gimple def_stmt;
3973 enum vect_def_type dt;
3974 gimple new_phi = NULL;
3975 tree scalar_type;
3976 bool is_simple_use;
3977 gimple orig_stmt;
3978 stmt_vec_info orig_stmt_info;
3979 tree expr = NULL_TREE;
3980 int i;
3981 int ncopies;
3982 int epilog_copies;
3983 stmt_vec_info prev_stmt_info, prev_phi_info;
3984 bool single_defuse_cycle = false;
3985 tree reduc_def = NULL_TREE;
3986 gimple new_stmt = NULL;
3987 int j;
3988 tree ops[3];
3989 bool nested_cycle = false, found_nested_cycle_def = false;
3990 gimple reduc_def_stmt = NULL;
3991 /* The default is that the reduction variable is the last in statement. */
3992 int reduc_index = 2;
3993 bool double_reduc = false, dummy;
3994 basic_block def_bb;
3995 struct loop * def_stmt_loop, *outer_loop = NULL;
3996 tree def_arg;
3997 gimple def_arg_stmt;
3998 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3999 VEC (gimple, heap) *phis = NULL;
4000 int vec_num;
4001 tree def0, def1, tem;
4003 if (nested_in_vect_loop_p (loop, stmt))
4005 outer_loop = loop;
4006 loop = loop->inner;
4007 nested_cycle = true;
4010 /* 1. Is vectorizable reduction? */
4011 /* Not supportable if the reduction variable is used in the loop. */
4012 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
4013 return false;
4015 /* Reductions that are not used even in an enclosing outer-loop,
4016 are expected to be "live" (used out of the loop). */
4017 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4018 && !STMT_VINFO_LIVE_P (stmt_info))
4019 return false;
4021 /* Make sure it was already recognized as a reduction computation. */
4022 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4023 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4024 return false;
4026 /* 2. Has this been recognized as a reduction pattern?
4028 Check if STMT represents a pattern that has been recognized
4029 in earlier analysis stages. For stmts that represent a pattern,
4030 the STMT_VINFO_RELATED_STMT field records the last stmt in
4031 the original sequence that constitutes the pattern. */
4033 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4034 if (orig_stmt)
4036 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4037 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4038 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4039 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4042 /* 3. Check the operands of the operation. The first operands are defined
4043 inside the loop body. The last operand is the reduction variable,
4044 which is defined by the loop-header-phi. */
4046 gcc_assert (is_gimple_assign (stmt));
4048 /* Flatten RHS. */
4049 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4051 case GIMPLE_SINGLE_RHS:
4052 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4053 if (op_type == ternary_op)
4055 tree rhs = gimple_assign_rhs1 (stmt);
4056 ops[0] = TREE_OPERAND (rhs, 0);
4057 ops[1] = TREE_OPERAND (rhs, 1);
4058 ops[2] = TREE_OPERAND (rhs, 2);
4059 code = TREE_CODE (rhs);
4061 else
4062 return false;
4063 break;
4065 case GIMPLE_BINARY_RHS:
4066 code = gimple_assign_rhs_code (stmt);
4067 op_type = TREE_CODE_LENGTH (code);
4068 gcc_assert (op_type == binary_op);
4069 ops[0] = gimple_assign_rhs1 (stmt);
4070 ops[1] = gimple_assign_rhs2 (stmt);
4071 break;
4073 case GIMPLE_TERNARY_RHS:
4074 code = gimple_assign_rhs_code (stmt);
4075 op_type = TREE_CODE_LENGTH (code);
4076 gcc_assert (op_type == ternary_op);
4077 ops[0] = gimple_assign_rhs1 (stmt);
4078 ops[1] = gimple_assign_rhs2 (stmt);
4079 ops[2] = gimple_assign_rhs3 (stmt);
4080 break;
4082 case GIMPLE_UNARY_RHS:
4083 return false;
4085 default:
4086 gcc_unreachable ();
4089 scalar_dest = gimple_assign_lhs (stmt);
4090 scalar_type = TREE_TYPE (scalar_dest);
4091 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4092 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4093 return false;
4095 /* All uses but the last are expected to be defined in the loop.
4096 The last use is the reduction variable. In case of nested cycle this
4097 assumption is not true: we use reduc_index to record the index of the
4098 reduction variable. */
4099 for (i = 0; i < op_type-1; i++)
4101 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4102 if (i == 0 && code == COND_EXPR)
4103 continue;
4105 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4106 &def_stmt, &def, &dt, &tem);
4107 if (!vectype_in)
4108 vectype_in = tem;
4109 gcc_assert (is_simple_use);
4110 if (dt != vect_internal_def
4111 && dt != vect_external_def
4112 && dt != vect_constant_def
4113 && dt != vect_induction_def
4114 && !(dt == vect_nested_cycle && nested_cycle))
4115 return false;
4117 if (dt == vect_nested_cycle)
4119 found_nested_cycle_def = true;
4120 reduc_def_stmt = def_stmt;
4121 reduc_index = i;
4125 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4126 &def, &dt, &tem);
4127 if (!vectype_in)
4128 vectype_in = tem;
4129 gcc_assert (is_simple_use);
4130 gcc_assert (dt == vect_reduction_def
4131 || dt == vect_nested_cycle
4132 || ((dt == vect_internal_def || dt == vect_external_def
4133 || dt == vect_constant_def || dt == vect_induction_def)
4134 && nested_cycle && found_nested_cycle_def));
4135 if (!found_nested_cycle_def)
4136 reduc_def_stmt = def_stmt;
4138 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4139 if (orig_stmt)
4140 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4141 reduc_def_stmt,
4142 !nested_cycle,
4143 &dummy));
4144 else
4145 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4146 !nested_cycle, &dummy));
4148 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4149 return false;
4151 if (slp_node || PURE_SLP_STMT (stmt_info))
4152 ncopies = 1;
4153 else
4154 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4155 / TYPE_VECTOR_SUBPARTS (vectype_in));
4157 gcc_assert (ncopies >= 1);
4159 vec_mode = TYPE_MODE (vectype_in);
4161 if (code == COND_EXPR)
4163 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4165 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "unsupported condition in reduction");
4168 return false;
4171 else
4173 /* 4. Supportable by target? */
4175 /* 4.1. check support for the operation in the loop */
4176 optab = optab_for_tree_code (code, vectype_in, optab_default);
4177 if (!optab)
4179 if (vect_print_dump_info (REPORT_DETAILS))
4180 fprintf (vect_dump, "no optab.");
4182 return false;
4185 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4187 if (vect_print_dump_info (REPORT_DETAILS))
4188 fprintf (vect_dump, "op not supported by target.");
4190 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4191 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4192 < vect_min_worthwhile_factor (code))
4193 return false;
4195 if (vect_print_dump_info (REPORT_DETAILS))
4196 fprintf (vect_dump, "proceeding using word mode.");
4199 /* Worthwhile without SIMD support? */
4200 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4201 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4202 < vect_min_worthwhile_factor (code))
4204 if (vect_print_dump_info (REPORT_DETAILS))
4205 fprintf (vect_dump, "not worthwhile without SIMD support.");
4207 return false;
4211 /* 4.2. Check support for the epilog operation.
4213 If STMT represents a reduction pattern, then the type of the
4214 reduction variable may be different than the type of the rest
4215 of the arguments. For example, consider the case of accumulation
4216 of shorts into an int accumulator; The original code:
4217 S1: int_a = (int) short_a;
4218 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4220 was replaced with:
4221 STMT: int_acc = widen_sum <short_a, int_acc>
4223 This means that:
4224 1. The tree-code that is used to create the vector operation in the
4225 epilog code (that reduces the partial results) is not the
4226 tree-code of STMT, but is rather the tree-code of the original
4227 stmt from the pattern that STMT is replacing. I.e, in the example
4228 above we want to use 'widen_sum' in the loop, but 'plus' in the
4229 epilog.
4230 2. The type (mode) we use to check available target support
4231 for the vector operation to be created in the *epilog*, is
4232 determined by the type of the reduction variable (in the example
4233 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4234 However the type (mode) we use to check available target support
4235 for the vector operation to be created *inside the loop*, is
4236 determined by the type of the other arguments to STMT (in the
4237 example we'd check this: optab_handler (widen_sum_optab,
4238 vect_short_mode)).
4240 This is contrary to "regular" reductions, in which the types of all
4241 the arguments are the same as the type of the reduction variable.
4242 For "regular" reductions we can therefore use the same vector type
4243 (and also the same tree-code) when generating the epilog code and
4244 when generating the code inside the loop. */
4246 if (orig_stmt)
4248 /* This is a reduction pattern: get the vectype from the type of the
4249 reduction variable, and get the tree-code from orig_stmt. */
4250 orig_code = gimple_assign_rhs_code (orig_stmt);
4251 gcc_assert (vectype_out);
4252 vec_mode = TYPE_MODE (vectype_out);
4254 else
4256 /* Regular reduction: use the same vectype and tree-code as used for
4257 the vector code inside the loop can be used for the epilog code. */
4258 orig_code = code;
4261 if (nested_cycle)
4263 def_bb = gimple_bb (reduc_def_stmt);
4264 def_stmt_loop = def_bb->loop_father;
4265 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4266 loop_preheader_edge (def_stmt_loop));
4267 if (TREE_CODE (def_arg) == SSA_NAME
4268 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4269 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4270 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4271 && vinfo_for_stmt (def_arg_stmt)
4272 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4273 == vect_double_reduction_def)
4274 double_reduc = true;
4277 epilog_reduc_code = ERROR_MARK;
4278 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4280 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4281 optab_default);
4282 if (!reduc_optab)
4284 if (vect_print_dump_info (REPORT_DETAILS))
4285 fprintf (vect_dump, "no optab for reduction.");
4287 epilog_reduc_code = ERROR_MARK;
4290 if (reduc_optab
4291 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4293 if (vect_print_dump_info (REPORT_DETAILS))
4294 fprintf (vect_dump, "reduc op not supported by target.");
4296 epilog_reduc_code = ERROR_MARK;
4299 else
4301 if (!nested_cycle || double_reduc)
4303 if (vect_print_dump_info (REPORT_DETAILS))
4304 fprintf (vect_dump, "no reduc code for scalar code.");
4306 return false;
4310 if (double_reduc && ncopies > 1)
4312 if (vect_print_dump_info (REPORT_DETAILS))
4313 fprintf (vect_dump, "multiple types in double reduction");
4315 return false;
4318 if (!vec_stmt) /* transformation not required. */
4320 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4321 return false;
4322 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4323 return true;
4326 /** Transform. **/
4328 if (vect_print_dump_info (REPORT_DETAILS))
4329 fprintf (vect_dump, "transform reduction.");
4331 /* FORNOW: Multiple types are not supported for condition. */
4332 if (code == COND_EXPR)
4333 gcc_assert (ncopies == 1);
4335 /* Create the destination vector */
4336 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4338 /* In case the vectorization factor (VF) is bigger than the number
4339 of elements that we can fit in a vectype (nunits), we have to generate
4340 more than one vector stmt - i.e - we need to "unroll" the
4341 vector stmt by a factor VF/nunits. For more details see documentation
4342 in vectorizable_operation. */
4344 /* If the reduction is used in an outer loop we need to generate
4345 VF intermediate results, like so (e.g. for ncopies=2):
4346 r0 = phi (init, r0)
4347 r1 = phi (init, r1)
4348 r0 = x0 + r0;
4349 r1 = x1 + r1;
4350 (i.e. we generate VF results in 2 registers).
4351 In this case we have a separate def-use cycle for each copy, and therefore
4352 for each copy we get the vector def for the reduction variable from the
4353 respective phi node created for this copy.
4355 Otherwise (the reduction is unused in the loop nest), we can combine
4356 together intermediate results, like so (e.g. for ncopies=2):
4357 r = phi (init, r)
4358 r = x0 + r;
4359 r = x1 + r;
4360 (i.e. we generate VF/2 results in a single register).
4361 In this case for each copy we get the vector def for the reduction variable
4362 from the vectorized reduction operation generated in the previous iteration.
4365 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4367 single_defuse_cycle = true;
4368 epilog_copies = 1;
4370 else
4371 epilog_copies = ncopies;
4373 prev_stmt_info = NULL;
4374 prev_phi_info = NULL;
4375 if (slp_node)
4377 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4378 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4379 == TYPE_VECTOR_SUBPARTS (vectype_in));
4381 else
4383 vec_num = 1;
4384 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4385 if (op_type == ternary_op)
4386 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4389 phis = VEC_alloc (gimple, heap, vec_num);
4390 vect_defs = VEC_alloc (tree, heap, vec_num);
4391 if (!slp_node)
4392 VEC_quick_push (tree, vect_defs, NULL_TREE);
4394 for (j = 0; j < ncopies; j++)
4396 if (j == 0 || !single_defuse_cycle)
4398 for (i = 0; i < vec_num; i++)
4400 /* Create the reduction-phi that defines the reduction
4401 operand. */
4402 new_phi = create_phi_node (vec_dest, loop->header);
4403 set_vinfo_for_stmt (new_phi,
4404 new_stmt_vec_info (new_phi, loop_vinfo,
4405 NULL));
4406 if (j == 0 || slp_node)
4407 VEC_quick_push (gimple, phis, new_phi);
4411 if (code == COND_EXPR)
4413 gcc_assert (!slp_node);
4414 vectorizable_condition (stmt, gsi, vec_stmt,
4415 PHI_RESULT (VEC_index (gimple, phis, 0)),
4416 reduc_index);
4417 /* Multiple types are not supported for condition. */
4418 break;
4421 /* Handle uses. */
4422 if (j == 0)
4424 tree op0, op1 = NULL_TREE;
4426 op0 = ops[!reduc_index];
4427 if (op_type == ternary_op)
4429 if (reduc_index == 0)
4430 op1 = ops[2];
4431 else
4432 op1 = ops[1];
4435 if (slp_node)
4436 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4437 -1);
4438 else
4440 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4441 stmt, NULL);
4442 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4443 if (op_type == ternary_op)
4445 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4446 NULL);
4447 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4451 else
4453 if (!slp_node)
4455 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4456 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4457 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4458 if (op_type == ternary_op)
4460 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4461 loop_vec_def1);
4462 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4466 if (single_defuse_cycle)
4467 reduc_def = gimple_assign_lhs (new_stmt);
4469 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4472 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4474 if (slp_node)
4475 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4476 else
4478 if (!single_defuse_cycle || j == 0)
4479 reduc_def = PHI_RESULT (new_phi);
4482 def1 = ((op_type == ternary_op)
4483 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4484 if (op_type == binary_op)
4486 if (reduc_index == 0)
4487 expr = build2 (code, vectype_out, reduc_def, def0);
4488 else
4489 expr = build2 (code, vectype_out, def0, reduc_def);
4491 else
4493 if (reduc_index == 0)
4494 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4495 else
4497 if (reduc_index == 1)
4498 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4499 else
4500 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4504 new_stmt = gimple_build_assign (vec_dest, expr);
4505 new_temp = make_ssa_name (vec_dest, new_stmt);
4506 gimple_assign_set_lhs (new_stmt, new_temp);
4507 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4508 if (slp_node)
4510 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4511 VEC_quick_push (tree, vect_defs, new_temp);
4513 else
4514 VEC_replace (tree, vect_defs, 0, new_temp);
4517 if (slp_node)
4518 continue;
4520 if (j == 0)
4521 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4522 else
4523 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4525 prev_stmt_info = vinfo_for_stmt (new_stmt);
4526 prev_phi_info = vinfo_for_stmt (new_phi);
4529 /* Finalize the reduction-phi (set its arguments) and create the
4530 epilog reduction code. */
4531 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4533 new_temp = gimple_assign_lhs (*vec_stmt);
4534 VEC_replace (tree, vect_defs, 0, new_temp);
4537 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4538 epilog_reduc_code, phis, reduc_index,
4539 double_reduc, slp_node);
4541 VEC_free (gimple, heap, phis);
4542 VEC_free (tree, heap, vec_oprnds0);
4543 if (vec_oprnds1)
4544 VEC_free (tree, heap, vec_oprnds1);
4546 return true;
4549 /* Function vect_min_worthwhile_factor.
4551 For a loop where we could vectorize the operation indicated by CODE,
4552 return the minimum vectorization factor that makes it worthwhile
4553 to use generic vectors. */
4555 vect_min_worthwhile_factor (enum tree_code code)
4557 switch (code)
4559 case PLUS_EXPR:
4560 case MINUS_EXPR:
4561 case NEGATE_EXPR:
4562 return 4;
4564 case BIT_AND_EXPR:
4565 case BIT_IOR_EXPR:
4566 case BIT_XOR_EXPR:
4567 case BIT_NOT_EXPR:
4568 return 2;
4570 default:
4571 return INT_MAX;
4576 /* Function vectorizable_induction
4578 Check if PHI performs an induction computation that can be vectorized.
4579 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4580 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4581 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4583 bool
4584 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4585 gimple *vec_stmt)
4587 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4588 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4589 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4590 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4591 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4592 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4593 tree vec_def;
4595 gcc_assert (ncopies >= 1);
4596 /* FORNOW. This restriction should be relaxed. */
4597 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4599 if (vect_print_dump_info (REPORT_DETAILS))
4600 fprintf (vect_dump, "multiple types in nested loop.");
4601 return false;
4604 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4605 return false;
4607 /* FORNOW: SLP not supported. */
4608 if (STMT_SLP_TYPE (stmt_info))
4609 return false;
4611 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4613 if (gimple_code (phi) != GIMPLE_PHI)
4614 return false;
4616 if (!vec_stmt) /* transformation not required. */
4618 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4619 if (vect_print_dump_info (REPORT_DETAILS))
4620 fprintf (vect_dump, "=== vectorizable_induction ===");
4621 vect_model_induction_cost (stmt_info, ncopies);
4622 return true;
4625 /** Transform. **/
4627 if (vect_print_dump_info (REPORT_DETAILS))
4628 fprintf (vect_dump, "transform induction phi.");
4630 vec_def = get_initial_def_for_induction (phi);
4631 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4632 return true;
4635 /* Function vectorizable_live_operation.
4637 STMT computes a value that is used outside the loop. Check if
4638 it can be supported. */
4640 bool
4641 vectorizable_live_operation (gimple stmt,
4642 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4643 gimple *vec_stmt ATTRIBUTE_UNUSED)
4645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4647 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4648 int i;
4649 int op_type;
4650 tree op;
4651 tree def;
4652 gimple def_stmt;
4653 enum vect_def_type dt;
4654 enum tree_code code;
4655 enum gimple_rhs_class rhs_class;
4657 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4659 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4660 return false;
4662 if (!is_gimple_assign (stmt))
4663 return false;
4665 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4666 return false;
4668 /* FORNOW. CHECKME. */
4669 if (nested_in_vect_loop_p (loop, stmt))
4670 return false;
4672 code = gimple_assign_rhs_code (stmt);
4673 op_type = TREE_CODE_LENGTH (code);
4674 rhs_class = get_gimple_rhs_class (code);
4675 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4676 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4678 /* FORNOW: support only if all uses are invariant. This means
4679 that the scalar operations can remain in place, unvectorized.
4680 The original last scalar value that they compute will be used. */
4682 for (i = 0; i < op_type; i++)
4684 if (rhs_class == GIMPLE_SINGLE_RHS)
4685 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4686 else
4687 op = gimple_op (stmt, i + 1);
4688 if (op
4689 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4691 if (vect_print_dump_info (REPORT_DETAILS))
4692 fprintf (vect_dump, "use not simple.");
4693 return false;
4696 if (dt != vect_external_def && dt != vect_constant_def)
4697 return false;
4700 /* No transformation is required for the cases we currently support. */
4701 return true;
4704 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4706 static void
4707 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4709 ssa_op_iter op_iter;
4710 imm_use_iterator imm_iter;
4711 def_operand_p def_p;
4712 gimple ustmt;
4714 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4716 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4718 basic_block bb;
4720 if (!is_gimple_debug (ustmt))
4721 continue;
4723 bb = gimple_bb (ustmt);
4725 if (!flow_bb_inside_loop_p (loop, bb))
4727 if (gimple_debug_bind_p (ustmt))
4729 if (vect_print_dump_info (REPORT_DETAILS))
4730 fprintf (vect_dump, "killing debug use");
4732 gimple_debug_bind_reset_value (ustmt);
4733 update_stmt (ustmt);
4735 else
4736 gcc_unreachable ();
4742 /* Function vect_transform_loop.
4744 The analysis phase has determined that the loop is vectorizable.
4745 Vectorize the loop - created vectorized stmts to replace the scalar
4746 stmts in the loop, and update the loop exit condition. */
4748 void
4749 vect_transform_loop (loop_vec_info loop_vinfo)
4751 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4752 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4753 int nbbs = loop->num_nodes;
4754 gimple_stmt_iterator si;
4755 int i;
4756 tree ratio = NULL;
4757 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4758 bool strided_store;
4759 bool slp_scheduled = false;
4760 unsigned int nunits;
4761 tree cond_expr = NULL_TREE;
4762 gimple_seq cond_expr_stmt_list = NULL;
4763 bool do_peeling_for_loop_bound;
4765 if (vect_print_dump_info (REPORT_DETAILS))
4766 fprintf (vect_dump, "=== vec_transform_loop ===");
4768 /* Peel the loop if there are data refs with unknown alignment.
4769 Only one data ref with unknown store is allowed. */
4771 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4772 vect_do_peeling_for_alignment (loop_vinfo);
4774 do_peeling_for_loop_bound
4775 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4776 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4777 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4779 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4780 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4781 vect_loop_versioning (loop_vinfo,
4782 !do_peeling_for_loop_bound,
4783 &cond_expr, &cond_expr_stmt_list);
4785 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4786 compile time constant), or it is a constant that doesn't divide by the
4787 vectorization factor, then an epilog loop needs to be created.
4788 We therefore duplicate the loop: the original loop will be vectorized,
4789 and will compute the first (n/VF) iterations. The second copy of the loop
4790 will remain scalar and will compute the remaining (n%VF) iterations.
4791 (VF is the vectorization factor). */
4793 if (do_peeling_for_loop_bound)
4794 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4795 cond_expr, cond_expr_stmt_list);
4796 else
4797 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4798 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4800 /* 1) Make sure the loop header has exactly two entries
4801 2) Make sure we have a preheader basic block. */
4803 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4805 split_edge (loop_preheader_edge (loop));
4807 /* FORNOW: the vectorizer supports only loops which body consist
4808 of one basic block (header + empty latch). When the vectorizer will
4809 support more involved loop forms, the order by which the BBs are
4810 traversed need to be reconsidered. */
4812 for (i = 0; i < nbbs; i++)
4814 basic_block bb = bbs[i];
4815 stmt_vec_info stmt_info;
4816 gimple phi;
4818 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4820 phi = gsi_stmt (si);
4821 if (vect_print_dump_info (REPORT_DETAILS))
4823 fprintf (vect_dump, "------>vectorizing phi: ");
4824 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4826 stmt_info = vinfo_for_stmt (phi);
4827 if (!stmt_info)
4828 continue;
4830 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4831 vect_loop_kill_debug_uses (loop, phi);
4833 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4834 && !STMT_VINFO_LIVE_P (stmt_info))
4835 continue;
4837 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4838 != (unsigned HOST_WIDE_INT) vectorization_factor)
4839 && vect_print_dump_info (REPORT_DETAILS))
4840 fprintf (vect_dump, "multiple-types.");
4842 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4844 if (vect_print_dump_info (REPORT_DETAILS))
4845 fprintf (vect_dump, "transform phi.");
4846 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4850 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4852 gimple stmt = gsi_stmt (si);
4853 bool is_store;
4855 if (vect_print_dump_info (REPORT_DETAILS))
4857 fprintf (vect_dump, "------>vectorizing statement: ");
4858 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4861 stmt_info = vinfo_for_stmt (stmt);
4863 /* vector stmts created in the outer-loop during vectorization of
4864 stmts in an inner-loop may not have a stmt_info, and do not
4865 need to be vectorized. */
4866 if (!stmt_info)
4868 gsi_next (&si);
4869 continue;
4872 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4873 vect_loop_kill_debug_uses (loop, stmt);
4875 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4876 && !STMT_VINFO_LIVE_P (stmt_info))
4878 gsi_next (&si);
4879 continue;
4882 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4883 nunits =
4884 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4885 if (!STMT_SLP_TYPE (stmt_info)
4886 && nunits != (unsigned int) vectorization_factor
4887 && vect_print_dump_info (REPORT_DETAILS))
4888 /* For SLP VF is set according to unrolling factor, and not to
4889 vector size, hence for SLP this print is not valid. */
4890 fprintf (vect_dump, "multiple-types.");
4892 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4893 reached. */
4894 if (STMT_SLP_TYPE (stmt_info))
4896 if (!slp_scheduled)
4898 slp_scheduled = true;
4900 if (vect_print_dump_info (REPORT_DETAILS))
4901 fprintf (vect_dump, "=== scheduling SLP instances ===");
4903 vect_schedule_slp (loop_vinfo, NULL);
4906 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4907 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4909 gsi_next (&si);
4910 continue;
4914 /* -------- vectorize statement ------------ */
4915 if (vect_print_dump_info (REPORT_DETAILS))
4916 fprintf (vect_dump, "transform statement.");
4918 strided_store = false;
4919 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4920 if (is_store)
4922 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4924 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4925 interleaving chain was completed - free all the stores in
4926 the chain. */
4927 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4928 gsi_remove (&si, true);
4929 continue;
4931 else
4933 /* Free the attached stmt_vec_info and remove the stmt. */
4934 free_stmt_vec_info (stmt);
4935 gsi_remove (&si, true);
4936 continue;
4939 gsi_next (&si);
4940 } /* stmts in BB */
4941 } /* BBs in loop */
4943 slpeel_make_loop_iterate_ntimes (loop, ratio);
4945 /* The memory tags and pointers in vectorized statements need to
4946 have their SSA forms updated. FIXME, why can't this be delayed
4947 until all the loops have been transformed? */
4948 update_ssa (TODO_update_ssa);
4950 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4951 fprintf (vect_dump, "LOOP VECTORIZED.");
4952 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4953 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");