2011-04-21 Tobias Burnus <burnus@net-b.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blob5fecf2a052497a17121eb719cd81979e26632847
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 return false;
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 return false;
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 return false;
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 else
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 return false;
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 return false;
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 return false;
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386 return true;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 *init = init_expr;
425 *step = step_expr;
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
434 return true;
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
525 if (double_reduc)
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
534 else
536 if (nested_cycle)
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
545 else
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
579 Example1: reduction:
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
585 Example2: induction:
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
621 tree niters;
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
711 else
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
762 LOOP_VINFO_PEELING_HTAB (res) = NULL;
764 return res;
768 /* Function destroy_loop_vec_info.
770 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
771 stmts in the loop. */
773 void
774 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
776 struct loop *loop;
777 basic_block *bbs;
778 int nbbs;
779 gimple_stmt_iterator si;
780 int j;
781 VEC (slp_instance, heap) *slp_instances;
782 slp_instance instance;
784 if (!loop_vinfo)
785 return;
787 loop = LOOP_VINFO_LOOP (loop_vinfo);
789 bbs = LOOP_VINFO_BBS (loop_vinfo);
790 nbbs = loop->num_nodes;
792 if (!clean_stmts)
794 free (LOOP_VINFO_BBS (loop_vinfo));
795 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
796 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
797 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
798 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
799 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
801 free (loop_vinfo);
802 loop->aux = NULL;
803 return;
806 for (j = 0; j < nbbs; j++)
808 basic_block bb = bbs[j];
809 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
810 free_stmt_vec_info (gsi_stmt (si));
812 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
814 gimple stmt = gsi_stmt (si);
815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
817 if (stmt_info)
819 /* Check if this is a "pattern stmt" (introduced by the
820 vectorizer during the pattern recognition pass). */
821 bool remove_stmt_p = false;
822 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
823 if (orig_stmt)
825 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
826 if (orig_stmt_info
827 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
828 remove_stmt_p = true;
831 /* Free stmt_vec_info. */
832 free_stmt_vec_info (stmt);
834 /* Remove dead "pattern stmts". */
835 if (remove_stmt_p)
836 gsi_remove (&si, true);
838 gsi_next (&si);
842 free (LOOP_VINFO_BBS (loop_vinfo));
843 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
844 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
845 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
846 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
847 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
848 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
849 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
850 vect_free_slp_instance (instance);
852 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
853 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
854 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
856 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
857 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
859 free (loop_vinfo);
860 loop->aux = NULL;
864 /* Function vect_analyze_loop_1.
866 Apply a set of analyses on LOOP, and create a loop_vec_info struct
867 for it. The different analyses will record information in the
868 loop_vec_info struct. This is a subset of the analyses applied in
869 vect_analyze_loop, to be applied on an inner-loop nested in the loop
870 that is now considered for (outer-loop) vectorization. */
872 static loop_vec_info
873 vect_analyze_loop_1 (struct loop *loop)
875 loop_vec_info loop_vinfo;
877 if (vect_print_dump_info (REPORT_DETAILS))
878 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
880 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
882 loop_vinfo = vect_analyze_loop_form (loop);
883 if (!loop_vinfo)
885 if (vect_print_dump_info (REPORT_DETAILS))
886 fprintf (vect_dump, "bad inner-loop form.");
887 return NULL;
890 return loop_vinfo;
894 /* Function vect_analyze_loop_form.
896 Verify that certain CFG restrictions hold, including:
897 - the loop has a pre-header
898 - the loop has a single entry and exit
899 - the loop exit condition is simple enough, and the number of iterations
900 can be analyzed (a countable loop). */
902 loop_vec_info
903 vect_analyze_loop_form (struct loop *loop)
905 loop_vec_info loop_vinfo;
906 gimple loop_cond;
907 tree number_of_iterations = NULL;
908 loop_vec_info inner_loop_vinfo = NULL;
910 if (vect_print_dump_info (REPORT_DETAILS))
911 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
913 /* Different restrictions apply when we are considering an inner-most loop,
914 vs. an outer (nested) loop.
915 (FORNOW. May want to relax some of these restrictions in the future). */
917 if (!loop->inner)
919 /* Inner-most loop. We currently require that the number of BBs is
920 exactly 2 (the header and latch). Vectorizable inner-most loops
921 look like this:
923 (pre-header)
925 header <--------+
926 | | |
927 | +--> latch --+
929 (exit-bb) */
931 if (loop->num_nodes != 2)
933 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
934 fprintf (vect_dump, "not vectorized: control flow in loop.");
935 return NULL;
938 if (empty_block_p (loop->header))
940 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
941 fprintf (vect_dump, "not vectorized: empty loop.");
942 return NULL;
945 else
947 struct loop *innerloop = loop->inner;
948 edge entryedge;
950 /* Nested loop. We currently require that the loop is doubly-nested,
951 contains a single inner loop, and the number of BBs is exactly 5.
952 Vectorizable outer-loops look like this:
954 (pre-header)
956 header <---+
958 inner-loop |
960 tail ------+
962 (exit-bb)
964 The inner-loop has the properties expected of inner-most loops
965 as described above. */
967 if ((loop->inner)->inner || (loop->inner)->next)
969 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
970 fprintf (vect_dump, "not vectorized: multiple nested loops.");
971 return NULL;
974 /* Analyze the inner-loop. */
975 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
976 if (!inner_loop_vinfo)
978 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
979 fprintf (vect_dump, "not vectorized: Bad inner loop.");
980 return NULL;
983 if (!expr_invariant_in_loop_p (loop,
984 LOOP_VINFO_NITERS (inner_loop_vinfo)))
986 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
987 fprintf (vect_dump,
988 "not vectorized: inner-loop count not invariant.");
989 destroy_loop_vec_info (inner_loop_vinfo, true);
990 return NULL;
993 if (loop->num_nodes != 5)
995 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
996 fprintf (vect_dump, "not vectorized: control flow in loop.");
997 destroy_loop_vec_info (inner_loop_vinfo, true);
998 return NULL;
1001 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1002 entryedge = EDGE_PRED (innerloop->header, 0);
1003 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1004 entryedge = EDGE_PRED (innerloop->header, 1);
1006 if (entryedge->src != loop->header
1007 || !single_exit (innerloop)
1008 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1011 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1012 destroy_loop_vec_info (inner_loop_vinfo, true);
1013 return NULL;
1016 if (vect_print_dump_info (REPORT_DETAILS))
1017 fprintf (vect_dump, "Considering outer-loop vectorization.");
1020 if (!single_exit (loop)
1021 || EDGE_COUNT (loop->header->preds) != 2)
1023 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1025 if (!single_exit (loop))
1026 fprintf (vect_dump, "not vectorized: multiple exits.");
1027 else if (EDGE_COUNT (loop->header->preds) != 2)
1028 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1030 if (inner_loop_vinfo)
1031 destroy_loop_vec_info (inner_loop_vinfo, true);
1032 return NULL;
1035 /* We assume that the loop exit condition is at the end of the loop. i.e,
1036 that the loop is represented as a do-while (with a proper if-guard
1037 before the loop if needed), where the loop header contains all the
1038 executable statements, and the latch is empty. */
1039 if (!empty_block_p (loop->latch)
1040 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1042 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1043 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1044 if (inner_loop_vinfo)
1045 destroy_loop_vec_info (inner_loop_vinfo, true);
1046 return NULL;
1049 /* Make sure there exists a single-predecessor exit bb: */
1050 if (!single_pred_p (single_exit (loop)->dest))
1052 edge e = single_exit (loop);
1053 if (!(e->flags & EDGE_ABNORMAL))
1055 split_loop_exit_edge (e);
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "split exit edge.");
1059 else
1061 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1062 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1063 if (inner_loop_vinfo)
1064 destroy_loop_vec_info (inner_loop_vinfo, true);
1065 return NULL;
1069 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1070 if (!loop_cond)
1072 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1073 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1074 if (inner_loop_vinfo)
1075 destroy_loop_vec_info (inner_loop_vinfo, true);
1076 return NULL;
1079 if (!number_of_iterations)
1081 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1082 fprintf (vect_dump,
1083 "not vectorized: number of iterations cannot be computed.");
1084 if (inner_loop_vinfo)
1085 destroy_loop_vec_info (inner_loop_vinfo, true);
1086 return NULL;
1089 if (chrec_contains_undetermined (number_of_iterations))
1091 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1092 fprintf (vect_dump, "Infinite number of iterations.");
1093 if (inner_loop_vinfo)
1094 destroy_loop_vec_info (inner_loop_vinfo, true);
1095 return NULL;
1098 if (!NITERS_KNOWN_P (number_of_iterations))
1100 if (vect_print_dump_info (REPORT_DETAILS))
1102 fprintf (vect_dump, "Symbolic number of iterations is ");
1103 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1106 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1108 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1109 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1110 if (inner_loop_vinfo)
1111 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 return NULL;
1115 loop_vinfo = new_loop_vec_info (loop);
1116 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1117 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1119 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1121 /* CHECKME: May want to keep it around it in the future. */
1122 if (inner_loop_vinfo)
1123 destroy_loop_vec_info (inner_loop_vinfo, false);
1125 gcc_assert (!loop->aux);
1126 loop->aux = loop_vinfo;
1127 return loop_vinfo;
1131 /* Get cost by calling cost target builtin. */
1133 static inline int
1134 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1136 tree dummy_type = NULL;
1137 int dummy = 0;
1139 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1140 dummy_type, dummy);
1144 /* Function vect_analyze_loop_operations.
1146 Scan the loop stmts and make sure they are all vectorizable. */
1148 static bool
1149 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1151 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1152 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1153 int nbbs = loop->num_nodes;
1154 gimple_stmt_iterator si;
1155 unsigned int vectorization_factor = 0;
1156 int i;
1157 gimple phi;
1158 stmt_vec_info stmt_info;
1159 bool need_to_vectorize = false;
1160 int min_profitable_iters;
1161 int min_scalar_loop_bound;
1162 unsigned int th;
1163 bool only_slp_in_loop = true, ok;
1165 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1168 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1169 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1171 for (i = 0; i < nbbs; i++)
1173 basic_block bb = bbs[i];
1175 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177 phi = gsi_stmt (si);
1178 ok = true;
1180 stmt_info = vinfo_for_stmt (phi);
1181 if (vect_print_dump_info (REPORT_DETAILS))
1183 fprintf (vect_dump, "examining phi: ");
1184 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1187 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1188 (i.e., a phi in the tail of the outer-loop). */
1189 if (! is_loop_header_bb_p (bb))
1191 /* FORNOW: we currently don't support the case that these phis
1192 are not used in the outerloop (unless it is double reduction,
1193 i.e., this phi is vect_reduction_def), cause this case
1194 requires to actually do something here. */
1195 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1196 || STMT_VINFO_LIVE_P (stmt_info))
1197 && STMT_VINFO_DEF_TYPE (stmt_info)
1198 != vect_double_reduction_def)
1200 if (vect_print_dump_info (REPORT_DETAILS))
1201 fprintf (vect_dump,
1202 "Unsupported loop-closed phi in outer-loop.");
1203 return false;
1206 /* If PHI is used in the outer loop, we check that its operand
1207 is defined in the inner loop. */
1208 if (STMT_VINFO_RELEVANT_P (stmt_info))
1210 tree phi_op;
1211 gimple op_def_stmt;
1213 if (gimple_phi_num_args (phi) != 1)
1214 return false;
1216 phi_op = PHI_ARG_DEF (phi, 0);
1217 if (TREE_CODE (phi_op) != SSA_NAME)
1218 return false;
1220 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1221 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1222 return false;
1224 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1225 != vect_used_in_outer
1226 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1227 != vect_used_in_outer_by_reduction)
1228 return false;
1231 continue;
1234 gcc_assert (stmt_info);
1236 if (STMT_VINFO_LIVE_P (stmt_info))
1238 /* FORNOW: not yet supported. */
1239 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1240 fprintf (vect_dump, "not vectorized: value used after loop.");
1241 return false;
1244 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1245 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1247 /* A scalar-dependence cycle that we don't support. */
1248 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1249 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1250 return false;
1253 if (STMT_VINFO_RELEVANT_P (stmt_info))
1255 need_to_vectorize = true;
1256 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1257 ok = vectorizable_induction (phi, NULL, NULL);
1260 if (!ok)
1262 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1264 fprintf (vect_dump,
1265 "not vectorized: relevant phi not supported: ");
1266 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1268 return false;
1272 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1274 gimple stmt = gsi_stmt (si);
1275 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1277 gcc_assert (stmt_info);
1279 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1280 return false;
1282 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1283 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1284 && !PURE_SLP_STMT (stmt_info))
1285 /* STMT needs both SLP and loop-based vectorization. */
1286 only_slp_in_loop = false;
1288 } /* bbs */
1290 /* All operations in the loop are either irrelevant (deal with loop
1291 control, or dead), or only used outside the loop and can be moved
1292 out of the loop (e.g. invariants, inductions). The loop can be
1293 optimized away by scalar optimizations. We're better off not
1294 touching this loop. */
1295 if (!need_to_vectorize)
1297 if (vect_print_dump_info (REPORT_DETAILS))
1298 fprintf (vect_dump,
1299 "All the computation can be taken out of the loop.");
1300 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301 fprintf (vect_dump,
1302 "not vectorized: redundant loop. no profit to vectorize.");
1303 return false;
1306 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1307 vectorization factor of the loop is the unrolling factor required by the
1308 SLP instances. If that unrolling factor is 1, we say, that we perform
1309 pure SLP on loop - cross iteration parallelism is not exploited. */
1310 if (only_slp_in_loop)
1311 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1312 else
1313 vectorization_factor = least_common_multiple (vectorization_factor,
1314 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1316 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1318 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1319 && vect_print_dump_info (REPORT_DETAILS))
1320 fprintf (vect_dump,
1321 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1322 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1324 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1325 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1327 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1328 fprintf (vect_dump, "not vectorized: iteration count too small.");
1329 if (vect_print_dump_info (REPORT_DETAILS))
1330 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1331 "vectorization factor.");
1332 return false;
1335 /* Analyze cost. Decide if worth while to vectorize. */
1337 /* Once VF is set, SLP costs should be updated since the number of created
1338 vector stmts depends on VF. */
1339 vect_update_slp_costs_according_to_vf (loop_vinfo);
1341 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1342 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1344 if (min_profitable_iters < 0)
1346 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1347 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1348 if (vect_print_dump_info (REPORT_DETAILS))
1349 fprintf (vect_dump, "not vectorized: vector version will never be "
1350 "profitable.");
1351 return false;
1354 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1355 * vectorization_factor) - 1);
1357 /* Use the cost model only if it is more conservative than user specified
1358 threshold. */
1360 th = (unsigned) min_scalar_loop_bound;
1361 if (min_profitable_iters
1362 && (!min_scalar_loop_bound
1363 || min_profitable_iters > min_scalar_loop_bound))
1364 th = (unsigned) min_profitable_iters;
1366 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1367 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1369 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1370 fprintf (vect_dump, "not vectorized: vectorization not "
1371 "profitable.");
1372 if (vect_print_dump_info (REPORT_DETAILS))
1373 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1374 "user specified loop bound parameter or minimum "
1375 "profitable iterations (whichever is more conservative).");
1376 return false;
1379 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1380 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1381 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1383 if (vect_print_dump_info (REPORT_DETAILS))
1384 fprintf (vect_dump, "epilog loop required.");
1385 if (!vect_can_advance_ivs_p (loop_vinfo))
1387 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1388 fprintf (vect_dump,
1389 "not vectorized: can't create epilog loop 1.");
1390 return false;
1392 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1394 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1395 fprintf (vect_dump,
1396 "not vectorized: can't create epilog loop 2.");
1397 return false;
1401 return true;
1405 /* Function vect_analyze_loop_2.
1407 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1408 for it. The different analyses will record information in the
1409 loop_vec_info struct. */
1410 static bool
1411 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1413 bool ok, dummy;
1414 int max_vf = MAX_VECTORIZATION_FACTOR;
1415 int min_vf = 2;
1417 /* Find all data references in the loop (which correspond to vdefs/vuses)
1418 and analyze their evolution in the loop. Also adjust the minimal
1419 vectorization factor according to the loads and stores.
1421 FORNOW: Handle only simple, array references, which
1422 alignment can be forced, and aligned pointer-references. */
1424 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1425 if (!ok)
1427 if (vect_print_dump_info (REPORT_DETAILS))
1428 fprintf (vect_dump, "bad data references.");
1429 return false;
1432 /* Classify all cross-iteration scalar data-flow cycles.
1433 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1435 vect_analyze_scalar_cycles (loop_vinfo);
1437 vect_pattern_recog (loop_vinfo);
1439 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1441 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1442 if (!ok)
1444 if (vect_print_dump_info (REPORT_DETAILS))
1445 fprintf (vect_dump, "unexpected pattern.");
1446 return false;
1449 /* Analyze data dependences between the data-refs in the loop
1450 and adjust the maximum vectorization factor according to
1451 the dependences.
1452 FORNOW: fail at the first data dependence that we encounter. */
1454 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1455 if (!ok
1456 || max_vf < min_vf)
1458 if (vect_print_dump_info (REPORT_DETAILS))
1459 fprintf (vect_dump, "bad data dependence.");
1460 return false;
1463 ok = vect_determine_vectorization_factor (loop_vinfo);
1464 if (!ok)
1466 if (vect_print_dump_info (REPORT_DETAILS))
1467 fprintf (vect_dump, "can't determine vectorization factor.");
1468 return false;
1470 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1472 if (vect_print_dump_info (REPORT_DETAILS))
1473 fprintf (vect_dump, "bad data dependence.");
1474 return false;
1477 /* Analyze the alignment of the data-refs in the loop.
1478 Fail if a data reference is found that cannot be vectorized. */
1480 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1481 if (!ok)
1483 if (vect_print_dump_info (REPORT_DETAILS))
1484 fprintf (vect_dump, "bad data alignment.");
1485 return false;
1488 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1489 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1491 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1492 if (!ok)
1494 if (vect_print_dump_info (REPORT_DETAILS))
1495 fprintf (vect_dump, "bad data access.");
1496 return false;
1499 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1500 It is important to call pruning after vect_analyze_data_ref_accesses,
1501 since we use grouping information gathered by interleaving analysis. */
1502 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1503 if (!ok)
1505 if (vect_print_dump_info (REPORT_DETAILS))
1506 fprintf (vect_dump, "too long list of versioning for alias "
1507 "run-time tests.");
1508 return false;
1511 /* This pass will decide on using loop versioning and/or loop peeling in
1512 order to enhance the alignment of data references in the loop. */
1514 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1515 if (!ok)
1517 if (vect_print_dump_info (REPORT_DETAILS))
1518 fprintf (vect_dump, "bad data alignment.");
1519 return false;
1522 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1523 ok = vect_analyze_slp (loop_vinfo, NULL);
1524 if (ok)
1526 /* Decide which possible SLP instances to SLP. */
1527 vect_make_slp_decision (loop_vinfo);
1529 /* Find stmts that need to be both vectorized and SLPed. */
1530 vect_detect_hybrid_slp (loop_vinfo);
1533 /* Scan all the operations in the loop and make sure they are
1534 vectorizable. */
1536 ok = vect_analyze_loop_operations (loop_vinfo);
1537 if (!ok)
1539 if (vect_print_dump_info (REPORT_DETAILS))
1540 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1541 return false;
1544 return true;
1547 /* Function vect_analyze_loop.
1549 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1550 for it. The different analyses will record information in the
1551 loop_vec_info struct. */
1552 loop_vec_info
1553 vect_analyze_loop (struct loop *loop)
1555 loop_vec_info loop_vinfo;
1556 unsigned int vector_sizes;
1558 /* Autodetect first vector size we try. */
1559 current_vector_size = 0;
1560 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1562 if (vect_print_dump_info (REPORT_DETAILS))
1563 fprintf (vect_dump, "===== analyze_loop_nest =====");
1565 if (loop_outer (loop)
1566 && loop_vec_info_for_loop (loop_outer (loop))
1567 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1569 if (vect_print_dump_info (REPORT_DETAILS))
1570 fprintf (vect_dump, "outer-loop already vectorized.");
1571 return NULL;
1574 while (1)
1576 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1577 loop_vinfo = vect_analyze_loop_form (loop);
1578 if (!loop_vinfo)
1580 if (vect_print_dump_info (REPORT_DETAILS))
1581 fprintf (vect_dump, "bad loop form.");
1582 return NULL;
1585 if (vect_analyze_loop_2 (loop_vinfo))
1587 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1589 return loop_vinfo;
1592 destroy_loop_vec_info (loop_vinfo, true);
1594 vector_sizes &= ~current_vector_size;
1595 if (vector_sizes == 0
1596 || current_vector_size == 0)
1597 return NULL;
1599 /* Try the next biggest vector size. */
1600 current_vector_size = 1 << floor_log2 (vector_sizes);
1601 if (vect_print_dump_info (REPORT_DETAILS))
1602 fprintf (vect_dump, "***** Re-trying analysis with "
1603 "vector size %d\n", current_vector_size);
1608 /* Function reduction_code_for_scalar_code
1610 Input:
1611 CODE - tree_code of a reduction operations.
1613 Output:
1614 REDUC_CODE - the corresponding tree-code to be used to reduce the
1615 vector of partial results into a single scalar result (which
1616 will also reside in a vector) or ERROR_MARK if the operation is
1617 a supported reduction operation, but does not have such tree-code.
1619 Return FALSE if CODE currently cannot be vectorized as reduction. */
1621 static bool
1622 reduction_code_for_scalar_code (enum tree_code code,
1623 enum tree_code *reduc_code)
1625 switch (code)
1627 case MAX_EXPR:
1628 *reduc_code = REDUC_MAX_EXPR;
1629 return true;
1631 case MIN_EXPR:
1632 *reduc_code = REDUC_MIN_EXPR;
1633 return true;
1635 case PLUS_EXPR:
1636 *reduc_code = REDUC_PLUS_EXPR;
1637 return true;
1639 case MULT_EXPR:
1640 case MINUS_EXPR:
1641 case BIT_IOR_EXPR:
1642 case BIT_XOR_EXPR:
1643 case BIT_AND_EXPR:
1644 *reduc_code = ERROR_MARK;
1645 return true;
1647 default:
1648 return false;
1653 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1654 STMT is printed with a message MSG. */
1656 static void
1657 report_vect_op (gimple stmt, const char *msg)
1659 fprintf (vect_dump, "%s", msg);
1660 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1664 /* Function vect_is_simple_reduction_1
1666 (1) Detect a cross-iteration def-use cycle that represents a simple
1667 reduction computation. We look for the following pattern:
1669 loop_header:
1670 a1 = phi < a0, a2 >
1671 a3 = ...
1672 a2 = operation (a3, a1)
1674 such that:
1675 1. operation is commutative and associative and it is safe to
1676 change the order of the computation (if CHECK_REDUCTION is true)
1677 2. no uses for a2 in the loop (a2 is used out of the loop)
1678 3. no uses of a1 in the loop besides the reduction operation
1679 4. no uses of a1 outside the loop.
1681 Conditions 1,4 are tested here.
1682 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1684 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1685 nested cycles, if CHECK_REDUCTION is false.
1687 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1688 reductions:
1690 a1 = phi < a0, a2 >
1691 inner loop (def of a3)
1692 a2 = phi < a3 >
1694 If MODIFY is true it tries also to rework the code in-place to enable
1695 detection of more reduction patterns. For the time being we rewrite
1696 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1699 static gimple
1700 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1701 bool check_reduction, bool *double_reduc,
1702 bool modify)
1704 struct loop *loop = (gimple_bb (phi))->loop_father;
1705 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1706 edge latch_e = loop_latch_edge (loop);
1707 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1708 gimple def_stmt, def1 = NULL, def2 = NULL;
1709 enum tree_code orig_code, code;
1710 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1711 tree type;
1712 int nloop_uses;
1713 tree name;
1714 imm_use_iterator imm_iter;
1715 use_operand_p use_p;
1716 bool phi_def;
1718 *double_reduc = false;
1720 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1721 otherwise, we assume outer loop vectorization. */
1722 gcc_assert ((check_reduction && loop == vect_loop)
1723 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1725 name = PHI_RESULT (phi);
1726 nloop_uses = 0;
1727 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1729 gimple use_stmt = USE_STMT (use_p);
1730 if (is_gimple_debug (use_stmt))
1731 continue;
1733 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1735 if (vect_print_dump_info (REPORT_DETAILS))
1736 fprintf (vect_dump, "intermediate value used outside loop.");
1738 return NULL;
1741 if (vinfo_for_stmt (use_stmt)
1742 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1743 nloop_uses++;
1744 if (nloop_uses > 1)
1746 if (vect_print_dump_info (REPORT_DETAILS))
1747 fprintf (vect_dump, "reduction used in loop.");
1748 return NULL;
1752 if (TREE_CODE (loop_arg) != SSA_NAME)
1754 if (vect_print_dump_info (REPORT_DETAILS))
1756 fprintf (vect_dump, "reduction: not ssa_name: ");
1757 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1759 return NULL;
1762 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1763 if (!def_stmt)
1765 if (vect_print_dump_info (REPORT_DETAILS))
1766 fprintf (vect_dump, "reduction: no def_stmt.");
1767 return NULL;
1770 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1772 if (vect_print_dump_info (REPORT_DETAILS))
1773 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1774 return NULL;
1777 if (is_gimple_assign (def_stmt))
1779 name = gimple_assign_lhs (def_stmt);
1780 phi_def = false;
1782 else
1784 name = PHI_RESULT (def_stmt);
1785 phi_def = true;
1788 nloop_uses = 0;
1789 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1791 gimple use_stmt = USE_STMT (use_p);
1792 if (is_gimple_debug (use_stmt))
1793 continue;
1794 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1795 && vinfo_for_stmt (use_stmt)
1796 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1797 nloop_uses++;
1798 if (nloop_uses > 1)
1800 if (vect_print_dump_info (REPORT_DETAILS))
1801 fprintf (vect_dump, "reduction used in loop.");
1802 return NULL;
1806 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1807 defined in the inner loop. */
1808 if (phi_def)
1810 op1 = PHI_ARG_DEF (def_stmt, 0);
1812 if (gimple_phi_num_args (def_stmt) != 1
1813 || TREE_CODE (op1) != SSA_NAME)
1815 if (vect_print_dump_info (REPORT_DETAILS))
1816 fprintf (vect_dump, "unsupported phi node definition.");
1818 return NULL;
1821 def1 = SSA_NAME_DEF_STMT (op1);
1822 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1823 && loop->inner
1824 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1825 && is_gimple_assign (def1))
1827 if (vect_print_dump_info (REPORT_DETAILS))
1828 report_vect_op (def_stmt, "detected double reduction: ");
1830 *double_reduc = true;
1831 return def_stmt;
1834 return NULL;
1837 code = orig_code = gimple_assign_rhs_code (def_stmt);
1839 /* We can handle "res -= x[i]", which is non-associative by
1840 simply rewriting this into "res += -x[i]". Avoid changing
1841 gimple instruction for the first simple tests and only do this
1842 if we're allowed to change code at all. */
1843 if (code == MINUS_EXPR
1844 && modify
1845 && (op1 = gimple_assign_rhs1 (def_stmt))
1846 && TREE_CODE (op1) == SSA_NAME
1847 && SSA_NAME_DEF_STMT (op1) == phi)
1848 code = PLUS_EXPR;
1850 if (check_reduction
1851 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1853 if (vect_print_dump_info (REPORT_DETAILS))
1854 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1855 return NULL;
1858 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1860 if (code != COND_EXPR)
1862 if (vect_print_dump_info (REPORT_DETAILS))
1863 report_vect_op (def_stmt, "reduction: not binary operation: ");
1865 return NULL;
1868 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1869 if (COMPARISON_CLASS_P (op3))
1871 op4 = TREE_OPERAND (op3, 1);
1872 op3 = TREE_OPERAND (op3, 0);
1875 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1876 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1878 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1880 if (vect_print_dump_info (REPORT_DETAILS))
1881 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1883 return NULL;
1886 else
1888 op1 = gimple_assign_rhs1 (def_stmt);
1889 op2 = gimple_assign_rhs2 (def_stmt);
1891 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1893 if (vect_print_dump_info (REPORT_DETAILS))
1894 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1896 return NULL;
1900 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1901 if ((TREE_CODE (op1) == SSA_NAME
1902 && !types_compatible_p (type,TREE_TYPE (op1)))
1903 || (TREE_CODE (op2) == SSA_NAME
1904 && !types_compatible_p (type, TREE_TYPE (op2)))
1905 || (op3 && TREE_CODE (op3) == SSA_NAME
1906 && !types_compatible_p (type, TREE_TYPE (op3)))
1907 || (op4 && TREE_CODE (op4) == SSA_NAME
1908 && !types_compatible_p (type, TREE_TYPE (op4))))
1910 if (vect_print_dump_info (REPORT_DETAILS))
1912 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1913 print_generic_expr (vect_dump, type, TDF_SLIM);
1914 fprintf (vect_dump, ", operands types: ");
1915 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1916 fprintf (vect_dump, ",");
1917 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1918 if (op3)
1920 fprintf (vect_dump, ",");
1921 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1924 if (op4)
1926 fprintf (vect_dump, ",");
1927 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1931 return NULL;
1934 /* Check that it's ok to change the order of the computation.
1935 Generally, when vectorizing a reduction we change the order of the
1936 computation. This may change the behavior of the program in some
1937 cases, so we need to check that this is ok. One exception is when
1938 vectorizing an outer-loop: the inner-loop is executed sequentially,
1939 and therefore vectorizing reductions in the inner-loop during
1940 outer-loop vectorization is safe. */
1942 /* CHECKME: check for !flag_finite_math_only too? */
1943 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1944 && check_reduction)
1946 /* Changing the order of operations changes the semantics. */
1947 if (vect_print_dump_info (REPORT_DETAILS))
1948 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1949 return NULL;
1951 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1952 && check_reduction)
1954 /* Changing the order of operations changes the semantics. */
1955 if (vect_print_dump_info (REPORT_DETAILS))
1956 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1957 return NULL;
1959 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1961 /* Changing the order of operations changes the semantics. */
1962 if (vect_print_dump_info (REPORT_DETAILS))
1963 report_vect_op (def_stmt,
1964 "reduction: unsafe fixed-point math optimization: ");
1965 return NULL;
1968 /* If we detected "res -= x[i]" earlier, rewrite it into
1969 "res += -x[i]" now. If this turns out to be useless reassoc
1970 will clean it up again. */
1971 if (orig_code == MINUS_EXPR)
1973 tree rhs = gimple_assign_rhs2 (def_stmt);
1974 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1975 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1976 rhs, NULL);
1977 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1978 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1979 loop_info, NULL));
1980 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1981 gimple_assign_set_rhs2 (def_stmt, negrhs);
1982 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1983 update_stmt (def_stmt);
1986 /* Reduction is safe. We're dealing with one of the following:
1987 1) integer arithmetic and no trapv
1988 2) floating point arithmetic, and special flags permit this optimization
1989 3) nested cycle (i.e., outer loop vectorization). */
1990 if (TREE_CODE (op1) == SSA_NAME)
1991 def1 = SSA_NAME_DEF_STMT (op1);
1993 if (TREE_CODE (op2) == SSA_NAME)
1994 def2 = SSA_NAME_DEF_STMT (op2);
1996 if (code != COND_EXPR
1997 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1999 if (vect_print_dump_info (REPORT_DETAILS))
2000 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2001 return NULL;
2004 /* Check that one def is the reduction def, defined by PHI,
2005 the other def is either defined in the loop ("vect_internal_def"),
2006 or it's an induction (defined by a loop-header phi-node). */
2008 if (def2 && def2 == phi
2009 && (code == COND_EXPR
2010 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2011 && (is_gimple_assign (def1)
2012 || is_gimple_call (def1)
2013 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2014 == vect_induction_def
2015 || (gimple_code (def1) == GIMPLE_PHI
2016 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2017 == vect_internal_def
2018 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2020 if (vect_print_dump_info (REPORT_DETAILS))
2021 report_vect_op (def_stmt, "detected reduction: ");
2022 return def_stmt;
2024 else if (def1 && def1 == phi
2025 && (code == COND_EXPR
2026 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2027 && (is_gimple_assign (def2)
2028 || is_gimple_call (def2)
2029 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2030 == vect_induction_def
2031 || (gimple_code (def2) == GIMPLE_PHI
2032 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2033 == vect_internal_def
2034 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2036 if (check_reduction)
2038 /* Swap operands (just for simplicity - so that the rest of the code
2039 can assume that the reduction variable is always the last (second)
2040 argument). */
2041 if (vect_print_dump_info (REPORT_DETAILS))
2042 report_vect_op (def_stmt,
2043 "detected reduction: need to swap operands: ");
2045 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2046 gimple_assign_rhs2_ptr (def_stmt));
2048 else
2050 if (vect_print_dump_info (REPORT_DETAILS))
2051 report_vect_op (def_stmt, "detected reduction: ");
2054 return def_stmt;
2056 else
2058 if (vect_print_dump_info (REPORT_DETAILS))
2059 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2061 return NULL;
2065 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2066 in-place. Arguments as there. */
2068 static gimple
2069 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2070 bool check_reduction, bool *double_reduc)
2072 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2073 double_reduc, false);
2076 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2077 in-place if it enables detection of more reductions. Arguments
2078 as there. */
2080 gimple
2081 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2082 bool check_reduction, bool *double_reduc)
2084 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2085 double_reduc, true);
2088 /* Calculate the cost of one scalar iteration of the loop. */
2090 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2092 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2093 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2094 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2095 int innerloop_iters, i, stmt_cost;
2097 /* Count statements in scalar loop. Using this as scalar cost for a single
2098 iteration for now.
2100 TODO: Add outer loop support.
2102 TODO: Consider assigning different costs to different scalar
2103 statements. */
2105 /* FORNOW. */
2106 innerloop_iters = 1;
2107 if (loop->inner)
2108 innerloop_iters = 50; /* FIXME */
2110 for (i = 0; i < nbbs; i++)
2112 gimple_stmt_iterator si;
2113 basic_block bb = bbs[i];
2115 if (bb->loop_father == loop->inner)
2116 factor = innerloop_iters;
2117 else
2118 factor = 1;
2120 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2122 gimple stmt = gsi_stmt (si);
2123 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2125 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2126 continue;
2128 /* Skip stmts that are not vectorized inside the loop. */
2129 if (stmt_info
2130 && !STMT_VINFO_RELEVANT_P (stmt_info)
2131 && (!STMT_VINFO_LIVE_P (stmt_info)
2132 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2133 continue;
2135 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2137 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2138 stmt_cost = vect_get_cost (scalar_load);
2139 else
2140 stmt_cost = vect_get_cost (scalar_store);
2142 else
2143 stmt_cost = vect_get_cost (scalar_stmt);
2145 scalar_single_iter_cost += stmt_cost * factor;
2148 return scalar_single_iter_cost;
2151 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2153 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2154 int *peel_iters_epilogue,
2155 int scalar_single_iter_cost)
2157 int peel_guard_costs = 0;
2158 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2160 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2162 *peel_iters_epilogue = vf/2;
2163 if (vect_print_dump_info (REPORT_COST))
2164 fprintf (vect_dump, "cost model: "
2165 "epilogue peel iters set to vf/2 because "
2166 "loop iterations are unknown .");
2168 /* If peeled iterations are known but number of scalar loop
2169 iterations are unknown, count a taken branch per peeled loop. */
2170 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2172 else
2174 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2175 peel_iters_prologue = niters < peel_iters_prologue ?
2176 niters : peel_iters_prologue;
2177 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2180 return (peel_iters_prologue * scalar_single_iter_cost)
2181 + (*peel_iters_epilogue * scalar_single_iter_cost)
2182 + peel_guard_costs;
2185 /* Function vect_estimate_min_profitable_iters
2187 Return the number of iterations required for the vector version of the
2188 loop to be profitable relative to the cost of the scalar version of the
2189 loop.
2191 TODO: Take profile info into account before making vectorization
2192 decisions, if available. */
2195 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2197 int i;
2198 int min_profitable_iters;
2199 int peel_iters_prologue;
2200 int peel_iters_epilogue;
2201 int vec_inside_cost = 0;
2202 int vec_outside_cost = 0;
2203 int scalar_single_iter_cost = 0;
2204 int scalar_outside_cost = 0;
2205 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2206 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2207 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2208 int nbbs = loop->num_nodes;
2209 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2210 int peel_guard_costs = 0;
2211 int innerloop_iters = 0, factor;
2212 VEC (slp_instance, heap) *slp_instances;
2213 slp_instance instance;
2215 /* Cost model disabled. */
2216 if (!flag_vect_cost_model)
2218 if (vect_print_dump_info (REPORT_COST))
2219 fprintf (vect_dump, "cost model disabled.");
2220 return 0;
2223 /* Requires loop versioning tests to handle misalignment. */
2224 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2226 /* FIXME: Make cost depend on complexity of individual check. */
2227 vec_outside_cost +=
2228 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2229 if (vect_print_dump_info (REPORT_COST))
2230 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2231 "versioning to treat misalignment.\n");
2234 /* Requires loop versioning with alias checks. */
2235 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2237 /* FIXME: Make cost depend on complexity of individual check. */
2238 vec_outside_cost +=
2239 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2240 if (vect_print_dump_info (REPORT_COST))
2241 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2242 "versioning aliasing.\n");
2245 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2246 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2247 vec_outside_cost += vect_get_cost (cond_branch_taken);
2249 /* Count statements in scalar loop. Using this as scalar cost for a single
2250 iteration for now.
2252 TODO: Add outer loop support.
2254 TODO: Consider assigning different costs to different scalar
2255 statements. */
2257 /* FORNOW. */
2258 if (loop->inner)
2259 innerloop_iters = 50; /* FIXME */
2261 for (i = 0; i < nbbs; i++)
2263 gimple_stmt_iterator si;
2264 basic_block bb = bbs[i];
2266 if (bb->loop_father == loop->inner)
2267 factor = innerloop_iters;
2268 else
2269 factor = 1;
2271 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2273 gimple stmt = gsi_stmt (si);
2274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275 /* Skip stmts that are not vectorized inside the loop. */
2276 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2277 && (!STMT_VINFO_LIVE_P (stmt_info)
2278 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2279 continue;
2280 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2281 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2282 some of the "outside" costs are generated inside the outer-loop. */
2283 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2287 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2289 /* Add additional cost for the peeled instructions in prologue and epilogue
2290 loop.
2292 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2293 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2295 TODO: Build an expression that represents peel_iters for prologue and
2296 epilogue to be used in a run-time test. */
2298 if (npeel < 0)
2300 peel_iters_prologue = vf/2;
2301 if (vect_print_dump_info (REPORT_COST))
2302 fprintf (vect_dump, "cost model: "
2303 "prologue peel iters set to vf/2.");
2305 /* If peeling for alignment is unknown, loop bound of main loop becomes
2306 unknown. */
2307 peel_iters_epilogue = vf/2;
2308 if (vect_print_dump_info (REPORT_COST))
2309 fprintf (vect_dump, "cost model: "
2310 "epilogue peel iters set to vf/2 because "
2311 "peeling for alignment is unknown .");
2313 /* If peeled iterations are unknown, count a taken branch and a not taken
2314 branch per peeled loop. Even if scalar loop iterations are known,
2315 vector iterations are not known since peeled prologue iterations are
2316 not known. Hence guards remain the same. */
2317 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2318 + vect_get_cost (cond_branch_not_taken));
2319 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2320 + (peel_iters_epilogue * scalar_single_iter_cost)
2321 + peel_guard_costs;
2323 else
2325 peel_iters_prologue = npeel;
2326 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2327 peel_iters_prologue, &peel_iters_epilogue,
2328 scalar_single_iter_cost);
2331 /* FORNOW: The scalar outside cost is incremented in one of the
2332 following ways:
2334 1. The vectorizer checks for alignment and aliasing and generates
2335 a condition that allows dynamic vectorization. A cost model
2336 check is ANDED with the versioning condition. Hence scalar code
2337 path now has the added cost of the versioning check.
2339 if (cost > th & versioning_check)
2340 jmp to vector code
2342 Hence run-time scalar is incremented by not-taken branch cost.
2344 2. The vectorizer then checks if a prologue is required. If the
2345 cost model check was not done before during versioning, it has to
2346 be done before the prologue check.
2348 if (cost <= th)
2349 prologue = scalar_iters
2350 if (prologue == 0)
2351 jmp to vector code
2352 else
2353 execute prologue
2354 if (prologue == num_iters)
2355 go to exit
2357 Hence the run-time scalar cost is incremented by a taken branch,
2358 plus a not-taken branch, plus a taken branch cost.
2360 3. The vectorizer then checks if an epilogue is required. If the
2361 cost model check was not done before during prologue check, it
2362 has to be done with the epilogue check.
2364 if (prologue == 0)
2365 jmp to vector code
2366 else
2367 execute prologue
2368 if (prologue == num_iters)
2369 go to exit
2370 vector code:
2371 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2372 jmp to epilogue
2374 Hence the run-time scalar cost should be incremented by 2 taken
2375 branches.
2377 TODO: The back end may reorder the BBS's differently and reverse
2378 conditions/branch directions. Change the estimates below to
2379 something more reasonable. */
2381 /* If the number of iterations is known and we do not do versioning, we can
2382 decide whether to vectorize at compile time. Hence the scalar version
2383 do not carry cost model guard costs. */
2384 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2385 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2386 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2388 /* Cost model check occurs at versioning. */
2389 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2390 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2391 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2392 else
2394 /* Cost model check occurs at prologue generation. */
2395 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2396 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2397 + vect_get_cost (cond_branch_not_taken);
2398 /* Cost model check occurs at epilogue generation. */
2399 else
2400 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2404 /* Add SLP costs. */
2405 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2406 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2408 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2409 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2412 /* Calculate number of iterations required to make the vector version
2413 profitable, relative to the loop bodies only. The following condition
2414 must hold true:
2415 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2416 where
2417 SIC = scalar iteration cost, VIC = vector iteration cost,
2418 VOC = vector outside cost, VF = vectorization factor,
2419 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2420 SOC = scalar outside cost for run time cost model check. */
2422 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2424 if (vec_outside_cost <= 0)
2425 min_profitable_iters = 1;
2426 else
2428 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2429 - vec_inside_cost * peel_iters_prologue
2430 - vec_inside_cost * peel_iters_epilogue)
2431 / ((scalar_single_iter_cost * vf)
2432 - vec_inside_cost);
2434 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2435 <= ((vec_inside_cost * min_profitable_iters)
2436 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2437 min_profitable_iters++;
2440 /* vector version will never be profitable. */
2441 else
2443 if (vect_print_dump_info (REPORT_COST))
2444 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2445 "divided by the scalar iteration cost = %d "
2446 "is greater or equal to the vectorization factor = %d.",
2447 vec_inside_cost, scalar_single_iter_cost, vf);
2448 return -1;
2451 if (vect_print_dump_info (REPORT_COST))
2453 fprintf (vect_dump, "Cost model analysis: \n");
2454 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2455 vec_inside_cost);
2456 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2457 vec_outside_cost);
2458 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2459 scalar_single_iter_cost);
2460 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2461 fprintf (vect_dump, " prologue iterations: %d\n",
2462 peel_iters_prologue);
2463 fprintf (vect_dump, " epilogue iterations: %d\n",
2464 peel_iters_epilogue);
2465 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2466 min_profitable_iters);
2469 min_profitable_iters =
2470 min_profitable_iters < vf ? vf : min_profitable_iters;
2472 /* Because the condition we create is:
2473 if (niters <= min_profitable_iters)
2474 then skip the vectorized loop. */
2475 min_profitable_iters--;
2477 if (vect_print_dump_info (REPORT_COST))
2478 fprintf (vect_dump, " Profitability threshold = %d\n",
2479 min_profitable_iters);
2481 return min_profitable_iters;
2485 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2486 functions. Design better to avoid maintenance issues. */
2488 /* Function vect_model_reduction_cost.
2490 Models cost for a reduction operation, including the vector ops
2491 generated within the strip-mine loop, the initial definition before
2492 the loop, and the epilogue code that must be generated. */
2494 static bool
2495 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2496 int ncopies)
2498 int outer_cost = 0;
2499 enum tree_code code;
2500 optab optab;
2501 tree vectype;
2502 gimple stmt, orig_stmt;
2503 tree reduction_op;
2504 enum machine_mode mode;
2505 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2506 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2509 /* Cost of reduction op inside loop. */
2510 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2511 += ncopies * vect_get_cost (vector_stmt);
2513 stmt = STMT_VINFO_STMT (stmt_info);
2515 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2517 case GIMPLE_SINGLE_RHS:
2518 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2519 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2520 break;
2521 case GIMPLE_UNARY_RHS:
2522 reduction_op = gimple_assign_rhs1 (stmt);
2523 break;
2524 case GIMPLE_BINARY_RHS:
2525 reduction_op = gimple_assign_rhs2 (stmt);
2526 break;
2527 case GIMPLE_TERNARY_RHS:
2528 reduction_op = gimple_assign_rhs3 (stmt);
2529 break;
2530 default:
2531 gcc_unreachable ();
2534 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2535 if (!vectype)
2537 if (vect_print_dump_info (REPORT_COST))
2539 fprintf (vect_dump, "unsupported data-type ");
2540 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2542 return false;
2545 mode = TYPE_MODE (vectype);
2546 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2548 if (!orig_stmt)
2549 orig_stmt = STMT_VINFO_STMT (stmt_info);
2551 code = gimple_assign_rhs_code (orig_stmt);
2553 /* Add in cost for initial definition. */
2554 outer_cost += vect_get_cost (scalar_to_vec);
2556 /* Determine cost of epilogue code.
2558 We have a reduction operator that will reduce the vector in one statement.
2559 Also requires scalar extract. */
2561 if (!nested_in_vect_loop_p (loop, orig_stmt))
2563 if (reduc_code != ERROR_MARK)
2564 outer_cost += vect_get_cost (vector_stmt)
2565 + vect_get_cost (vec_to_scalar);
2566 else
2568 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2569 tree bitsize =
2570 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2571 int element_bitsize = tree_low_cst (bitsize, 1);
2572 int nelements = vec_size_in_bits / element_bitsize;
2574 optab = optab_for_tree_code (code, vectype, optab_default);
2576 /* We have a whole vector shift available. */
2577 if (VECTOR_MODE_P (mode)
2578 && optab_handler (optab, mode) != CODE_FOR_nothing
2579 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2580 /* Final reduction via vector shifts and the reduction operator. Also
2581 requires scalar extract. */
2582 outer_cost += ((exact_log2(nelements) * 2)
2583 * vect_get_cost (vector_stmt)
2584 + vect_get_cost (vec_to_scalar));
2585 else
2586 /* Use extracts and reduction op for final reduction. For N elements,
2587 we have N extracts and N-1 reduction ops. */
2588 outer_cost += ((nelements + nelements - 1)
2589 * vect_get_cost (vector_stmt));
2593 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2595 if (vect_print_dump_info (REPORT_COST))
2596 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2597 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2598 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2600 return true;
2604 /* Function vect_model_induction_cost.
2606 Models cost for induction operations. */
2608 static void
2609 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2611 /* loop cost for vec_loop. */
2612 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2613 = ncopies * vect_get_cost (vector_stmt);
2614 /* prologue cost for vec_init and vec_step. */
2615 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2616 = 2 * vect_get_cost (scalar_to_vec);
2618 if (vect_print_dump_info (REPORT_COST))
2619 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2620 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2621 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2625 /* Function get_initial_def_for_induction
2627 Input:
2628 STMT - a stmt that performs an induction operation in the loop.
2629 IV_PHI - the initial value of the induction variable
2631 Output:
2632 Return a vector variable, initialized with the first VF values of
2633 the induction variable. E.g., for an iv with IV_PHI='X' and
2634 evolution S, for a vector of 4 units, we want to return:
2635 [X, X + S, X + 2*S, X + 3*S]. */
2637 static tree
2638 get_initial_def_for_induction (gimple iv_phi)
2640 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2641 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2642 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2643 tree scalar_type;
2644 tree vectype;
2645 int nunits;
2646 edge pe = loop_preheader_edge (loop);
2647 struct loop *iv_loop;
2648 basic_block new_bb;
2649 tree vec, vec_init, vec_step, t;
2650 tree access_fn;
2651 tree new_var;
2652 tree new_name;
2653 gimple init_stmt, induction_phi, new_stmt;
2654 tree induc_def, vec_def, vec_dest;
2655 tree init_expr, step_expr;
2656 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2657 int i;
2658 bool ok;
2659 int ncopies;
2660 tree expr;
2661 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2662 bool nested_in_vect_loop = false;
2663 gimple_seq stmts = NULL;
2664 imm_use_iterator imm_iter;
2665 use_operand_p use_p;
2666 gimple exit_phi;
2667 edge latch_e;
2668 tree loop_arg;
2669 gimple_stmt_iterator si;
2670 basic_block bb = gimple_bb (iv_phi);
2671 tree stepvectype;
2672 tree resvectype;
2674 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2675 if (nested_in_vect_loop_p (loop, iv_phi))
2677 nested_in_vect_loop = true;
2678 iv_loop = loop->inner;
2680 else
2681 iv_loop = loop;
2682 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2684 latch_e = loop_latch_edge (iv_loop);
2685 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2687 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2688 gcc_assert (access_fn);
2689 STRIP_NOPS (access_fn);
2690 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2691 &init_expr, &step_expr);
2692 gcc_assert (ok);
2693 pe = loop_preheader_edge (iv_loop);
2695 scalar_type = TREE_TYPE (init_expr);
2696 vectype = get_vectype_for_scalar_type (scalar_type);
2697 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2698 gcc_assert (vectype);
2699 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2700 ncopies = vf / nunits;
2702 gcc_assert (phi_info);
2703 gcc_assert (ncopies >= 1);
2705 /* Find the first insertion point in the BB. */
2706 si = gsi_after_labels (bb);
2708 /* Create the vector that holds the initial_value of the induction. */
2709 if (nested_in_vect_loop)
2711 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2712 been created during vectorization of previous stmts. We obtain it
2713 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2714 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2715 loop_preheader_edge (iv_loop));
2716 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2718 else
2720 /* iv_loop is the loop to be vectorized. Create:
2721 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2722 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2723 add_referenced_var (new_var);
2725 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2726 if (stmts)
2728 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2729 gcc_assert (!new_bb);
2732 t = NULL_TREE;
2733 t = tree_cons (NULL_TREE, new_name, t);
2734 for (i = 1; i < nunits; i++)
2736 /* Create: new_name_i = new_name + step_expr */
2737 enum tree_code code = POINTER_TYPE_P (scalar_type)
2738 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2739 init_stmt = gimple_build_assign_with_ops (code, new_var,
2740 new_name, step_expr);
2741 new_name = make_ssa_name (new_var, init_stmt);
2742 gimple_assign_set_lhs (init_stmt, new_name);
2744 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2745 gcc_assert (!new_bb);
2747 if (vect_print_dump_info (REPORT_DETAILS))
2749 fprintf (vect_dump, "created new init_stmt: ");
2750 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2752 t = tree_cons (NULL_TREE, new_name, t);
2754 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2755 vec = build_constructor_from_list (vectype, nreverse (t));
2756 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2760 /* Create the vector that holds the step of the induction. */
2761 if (nested_in_vect_loop)
2762 /* iv_loop is nested in the loop to be vectorized. Generate:
2763 vec_step = [S, S, S, S] */
2764 new_name = step_expr;
2765 else
2767 /* iv_loop is the loop to be vectorized. Generate:
2768 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2769 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2770 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2771 expr, step_expr);
2774 t = unshare_expr (new_name);
2775 gcc_assert (CONSTANT_CLASS_P (new_name));
2776 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2777 gcc_assert (stepvectype);
2778 vec = build_vector_from_val (stepvectype, t);
2779 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2782 /* Create the following def-use cycle:
2783 loop prolog:
2784 vec_init = ...
2785 vec_step = ...
2786 loop:
2787 vec_iv = PHI <vec_init, vec_loop>
2789 STMT
2791 vec_loop = vec_iv + vec_step; */
2793 /* Create the induction-phi that defines the induction-operand. */
2794 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2795 add_referenced_var (vec_dest);
2796 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2797 set_vinfo_for_stmt (induction_phi,
2798 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2799 induc_def = PHI_RESULT (induction_phi);
2801 /* Create the iv update inside the loop */
2802 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2803 induc_def, vec_step);
2804 vec_def = make_ssa_name (vec_dest, new_stmt);
2805 gimple_assign_set_lhs (new_stmt, vec_def);
2806 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2807 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2808 NULL));
2810 /* Set the arguments of the phi node: */
2811 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2812 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2813 UNKNOWN_LOCATION);
2816 /* In case that vectorization factor (VF) is bigger than the number
2817 of elements that we can fit in a vectype (nunits), we have to generate
2818 more than one vector stmt - i.e - we need to "unroll" the
2819 vector stmt by a factor VF/nunits. For more details see documentation
2820 in vectorizable_operation. */
2822 if (ncopies > 1)
2824 stmt_vec_info prev_stmt_vinfo;
2825 /* FORNOW. This restriction should be relaxed. */
2826 gcc_assert (!nested_in_vect_loop);
2828 /* Create the vector that holds the step of the induction. */
2829 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2830 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2831 expr, step_expr);
2832 t = unshare_expr (new_name);
2833 gcc_assert (CONSTANT_CLASS_P (new_name));
2834 vec = build_vector_from_val (stepvectype, t);
2835 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2837 vec_def = induc_def;
2838 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2839 for (i = 1; i < ncopies; i++)
2841 /* vec_i = vec_prev + vec_step */
2842 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2843 vec_def, vec_step);
2844 vec_def = make_ssa_name (vec_dest, new_stmt);
2845 gimple_assign_set_lhs (new_stmt, vec_def);
2847 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2848 if (!useless_type_conversion_p (resvectype, vectype))
2850 new_stmt = gimple_build_assign_with_ops
2851 (VIEW_CONVERT_EXPR,
2852 vect_get_new_vect_var (resvectype, vect_simple_var,
2853 "vec_iv_"),
2854 build1 (VIEW_CONVERT_EXPR, resvectype,
2855 gimple_assign_lhs (new_stmt)), NULL_TREE);
2856 gimple_assign_set_lhs (new_stmt,
2857 make_ssa_name
2858 (gimple_assign_lhs (new_stmt), new_stmt));
2859 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2861 set_vinfo_for_stmt (new_stmt,
2862 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2863 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2864 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2868 if (nested_in_vect_loop)
2870 /* Find the loop-closed exit-phi of the induction, and record
2871 the final vector of induction results: */
2872 exit_phi = NULL;
2873 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2875 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2877 exit_phi = USE_STMT (use_p);
2878 break;
2881 if (exit_phi)
2883 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2884 /* FORNOW. Currently not supporting the case that an inner-loop induction
2885 is not used in the outer-loop (i.e. only outside the outer-loop). */
2886 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2887 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2889 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2890 if (vect_print_dump_info (REPORT_DETAILS))
2892 fprintf (vect_dump, "vector of inductions after inner-loop:");
2893 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2899 if (vect_print_dump_info (REPORT_DETAILS))
2901 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2902 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2903 fprintf (vect_dump, "\n");
2904 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2907 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2908 if (!useless_type_conversion_p (resvectype, vectype))
2910 new_stmt = gimple_build_assign_with_ops
2911 (VIEW_CONVERT_EXPR,
2912 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2913 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2914 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2915 gimple_assign_set_lhs (new_stmt, induc_def);
2916 si = gsi_start_bb (bb);
2917 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2918 set_vinfo_for_stmt (new_stmt,
2919 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2920 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2921 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2924 return induc_def;
2928 /* Function get_initial_def_for_reduction
2930 Input:
2931 STMT - a stmt that performs a reduction operation in the loop.
2932 INIT_VAL - the initial value of the reduction variable
2934 Output:
2935 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2936 of the reduction (used for adjusting the epilog - see below).
2937 Return a vector variable, initialized according to the operation that STMT
2938 performs. This vector will be used as the initial value of the
2939 vector of partial results.
2941 Option1 (adjust in epilog): Initialize the vector as follows:
2942 add/bit or/xor: [0,0,...,0,0]
2943 mult/bit and: [1,1,...,1,1]
2944 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2945 and when necessary (e.g. add/mult case) let the caller know
2946 that it needs to adjust the result by init_val.
2948 Option2: Initialize the vector as follows:
2949 add/bit or/xor: [init_val,0,0,...,0]
2950 mult/bit and: [init_val,1,1,...,1]
2951 min/max/cond_expr: [init_val,init_val,...,init_val]
2952 and no adjustments are needed.
2954 For example, for the following code:
2956 s = init_val;
2957 for (i=0;i<n;i++)
2958 s = s + a[i];
2960 STMT is 's = s + a[i]', and the reduction variable is 's'.
2961 For a vector of 4 units, we want to return either [0,0,0,init_val],
2962 or [0,0,0,0] and let the caller know that it needs to adjust
2963 the result at the end by 'init_val'.
2965 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2966 initialization vector is simpler (same element in all entries), if
2967 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2969 A cost model should help decide between these two schemes. */
2971 tree
2972 get_initial_def_for_reduction (gimple stmt, tree init_val,
2973 tree *adjustment_def)
2975 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2976 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978 tree scalar_type = TREE_TYPE (init_val);
2979 tree vectype = get_vectype_for_scalar_type (scalar_type);
2980 int nunits;
2981 enum tree_code code = gimple_assign_rhs_code (stmt);
2982 tree def_for_init;
2983 tree init_def;
2984 tree t = NULL_TREE;
2985 int i;
2986 bool nested_in_vect_loop = false;
2987 tree init_value;
2988 REAL_VALUE_TYPE real_init_val = dconst0;
2989 int int_init_val = 0;
2990 gimple def_stmt = NULL;
2992 gcc_assert (vectype);
2993 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2995 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2996 || SCALAR_FLOAT_TYPE_P (scalar_type));
2998 if (nested_in_vect_loop_p (loop, stmt))
2999 nested_in_vect_loop = true;
3000 else
3001 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3003 /* In case of double reduction we only create a vector variable to be put
3004 in the reduction phi node. The actual statement creation is done in
3005 vect_create_epilog_for_reduction. */
3006 if (adjustment_def && nested_in_vect_loop
3007 && TREE_CODE (init_val) == SSA_NAME
3008 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3009 && gimple_code (def_stmt) == GIMPLE_PHI
3010 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3011 && vinfo_for_stmt (def_stmt)
3012 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3013 == vect_double_reduction_def)
3015 *adjustment_def = NULL;
3016 return vect_create_destination_var (init_val, vectype);
3019 if (TREE_CONSTANT (init_val))
3021 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3022 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3023 else
3024 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3026 else
3027 init_value = init_val;
3029 switch (code)
3031 case WIDEN_SUM_EXPR:
3032 case DOT_PROD_EXPR:
3033 case PLUS_EXPR:
3034 case MINUS_EXPR:
3035 case BIT_IOR_EXPR:
3036 case BIT_XOR_EXPR:
3037 case MULT_EXPR:
3038 case BIT_AND_EXPR:
3039 /* ADJUSMENT_DEF is NULL when called from
3040 vect_create_epilog_for_reduction to vectorize double reduction. */
3041 if (adjustment_def)
3043 if (nested_in_vect_loop)
3044 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3045 NULL);
3046 else
3047 *adjustment_def = init_val;
3050 if (code == MULT_EXPR)
3052 real_init_val = dconst1;
3053 int_init_val = 1;
3056 if (code == BIT_AND_EXPR)
3057 int_init_val = -1;
3059 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3060 def_for_init = build_real (scalar_type, real_init_val);
3061 else
3062 def_for_init = build_int_cst (scalar_type, int_init_val);
3064 /* Create a vector of '0' or '1' except the first element. */
3065 for (i = nunits - 2; i >= 0; --i)
3066 t = tree_cons (NULL_TREE, def_for_init, t);
3068 /* Option1: the first element is '0' or '1' as well. */
3069 if (adjustment_def)
3071 t = tree_cons (NULL_TREE, def_for_init, t);
3072 init_def = build_vector (vectype, t);
3073 break;
3076 /* Option2: the first element is INIT_VAL. */
3077 t = tree_cons (NULL_TREE, init_value, t);
3078 if (TREE_CONSTANT (init_val))
3079 init_def = build_vector (vectype, t);
3080 else
3081 init_def = build_constructor_from_list (vectype, t);
3083 break;
3085 case MIN_EXPR:
3086 case MAX_EXPR:
3087 case COND_EXPR:
3088 if (adjustment_def)
3090 *adjustment_def = NULL_TREE;
3091 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3092 break;
3095 init_def = build_vector_from_val (vectype, init_value);
3096 break;
3098 default:
3099 gcc_unreachable ();
3102 return init_def;
3106 /* Function vect_create_epilog_for_reduction
3108 Create code at the loop-epilog to finalize the result of a reduction
3109 computation.
3111 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3112 reduction statements.
3113 STMT is the scalar reduction stmt that is being vectorized.
3114 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3115 number of elements that we can fit in a vectype (nunits). In this case
3116 we have to generate more than one vector stmt - i.e - we need to "unroll"
3117 the vector stmt by a factor VF/nunits. For more details see documentation
3118 in vectorizable_operation.
3119 REDUC_CODE is the tree-code for the epilog reduction.
3120 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3121 computation.
3122 REDUC_INDEX is the index of the operand in the right hand side of the
3123 statement that is defined by REDUCTION_PHI.
3124 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3125 SLP_NODE is an SLP node containing a group of reduction statements. The
3126 first one in this group is STMT.
3128 This function:
3129 1. Creates the reduction def-use cycles: sets the arguments for
3130 REDUCTION_PHIS:
3131 The loop-entry argument is the vectorized initial-value of the reduction.
3132 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3133 sums.
3134 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3135 by applying the operation specified by REDUC_CODE if available, or by
3136 other means (whole-vector shifts or a scalar loop).
3137 The function also creates a new phi node at the loop exit to preserve
3138 loop-closed form, as illustrated below.
3140 The flow at the entry to this function:
3142 loop:
3143 vec_def = phi <null, null> # REDUCTION_PHI
3144 VECT_DEF = vector_stmt # vectorized form of STMT
3145 s_loop = scalar_stmt # (scalar) STMT
3146 loop_exit:
3147 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3148 use <s_out0>
3149 use <s_out0>
3151 The above is transformed by this function into:
3153 loop:
3154 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3155 VECT_DEF = vector_stmt # vectorized form of STMT
3156 s_loop = scalar_stmt # (scalar) STMT
3157 loop_exit:
3158 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3159 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3160 v_out2 = reduce <v_out1>
3161 s_out3 = extract_field <v_out2, 0>
3162 s_out4 = adjust_result <s_out3>
3163 use <s_out4>
3164 use <s_out4>
3167 static void
3168 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3169 int ncopies, enum tree_code reduc_code,
3170 VEC (gimple, heap) *reduction_phis,
3171 int reduc_index, bool double_reduc,
3172 slp_tree slp_node)
3174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3175 stmt_vec_info prev_phi_info;
3176 tree vectype;
3177 enum machine_mode mode;
3178 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3180 basic_block exit_bb;
3181 tree scalar_dest;
3182 tree scalar_type;
3183 gimple new_phi = NULL, phi;
3184 gimple_stmt_iterator exit_gsi;
3185 tree vec_dest;
3186 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3187 gimple epilog_stmt = NULL;
3188 enum tree_code code = gimple_assign_rhs_code (stmt);
3189 gimple exit_phi;
3190 tree bitsize, bitpos;
3191 tree adjustment_def = NULL;
3192 tree vec_initial_def = NULL;
3193 tree reduction_op, expr, def;
3194 tree orig_name, scalar_result;
3195 imm_use_iterator imm_iter, phi_imm_iter;
3196 use_operand_p use_p, phi_use_p;
3197 bool extract_scalar_result = false;
3198 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3199 bool nested_in_vect_loop = false;
3200 VEC (gimple, heap) *new_phis = NULL;
3201 enum vect_def_type dt = vect_unknown_def_type;
3202 int j, i;
3203 VEC (tree, heap) *scalar_results = NULL;
3204 unsigned int group_size = 1, k, ratio;
3205 VEC (tree, heap) *vec_initial_defs = NULL;
3206 VEC (gimple, heap) *phis;
3208 if (slp_node)
3209 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3211 if (nested_in_vect_loop_p (loop, stmt))
3213 outer_loop = loop;
3214 loop = loop->inner;
3215 nested_in_vect_loop = true;
3216 gcc_assert (!slp_node);
3219 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3221 case GIMPLE_SINGLE_RHS:
3222 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3223 == ternary_op);
3224 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3225 break;
3226 case GIMPLE_UNARY_RHS:
3227 reduction_op = gimple_assign_rhs1 (stmt);
3228 break;
3229 case GIMPLE_BINARY_RHS:
3230 reduction_op = reduc_index ?
3231 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3232 break;
3233 case GIMPLE_TERNARY_RHS:
3234 reduction_op = gimple_op (stmt, reduc_index + 1);
3235 break;
3236 default:
3237 gcc_unreachable ();
3240 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3241 gcc_assert (vectype);
3242 mode = TYPE_MODE (vectype);
3244 /* 1. Create the reduction def-use cycle:
3245 Set the arguments of REDUCTION_PHIS, i.e., transform
3247 loop:
3248 vec_def = phi <null, null> # REDUCTION_PHI
3249 VECT_DEF = vector_stmt # vectorized form of STMT
3252 into:
3254 loop:
3255 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3256 VECT_DEF = vector_stmt # vectorized form of STMT
3259 (in case of SLP, do it for all the phis). */
3261 /* Get the loop-entry arguments. */
3262 if (slp_node)
3263 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3264 NULL, reduc_index);
3265 else
3267 vec_initial_defs = VEC_alloc (tree, heap, 1);
3268 /* For the case of reduction, vect_get_vec_def_for_operand returns
3269 the scalar def before the loop, that defines the initial value
3270 of the reduction variable. */
3271 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3272 &adjustment_def);
3273 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3276 /* Set phi nodes arguments. */
3277 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3279 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3280 tree def = VEC_index (tree, vect_defs, i);
3281 for (j = 0; j < ncopies; j++)
3283 /* Set the loop-entry arg of the reduction-phi. */
3284 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3285 UNKNOWN_LOCATION);
3287 /* Set the loop-latch arg for the reduction-phi. */
3288 if (j > 0)
3289 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3291 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3293 if (vect_print_dump_info (REPORT_DETAILS))
3295 fprintf (vect_dump, "transform reduction: created def-use"
3296 " cycle: ");
3297 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3298 fprintf (vect_dump, "\n");
3299 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3300 TDF_SLIM);
3303 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3307 VEC_free (tree, heap, vec_initial_defs);
3309 /* 2. Create epilog code.
3310 The reduction epilog code operates across the elements of the vector
3311 of partial results computed by the vectorized loop.
3312 The reduction epilog code consists of:
3314 step 1: compute the scalar result in a vector (v_out2)
3315 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3316 step 3: adjust the scalar result (s_out3) if needed.
3318 Step 1 can be accomplished using one the following three schemes:
3319 (scheme 1) using reduc_code, if available.
3320 (scheme 2) using whole-vector shifts, if available.
3321 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3322 combined.
3324 The overall epilog code looks like this:
3326 s_out0 = phi <s_loop> # original EXIT_PHI
3327 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3328 v_out2 = reduce <v_out1> # step 1
3329 s_out3 = extract_field <v_out2, 0> # step 2
3330 s_out4 = adjust_result <s_out3> # step 3
3332 (step 3 is optional, and steps 1 and 2 may be combined).
3333 Lastly, the uses of s_out0 are replaced by s_out4. */
3336 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3337 v_out1 = phi <VECT_DEF>
3338 Store them in NEW_PHIS. */
3340 exit_bb = single_exit (loop)->dest;
3341 prev_phi_info = NULL;
3342 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3343 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3345 for (j = 0; j < ncopies; j++)
3347 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3348 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3349 if (j == 0)
3350 VEC_quick_push (gimple, new_phis, phi);
3351 else
3353 def = vect_get_vec_def_for_stmt_copy (dt, def);
3354 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3357 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3358 prev_phi_info = vinfo_for_stmt (phi);
3362 /* The epilogue is created for the outer-loop, i.e., for the loop being
3363 vectorized. */
3364 if (double_reduc)
3366 loop = outer_loop;
3367 exit_bb = single_exit (loop)->dest;
3370 exit_gsi = gsi_after_labels (exit_bb);
3372 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3373 (i.e. when reduc_code is not available) and in the final adjustment
3374 code (if needed). Also get the original scalar reduction variable as
3375 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3376 represents a reduction pattern), the tree-code and scalar-def are
3377 taken from the original stmt that the pattern-stmt (STMT) replaces.
3378 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3379 are taken from STMT. */
3381 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3382 if (!orig_stmt)
3384 /* Regular reduction */
3385 orig_stmt = stmt;
3387 else
3389 /* Reduction pattern */
3390 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3391 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3392 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3395 code = gimple_assign_rhs_code (orig_stmt);
3396 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3397 partial results are added and not subtracted. */
3398 if (code == MINUS_EXPR)
3399 code = PLUS_EXPR;
3401 scalar_dest = gimple_assign_lhs (orig_stmt);
3402 scalar_type = TREE_TYPE (scalar_dest);
3403 scalar_results = VEC_alloc (tree, heap, group_size);
3404 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3405 bitsize = TYPE_SIZE (scalar_type);
3407 /* In case this is a reduction in an inner-loop while vectorizing an outer
3408 loop - we don't need to extract a single scalar result at the end of the
3409 inner-loop (unless it is double reduction, i.e., the use of reduction is
3410 outside the outer-loop). The final vector of partial results will be used
3411 in the vectorized outer-loop, or reduced to a scalar result at the end of
3412 the outer-loop. */
3413 if (nested_in_vect_loop && !double_reduc)
3414 goto vect_finalize_reduction;
3416 /* 2.3 Create the reduction code, using one of the three schemes described
3417 above. In SLP we simply need to extract all the elements from the
3418 vector (without reducing them), so we use scalar shifts. */
3419 if (reduc_code != ERROR_MARK && !slp_node)
3421 tree tmp;
3423 /*** Case 1: Create:
3424 v_out2 = reduc_expr <v_out1> */
3426 if (vect_print_dump_info (REPORT_DETAILS))
3427 fprintf (vect_dump, "Reduce using direct vector reduction.");
3429 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3430 new_phi = VEC_index (gimple, new_phis, 0);
3431 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3432 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3433 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3434 gimple_assign_set_lhs (epilog_stmt, new_temp);
3435 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3437 extract_scalar_result = true;
3439 else
3441 enum tree_code shift_code = ERROR_MARK;
3442 bool have_whole_vector_shift = true;
3443 int bit_offset;
3444 int element_bitsize = tree_low_cst (bitsize, 1);
3445 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3446 tree vec_temp;
3448 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3449 shift_code = VEC_RSHIFT_EXPR;
3450 else
3451 have_whole_vector_shift = false;
3453 /* Regardless of whether we have a whole vector shift, if we're
3454 emulating the operation via tree-vect-generic, we don't want
3455 to use it. Only the first round of the reduction is likely
3456 to still be profitable via emulation. */
3457 /* ??? It might be better to emit a reduction tree code here, so that
3458 tree-vect-generic can expand the first round via bit tricks. */
3459 if (!VECTOR_MODE_P (mode))
3460 have_whole_vector_shift = false;
3461 else
3463 optab optab = optab_for_tree_code (code, vectype, optab_default);
3464 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3465 have_whole_vector_shift = false;
3468 if (have_whole_vector_shift && !slp_node)
3470 /*** Case 2: Create:
3471 for (offset = VS/2; offset >= element_size; offset/=2)
3473 Create: va' = vec_shift <va, offset>
3474 Create: va = vop <va, va'>
3475 } */
3477 if (vect_print_dump_info (REPORT_DETAILS))
3478 fprintf (vect_dump, "Reduce using vector shifts");
3480 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3481 new_phi = VEC_index (gimple, new_phis, 0);
3482 new_temp = PHI_RESULT (new_phi);
3483 for (bit_offset = vec_size_in_bits/2;
3484 bit_offset >= element_bitsize;
3485 bit_offset /= 2)
3487 tree bitpos = size_int (bit_offset);
3489 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3490 vec_dest, new_temp, bitpos);
3491 new_name = make_ssa_name (vec_dest, epilog_stmt);
3492 gimple_assign_set_lhs (epilog_stmt, new_name);
3493 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3495 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3496 new_name, new_temp);
3497 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3498 gimple_assign_set_lhs (epilog_stmt, new_temp);
3499 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3502 extract_scalar_result = true;
3504 else
3506 tree rhs;
3508 /*** Case 3: Create:
3509 s = extract_field <v_out2, 0>
3510 for (offset = element_size;
3511 offset < vector_size;
3512 offset += element_size;)
3514 Create: s' = extract_field <v_out2, offset>
3515 Create: s = op <s, s'> // For non SLP cases
3516 } */
3518 if (vect_print_dump_info (REPORT_DETAILS))
3519 fprintf (vect_dump, "Reduce using scalar code. ");
3521 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3522 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3524 vec_temp = PHI_RESULT (new_phi);
3525 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3526 bitsize_zero_node);
3527 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3528 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3529 gimple_assign_set_lhs (epilog_stmt, new_temp);
3530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3532 /* In SLP we don't need to apply reduction operation, so we just
3533 collect s' values in SCALAR_RESULTS. */
3534 if (slp_node)
3535 VEC_safe_push (tree, heap, scalar_results, new_temp);
3537 for (bit_offset = element_bitsize;
3538 bit_offset < vec_size_in_bits;
3539 bit_offset += element_bitsize)
3541 tree bitpos = bitsize_int (bit_offset);
3542 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3543 bitsize, bitpos);
3545 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3546 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3547 gimple_assign_set_lhs (epilog_stmt, new_name);
3548 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3550 if (slp_node)
3552 /* In SLP we don't need to apply reduction operation, so
3553 we just collect s' values in SCALAR_RESULTS. */
3554 new_temp = new_name;
3555 VEC_safe_push (tree, heap, scalar_results, new_name);
3557 else
3559 epilog_stmt = gimple_build_assign_with_ops (code,
3560 new_scalar_dest, new_name, new_temp);
3561 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3562 gimple_assign_set_lhs (epilog_stmt, new_temp);
3563 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3568 /* The only case where we need to reduce scalar results in SLP, is
3569 unrolling. If the size of SCALAR_RESULTS is greater than
3570 GROUP_SIZE, we reduce them combining elements modulo
3571 GROUP_SIZE. */
3572 if (slp_node)
3574 tree res, first_res, new_res;
3575 gimple new_stmt;
3577 /* Reduce multiple scalar results in case of SLP unrolling. */
3578 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3579 j++)
3581 first_res = VEC_index (tree, scalar_results, j % group_size);
3582 new_stmt = gimple_build_assign_with_ops (code,
3583 new_scalar_dest, first_res, res);
3584 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3585 gimple_assign_set_lhs (new_stmt, new_res);
3586 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3587 VEC_replace (tree, scalar_results, j % group_size, new_res);
3590 else
3591 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3592 VEC_safe_push (tree, heap, scalar_results, new_temp);
3594 extract_scalar_result = false;
3598 /* 2.4 Extract the final scalar result. Create:
3599 s_out3 = extract_field <v_out2, bitpos> */
3601 if (extract_scalar_result)
3603 tree rhs;
3605 if (vect_print_dump_info (REPORT_DETAILS))
3606 fprintf (vect_dump, "extract scalar result");
3608 if (BYTES_BIG_ENDIAN)
3609 bitpos = size_binop (MULT_EXPR,
3610 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3611 TYPE_SIZE (scalar_type));
3612 else
3613 bitpos = bitsize_zero_node;
3615 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3616 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3617 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3618 gimple_assign_set_lhs (epilog_stmt, new_temp);
3619 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3620 VEC_safe_push (tree, heap, scalar_results, new_temp);
3623 vect_finalize_reduction:
3625 if (double_reduc)
3626 loop = loop->inner;
3628 /* 2.5 Adjust the final result by the initial value of the reduction
3629 variable. (When such adjustment is not needed, then
3630 'adjustment_def' is zero). For example, if code is PLUS we create:
3631 new_temp = loop_exit_def + adjustment_def */
3633 if (adjustment_def)
3635 gcc_assert (!slp_node);
3636 if (nested_in_vect_loop)
3638 new_phi = VEC_index (gimple, new_phis, 0);
3639 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3640 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3641 new_dest = vect_create_destination_var (scalar_dest, vectype);
3643 else
3645 new_temp = VEC_index (tree, scalar_results, 0);
3646 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3647 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3648 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3651 epilog_stmt = gimple_build_assign (new_dest, expr);
3652 new_temp = make_ssa_name (new_dest, epilog_stmt);
3653 gimple_assign_set_lhs (epilog_stmt, new_temp);
3654 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3655 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3656 if (nested_in_vect_loop)
3658 set_vinfo_for_stmt (epilog_stmt,
3659 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3660 NULL));
3661 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3662 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3664 if (!double_reduc)
3665 VEC_quick_push (tree, scalar_results, new_temp);
3666 else
3667 VEC_replace (tree, scalar_results, 0, new_temp);
3669 else
3670 VEC_replace (tree, scalar_results, 0, new_temp);
3672 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3675 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3676 phis with new adjusted scalar results, i.e., replace use <s_out0>
3677 with use <s_out4>.
3679 Transform:
3680 loop_exit:
3681 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3682 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3683 v_out2 = reduce <v_out1>
3684 s_out3 = extract_field <v_out2, 0>
3685 s_out4 = adjust_result <s_out3>
3686 use <s_out0>
3687 use <s_out0>
3689 into:
3691 loop_exit:
3692 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3693 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3694 v_out2 = reduce <v_out1>
3695 s_out3 = extract_field <v_out2, 0>
3696 s_out4 = adjust_result <s_out3>
3697 use <s_out4>
3698 use <s_out4> */
3700 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3701 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3702 need to match SCALAR_RESULTS with corresponding statements. The first
3703 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3704 the first vector stmt, etc.
3705 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3706 if (group_size > VEC_length (gimple, new_phis))
3708 ratio = group_size / VEC_length (gimple, new_phis);
3709 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3711 else
3712 ratio = 1;
3714 for (k = 0; k < group_size; k++)
3716 if (k % ratio == 0)
3718 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3719 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3722 if (slp_node)
3724 gimple current_stmt = VEC_index (gimple,
3725 SLP_TREE_SCALAR_STMTS (slp_node), k);
3727 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3728 /* SLP statements can't participate in patterns. */
3729 gcc_assert (!orig_stmt);
3730 scalar_dest = gimple_assign_lhs (current_stmt);
3733 phis = VEC_alloc (gimple, heap, 3);
3734 /* Find the loop-closed-use at the loop exit of the original scalar
3735 result. (The reduction result is expected to have two immediate uses -
3736 one at the latch block, and one at the loop exit). */
3737 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3738 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3739 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3741 /* We expect to have found an exit_phi because of loop-closed-ssa
3742 form. */
3743 gcc_assert (!VEC_empty (gimple, phis));
3745 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3747 if (outer_loop)
3749 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3750 gimple vect_phi;
3752 /* FORNOW. Currently not supporting the case that an inner-loop
3753 reduction is not used in the outer-loop (but only outside the
3754 outer-loop), unless it is double reduction. */
3755 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3756 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3757 || double_reduc);
3759 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3760 if (!double_reduc
3761 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3762 != vect_double_reduction_def)
3763 continue;
3765 /* Handle double reduction:
3767 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3768 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3769 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3770 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3772 At that point the regular reduction (stmt2 and stmt3) is
3773 already vectorized, as well as the exit phi node, stmt4.
3774 Here we vectorize the phi node of double reduction, stmt1, and
3775 update all relevant statements. */
3777 /* Go through all the uses of s2 to find double reduction phi
3778 node, i.e., stmt1 above. */
3779 orig_name = PHI_RESULT (exit_phi);
3780 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3782 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3783 stmt_vec_info new_phi_vinfo;
3784 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3785 basic_block bb = gimple_bb (use_stmt);
3786 gimple use;
3788 /* Check that USE_STMT is really double reduction phi
3789 node. */
3790 if (gimple_code (use_stmt) != GIMPLE_PHI
3791 || gimple_phi_num_args (use_stmt) != 2
3792 || !use_stmt_vinfo
3793 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3794 != vect_double_reduction_def
3795 || bb->loop_father != outer_loop)
3796 continue;
3798 /* Create vector phi node for double reduction:
3799 vs1 = phi <vs0, vs2>
3800 vs1 was created previously in this function by a call to
3801 vect_get_vec_def_for_operand and is stored in
3802 vec_initial_def;
3803 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3804 vs0 is created here. */
3806 /* Create vector phi node. */
3807 vect_phi = create_phi_node (vec_initial_def, bb);
3808 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3809 loop_vec_info_for_loop (outer_loop), NULL);
3810 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3812 /* Create vs0 - initial def of the double reduction phi. */
3813 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3814 loop_preheader_edge (outer_loop));
3815 init_def = get_initial_def_for_reduction (stmt,
3816 preheader_arg, NULL);
3817 vect_phi_init = vect_init_vector (use_stmt, init_def,
3818 vectype, NULL);
3820 /* Update phi node arguments with vs0 and vs2. */
3821 add_phi_arg (vect_phi, vect_phi_init,
3822 loop_preheader_edge (outer_loop),
3823 UNKNOWN_LOCATION);
3824 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3825 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3826 if (vect_print_dump_info (REPORT_DETAILS))
3828 fprintf (vect_dump, "created double reduction phi "
3829 "node: ");
3830 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3833 vect_phi_res = PHI_RESULT (vect_phi);
3835 /* Replace the use, i.e., set the correct vs1 in the regular
3836 reduction phi node. FORNOW, NCOPIES is always 1, so the
3837 loop is redundant. */
3838 use = reduction_phi;
3839 for (j = 0; j < ncopies; j++)
3841 edge pr_edge = loop_preheader_edge (loop);
3842 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3843 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3849 VEC_free (gimple, heap, phis);
3850 if (nested_in_vect_loop)
3852 if (double_reduc)
3853 loop = outer_loop;
3854 else
3855 continue;
3858 phis = VEC_alloc (gimple, heap, 3);
3859 /* Find the loop-closed-use at the loop exit of the original scalar
3860 result. (The reduction result is expected to have two immediate uses,
3861 one at the latch block, and one at the loop exit). For double
3862 reductions we are looking for exit phis of the outer loop. */
3863 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3865 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3866 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3867 else
3869 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3871 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3873 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3875 if (!flow_bb_inside_loop_p (loop,
3876 gimple_bb (USE_STMT (phi_use_p))))
3877 VEC_safe_push (gimple, heap, phis,
3878 USE_STMT (phi_use_p));
3884 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3886 /* Replace the uses: */
3887 orig_name = PHI_RESULT (exit_phi);
3888 scalar_result = VEC_index (tree, scalar_results, k);
3889 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3890 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3891 SET_USE (use_p, scalar_result);
3894 VEC_free (gimple, heap, phis);
3897 VEC_free (tree, heap, scalar_results);
3898 VEC_free (gimple, heap, new_phis);
3902 /* Function vectorizable_reduction.
3904 Check if STMT performs a reduction operation that can be vectorized.
3905 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3906 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3907 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3909 This function also handles reduction idioms (patterns) that have been
3910 recognized in advance during vect_pattern_recog. In this case, STMT may be
3911 of this form:
3912 X = pattern_expr (arg0, arg1, ..., X)
3913 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3914 sequence that had been detected and replaced by the pattern-stmt (STMT).
3916 In some cases of reduction patterns, the type of the reduction variable X is
3917 different than the type of the other arguments of STMT.
3918 In such cases, the vectype that is used when transforming STMT into a vector
3919 stmt is different than the vectype that is used to determine the
3920 vectorization factor, because it consists of a different number of elements
3921 than the actual number of elements that are being operated upon in parallel.
3923 For example, consider an accumulation of shorts into an int accumulator.
3924 On some targets it's possible to vectorize this pattern operating on 8
3925 shorts at a time (hence, the vectype for purposes of determining the
3926 vectorization factor should be V8HI); on the other hand, the vectype that
3927 is used to create the vector form is actually V4SI (the type of the result).
3929 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3930 indicates what is the actual level of parallelism (V8HI in the example), so
3931 that the right vectorization factor would be derived. This vectype
3932 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3933 be used to create the vectorized stmt. The right vectype for the vectorized
3934 stmt is obtained from the type of the result X:
3935 get_vectype_for_scalar_type (TREE_TYPE (X))
3937 This means that, contrary to "regular" reductions (or "regular" stmts in
3938 general), the following equation:
3939 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3940 does *NOT* necessarily hold for reduction patterns. */
3942 bool
3943 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3944 gimple *vec_stmt, slp_tree slp_node)
3946 tree vec_dest;
3947 tree scalar_dest;
3948 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3950 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3951 tree vectype_in = NULL_TREE;
3952 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3953 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3954 enum tree_code code, orig_code, epilog_reduc_code;
3955 enum machine_mode vec_mode;
3956 int op_type;
3957 optab optab, reduc_optab;
3958 tree new_temp = NULL_TREE;
3959 tree def;
3960 gimple def_stmt;
3961 enum vect_def_type dt;
3962 gimple new_phi = NULL;
3963 tree scalar_type;
3964 bool is_simple_use;
3965 gimple orig_stmt;
3966 stmt_vec_info orig_stmt_info;
3967 tree expr = NULL_TREE;
3968 int i;
3969 int ncopies;
3970 int epilog_copies;
3971 stmt_vec_info prev_stmt_info, prev_phi_info;
3972 bool single_defuse_cycle = false;
3973 tree reduc_def = NULL_TREE;
3974 gimple new_stmt = NULL;
3975 int j;
3976 tree ops[3];
3977 bool nested_cycle = false, found_nested_cycle_def = false;
3978 gimple reduc_def_stmt = NULL;
3979 /* The default is that the reduction variable is the last in statement. */
3980 int reduc_index = 2;
3981 bool double_reduc = false, dummy;
3982 basic_block def_bb;
3983 struct loop * def_stmt_loop, *outer_loop = NULL;
3984 tree def_arg;
3985 gimple def_arg_stmt;
3986 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3987 VEC (gimple, heap) *phis = NULL;
3988 int vec_num;
3989 tree def0, def1, tem;
3991 if (nested_in_vect_loop_p (loop, stmt))
3993 outer_loop = loop;
3994 loop = loop->inner;
3995 nested_cycle = true;
3998 /* 1. Is vectorizable reduction? */
3999 /* Not supportable if the reduction variable is used in the loop. */
4000 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
4001 return false;
4003 /* Reductions that are not used even in an enclosing outer-loop,
4004 are expected to be "live" (used out of the loop). */
4005 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4006 && !STMT_VINFO_LIVE_P (stmt_info))
4007 return false;
4009 /* Make sure it was already recognized as a reduction computation. */
4010 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4011 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4012 return false;
4014 /* 2. Has this been recognized as a reduction pattern?
4016 Check if STMT represents a pattern that has been recognized
4017 in earlier analysis stages. For stmts that represent a pattern,
4018 the STMT_VINFO_RELATED_STMT field records the last stmt in
4019 the original sequence that constitutes the pattern. */
4021 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4022 if (orig_stmt)
4024 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4025 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4026 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4027 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4030 /* 3. Check the operands of the operation. The first operands are defined
4031 inside the loop body. The last operand is the reduction variable,
4032 which is defined by the loop-header-phi. */
4034 gcc_assert (is_gimple_assign (stmt));
4036 /* Flatten RHS. */
4037 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4039 case GIMPLE_SINGLE_RHS:
4040 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4041 if (op_type == ternary_op)
4043 tree rhs = gimple_assign_rhs1 (stmt);
4044 ops[0] = TREE_OPERAND (rhs, 0);
4045 ops[1] = TREE_OPERAND (rhs, 1);
4046 ops[2] = TREE_OPERAND (rhs, 2);
4047 code = TREE_CODE (rhs);
4049 else
4050 return false;
4051 break;
4053 case GIMPLE_BINARY_RHS:
4054 code = gimple_assign_rhs_code (stmt);
4055 op_type = TREE_CODE_LENGTH (code);
4056 gcc_assert (op_type == binary_op);
4057 ops[0] = gimple_assign_rhs1 (stmt);
4058 ops[1] = gimple_assign_rhs2 (stmt);
4059 break;
4061 case GIMPLE_TERNARY_RHS:
4062 code = gimple_assign_rhs_code (stmt);
4063 op_type = TREE_CODE_LENGTH (code);
4064 gcc_assert (op_type == ternary_op);
4065 ops[0] = gimple_assign_rhs1 (stmt);
4066 ops[1] = gimple_assign_rhs2 (stmt);
4067 ops[2] = gimple_assign_rhs3 (stmt);
4068 break;
4070 case GIMPLE_UNARY_RHS:
4071 return false;
4073 default:
4074 gcc_unreachable ();
4077 scalar_dest = gimple_assign_lhs (stmt);
4078 scalar_type = TREE_TYPE (scalar_dest);
4079 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4080 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4081 return false;
4083 /* All uses but the last are expected to be defined in the loop.
4084 The last use is the reduction variable. In case of nested cycle this
4085 assumption is not true: we use reduc_index to record the index of the
4086 reduction variable. */
4087 for (i = 0; i < op_type-1; i++)
4089 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4090 if (i == 0 && code == COND_EXPR)
4091 continue;
4093 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4094 &def_stmt, &def, &dt, &tem);
4095 if (!vectype_in)
4096 vectype_in = tem;
4097 gcc_assert (is_simple_use);
4098 if (dt != vect_internal_def
4099 && dt != vect_external_def
4100 && dt != vect_constant_def
4101 && dt != vect_induction_def
4102 && !(dt == vect_nested_cycle && nested_cycle))
4103 return false;
4105 if (dt == vect_nested_cycle)
4107 found_nested_cycle_def = true;
4108 reduc_def_stmt = def_stmt;
4109 reduc_index = i;
4113 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4114 &def, &dt, &tem);
4115 if (!vectype_in)
4116 vectype_in = tem;
4117 gcc_assert (is_simple_use);
4118 gcc_assert (dt == vect_reduction_def
4119 || dt == vect_nested_cycle
4120 || ((dt == vect_internal_def || dt == vect_external_def
4121 || dt == vect_constant_def || dt == vect_induction_def)
4122 && nested_cycle && found_nested_cycle_def));
4123 if (!found_nested_cycle_def)
4124 reduc_def_stmt = def_stmt;
4126 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4127 if (orig_stmt)
4128 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4129 reduc_def_stmt,
4130 !nested_cycle,
4131 &dummy));
4132 else
4133 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4134 !nested_cycle, &dummy));
4136 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4137 return false;
4139 if (slp_node)
4140 ncopies = 1;
4141 else
4142 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4143 / TYPE_VECTOR_SUBPARTS (vectype_in));
4145 gcc_assert (ncopies >= 1);
4147 vec_mode = TYPE_MODE (vectype_in);
4149 if (code == COND_EXPR)
4151 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4153 if (vect_print_dump_info (REPORT_DETAILS))
4154 fprintf (vect_dump, "unsupported condition in reduction");
4156 return false;
4159 else
4161 /* 4. Supportable by target? */
4163 /* 4.1. check support for the operation in the loop */
4164 optab = optab_for_tree_code (code, vectype_in, optab_default);
4165 if (!optab)
4167 if (vect_print_dump_info (REPORT_DETAILS))
4168 fprintf (vect_dump, "no optab.");
4170 return false;
4173 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4175 if (vect_print_dump_info (REPORT_DETAILS))
4176 fprintf (vect_dump, "op not supported by target.");
4178 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4179 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4180 < vect_min_worthwhile_factor (code))
4181 return false;
4183 if (vect_print_dump_info (REPORT_DETAILS))
4184 fprintf (vect_dump, "proceeding using word mode.");
4187 /* Worthwhile without SIMD support? */
4188 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4189 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4190 < vect_min_worthwhile_factor (code))
4192 if (vect_print_dump_info (REPORT_DETAILS))
4193 fprintf (vect_dump, "not worthwhile without SIMD support.");
4195 return false;
4199 /* 4.2. Check support for the epilog operation.
4201 If STMT represents a reduction pattern, then the type of the
4202 reduction variable may be different than the type of the rest
4203 of the arguments. For example, consider the case of accumulation
4204 of shorts into an int accumulator; The original code:
4205 S1: int_a = (int) short_a;
4206 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4208 was replaced with:
4209 STMT: int_acc = widen_sum <short_a, int_acc>
4211 This means that:
4212 1. The tree-code that is used to create the vector operation in the
4213 epilog code (that reduces the partial results) is not the
4214 tree-code of STMT, but is rather the tree-code of the original
4215 stmt from the pattern that STMT is replacing. I.e, in the example
4216 above we want to use 'widen_sum' in the loop, but 'plus' in the
4217 epilog.
4218 2. The type (mode) we use to check available target support
4219 for the vector operation to be created in the *epilog*, is
4220 determined by the type of the reduction variable (in the example
4221 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4222 However the type (mode) we use to check available target support
4223 for the vector operation to be created *inside the loop*, is
4224 determined by the type of the other arguments to STMT (in the
4225 example we'd check this: optab_handler (widen_sum_optab,
4226 vect_short_mode)).
4228 This is contrary to "regular" reductions, in which the types of all
4229 the arguments are the same as the type of the reduction variable.
4230 For "regular" reductions we can therefore use the same vector type
4231 (and also the same tree-code) when generating the epilog code and
4232 when generating the code inside the loop. */
4234 if (orig_stmt)
4236 /* This is a reduction pattern: get the vectype from the type of the
4237 reduction variable, and get the tree-code from orig_stmt. */
4238 orig_code = gimple_assign_rhs_code (orig_stmt);
4239 gcc_assert (vectype_out);
4240 vec_mode = TYPE_MODE (vectype_out);
4242 else
4244 /* Regular reduction: use the same vectype and tree-code as used for
4245 the vector code inside the loop can be used for the epilog code. */
4246 orig_code = code;
4249 if (nested_cycle)
4251 def_bb = gimple_bb (reduc_def_stmt);
4252 def_stmt_loop = def_bb->loop_father;
4253 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4254 loop_preheader_edge (def_stmt_loop));
4255 if (TREE_CODE (def_arg) == SSA_NAME
4256 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4257 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4258 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4259 && vinfo_for_stmt (def_arg_stmt)
4260 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4261 == vect_double_reduction_def)
4262 double_reduc = true;
4265 epilog_reduc_code = ERROR_MARK;
4266 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4268 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4269 optab_default);
4270 if (!reduc_optab)
4272 if (vect_print_dump_info (REPORT_DETAILS))
4273 fprintf (vect_dump, "no optab for reduction.");
4275 epilog_reduc_code = ERROR_MARK;
4278 if (reduc_optab
4279 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4281 if (vect_print_dump_info (REPORT_DETAILS))
4282 fprintf (vect_dump, "reduc op not supported by target.");
4284 epilog_reduc_code = ERROR_MARK;
4287 else
4289 if (!nested_cycle || double_reduc)
4291 if (vect_print_dump_info (REPORT_DETAILS))
4292 fprintf (vect_dump, "no reduc code for scalar code.");
4294 return false;
4298 if (double_reduc && ncopies > 1)
4300 if (vect_print_dump_info (REPORT_DETAILS))
4301 fprintf (vect_dump, "multiple types in double reduction");
4303 return false;
4306 if (!vec_stmt) /* transformation not required. */
4308 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4309 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4310 return false;
4311 return true;
4314 /** Transform. **/
4316 if (vect_print_dump_info (REPORT_DETAILS))
4317 fprintf (vect_dump, "transform reduction.");
4319 /* FORNOW: Multiple types are not supported for condition. */
4320 if (code == COND_EXPR)
4321 gcc_assert (ncopies == 1);
4323 /* Create the destination vector */
4324 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4326 /* In case the vectorization factor (VF) is bigger than the number
4327 of elements that we can fit in a vectype (nunits), we have to generate
4328 more than one vector stmt - i.e - we need to "unroll" the
4329 vector stmt by a factor VF/nunits. For more details see documentation
4330 in vectorizable_operation. */
4332 /* If the reduction is used in an outer loop we need to generate
4333 VF intermediate results, like so (e.g. for ncopies=2):
4334 r0 = phi (init, r0)
4335 r1 = phi (init, r1)
4336 r0 = x0 + r0;
4337 r1 = x1 + r1;
4338 (i.e. we generate VF results in 2 registers).
4339 In this case we have a separate def-use cycle for each copy, and therefore
4340 for each copy we get the vector def for the reduction variable from the
4341 respective phi node created for this copy.
4343 Otherwise (the reduction is unused in the loop nest), we can combine
4344 together intermediate results, like so (e.g. for ncopies=2):
4345 r = phi (init, r)
4346 r = x0 + r;
4347 r = x1 + r;
4348 (i.e. we generate VF/2 results in a single register).
4349 In this case for each copy we get the vector def for the reduction variable
4350 from the vectorized reduction operation generated in the previous iteration.
4353 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4355 single_defuse_cycle = true;
4356 epilog_copies = 1;
4358 else
4359 epilog_copies = ncopies;
4361 prev_stmt_info = NULL;
4362 prev_phi_info = NULL;
4363 if (slp_node)
4365 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4366 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4367 == TYPE_VECTOR_SUBPARTS (vectype_in));
4369 else
4371 vec_num = 1;
4372 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4373 if (op_type == ternary_op)
4374 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4377 phis = VEC_alloc (gimple, heap, vec_num);
4378 vect_defs = VEC_alloc (tree, heap, vec_num);
4379 if (!slp_node)
4380 VEC_quick_push (tree, vect_defs, NULL_TREE);
4382 for (j = 0; j < ncopies; j++)
4384 if (j == 0 || !single_defuse_cycle)
4386 for (i = 0; i < vec_num; i++)
4388 /* Create the reduction-phi that defines the reduction
4389 operand. */
4390 new_phi = create_phi_node (vec_dest, loop->header);
4391 set_vinfo_for_stmt (new_phi,
4392 new_stmt_vec_info (new_phi, loop_vinfo,
4393 NULL));
4394 if (j == 0 || slp_node)
4395 VEC_quick_push (gimple, phis, new_phi);
4399 if (code == COND_EXPR)
4401 gcc_assert (!slp_node);
4402 vectorizable_condition (stmt, gsi, vec_stmt,
4403 PHI_RESULT (VEC_index (gimple, phis, 0)),
4404 reduc_index);
4405 /* Multiple types are not supported for condition. */
4406 break;
4409 /* Handle uses. */
4410 if (j == 0)
4412 tree op0, op1 = NULL_TREE;
4414 op0 = ops[!reduc_index];
4415 if (op_type == ternary_op)
4417 if (reduc_index == 0)
4418 op1 = ops[2];
4419 else
4420 op1 = ops[1];
4423 if (slp_node)
4424 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4425 -1);
4426 else
4428 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4429 stmt, NULL);
4430 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4431 if (op_type == ternary_op)
4433 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4434 NULL);
4435 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4439 else
4441 if (!slp_node)
4443 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4444 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4445 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4446 if (op_type == ternary_op)
4448 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4449 loop_vec_def1);
4450 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4454 if (single_defuse_cycle)
4455 reduc_def = gimple_assign_lhs (new_stmt);
4457 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4460 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4462 if (slp_node)
4463 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4464 else
4466 if (!single_defuse_cycle || j == 0)
4467 reduc_def = PHI_RESULT (new_phi);
4470 def1 = ((op_type == ternary_op)
4471 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4472 if (op_type == binary_op)
4474 if (reduc_index == 0)
4475 expr = build2 (code, vectype_out, reduc_def, def0);
4476 else
4477 expr = build2 (code, vectype_out, def0, reduc_def);
4479 else
4481 if (reduc_index == 0)
4482 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4483 else
4485 if (reduc_index == 1)
4486 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4487 else
4488 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4492 new_stmt = gimple_build_assign (vec_dest, expr);
4493 new_temp = make_ssa_name (vec_dest, new_stmt);
4494 gimple_assign_set_lhs (new_stmt, new_temp);
4495 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4496 if (slp_node)
4498 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4499 VEC_quick_push (tree, vect_defs, new_temp);
4501 else
4502 VEC_replace (tree, vect_defs, 0, new_temp);
4505 if (slp_node)
4506 continue;
4508 if (j == 0)
4509 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4510 else
4511 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4513 prev_stmt_info = vinfo_for_stmt (new_stmt);
4514 prev_phi_info = vinfo_for_stmt (new_phi);
4517 /* Finalize the reduction-phi (set its arguments) and create the
4518 epilog reduction code. */
4519 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4521 new_temp = gimple_assign_lhs (*vec_stmt);
4522 VEC_replace (tree, vect_defs, 0, new_temp);
4525 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4526 epilog_reduc_code, phis, reduc_index,
4527 double_reduc, slp_node);
4529 VEC_free (gimple, heap, phis);
4530 VEC_free (tree, heap, vec_oprnds0);
4531 if (vec_oprnds1)
4532 VEC_free (tree, heap, vec_oprnds1);
4534 return true;
4537 /* Function vect_min_worthwhile_factor.
4539 For a loop where we could vectorize the operation indicated by CODE,
4540 return the minimum vectorization factor that makes it worthwhile
4541 to use generic vectors. */
4543 vect_min_worthwhile_factor (enum tree_code code)
4545 switch (code)
4547 case PLUS_EXPR:
4548 case MINUS_EXPR:
4549 case NEGATE_EXPR:
4550 return 4;
4552 case BIT_AND_EXPR:
4553 case BIT_IOR_EXPR:
4554 case BIT_XOR_EXPR:
4555 case BIT_NOT_EXPR:
4556 return 2;
4558 default:
4559 return INT_MAX;
4564 /* Function vectorizable_induction
4566 Check if PHI performs an induction computation that can be vectorized.
4567 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4568 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4569 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4571 bool
4572 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4573 gimple *vec_stmt)
4575 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4576 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4577 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4578 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4579 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4580 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4581 tree vec_def;
4583 gcc_assert (ncopies >= 1);
4584 /* FORNOW. This restriction should be relaxed. */
4585 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4587 if (vect_print_dump_info (REPORT_DETAILS))
4588 fprintf (vect_dump, "multiple types in nested loop.");
4589 return false;
4592 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4593 return false;
4595 /* FORNOW: SLP not supported. */
4596 if (STMT_SLP_TYPE (stmt_info))
4597 return false;
4599 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4601 if (gimple_code (phi) != GIMPLE_PHI)
4602 return false;
4604 if (!vec_stmt) /* transformation not required. */
4606 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4607 if (vect_print_dump_info (REPORT_DETAILS))
4608 fprintf (vect_dump, "=== vectorizable_induction ===");
4609 vect_model_induction_cost (stmt_info, ncopies);
4610 return true;
4613 /** Transform. **/
4615 if (vect_print_dump_info (REPORT_DETAILS))
4616 fprintf (vect_dump, "transform induction phi.");
4618 vec_def = get_initial_def_for_induction (phi);
4619 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4620 return true;
4623 /* Function vectorizable_live_operation.
4625 STMT computes a value that is used outside the loop. Check if
4626 it can be supported. */
4628 bool
4629 vectorizable_live_operation (gimple stmt,
4630 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4631 gimple *vec_stmt ATTRIBUTE_UNUSED)
4633 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4634 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4635 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4636 int i;
4637 int op_type;
4638 tree op;
4639 tree def;
4640 gimple def_stmt;
4641 enum vect_def_type dt;
4642 enum tree_code code;
4643 enum gimple_rhs_class rhs_class;
4645 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4647 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4648 return false;
4650 if (!is_gimple_assign (stmt))
4651 return false;
4653 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4654 return false;
4656 /* FORNOW. CHECKME. */
4657 if (nested_in_vect_loop_p (loop, stmt))
4658 return false;
4660 code = gimple_assign_rhs_code (stmt);
4661 op_type = TREE_CODE_LENGTH (code);
4662 rhs_class = get_gimple_rhs_class (code);
4663 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4664 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4666 /* FORNOW: support only if all uses are invariant. This means
4667 that the scalar operations can remain in place, unvectorized.
4668 The original last scalar value that they compute will be used. */
4670 for (i = 0; i < op_type; i++)
4672 if (rhs_class == GIMPLE_SINGLE_RHS)
4673 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4674 else
4675 op = gimple_op (stmt, i + 1);
4676 if (op
4677 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4679 if (vect_print_dump_info (REPORT_DETAILS))
4680 fprintf (vect_dump, "use not simple.");
4681 return false;
4684 if (dt != vect_external_def && dt != vect_constant_def)
4685 return false;
4688 /* No transformation is required for the cases we currently support. */
4689 return true;
4692 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4694 static void
4695 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4697 ssa_op_iter op_iter;
4698 imm_use_iterator imm_iter;
4699 def_operand_p def_p;
4700 gimple ustmt;
4702 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4704 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4706 basic_block bb;
4708 if (!is_gimple_debug (ustmt))
4709 continue;
4711 bb = gimple_bb (ustmt);
4713 if (!flow_bb_inside_loop_p (loop, bb))
4715 if (gimple_debug_bind_p (ustmt))
4717 if (vect_print_dump_info (REPORT_DETAILS))
4718 fprintf (vect_dump, "killing debug use");
4720 gimple_debug_bind_reset_value (ustmt);
4721 update_stmt (ustmt);
4723 else
4724 gcc_unreachable ();
4730 /* Function vect_transform_loop.
4732 The analysis phase has determined that the loop is vectorizable.
4733 Vectorize the loop - created vectorized stmts to replace the scalar
4734 stmts in the loop, and update the loop exit condition. */
4736 void
4737 vect_transform_loop (loop_vec_info loop_vinfo)
4739 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4740 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4741 int nbbs = loop->num_nodes;
4742 gimple_stmt_iterator si;
4743 int i;
4744 tree ratio = NULL;
4745 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4746 bool strided_store;
4747 bool slp_scheduled = false;
4748 unsigned int nunits;
4749 tree cond_expr = NULL_TREE;
4750 gimple_seq cond_expr_stmt_list = NULL;
4751 bool do_peeling_for_loop_bound;
4753 if (vect_print_dump_info (REPORT_DETAILS))
4754 fprintf (vect_dump, "=== vec_transform_loop ===");
4756 /* Peel the loop if there are data refs with unknown alignment.
4757 Only one data ref with unknown store is allowed. */
4759 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4760 vect_do_peeling_for_alignment (loop_vinfo);
4762 do_peeling_for_loop_bound
4763 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4764 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4765 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4767 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4768 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4769 vect_loop_versioning (loop_vinfo,
4770 !do_peeling_for_loop_bound,
4771 &cond_expr, &cond_expr_stmt_list);
4773 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4774 compile time constant), or it is a constant that doesn't divide by the
4775 vectorization factor, then an epilog loop needs to be created.
4776 We therefore duplicate the loop: the original loop will be vectorized,
4777 and will compute the first (n/VF) iterations. The second copy of the loop
4778 will remain scalar and will compute the remaining (n%VF) iterations.
4779 (VF is the vectorization factor). */
4781 if (do_peeling_for_loop_bound)
4782 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4783 cond_expr, cond_expr_stmt_list);
4784 else
4785 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4786 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4788 /* 1) Make sure the loop header has exactly two entries
4789 2) Make sure we have a preheader basic block. */
4791 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4793 split_edge (loop_preheader_edge (loop));
4795 /* FORNOW: the vectorizer supports only loops which body consist
4796 of one basic block (header + empty latch). When the vectorizer will
4797 support more involved loop forms, the order by which the BBs are
4798 traversed need to be reconsidered. */
4800 for (i = 0; i < nbbs; i++)
4802 basic_block bb = bbs[i];
4803 stmt_vec_info stmt_info;
4804 gimple phi;
4806 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4808 phi = gsi_stmt (si);
4809 if (vect_print_dump_info (REPORT_DETAILS))
4811 fprintf (vect_dump, "------>vectorizing phi: ");
4812 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4814 stmt_info = vinfo_for_stmt (phi);
4815 if (!stmt_info)
4816 continue;
4818 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4819 vect_loop_kill_debug_uses (loop, phi);
4821 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4822 && !STMT_VINFO_LIVE_P (stmt_info))
4823 continue;
4825 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4826 != (unsigned HOST_WIDE_INT) vectorization_factor)
4827 && vect_print_dump_info (REPORT_DETAILS))
4828 fprintf (vect_dump, "multiple-types.");
4830 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4832 if (vect_print_dump_info (REPORT_DETAILS))
4833 fprintf (vect_dump, "transform phi.");
4834 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4838 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4840 gimple stmt = gsi_stmt (si);
4841 bool is_store;
4843 if (vect_print_dump_info (REPORT_DETAILS))
4845 fprintf (vect_dump, "------>vectorizing statement: ");
4846 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4849 stmt_info = vinfo_for_stmt (stmt);
4851 /* vector stmts created in the outer-loop during vectorization of
4852 stmts in an inner-loop may not have a stmt_info, and do not
4853 need to be vectorized. */
4854 if (!stmt_info)
4856 gsi_next (&si);
4857 continue;
4860 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4861 vect_loop_kill_debug_uses (loop, stmt);
4863 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4864 && !STMT_VINFO_LIVE_P (stmt_info))
4866 gsi_next (&si);
4867 continue;
4870 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4871 nunits =
4872 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4873 if (!STMT_SLP_TYPE (stmt_info)
4874 && nunits != (unsigned int) vectorization_factor
4875 && vect_print_dump_info (REPORT_DETAILS))
4876 /* For SLP VF is set according to unrolling factor, and not to
4877 vector size, hence for SLP this print is not valid. */
4878 fprintf (vect_dump, "multiple-types.");
4880 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4881 reached. */
4882 if (STMT_SLP_TYPE (stmt_info))
4884 if (!slp_scheduled)
4886 slp_scheduled = true;
4888 if (vect_print_dump_info (REPORT_DETAILS))
4889 fprintf (vect_dump, "=== scheduling SLP instances ===");
4891 vect_schedule_slp (loop_vinfo, NULL);
4894 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4895 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4897 gsi_next (&si);
4898 continue;
4902 /* -------- vectorize statement ------------ */
4903 if (vect_print_dump_info (REPORT_DETAILS))
4904 fprintf (vect_dump, "transform statement.");
4906 strided_store = false;
4907 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4908 if (is_store)
4910 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4912 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4913 interleaving chain was completed - free all the stores in
4914 the chain. */
4915 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4916 gsi_remove (&si, true);
4917 continue;
4919 else
4921 /* Free the attached stmt_vec_info and remove the stmt. */
4922 free_stmt_vec_info (stmt);
4923 gsi_remove (&si, true);
4924 continue;
4927 gsi_next (&si);
4928 } /* stmts in BB */
4929 } /* BBs in loop */
4931 slpeel_make_loop_iterate_ntimes (loop, ratio);
4933 /* The memory tags and pointers in vectorized statements need to
4934 have their SSA forms updated. FIXME, why can't this be delayed
4935 until all the loops have been transformed? */
4936 update_ssa (TODO_update_ssa);
4938 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4939 fprintf (vect_dump, "LOOP VECTORIZED.");
4940 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4941 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");