Merge aosp-toolchain/gcc/gcc-4_9 changes.
[official-gcc.git] / gcc-4_6 / gcc / tree-vect-loop.c
blobdd9aef4174f0a4b3d762a5f4f221fd64e340b9b1
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 return false;
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 return false;
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 return false;
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 else
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 return false;
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 return false;
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 return false;
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386 return true;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 *init = init_expr;
425 *step = step_expr;
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
434 return true;
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
525 if (double_reduc)
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
534 else
536 if (nested_cycle)
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
545 else
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
579 Example1: reduction:
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
585 Example2: induction:
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
621 tree niters;
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
711 else
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
762 LOOP_VINFO_PEELING_HTAB (res) = NULL;
763 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
765 return res;
769 /* Function destroy_loop_vec_info.
771 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
772 stmts in the loop. */
774 void
775 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
777 struct loop *loop;
778 basic_block *bbs;
779 int nbbs;
780 gimple_stmt_iterator si;
781 int j;
782 VEC (slp_instance, heap) *slp_instances;
783 slp_instance instance;
785 if (!loop_vinfo)
786 return;
788 loop = LOOP_VINFO_LOOP (loop_vinfo);
790 bbs = LOOP_VINFO_BBS (loop_vinfo);
791 nbbs = loop->num_nodes;
793 if (!clean_stmts)
795 free (LOOP_VINFO_BBS (loop_vinfo));
796 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
797 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
798 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
799 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
800 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
802 free (loop_vinfo);
803 loop->aux = NULL;
804 return;
807 for (j = 0; j < nbbs; j++)
809 basic_block bb = bbs[j];
810 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
811 free_stmt_vec_info (gsi_stmt (si));
813 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
815 gimple stmt = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
818 if (stmt_info)
820 /* Check if this is a "pattern stmt" (introduced by the
821 vectorizer during the pattern recognition pass). */
822 bool remove_stmt_p = false;
823 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
824 if (orig_stmt)
826 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
827 if (orig_stmt_info
828 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
829 remove_stmt_p = true;
832 /* Free stmt_vec_info. */
833 free_stmt_vec_info (stmt);
835 /* Remove dead "pattern stmts". */
836 if (remove_stmt_p)
837 gsi_remove (&si, true);
839 gsi_next (&si);
843 free (LOOP_VINFO_BBS (loop_vinfo));
844 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
845 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
846 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
847 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
848 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
849 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
850 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
851 vect_free_slp_instance (instance);
853 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
854 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
855 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
857 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
858 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
860 free (loop_vinfo);
861 loop->aux = NULL;
865 /* Function vect_analyze_loop_1.
867 Apply a set of analyses on LOOP, and create a loop_vec_info struct
868 for it. The different analyses will record information in the
869 loop_vec_info struct. This is a subset of the analyses applied in
870 vect_analyze_loop, to be applied on an inner-loop nested in the loop
871 that is now considered for (outer-loop) vectorization. */
873 static loop_vec_info
874 vect_analyze_loop_1 (struct loop *loop)
876 loop_vec_info loop_vinfo;
878 if (vect_print_dump_info (REPORT_DETAILS))
879 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
881 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
883 loop_vinfo = vect_analyze_loop_form (loop);
884 if (!loop_vinfo)
886 if (vect_print_dump_info (REPORT_DETAILS))
887 fprintf (vect_dump, "bad inner-loop form.");
888 return NULL;
891 return loop_vinfo;
895 /* Function vect_analyze_loop_form.
897 Verify that certain CFG restrictions hold, including:
898 - the loop has a pre-header
899 - the loop has a single entry and exit
900 - the loop exit condition is simple enough, and the number of iterations
901 can be analyzed (a countable loop). */
903 loop_vec_info
904 vect_analyze_loop_form (struct loop *loop)
906 loop_vec_info loop_vinfo;
907 gimple loop_cond;
908 tree number_of_iterations = NULL;
909 loop_vec_info inner_loop_vinfo = NULL;
911 if (vect_print_dump_info (REPORT_DETAILS))
912 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
914 /* Different restrictions apply when we are considering an inner-most loop,
915 vs. an outer (nested) loop.
916 (FORNOW. May want to relax some of these restrictions in the future). */
918 if (!loop->inner)
920 /* Inner-most loop. We currently require that the number of BBs is
921 exactly 2 (the header and latch). Vectorizable inner-most loops
922 look like this:
924 (pre-header)
926 header <--------+
927 | | |
928 | +--> latch --+
930 (exit-bb) */
932 if (loop->num_nodes != 2)
934 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
935 fprintf (vect_dump, "not vectorized: control flow in loop.");
936 return NULL;
939 if (empty_block_p (loop->header))
941 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
942 fprintf (vect_dump, "not vectorized: empty loop.");
943 return NULL;
946 else
948 struct loop *innerloop = loop->inner;
949 edge entryedge;
951 /* Nested loop. We currently require that the loop is doubly-nested,
952 contains a single inner loop, and the number of BBs is exactly 5.
953 Vectorizable outer-loops look like this:
955 (pre-header)
957 header <---+
959 inner-loop |
961 tail ------+
963 (exit-bb)
965 The inner-loop has the properties expected of inner-most loops
966 as described above. */
968 if ((loop->inner)->inner || (loop->inner)->next)
970 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
971 fprintf (vect_dump, "not vectorized: multiple nested loops.");
972 return NULL;
975 /* Analyze the inner-loop. */
976 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
977 if (!inner_loop_vinfo)
979 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
980 fprintf (vect_dump, "not vectorized: Bad inner loop.");
981 return NULL;
984 if (!expr_invariant_in_loop_p (loop,
985 LOOP_VINFO_NITERS (inner_loop_vinfo)))
987 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
988 fprintf (vect_dump,
989 "not vectorized: inner-loop count not invariant.");
990 destroy_loop_vec_info (inner_loop_vinfo, true);
991 return NULL;
994 if (loop->num_nodes != 5)
996 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
997 fprintf (vect_dump, "not vectorized: control flow in loop.");
998 destroy_loop_vec_info (inner_loop_vinfo, true);
999 return NULL;
1002 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1003 entryedge = EDGE_PRED (innerloop->header, 0);
1004 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1005 entryedge = EDGE_PRED (innerloop->header, 1);
1007 if (entryedge->src != loop->header
1008 || !single_exit (innerloop)
1009 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1011 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1012 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1013 destroy_loop_vec_info (inner_loop_vinfo, true);
1014 return NULL;
1017 if (vect_print_dump_info (REPORT_DETAILS))
1018 fprintf (vect_dump, "Considering outer-loop vectorization.");
1021 if (!single_exit (loop)
1022 || EDGE_COUNT (loop->header->preds) != 2)
1024 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1026 if (!single_exit (loop))
1027 fprintf (vect_dump, "not vectorized: multiple exits.");
1028 else if (EDGE_COUNT (loop->header->preds) != 2)
1029 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1031 if (inner_loop_vinfo)
1032 destroy_loop_vec_info (inner_loop_vinfo, true);
1033 return NULL;
1036 /* We assume that the loop exit condition is at the end of the loop. i.e,
1037 that the loop is represented as a do-while (with a proper if-guard
1038 before the loop if needed), where the loop header contains all the
1039 executable statements, and the latch is empty. */
1040 if (!empty_block_p (loop->latch)
1041 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1043 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1044 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1045 if (inner_loop_vinfo)
1046 destroy_loop_vec_info (inner_loop_vinfo, true);
1047 return NULL;
1050 /* Make sure there exists a single-predecessor exit bb: */
1051 if (!single_pred_p (single_exit (loop)->dest))
1053 edge e = single_exit (loop);
1054 if (!(e->flags & EDGE_ABNORMAL))
1056 split_loop_exit_edge (e);
1057 if (vect_print_dump_info (REPORT_DETAILS))
1058 fprintf (vect_dump, "split exit edge.");
1060 else
1062 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1063 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1064 if (inner_loop_vinfo)
1065 destroy_loop_vec_info (inner_loop_vinfo, true);
1066 return NULL;
1070 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1071 if (!loop_cond)
1073 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1074 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1075 if (inner_loop_vinfo)
1076 destroy_loop_vec_info (inner_loop_vinfo, true);
1077 return NULL;
1080 if (!number_of_iterations)
1082 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1083 fprintf (vect_dump,
1084 "not vectorized: number of iterations cannot be computed.");
1085 if (inner_loop_vinfo)
1086 destroy_loop_vec_info (inner_loop_vinfo, true);
1087 return NULL;
1090 if (chrec_contains_undetermined (number_of_iterations))
1092 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1093 fprintf (vect_dump, "Infinite number of iterations.");
1094 if (inner_loop_vinfo)
1095 destroy_loop_vec_info (inner_loop_vinfo, true);
1096 return NULL;
1099 if (!NITERS_KNOWN_P (number_of_iterations))
1101 if (vect_print_dump_info (REPORT_DETAILS))
1103 fprintf (vect_dump, "Symbolic number of iterations is ");
1104 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1107 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1109 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1110 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1111 if (inner_loop_vinfo)
1112 destroy_loop_vec_info (inner_loop_vinfo, false);
1113 return NULL;
1116 loop_vinfo = new_loop_vec_info (loop);
1117 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1118 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1120 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1122 /* CHECKME: May want to keep it around it in the future. */
1123 if (inner_loop_vinfo)
1124 destroy_loop_vec_info (inner_loop_vinfo, false);
1126 gcc_assert (!loop->aux);
1127 loop->aux = loop_vinfo;
1128 return loop_vinfo;
1132 /* Get cost by calling cost target builtin. */
1134 static inline int
1135 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1137 tree dummy_type = NULL;
1138 int dummy = 0;
1140 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1141 dummy_type, dummy);
1145 /* Function vect_analyze_loop_operations.
1147 Scan the loop stmts and make sure they are all vectorizable. */
1149 static bool
1150 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1152 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1153 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1154 int nbbs = loop->num_nodes;
1155 gimple_stmt_iterator si;
1156 unsigned int vectorization_factor = 0;
1157 int i;
1158 gimple phi;
1159 stmt_vec_info stmt_info;
1160 bool need_to_vectorize = false;
1161 int min_profitable_iters;
1162 int min_scalar_loop_bound;
1163 unsigned int th;
1164 bool only_slp_in_loop = true, ok;
1166 if (vect_print_dump_info (REPORT_DETAILS))
1167 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1169 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1170 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1172 for (i = 0; i < nbbs; i++)
1174 basic_block bb = bbs[i];
1176 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1178 phi = gsi_stmt (si);
1179 ok = true;
1181 stmt_info = vinfo_for_stmt (phi);
1182 if (vect_print_dump_info (REPORT_DETAILS))
1184 fprintf (vect_dump, "examining phi: ");
1185 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1188 if (! is_loop_header_bb_p (bb))
1190 /* inner-loop loop-closed exit phi in outer-loop vectorization
1191 (i.e. a phi in the tail of the outer-loop).
1192 FORNOW: we currently don't support the case that these phis
1193 are not used in the outerloop (unless it is double reduction,
1194 i.e., this phi is vect_reduction_def), cause this case
1195 requires to actually do something here. */
1196 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1197 || STMT_VINFO_LIVE_P (stmt_info))
1198 && STMT_VINFO_DEF_TYPE (stmt_info)
1199 != vect_double_reduction_def)
1201 if (vect_print_dump_info (REPORT_DETAILS))
1202 fprintf (vect_dump,
1203 "Unsupported loop-closed phi in outer-loop.");
1204 return false;
1206 continue;
1209 gcc_assert (stmt_info);
1211 if (STMT_VINFO_LIVE_P (stmt_info))
1213 /* FORNOW: not yet supported. */
1214 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1215 fprintf (vect_dump, "not vectorized: value used after loop.");
1216 return false;
1219 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1220 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1222 /* A scalar-dependence cycle that we don't support. */
1223 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1224 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1225 return false;
1228 if (STMT_VINFO_RELEVANT_P (stmt_info))
1230 need_to_vectorize = true;
1231 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1232 ok = vectorizable_induction (phi, NULL, NULL);
1235 if (!ok)
1237 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1239 fprintf (vect_dump,
1240 "not vectorized: relevant phi not supported: ");
1241 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1243 return false;
1247 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249 gimple stmt = gsi_stmt (si);
1250 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1252 gcc_assert (stmt_info);
1254 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1255 return false;
1257 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1258 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1259 && !PURE_SLP_STMT (stmt_info))
1260 /* STMT needs both SLP and loop-based vectorization. */
1261 only_slp_in_loop = false;
1263 } /* bbs */
1265 /* All operations in the loop are either irrelevant (deal with loop
1266 control, or dead), or only used outside the loop and can be moved
1267 out of the loop (e.g. invariants, inductions). The loop can be
1268 optimized away by scalar optimizations. We're better off not
1269 touching this loop. */
1270 if (!need_to_vectorize)
1272 if (vect_print_dump_info (REPORT_DETAILS))
1273 fprintf (vect_dump,
1274 "All the computation can be taken out of the loop.");
1275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276 fprintf (vect_dump,
1277 "not vectorized: redundant loop. no profit to vectorize.");
1278 return false;
1281 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1282 vectorization factor of the loop is the unrolling factor required by the
1283 SLP instances. If that unrolling factor is 1, we say, that we perform
1284 pure SLP on loop - cross iteration parallelism is not exploited. */
1285 if (only_slp_in_loop)
1286 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1287 else
1288 vectorization_factor = least_common_multiple (vectorization_factor,
1289 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1291 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1293 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294 && vect_print_dump_info (REPORT_DETAILS))
1295 fprintf (vect_dump,
1296 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1297 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1299 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1300 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1302 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1303 fprintf (vect_dump, "not vectorized: iteration count too small.");
1304 if (vect_print_dump_info (REPORT_DETAILS))
1305 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1306 "vectorization factor.");
1307 return false;
1310 /* Analyze cost. Decide if worth while to vectorize. */
1312 /* Once VF is set, SLP costs should be updated since the number of created
1313 vector stmts depends on VF. */
1314 vect_update_slp_costs_according_to_vf (loop_vinfo);
1316 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1317 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1319 if (min_profitable_iters < 0)
1321 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1322 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1323 if (vect_print_dump_info (REPORT_DETAILS))
1324 fprintf (vect_dump, "not vectorized: vector version will never be "
1325 "profitable.");
1326 return false;
1329 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1330 * vectorization_factor) - 1);
1332 /* Use the cost model only if it is more conservative than user specified
1333 threshold. */
1335 th = (unsigned) min_scalar_loop_bound;
1336 if (min_profitable_iters
1337 && (!min_scalar_loop_bound
1338 || min_profitable_iters > min_scalar_loop_bound))
1339 th = (unsigned) min_profitable_iters;
1341 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1342 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1344 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1345 fprintf (vect_dump, "not vectorized: vectorization not "
1346 "profitable.");
1347 if (vect_print_dump_info (REPORT_DETAILS))
1348 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1349 "user specified loop bound parameter or minimum "
1350 "profitable iterations (whichever is more conservative).");
1351 return false;
1354 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1355 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1356 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1358 if (vect_print_dump_info (REPORT_DETAILS))
1359 fprintf (vect_dump, "epilog loop required.");
1360 if (!vect_can_advance_ivs_p (loop_vinfo))
1362 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1363 fprintf (vect_dump,
1364 "not vectorized: can't create epilog loop 1.");
1365 return false;
1367 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1369 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1370 fprintf (vect_dump,
1371 "not vectorized: can't create epilog loop 2.");
1372 return false;
1376 return true;
1380 /* Function vect_analyze_loop_2.
1382 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1383 for it. The different analyses will record information in the
1384 loop_vec_info struct. */
1385 static bool
1386 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1388 bool ok, dummy;
1389 int max_vf = MAX_VECTORIZATION_FACTOR;
1390 int min_vf = 2;
1392 /* Find all data references in the loop (which correspond to vdefs/vuses)
1393 and analyze their evolution in the loop. Also adjust the minimal
1394 vectorization factor according to the loads and stores.
1396 FORNOW: Handle only simple, array references, which
1397 alignment can be forced, and aligned pointer-references. */
1399 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1400 if (!ok)
1402 if (vect_print_dump_info (REPORT_DETAILS))
1403 fprintf (vect_dump, "bad data references.");
1404 return false;
1407 /* Classify all cross-iteration scalar data-flow cycles.
1408 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1410 vect_analyze_scalar_cycles (loop_vinfo);
1412 vect_pattern_recog (loop_vinfo);
1414 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1416 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1417 if (!ok)
1419 if (vect_print_dump_info (REPORT_DETAILS))
1420 fprintf (vect_dump, "unexpected pattern.");
1421 return false;
1424 /* Analyze data dependences between the data-refs in the loop
1425 and adjust the maximum vectorization factor according to
1426 the dependences.
1427 FORNOW: fail at the first data dependence that we encounter. */
1429 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1430 if (!ok
1431 || max_vf < min_vf)
1433 if (vect_print_dump_info (REPORT_DETAILS))
1434 fprintf (vect_dump, "bad data dependence.");
1435 return false;
1438 ok = vect_determine_vectorization_factor (loop_vinfo);
1439 if (!ok)
1441 if (vect_print_dump_info (REPORT_DETAILS))
1442 fprintf (vect_dump, "can't determine vectorization factor.");
1443 return false;
1445 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1447 if (vect_print_dump_info (REPORT_DETAILS))
1448 fprintf (vect_dump, "bad data dependence.");
1449 return false;
1452 /* Analyze the alignment of the data-refs in the loop.
1453 Fail if a data reference is found that cannot be vectorized. */
1455 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1456 if (!ok)
1458 if (vect_print_dump_info (REPORT_DETAILS))
1459 fprintf (vect_dump, "bad data alignment.");
1460 return false;
1463 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1464 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1466 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1467 if (!ok)
1469 if (vect_print_dump_info (REPORT_DETAILS))
1470 fprintf (vect_dump, "bad data access.");
1471 return false;
1474 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1475 It is important to call pruning after vect_analyze_data_ref_accesses,
1476 since we use grouping information gathered by interleaving analysis. */
1477 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1478 if (!ok)
1480 if (vect_print_dump_info (REPORT_DETAILS))
1481 fprintf (vect_dump, "too long list of versioning for alias "
1482 "run-time tests.");
1483 return false;
1486 /* This pass will decide on using loop versioning and/or loop peeling in
1487 order to enhance the alignment of data references in the loop. */
1489 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1490 if (!ok)
1492 if (vect_print_dump_info (REPORT_DETAILS))
1493 fprintf (vect_dump, "bad data alignment.");
1494 return false;
1497 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1498 ok = vect_analyze_slp (loop_vinfo, NULL);
1499 if (ok)
1501 /* Decide which possible SLP instances to SLP. */
1502 vect_make_slp_decision (loop_vinfo);
1504 /* Find stmts that need to be both vectorized and SLPed. */
1505 vect_detect_hybrid_slp (loop_vinfo);
1508 /* Scan all the operations in the loop and make sure they are
1509 vectorizable. */
1511 ok = vect_analyze_loop_operations (loop_vinfo);
1512 if (!ok)
1514 if (vect_print_dump_info (REPORT_DETAILS))
1515 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1516 return false;
1519 return true;
1522 /* Function vect_analyze_loop.
1524 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1525 for it. The different analyses will record information in the
1526 loop_vec_info struct. */
1527 loop_vec_info
1528 vect_analyze_loop (struct loop *loop)
1530 loop_vec_info loop_vinfo;
1531 unsigned int vector_sizes;
1533 /* Autodetect first vector size we try. */
1534 current_vector_size = 0;
1535 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1537 if (vect_print_dump_info (REPORT_DETAILS))
1538 fprintf (vect_dump, "===== analyze_loop_nest =====");
1540 if (loop_outer (loop)
1541 && loop_vec_info_for_loop (loop_outer (loop))
1542 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1544 if (vect_print_dump_info (REPORT_DETAILS))
1545 fprintf (vect_dump, "outer-loop already vectorized.");
1546 return NULL;
1549 while (1)
1551 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1552 loop_vinfo = vect_analyze_loop_form (loop);
1553 if (!loop_vinfo)
1555 if (vect_print_dump_info (REPORT_DETAILS))
1556 fprintf (vect_dump, "bad loop form.");
1557 return NULL;
1560 if (vect_analyze_loop_2 (loop_vinfo))
1562 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1564 return loop_vinfo;
1567 destroy_loop_vec_info (loop_vinfo, true);
1569 vector_sizes &= ~current_vector_size;
1570 if (vector_sizes == 0
1571 || current_vector_size == 0)
1572 return NULL;
1574 /* Try the next biggest vector size. */
1575 current_vector_size = 1 << floor_log2 (vector_sizes);
1576 if (vect_print_dump_info (REPORT_DETAILS))
1577 fprintf (vect_dump, "***** Re-trying analysis with "
1578 "vector size %d\n", current_vector_size);
1583 /* Function reduction_code_for_scalar_code
1585 Input:
1586 CODE - tree_code of a reduction operations.
1588 Output:
1589 REDUC_CODE - the corresponding tree-code to be used to reduce the
1590 vector of partial results into a single scalar result (which
1591 will also reside in a vector) or ERROR_MARK if the operation is
1592 a supported reduction operation, but does not have such tree-code.
1594 Return FALSE if CODE currently cannot be vectorized as reduction. */
1596 static bool
1597 reduction_code_for_scalar_code (enum tree_code code,
1598 enum tree_code *reduc_code)
1600 switch (code)
1602 case MAX_EXPR:
1603 *reduc_code = REDUC_MAX_EXPR;
1604 return true;
1606 case MIN_EXPR:
1607 *reduc_code = REDUC_MIN_EXPR;
1608 return true;
1610 case PLUS_EXPR:
1611 *reduc_code = REDUC_PLUS_EXPR;
1612 return true;
1614 case MULT_EXPR:
1615 case MINUS_EXPR:
1616 case BIT_IOR_EXPR:
1617 case BIT_XOR_EXPR:
1618 case BIT_AND_EXPR:
1619 *reduc_code = ERROR_MARK;
1620 return true;
1622 default:
1623 return false;
1628 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1629 STMT is printed with a message MSG. */
1631 static void
1632 report_vect_op (gimple stmt, const char *msg)
1634 fprintf (vect_dump, "%s", msg);
1635 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1639 /* Function vect_is_simple_reduction_1
1641 (1) Detect a cross-iteration def-use cycle that represents a simple
1642 reduction computation. We look for the following pattern:
1644 loop_header:
1645 a1 = phi < a0, a2 >
1646 a3 = ...
1647 a2 = operation (a3, a1)
1649 such that:
1650 1. operation is commutative and associative and it is safe to
1651 change the order of the computation (if CHECK_REDUCTION is true)
1652 2. no uses for a2 in the loop (a2 is used out of the loop)
1653 3. no uses of a1 in the loop besides the reduction operation
1654 4. no uses of a1 outside the loop.
1656 Conditions 1,4 are tested here.
1657 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1659 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1660 nested cycles, if CHECK_REDUCTION is false.
1662 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1663 reductions:
1665 a1 = phi < a0, a2 >
1666 inner loop (def of a3)
1667 a2 = phi < a3 >
1669 If MODIFY is true it tries also to rework the code in-place to enable
1670 detection of more reduction patterns. For the time being we rewrite
1671 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1674 static gimple
1675 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1676 bool check_reduction, bool *double_reduc,
1677 bool modify)
1679 struct loop *loop = (gimple_bb (phi))->loop_father;
1680 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1681 edge latch_e = loop_latch_edge (loop);
1682 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1683 gimple def_stmt, def1 = NULL, def2 = NULL;
1684 enum tree_code orig_code, code;
1685 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1686 tree type;
1687 int nloop_uses;
1688 tree name;
1689 imm_use_iterator imm_iter;
1690 use_operand_p use_p;
1691 bool phi_def;
1693 *double_reduc = false;
1695 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1696 otherwise, we assume outer loop vectorization. */
1697 gcc_assert ((check_reduction && loop == vect_loop)
1698 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1700 name = PHI_RESULT (phi);
1701 nloop_uses = 0;
1702 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1704 gimple use_stmt = USE_STMT (use_p);
1705 if (is_gimple_debug (use_stmt))
1706 continue;
1708 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1710 if (vect_print_dump_info (REPORT_DETAILS))
1711 fprintf (vect_dump, "intermediate value used outside loop.");
1713 return NULL;
1716 if (vinfo_for_stmt (use_stmt)
1717 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1718 nloop_uses++;
1719 if (nloop_uses > 1)
1721 if (vect_print_dump_info (REPORT_DETAILS))
1722 fprintf (vect_dump, "reduction used in loop.");
1723 return NULL;
1727 if (TREE_CODE (loop_arg) != SSA_NAME)
1729 if (vect_print_dump_info (REPORT_DETAILS))
1731 fprintf (vect_dump, "reduction: not ssa_name: ");
1732 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1734 return NULL;
1737 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1738 if (!def_stmt)
1740 if (vect_print_dump_info (REPORT_DETAILS))
1741 fprintf (vect_dump, "reduction: no def_stmt.");
1742 return NULL;
1745 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1747 if (vect_print_dump_info (REPORT_DETAILS))
1748 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1749 return NULL;
1752 if (is_gimple_assign (def_stmt))
1754 name = gimple_assign_lhs (def_stmt);
1755 phi_def = false;
1757 else
1759 name = PHI_RESULT (def_stmt);
1760 phi_def = true;
1763 nloop_uses = 0;
1764 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1766 gimple use_stmt = USE_STMT (use_p);
1767 if (is_gimple_debug (use_stmt))
1768 continue;
1769 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1770 && vinfo_for_stmt (use_stmt)
1771 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1772 nloop_uses++;
1773 if (nloop_uses > 1)
1775 if (vect_print_dump_info (REPORT_DETAILS))
1776 fprintf (vect_dump, "reduction used in loop.");
1777 return NULL;
1781 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1782 defined in the inner loop. */
1783 if (phi_def)
1785 op1 = PHI_ARG_DEF (def_stmt, 0);
1787 if (gimple_phi_num_args (def_stmt) != 1
1788 || TREE_CODE (op1) != SSA_NAME)
1790 if (vect_print_dump_info (REPORT_DETAILS))
1791 fprintf (vect_dump, "unsupported phi node definition.");
1793 return NULL;
1796 def1 = SSA_NAME_DEF_STMT (op1);
1797 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1798 && loop->inner
1799 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1800 && is_gimple_assign (def1))
1802 if (vect_print_dump_info (REPORT_DETAILS))
1803 report_vect_op (def_stmt, "detected double reduction: ");
1805 *double_reduc = true;
1806 return def_stmt;
1809 return NULL;
1812 code = orig_code = gimple_assign_rhs_code (def_stmt);
1814 /* We can handle "res -= x[i]", which is non-associative by
1815 simply rewriting this into "res += -x[i]". Avoid changing
1816 gimple instruction for the first simple tests and only do this
1817 if we're allowed to change code at all. */
1818 if (code == MINUS_EXPR
1819 && modify
1820 && (op1 = gimple_assign_rhs1 (def_stmt))
1821 && TREE_CODE (op1) == SSA_NAME
1822 && SSA_NAME_DEF_STMT (op1) == phi)
1823 code = PLUS_EXPR;
1825 if (check_reduction
1826 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1828 if (vect_print_dump_info (REPORT_DETAILS))
1829 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1830 return NULL;
1833 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1835 if (code != COND_EXPR)
1837 if (vect_print_dump_info (REPORT_DETAILS))
1838 report_vect_op (def_stmt, "reduction: not binary operation: ");
1840 return NULL;
1843 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1844 if (COMPARISON_CLASS_P (op3))
1846 op4 = TREE_OPERAND (op3, 1);
1847 op3 = TREE_OPERAND (op3, 0);
1850 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1851 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1853 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1855 if (vect_print_dump_info (REPORT_DETAILS))
1856 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1858 return NULL;
1861 else
1863 op1 = gimple_assign_rhs1 (def_stmt);
1864 op2 = gimple_assign_rhs2 (def_stmt);
1866 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1868 if (vect_print_dump_info (REPORT_DETAILS))
1869 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1871 return NULL;
1875 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1876 if ((TREE_CODE (op1) == SSA_NAME
1877 && !types_compatible_p (type,TREE_TYPE (op1)))
1878 || (TREE_CODE (op2) == SSA_NAME
1879 && !types_compatible_p (type, TREE_TYPE (op2)))
1880 || (op3 && TREE_CODE (op3) == SSA_NAME
1881 && !types_compatible_p (type, TREE_TYPE (op3)))
1882 || (op4 && TREE_CODE (op4) == SSA_NAME
1883 && !types_compatible_p (type, TREE_TYPE (op4))))
1885 if (vect_print_dump_info (REPORT_DETAILS))
1887 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1888 print_generic_expr (vect_dump, type, TDF_SLIM);
1889 fprintf (vect_dump, ", operands types: ");
1890 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1891 fprintf (vect_dump, ",");
1892 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1893 if (op3)
1895 fprintf (vect_dump, ",");
1896 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1899 if (op4)
1901 fprintf (vect_dump, ",");
1902 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1906 return NULL;
1909 /* Check that it's ok to change the order of the computation.
1910 Generally, when vectorizing a reduction we change the order of the
1911 computation. This may change the behavior of the program in some
1912 cases, so we need to check that this is ok. One exception is when
1913 vectorizing an outer-loop: the inner-loop is executed sequentially,
1914 and therefore vectorizing reductions in the inner-loop during
1915 outer-loop vectorization is safe. */
1917 /* CHECKME: check for !flag_finite_math_only too? */
1918 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1919 && check_reduction)
1921 /* Changing the order of operations changes the semantics. */
1922 if (vect_print_dump_info (REPORT_DETAILS))
1923 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1924 return NULL;
1926 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1927 && check_reduction)
1929 /* Changing the order of operations changes the semantics. */
1930 if (vect_print_dump_info (REPORT_DETAILS))
1931 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1932 return NULL;
1934 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1936 /* Changing the order of operations changes the semantics. */
1937 if (vect_print_dump_info (REPORT_DETAILS))
1938 report_vect_op (def_stmt,
1939 "reduction: unsafe fixed-point math optimization: ");
1940 return NULL;
1943 /* If we detected "res -= x[i]" earlier, rewrite it into
1944 "res += -x[i]" now. If this turns out to be useless reassoc
1945 will clean it up again. */
1946 if (orig_code == MINUS_EXPR)
1948 tree rhs = gimple_assign_rhs2 (def_stmt);
1949 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1950 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1951 rhs, NULL);
1952 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1953 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1954 loop_info, NULL));
1955 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1956 gimple_assign_set_rhs2 (def_stmt, negrhs);
1957 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1958 update_stmt (def_stmt);
1961 /* Reduction is safe. We're dealing with one of the following:
1962 1) integer arithmetic and no trapv
1963 2) floating point arithmetic, and special flags permit this optimization
1964 3) nested cycle (i.e., outer loop vectorization). */
1965 if (TREE_CODE (op1) == SSA_NAME)
1966 def1 = SSA_NAME_DEF_STMT (op1);
1968 if (TREE_CODE (op2) == SSA_NAME)
1969 def2 = SSA_NAME_DEF_STMT (op2);
1971 if (code != COND_EXPR
1972 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1974 if (vect_print_dump_info (REPORT_DETAILS))
1975 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1976 return NULL;
1979 /* Check that one def is the reduction def, defined by PHI,
1980 the other def is either defined in the loop ("vect_internal_def"),
1981 or it's an induction (defined by a loop-header phi-node). */
1983 if (def2 && def2 == phi
1984 && (code == COND_EXPR
1985 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1986 && (is_gimple_assign (def1)
1987 || is_gimple_call (def1)
1988 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1989 == vect_induction_def
1990 || (gimple_code (def1) == GIMPLE_PHI
1991 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1992 == vect_internal_def
1993 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 report_vect_op (def_stmt, "detected reduction: ");
1997 return def_stmt;
1999 else if (def1 && def1 == phi
2000 && (code == COND_EXPR
2001 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2002 && (is_gimple_assign (def2)
2003 || is_gimple_call (def2)
2004 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2005 == vect_induction_def
2006 || (gimple_code (def2) == GIMPLE_PHI
2007 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2008 == vect_internal_def
2009 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2011 if (check_reduction)
2013 /* Swap operands (just for simplicity - so that the rest of the code
2014 can assume that the reduction variable is always the last (second)
2015 argument). */
2016 if (vect_print_dump_info (REPORT_DETAILS))
2017 report_vect_op (def_stmt,
2018 "detected reduction: need to swap operands: ");
2020 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2021 gimple_assign_rhs2_ptr (def_stmt));
2023 else
2025 if (vect_print_dump_info (REPORT_DETAILS))
2026 report_vect_op (def_stmt, "detected reduction: ");
2029 return def_stmt;
2031 else
2033 if (vect_print_dump_info (REPORT_DETAILS))
2034 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2036 return NULL;
2040 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2041 in-place. Arguments as there. */
2043 static gimple
2044 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2045 bool check_reduction, bool *double_reduc)
2047 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2048 double_reduc, false);
2051 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2052 in-place if it enables detection of more reductions. Arguments
2053 as there. */
2055 gimple
2056 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2057 bool check_reduction, bool *double_reduc)
2059 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2060 double_reduc, true);
2063 /* Calculate the cost of one scalar iteration of the loop. */
2065 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2067 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2068 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2069 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2070 int innerloop_iters, i, stmt_cost;
2072 /* Count statements in scalar loop. Using this as scalar cost for a single
2073 iteration for now.
2075 TODO: Add outer loop support.
2077 TODO: Consider assigning different costs to different scalar
2078 statements. */
2080 /* FORNOW. */
2081 innerloop_iters = 1;
2082 if (loop->inner)
2083 innerloop_iters = 50; /* FIXME */
2085 for (i = 0; i < nbbs; i++)
2087 gimple_stmt_iterator si;
2088 basic_block bb = bbs[i];
2090 if (bb->loop_father == loop->inner)
2091 factor = innerloop_iters;
2092 else
2093 factor = 1;
2095 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2097 gimple stmt = gsi_stmt (si);
2098 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2100 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2101 continue;
2103 /* Skip stmts that are not vectorized inside the loop. */
2104 if (stmt_info
2105 && !STMT_VINFO_RELEVANT_P (stmt_info)
2106 && (!STMT_VINFO_LIVE_P (stmt_info)
2107 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2108 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2109 continue;
2111 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2113 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2114 stmt_cost = vect_get_cost (scalar_load);
2115 else
2116 stmt_cost = vect_get_cost (scalar_store);
2118 else
2119 stmt_cost = vect_get_cost (scalar_stmt);
2121 scalar_single_iter_cost += stmt_cost * factor;
2124 return scalar_single_iter_cost;
2127 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2129 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2130 int *peel_iters_epilogue,
2131 int scalar_single_iter_cost)
2133 int peel_guard_costs = 0;
2134 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2136 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2138 *peel_iters_epilogue = vf/2;
2139 if (vect_print_dump_info (REPORT_COST))
2140 fprintf (vect_dump, "cost model: "
2141 "epilogue peel iters set to vf/2 because "
2142 "loop iterations are unknown .");
2144 /* If peeled iterations are known but number of scalar loop
2145 iterations are unknown, count a taken branch per peeled loop. */
2146 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2148 else
2150 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2151 peel_iters_prologue = niters < peel_iters_prologue ?
2152 niters : peel_iters_prologue;
2153 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2154 /* If we need to peel for gaps, but no peeling is required, we have to
2155 peel VF iterations. */
2156 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2157 *peel_iters_epilogue = vf;
2160 return (peel_iters_prologue * scalar_single_iter_cost)
2161 + (*peel_iters_epilogue * scalar_single_iter_cost)
2162 + peel_guard_costs;
2165 /* Function vect_estimate_min_profitable_iters
2167 Return the number of iterations required for the vector version of the
2168 loop to be profitable relative to the cost of the scalar version of the
2169 loop.
2171 TODO: Take profile info into account before making vectorization
2172 decisions, if available. */
2175 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2177 int i;
2178 int min_profitable_iters;
2179 int peel_iters_prologue;
2180 int peel_iters_epilogue;
2181 int vec_inside_cost = 0;
2182 int vec_outside_cost = 0;
2183 int scalar_single_iter_cost = 0;
2184 int scalar_outside_cost = 0;
2185 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2188 int nbbs = loop->num_nodes;
2189 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2190 int peel_guard_costs = 0;
2191 int innerloop_iters = 0, factor;
2192 VEC (slp_instance, heap) *slp_instances;
2193 slp_instance instance;
2195 /* Cost model disabled. */
2196 if (!flag_vect_cost_model)
2198 if (vect_print_dump_info (REPORT_COST))
2199 fprintf (vect_dump, "cost model disabled.");
2200 return 0;
2203 /* Requires loop versioning tests to handle misalignment. */
2204 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2206 /* FIXME: Make cost depend on complexity of individual check. */
2207 vec_outside_cost +=
2208 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2209 if (vect_print_dump_info (REPORT_COST))
2210 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2211 "versioning to treat misalignment.\n");
2214 /* Requires loop versioning with alias checks. */
2215 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2217 /* FIXME: Make cost depend on complexity of individual check. */
2218 vec_outside_cost +=
2219 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2220 if (vect_print_dump_info (REPORT_COST))
2221 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2222 "versioning aliasing.\n");
2225 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2226 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2227 vec_outside_cost += vect_get_cost (cond_branch_taken);
2229 /* Count statements in scalar loop. Using this as scalar cost for a single
2230 iteration for now.
2232 TODO: Add outer loop support.
2234 TODO: Consider assigning different costs to different scalar
2235 statements. */
2237 /* FORNOW. */
2238 if (loop->inner)
2239 innerloop_iters = 50; /* FIXME */
2241 for (i = 0; i < nbbs; i++)
2243 gimple_stmt_iterator si;
2244 basic_block bb = bbs[i];
2246 if (bb->loop_father == loop->inner)
2247 factor = innerloop_iters;
2248 else
2249 factor = 1;
2251 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2253 gimple stmt = gsi_stmt (si);
2254 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2256 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2258 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2259 stmt_info = vinfo_for_stmt (stmt);
2262 /* Skip stmts that are not vectorized inside the loop. */
2263 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2264 && (!STMT_VINFO_LIVE_P (stmt_info)
2265 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
2266 continue;
2268 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2269 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2270 some of the "outside" costs are generated inside the outer-loop. */
2271 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2275 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2277 /* Add additional cost for the peeled instructions in prologue and epilogue
2278 loop.
2280 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2281 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2283 TODO: Build an expression that represents peel_iters for prologue and
2284 epilogue to be used in a run-time test. */
2286 if (npeel < 0)
2288 peel_iters_prologue = vf/2;
2289 if (vect_print_dump_info (REPORT_COST))
2290 fprintf (vect_dump, "cost model: "
2291 "prologue peel iters set to vf/2.");
2293 /* If peeling for alignment is unknown, loop bound of main loop becomes
2294 unknown. */
2295 peel_iters_epilogue = vf/2;
2296 if (vect_print_dump_info (REPORT_COST))
2297 fprintf (vect_dump, "cost model: "
2298 "epilogue peel iters set to vf/2 because "
2299 "peeling for alignment is unknown .");
2301 /* If peeled iterations are unknown, count a taken branch and a not taken
2302 branch per peeled loop. Even if scalar loop iterations are known,
2303 vector iterations are not known since peeled prologue iterations are
2304 not known. Hence guards remain the same. */
2305 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2306 + vect_get_cost (cond_branch_not_taken));
2307 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2308 + (peel_iters_epilogue * scalar_single_iter_cost)
2309 + peel_guard_costs;
2311 else
2313 peel_iters_prologue = npeel;
2314 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2315 peel_iters_prologue, &peel_iters_epilogue,
2316 scalar_single_iter_cost);
2319 /* FORNOW: The scalar outside cost is incremented in one of the
2320 following ways:
2322 1. The vectorizer checks for alignment and aliasing and generates
2323 a condition that allows dynamic vectorization. A cost model
2324 check is ANDED with the versioning condition. Hence scalar code
2325 path now has the added cost of the versioning check.
2327 if (cost > th & versioning_check)
2328 jmp to vector code
2330 Hence run-time scalar is incremented by not-taken branch cost.
2332 2. The vectorizer then checks if a prologue is required. If the
2333 cost model check was not done before during versioning, it has to
2334 be done before the prologue check.
2336 if (cost <= th)
2337 prologue = scalar_iters
2338 if (prologue == 0)
2339 jmp to vector code
2340 else
2341 execute prologue
2342 if (prologue == num_iters)
2343 go to exit
2345 Hence the run-time scalar cost is incremented by a taken branch,
2346 plus a not-taken branch, plus a taken branch cost.
2348 3. The vectorizer then checks if an epilogue is required. If the
2349 cost model check was not done before during prologue check, it
2350 has to be done with the epilogue check.
2352 if (prologue == 0)
2353 jmp to vector code
2354 else
2355 execute prologue
2356 if (prologue == num_iters)
2357 go to exit
2358 vector code:
2359 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2360 jmp to epilogue
2362 Hence the run-time scalar cost should be incremented by 2 taken
2363 branches.
2365 TODO: The back end may reorder the BBS's differently and reverse
2366 conditions/branch directions. Change the estimates below to
2367 something more reasonable. */
2369 /* If the number of iterations is known and we do not do versioning, we can
2370 decide whether to vectorize at compile time. Hence the scalar version
2371 do not carry cost model guard costs. */
2372 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2373 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2374 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2376 /* Cost model check occurs at versioning. */
2377 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2378 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2379 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2380 else
2382 /* Cost model check occurs at prologue generation. */
2383 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2384 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2385 + vect_get_cost (cond_branch_not_taken);
2386 /* Cost model check occurs at epilogue generation. */
2387 else
2388 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2392 /* Add SLP costs. */
2393 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2394 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2396 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2397 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2400 /* Calculate number of iterations required to make the vector version
2401 profitable, relative to the loop bodies only. The following condition
2402 must hold true:
2403 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2404 where
2405 SIC = scalar iteration cost, VIC = vector iteration cost,
2406 VOC = vector outside cost, VF = vectorization factor,
2407 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2408 SOC = scalar outside cost for run time cost model check. */
2410 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2412 if (vec_outside_cost <= 0)
2413 min_profitable_iters = 1;
2414 else
2416 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2417 - vec_inside_cost * peel_iters_prologue
2418 - vec_inside_cost * peel_iters_epilogue)
2419 / ((scalar_single_iter_cost * vf)
2420 - vec_inside_cost);
2422 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2423 <= ((vec_inside_cost * min_profitable_iters)
2424 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2425 min_profitable_iters++;
2428 /* vector version will never be profitable. */
2429 else
2431 if (vect_print_dump_info (REPORT_COST))
2432 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2433 "divided by the scalar iteration cost = %d "
2434 "is greater or equal to the vectorization factor = %d.",
2435 vec_inside_cost, scalar_single_iter_cost, vf);
2436 return -1;
2439 if (vect_print_dump_info (REPORT_COST))
2441 fprintf (vect_dump, "Cost model analysis: \n");
2442 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2443 vec_inside_cost);
2444 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2445 vec_outside_cost);
2446 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2447 scalar_single_iter_cost);
2448 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2449 fprintf (vect_dump, " prologue iterations: %d\n",
2450 peel_iters_prologue);
2451 fprintf (vect_dump, " epilogue iterations: %d\n",
2452 peel_iters_epilogue);
2453 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2454 min_profitable_iters);
2457 min_profitable_iters =
2458 min_profitable_iters < vf ? vf : min_profitable_iters;
2460 /* Because the condition we create is:
2461 if (niters <= min_profitable_iters)
2462 then skip the vectorized loop. */
2463 min_profitable_iters--;
2465 if (vect_print_dump_info (REPORT_COST))
2466 fprintf (vect_dump, " Profitability threshold = %d\n",
2467 min_profitable_iters);
2469 return min_profitable_iters;
2473 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2474 functions. Design better to avoid maintenance issues. */
2476 /* Function vect_model_reduction_cost.
2478 Models cost for a reduction operation, including the vector ops
2479 generated within the strip-mine loop, the initial definition before
2480 the loop, and the epilogue code that must be generated. */
2482 static bool
2483 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2484 int ncopies)
2486 int outer_cost = 0;
2487 enum tree_code code;
2488 optab optab;
2489 tree vectype;
2490 gimple stmt, orig_stmt;
2491 tree reduction_op;
2492 enum machine_mode mode;
2493 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2494 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2497 /* Cost of reduction op inside loop. */
2498 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2499 += ncopies * vect_get_cost (vector_stmt);
2501 stmt = STMT_VINFO_STMT (stmt_info);
2503 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2505 case GIMPLE_SINGLE_RHS:
2506 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2507 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2508 break;
2509 case GIMPLE_UNARY_RHS:
2510 reduction_op = gimple_assign_rhs1 (stmt);
2511 break;
2512 case GIMPLE_BINARY_RHS:
2513 reduction_op = gimple_assign_rhs2 (stmt);
2514 break;
2515 default:
2516 gcc_unreachable ();
2519 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2520 if (!vectype)
2522 if (vect_print_dump_info (REPORT_COST))
2524 fprintf (vect_dump, "unsupported data-type ");
2525 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2527 return false;
2530 mode = TYPE_MODE (vectype);
2531 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2533 if (!orig_stmt)
2534 orig_stmt = STMT_VINFO_STMT (stmt_info);
2536 code = gimple_assign_rhs_code (orig_stmt);
2538 /* Add in cost for initial definition. */
2539 outer_cost += vect_get_cost (scalar_to_vec);
2541 /* Determine cost of epilogue code.
2543 We have a reduction operator that will reduce the vector in one statement.
2544 Also requires scalar extract. */
2546 if (!nested_in_vect_loop_p (loop, orig_stmt))
2548 if (reduc_code != ERROR_MARK)
2549 outer_cost += vect_get_cost (vector_stmt)
2550 + vect_get_cost (vec_to_scalar);
2551 else
2553 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2554 tree bitsize =
2555 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2556 int element_bitsize = tree_low_cst (bitsize, 1);
2557 int nelements = vec_size_in_bits / element_bitsize;
2559 optab = optab_for_tree_code (code, vectype, optab_default);
2561 /* We have a whole vector shift available. */
2562 if (VECTOR_MODE_P (mode)
2563 && optab_handler (optab, mode) != CODE_FOR_nothing
2564 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2565 /* Final reduction via vector shifts and the reduction operator. Also
2566 requires scalar extract. */
2567 outer_cost += ((exact_log2(nelements) * 2)
2568 * vect_get_cost (vector_stmt)
2569 + vect_get_cost (vec_to_scalar));
2570 else
2571 /* Use extracts and reduction op for final reduction. For N elements,
2572 we have N extracts and N-1 reduction ops. */
2573 outer_cost += ((nelements + nelements - 1)
2574 * vect_get_cost (vector_stmt));
2578 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2580 if (vect_print_dump_info (REPORT_COST))
2581 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2582 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2583 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2585 return true;
2589 /* Function vect_model_induction_cost.
2591 Models cost for induction operations. */
2593 static void
2594 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2596 /* loop cost for vec_loop. */
2597 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2598 = ncopies * vect_get_cost (vector_stmt);
2599 /* prologue cost for vec_init and vec_step. */
2600 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2601 = 2 * vect_get_cost (scalar_to_vec);
2603 if (vect_print_dump_info (REPORT_COST))
2604 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2605 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2606 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2610 /* Function get_initial_def_for_induction
2612 Input:
2613 STMT - a stmt that performs an induction operation in the loop.
2614 IV_PHI - the initial value of the induction variable
2616 Output:
2617 Return a vector variable, initialized with the first VF values of
2618 the induction variable. E.g., for an iv with IV_PHI='X' and
2619 evolution S, for a vector of 4 units, we want to return:
2620 [X, X + S, X + 2*S, X + 3*S]. */
2622 static tree
2623 get_initial_def_for_induction (gimple iv_phi)
2625 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2626 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2627 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2628 tree scalar_type;
2629 tree vectype;
2630 int nunits;
2631 edge pe = loop_preheader_edge (loop);
2632 struct loop *iv_loop;
2633 basic_block new_bb;
2634 tree vec, vec_init, vec_step, t;
2635 tree access_fn;
2636 tree new_var;
2637 tree new_name;
2638 gimple init_stmt, induction_phi, new_stmt;
2639 tree induc_def, vec_def, vec_dest;
2640 tree init_expr, step_expr;
2641 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2642 int i;
2643 bool ok;
2644 int ncopies;
2645 tree expr;
2646 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2647 bool nested_in_vect_loop = false;
2648 gimple_seq stmts = NULL;
2649 imm_use_iterator imm_iter;
2650 use_operand_p use_p;
2651 gimple exit_phi;
2652 edge latch_e;
2653 tree loop_arg;
2654 gimple_stmt_iterator si;
2655 basic_block bb = gimple_bb (iv_phi);
2656 tree stepvectype;
2657 tree resvectype;
2659 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2660 if (nested_in_vect_loop_p (loop, iv_phi))
2662 nested_in_vect_loop = true;
2663 iv_loop = loop->inner;
2665 else
2666 iv_loop = loop;
2667 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2669 latch_e = loop_latch_edge (iv_loop);
2670 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2672 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2673 gcc_assert (access_fn);
2674 STRIP_NOPS (access_fn);
2675 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2676 &init_expr, &step_expr);
2677 gcc_assert (ok);
2678 pe = loop_preheader_edge (iv_loop);
2680 scalar_type = TREE_TYPE (init_expr);
2681 vectype = get_vectype_for_scalar_type (scalar_type);
2682 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2683 gcc_assert (vectype);
2684 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2685 ncopies = vf / nunits;
2687 gcc_assert (phi_info);
2688 gcc_assert (ncopies >= 1);
2690 /* Find the first insertion point in the BB. */
2691 si = gsi_after_labels (bb);
2693 /* Create the vector that holds the initial_value of the induction. */
2694 if (nested_in_vect_loop)
2696 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2697 been created during vectorization of previous stmts. We obtain it
2698 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2699 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2700 loop_preheader_edge (iv_loop));
2701 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2703 else
2705 /* iv_loop is the loop to be vectorized. Create:
2706 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2707 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2708 add_referenced_var (new_var);
2710 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2711 if (stmts)
2713 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2714 gcc_assert (!new_bb);
2717 t = NULL_TREE;
2718 t = tree_cons (NULL_TREE, new_name, t);
2719 for (i = 1; i < nunits; i++)
2721 /* Create: new_name_i = new_name + step_expr */
2722 enum tree_code code = POINTER_TYPE_P (scalar_type)
2723 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2724 init_stmt = gimple_build_assign_with_ops (code, new_var,
2725 new_name, step_expr);
2726 new_name = make_ssa_name (new_var, init_stmt);
2727 gimple_assign_set_lhs (init_stmt, new_name);
2729 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2730 gcc_assert (!new_bb);
2732 if (vect_print_dump_info (REPORT_DETAILS))
2734 fprintf (vect_dump, "created new init_stmt: ");
2735 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2737 t = tree_cons (NULL_TREE, new_name, t);
2739 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2740 vec = build_constructor_from_list (vectype, nreverse (t));
2741 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2745 /* Create the vector that holds the step of the induction. */
2746 if (nested_in_vect_loop)
2747 /* iv_loop is nested in the loop to be vectorized. Generate:
2748 vec_step = [S, S, S, S] */
2749 new_name = step_expr;
2750 else
2752 /* iv_loop is the loop to be vectorized. Generate:
2753 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2754 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2755 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2756 expr, step_expr);
2759 t = unshare_expr (new_name);
2760 gcc_assert (CONSTANT_CLASS_P (new_name));
2761 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2762 gcc_assert (stepvectype);
2763 vec = build_vector_from_val (stepvectype, t);
2764 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2767 /* Create the following def-use cycle:
2768 loop prolog:
2769 vec_init = ...
2770 vec_step = ...
2771 loop:
2772 vec_iv = PHI <vec_init, vec_loop>
2774 STMT
2776 vec_loop = vec_iv + vec_step; */
2778 /* Create the induction-phi that defines the induction-operand. */
2779 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2780 add_referenced_var (vec_dest);
2781 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2782 set_vinfo_for_stmt (induction_phi,
2783 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2784 induc_def = PHI_RESULT (induction_phi);
2786 /* Create the iv update inside the loop */
2787 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2788 induc_def, vec_step);
2789 vec_def = make_ssa_name (vec_dest, new_stmt);
2790 gimple_assign_set_lhs (new_stmt, vec_def);
2791 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2792 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2793 NULL));
2795 /* Set the arguments of the phi node: */
2796 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2797 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2798 UNKNOWN_LOCATION);
2801 /* In case that vectorization factor (VF) is bigger than the number
2802 of elements that we can fit in a vectype (nunits), we have to generate
2803 more than one vector stmt - i.e - we need to "unroll" the
2804 vector stmt by a factor VF/nunits. For more details see documentation
2805 in vectorizable_operation. */
2807 if (ncopies > 1)
2809 stmt_vec_info prev_stmt_vinfo;
2810 /* FORNOW. This restriction should be relaxed. */
2811 gcc_assert (!nested_in_vect_loop);
2813 /* Create the vector that holds the step of the induction. */
2814 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2815 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2816 expr, step_expr);
2817 t = unshare_expr (new_name);
2818 gcc_assert (CONSTANT_CLASS_P (new_name));
2819 vec = build_vector_from_val (stepvectype, t);
2820 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2822 vec_def = induc_def;
2823 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2824 for (i = 1; i < ncopies; i++)
2826 /* vec_i = vec_prev + vec_step */
2827 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2828 vec_def, vec_step);
2829 vec_def = make_ssa_name (vec_dest, new_stmt);
2830 gimple_assign_set_lhs (new_stmt, vec_def);
2832 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2833 if (!useless_type_conversion_p (resvectype, vectype))
2835 new_stmt = gimple_build_assign_with_ops
2836 (VIEW_CONVERT_EXPR,
2837 vect_get_new_vect_var (resvectype, vect_simple_var,
2838 "vec_iv_"),
2839 build1 (VIEW_CONVERT_EXPR, resvectype,
2840 gimple_assign_lhs (new_stmt)), NULL_TREE);
2841 gimple_assign_set_lhs (new_stmt,
2842 make_ssa_name
2843 (gimple_assign_lhs (new_stmt), new_stmt));
2844 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2846 set_vinfo_for_stmt (new_stmt,
2847 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2848 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2849 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2853 if (nested_in_vect_loop)
2855 /* Find the loop-closed exit-phi of the induction, and record
2856 the final vector of induction results: */
2857 exit_phi = NULL;
2858 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2860 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2862 exit_phi = USE_STMT (use_p);
2863 break;
2866 if (exit_phi)
2868 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2869 /* FORNOW. Currently not supporting the case that an inner-loop induction
2870 is not used in the outer-loop (i.e. only outside the outer-loop). */
2871 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2872 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2874 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2875 if (vect_print_dump_info (REPORT_DETAILS))
2877 fprintf (vect_dump, "vector of inductions after inner-loop:");
2878 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2884 if (vect_print_dump_info (REPORT_DETAILS))
2886 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2887 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2888 fprintf (vect_dump, "\n");
2889 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2892 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2893 if (!useless_type_conversion_p (resvectype, vectype))
2895 new_stmt = gimple_build_assign_with_ops
2896 (VIEW_CONVERT_EXPR,
2897 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2898 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2899 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2900 gimple_assign_set_lhs (new_stmt, induc_def);
2901 si = gsi_start_bb (bb);
2902 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2903 set_vinfo_for_stmt (new_stmt,
2904 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2905 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2906 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2909 return induc_def;
2913 /* Function get_initial_def_for_reduction
2915 Input:
2916 STMT - a stmt that performs a reduction operation in the loop.
2917 INIT_VAL - the initial value of the reduction variable
2919 Output:
2920 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2921 of the reduction (used for adjusting the epilog - see below).
2922 Return a vector variable, initialized according to the operation that STMT
2923 performs. This vector will be used as the initial value of the
2924 vector of partial results.
2926 Option1 (adjust in epilog): Initialize the vector as follows:
2927 add/bit or/xor: [0,0,...,0,0]
2928 mult/bit and: [1,1,...,1,1]
2929 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2930 and when necessary (e.g. add/mult case) let the caller know
2931 that it needs to adjust the result by init_val.
2933 Option2: Initialize the vector as follows:
2934 add/bit or/xor: [init_val,0,0,...,0]
2935 mult/bit and: [init_val,1,1,...,1]
2936 min/max/cond_expr: [init_val,init_val,...,init_val]
2937 and no adjustments are needed.
2939 For example, for the following code:
2941 s = init_val;
2942 for (i=0;i<n;i++)
2943 s = s + a[i];
2945 STMT is 's = s + a[i]', and the reduction variable is 's'.
2946 For a vector of 4 units, we want to return either [0,0,0,init_val],
2947 or [0,0,0,0] and let the caller know that it needs to adjust
2948 the result at the end by 'init_val'.
2950 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2951 initialization vector is simpler (same element in all entries), if
2952 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2954 A cost model should help decide between these two schemes. */
2956 tree
2957 get_initial_def_for_reduction (gimple stmt, tree init_val,
2958 tree *adjustment_def)
2960 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2961 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2962 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2963 tree scalar_type = TREE_TYPE (init_val);
2964 tree vectype = get_vectype_for_scalar_type (scalar_type);
2965 int nunits;
2966 enum tree_code code = gimple_assign_rhs_code (stmt);
2967 tree def_for_init;
2968 tree init_def;
2969 tree t = NULL_TREE;
2970 int i;
2971 bool nested_in_vect_loop = false;
2972 tree init_value;
2973 REAL_VALUE_TYPE real_init_val = dconst0;
2974 int int_init_val = 0;
2975 gimple def_stmt = NULL;
2977 gcc_assert (vectype);
2978 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2980 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2981 || SCALAR_FLOAT_TYPE_P (scalar_type));
2983 if (nested_in_vect_loop_p (loop, stmt))
2984 nested_in_vect_loop = true;
2985 else
2986 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2988 /* In case of double reduction we only create a vector variable to be put
2989 in the reduction phi node. The actual statement creation is done in
2990 vect_create_epilog_for_reduction. */
2991 if (adjustment_def && nested_in_vect_loop
2992 && TREE_CODE (init_val) == SSA_NAME
2993 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2994 && gimple_code (def_stmt) == GIMPLE_PHI
2995 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2996 && vinfo_for_stmt (def_stmt)
2997 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2998 == vect_double_reduction_def)
3000 *adjustment_def = NULL;
3001 return vect_create_destination_var (init_val, vectype);
3004 if (TREE_CONSTANT (init_val))
3006 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3007 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3008 else
3009 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3011 else
3012 init_value = init_val;
3014 switch (code)
3016 case WIDEN_SUM_EXPR:
3017 case DOT_PROD_EXPR:
3018 case PLUS_EXPR:
3019 case MINUS_EXPR:
3020 case BIT_IOR_EXPR:
3021 case BIT_XOR_EXPR:
3022 case MULT_EXPR:
3023 case BIT_AND_EXPR:
3024 /* ADJUSMENT_DEF is NULL when called from
3025 vect_create_epilog_for_reduction to vectorize double reduction. */
3026 if (adjustment_def)
3028 if (nested_in_vect_loop)
3029 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3030 NULL);
3031 else
3032 *adjustment_def = init_val;
3035 if (code == MULT_EXPR)
3037 real_init_val = dconst1;
3038 int_init_val = 1;
3041 if (code == BIT_AND_EXPR)
3042 int_init_val = -1;
3044 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3045 def_for_init = build_real (scalar_type, real_init_val);
3046 else
3047 def_for_init = build_int_cst (scalar_type, int_init_val);
3049 /* Create a vector of '0' or '1' except the first element. */
3050 for (i = nunits - 2; i >= 0; --i)
3051 t = tree_cons (NULL_TREE, def_for_init, t);
3053 /* Option1: the first element is '0' or '1' as well. */
3054 if (adjustment_def)
3056 t = tree_cons (NULL_TREE, def_for_init, t);
3057 init_def = build_vector (vectype, t);
3058 break;
3061 /* Option2: the first element is INIT_VAL. */
3062 t = tree_cons (NULL_TREE, init_value, t);
3063 if (TREE_CONSTANT (init_val))
3064 init_def = build_vector (vectype, t);
3065 else
3066 init_def = build_constructor_from_list (vectype, t);
3068 break;
3070 case MIN_EXPR:
3071 case MAX_EXPR:
3072 case COND_EXPR:
3073 if (adjustment_def)
3075 *adjustment_def = NULL_TREE;
3076 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3077 break;
3080 init_def = build_vector_from_val (vectype, init_value);
3081 break;
3083 default:
3084 gcc_unreachable ();
3087 return init_def;
3091 /* Function vect_create_epilog_for_reduction
3093 Create code at the loop-epilog to finalize the result of a reduction
3094 computation.
3096 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3097 reduction statements.
3098 STMT is the scalar reduction stmt that is being vectorized.
3099 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3100 number of elements that we can fit in a vectype (nunits). In this case
3101 we have to generate more than one vector stmt - i.e - we need to "unroll"
3102 the vector stmt by a factor VF/nunits. For more details see documentation
3103 in vectorizable_operation.
3104 REDUC_CODE is the tree-code for the epilog reduction.
3105 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3106 computation.
3107 REDUC_INDEX is the index of the operand in the right hand side of the
3108 statement that is defined by REDUCTION_PHI.
3109 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3110 SLP_NODE is an SLP node containing a group of reduction statements. The
3111 first one in this group is STMT.
3113 This function:
3114 1. Creates the reduction def-use cycles: sets the arguments for
3115 REDUCTION_PHIS:
3116 The loop-entry argument is the vectorized initial-value of the reduction.
3117 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3118 sums.
3119 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3120 by applying the operation specified by REDUC_CODE if available, or by
3121 other means (whole-vector shifts or a scalar loop).
3122 The function also creates a new phi node at the loop exit to preserve
3123 loop-closed form, as illustrated below.
3125 The flow at the entry to this function:
3127 loop:
3128 vec_def = phi <null, null> # REDUCTION_PHI
3129 VECT_DEF = vector_stmt # vectorized form of STMT
3130 s_loop = scalar_stmt # (scalar) STMT
3131 loop_exit:
3132 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3133 use <s_out0>
3134 use <s_out0>
3136 The above is transformed by this function into:
3138 loop:
3139 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3140 VECT_DEF = vector_stmt # vectorized form of STMT
3141 s_loop = scalar_stmt # (scalar) STMT
3142 loop_exit:
3143 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3144 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3145 v_out2 = reduce <v_out1>
3146 s_out3 = extract_field <v_out2, 0>
3147 s_out4 = adjust_result <s_out3>
3148 use <s_out4>
3149 use <s_out4>
3152 static void
3153 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3154 int ncopies, enum tree_code reduc_code,
3155 VEC (gimple, heap) *reduction_phis,
3156 int reduc_index, bool double_reduc,
3157 slp_tree slp_node)
3159 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3160 stmt_vec_info prev_phi_info;
3161 tree vectype;
3162 enum machine_mode mode;
3163 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3164 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3165 basic_block exit_bb;
3166 tree scalar_dest;
3167 tree scalar_type;
3168 gimple new_phi = NULL, phi;
3169 gimple_stmt_iterator exit_gsi;
3170 tree vec_dest;
3171 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3172 gimple epilog_stmt = NULL;
3173 enum tree_code code = gimple_assign_rhs_code (stmt);
3174 gimple exit_phi;
3175 tree bitsize, bitpos;
3176 tree adjustment_def = NULL;
3177 tree vec_initial_def = NULL;
3178 tree reduction_op, expr, def;
3179 tree orig_name, scalar_result;
3180 imm_use_iterator imm_iter, phi_imm_iter;
3181 use_operand_p use_p, phi_use_p;
3182 bool extract_scalar_result = false;
3183 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3184 bool nested_in_vect_loop = false;
3185 VEC (gimple, heap) *new_phis = NULL;
3186 enum vect_def_type dt = vect_unknown_def_type;
3187 int j, i;
3188 VEC (tree, heap) *scalar_results = NULL;
3189 unsigned int group_size = 1, k, ratio;
3190 VEC (tree, heap) *vec_initial_defs = NULL;
3191 VEC (gimple, heap) *phis;
3193 if (slp_node)
3194 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3196 if (nested_in_vect_loop_p (loop, stmt))
3198 outer_loop = loop;
3199 loop = loop->inner;
3200 nested_in_vect_loop = true;
3201 gcc_assert (!slp_node);
3204 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3206 case GIMPLE_SINGLE_RHS:
3207 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3208 == ternary_op);
3209 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3210 break;
3211 case GIMPLE_UNARY_RHS:
3212 reduction_op = gimple_assign_rhs1 (stmt);
3213 break;
3214 case GIMPLE_BINARY_RHS:
3215 reduction_op = reduc_index ?
3216 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3217 break;
3218 default:
3219 gcc_unreachable ();
3222 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3223 gcc_assert (vectype);
3224 mode = TYPE_MODE (vectype);
3226 /* 1. Create the reduction def-use cycle:
3227 Set the arguments of REDUCTION_PHIS, i.e., transform
3229 loop:
3230 vec_def = phi <null, null> # REDUCTION_PHI
3231 VECT_DEF = vector_stmt # vectorized form of STMT
3234 into:
3236 loop:
3237 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3238 VECT_DEF = vector_stmt # vectorized form of STMT
3241 (in case of SLP, do it for all the phis). */
3243 /* Get the loop-entry arguments. */
3244 if (slp_node)
3245 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3246 NULL, reduc_index);
3247 else
3249 vec_initial_defs = VEC_alloc (tree, heap, 1);
3250 /* For the case of reduction, vect_get_vec_def_for_operand returns
3251 the scalar def before the loop, that defines the initial value
3252 of the reduction variable. */
3253 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3254 &adjustment_def);
3255 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3258 /* Set phi nodes arguments. */
3259 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3261 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3262 tree def = VEC_index (tree, vect_defs, i);
3263 for (j = 0; j < ncopies; j++)
3265 /* Set the loop-entry arg of the reduction-phi. */
3266 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3267 UNKNOWN_LOCATION);
3269 /* Set the loop-latch arg for the reduction-phi. */
3270 if (j > 0)
3271 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3273 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3275 if (vect_print_dump_info (REPORT_DETAILS))
3277 fprintf (vect_dump, "transform reduction: created def-use"
3278 " cycle: ");
3279 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3280 fprintf (vect_dump, "\n");
3281 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3282 TDF_SLIM);
3285 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3289 VEC_free (tree, heap, vec_initial_defs);
3291 /* 2. Create epilog code.
3292 The reduction epilog code operates across the elements of the vector
3293 of partial results computed by the vectorized loop.
3294 The reduction epilog code consists of:
3296 step 1: compute the scalar result in a vector (v_out2)
3297 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3298 step 3: adjust the scalar result (s_out3) if needed.
3300 Step 1 can be accomplished using one the following three schemes:
3301 (scheme 1) using reduc_code, if available.
3302 (scheme 2) using whole-vector shifts, if available.
3303 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3304 combined.
3306 The overall epilog code looks like this:
3308 s_out0 = phi <s_loop> # original EXIT_PHI
3309 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3310 v_out2 = reduce <v_out1> # step 1
3311 s_out3 = extract_field <v_out2, 0> # step 2
3312 s_out4 = adjust_result <s_out3> # step 3
3314 (step 3 is optional, and steps 1 and 2 may be combined).
3315 Lastly, the uses of s_out0 are replaced by s_out4. */
3318 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3319 v_out1 = phi <VECT_DEF>
3320 Store them in NEW_PHIS. */
3322 exit_bb = single_exit (loop)->dest;
3323 prev_phi_info = NULL;
3324 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3325 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3327 for (j = 0; j < ncopies; j++)
3329 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3330 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3331 if (j == 0)
3332 VEC_quick_push (gimple, new_phis, phi);
3333 else
3335 def = vect_get_vec_def_for_stmt_copy (dt, def);
3336 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3339 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3340 prev_phi_info = vinfo_for_stmt (phi);
3344 /* The epilogue is created for the outer-loop, i.e., for the loop being
3345 vectorized. */
3346 if (double_reduc)
3348 loop = outer_loop;
3349 exit_bb = single_exit (loop)->dest;
3352 exit_gsi = gsi_after_labels (exit_bb);
3354 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3355 (i.e. when reduc_code is not available) and in the final adjustment
3356 code (if needed). Also get the original scalar reduction variable as
3357 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3358 represents a reduction pattern), the tree-code and scalar-def are
3359 taken from the original stmt that the pattern-stmt (STMT) replaces.
3360 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3361 are taken from STMT. */
3363 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3364 if (!orig_stmt)
3366 /* Regular reduction */
3367 orig_stmt = stmt;
3369 else
3371 /* Reduction pattern */
3372 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3373 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3374 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3377 code = gimple_assign_rhs_code (orig_stmt);
3378 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3379 partial results are added and not subtracted. */
3380 if (code == MINUS_EXPR)
3381 code = PLUS_EXPR;
3383 scalar_dest = gimple_assign_lhs (orig_stmt);
3384 scalar_type = TREE_TYPE (scalar_dest);
3385 scalar_results = VEC_alloc (tree, heap, group_size);
3386 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3387 bitsize = TYPE_SIZE (scalar_type);
3389 /* In case this is a reduction in an inner-loop while vectorizing an outer
3390 loop - we don't need to extract a single scalar result at the end of the
3391 inner-loop (unless it is double reduction, i.e., the use of reduction is
3392 outside the outer-loop). The final vector of partial results will be used
3393 in the vectorized outer-loop, or reduced to a scalar result at the end of
3394 the outer-loop. */
3395 if (nested_in_vect_loop && !double_reduc)
3396 goto vect_finalize_reduction;
3398 /* 2.3 Create the reduction code, using one of the three schemes described
3399 above. In SLP we simply need to extract all the elements from the
3400 vector (without reducing them), so we use scalar shifts. */
3401 if (reduc_code != ERROR_MARK && !slp_node)
3403 tree tmp;
3405 /*** Case 1: Create:
3406 v_out2 = reduc_expr <v_out1> */
3408 if (vect_print_dump_info (REPORT_DETAILS))
3409 fprintf (vect_dump, "Reduce using direct vector reduction.");
3411 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3412 new_phi = VEC_index (gimple, new_phis, 0);
3413 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3414 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3415 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3416 gimple_assign_set_lhs (epilog_stmt, new_temp);
3417 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3419 extract_scalar_result = true;
3421 else
3423 enum tree_code shift_code = ERROR_MARK;
3424 bool have_whole_vector_shift = true;
3425 int bit_offset;
3426 int element_bitsize = tree_low_cst (bitsize, 1);
3427 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3428 tree vec_temp;
3430 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3431 shift_code = VEC_RSHIFT_EXPR;
3432 else
3433 have_whole_vector_shift = false;
3435 /* Regardless of whether we have a whole vector shift, if we're
3436 emulating the operation via tree-vect-generic, we don't want
3437 to use it. Only the first round of the reduction is likely
3438 to still be profitable via emulation. */
3439 /* ??? It might be better to emit a reduction tree code here, so that
3440 tree-vect-generic can expand the first round via bit tricks. */
3441 if (!VECTOR_MODE_P (mode))
3442 have_whole_vector_shift = false;
3443 else
3445 optab optab = optab_for_tree_code (code, vectype, optab_default);
3446 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3447 have_whole_vector_shift = false;
3450 if (have_whole_vector_shift && !slp_node)
3452 /*** Case 2: Create:
3453 for (offset = VS/2; offset >= element_size; offset/=2)
3455 Create: va' = vec_shift <va, offset>
3456 Create: va = vop <va, va'>
3457 } */
3459 if (vect_print_dump_info (REPORT_DETAILS))
3460 fprintf (vect_dump, "Reduce using vector shifts");
3462 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3463 new_phi = VEC_index (gimple, new_phis, 0);
3464 new_temp = PHI_RESULT (new_phi);
3465 for (bit_offset = vec_size_in_bits/2;
3466 bit_offset >= element_bitsize;
3467 bit_offset /= 2)
3469 tree bitpos = size_int (bit_offset);
3471 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3472 vec_dest, new_temp, bitpos);
3473 new_name = make_ssa_name (vec_dest, epilog_stmt);
3474 gimple_assign_set_lhs (epilog_stmt, new_name);
3475 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3477 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3478 new_name, new_temp);
3479 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3480 gimple_assign_set_lhs (epilog_stmt, new_temp);
3481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3484 extract_scalar_result = true;
3486 else
3488 tree rhs;
3490 /*** Case 3: Create:
3491 s = extract_field <v_out2, 0>
3492 for (offset = element_size;
3493 offset < vector_size;
3494 offset += element_size;)
3496 Create: s' = extract_field <v_out2, offset>
3497 Create: s = op <s, s'> // For non SLP cases
3498 } */
3500 if (vect_print_dump_info (REPORT_DETAILS))
3501 fprintf (vect_dump, "Reduce using scalar code. ");
3503 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3504 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3506 vec_temp = PHI_RESULT (new_phi);
3507 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3508 bitsize_zero_node);
3509 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3510 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3511 gimple_assign_set_lhs (epilog_stmt, new_temp);
3512 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3514 /* In SLP we don't need to apply reduction operation, so we just
3515 collect s' values in SCALAR_RESULTS. */
3516 if (slp_node)
3517 VEC_safe_push (tree, heap, scalar_results, new_temp);
3519 for (bit_offset = element_bitsize;
3520 bit_offset < vec_size_in_bits;
3521 bit_offset += element_bitsize)
3523 tree bitpos = bitsize_int (bit_offset);
3524 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3525 bitsize, bitpos);
3527 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3528 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3529 gimple_assign_set_lhs (epilog_stmt, new_name);
3530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3532 if (slp_node)
3534 /* In SLP we don't need to apply reduction operation, so
3535 we just collect s' values in SCALAR_RESULTS. */
3536 new_temp = new_name;
3537 VEC_safe_push (tree, heap, scalar_results, new_name);
3539 else
3541 epilog_stmt = gimple_build_assign_with_ops (code,
3542 new_scalar_dest, new_name, new_temp);
3543 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3544 gimple_assign_set_lhs (epilog_stmt, new_temp);
3545 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3550 /* The only case where we need to reduce scalar results in SLP, is
3551 unrolling. If the size of SCALAR_RESULTS is greater than
3552 GROUP_SIZE, we reduce them combining elements modulo
3553 GROUP_SIZE. */
3554 if (slp_node)
3556 tree res, first_res, new_res;
3557 gimple new_stmt;
3559 /* Reduce multiple scalar results in case of SLP unrolling. */
3560 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3561 j++)
3563 first_res = VEC_index (tree, scalar_results, j % group_size);
3564 new_stmt = gimple_build_assign_with_ops (code,
3565 new_scalar_dest, first_res, res);
3566 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3567 gimple_assign_set_lhs (new_stmt, new_res);
3568 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3569 VEC_replace (tree, scalar_results, j % group_size, new_res);
3572 else
3573 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3574 VEC_safe_push (tree, heap, scalar_results, new_temp);
3576 extract_scalar_result = false;
3580 /* 2.4 Extract the final scalar result. Create:
3581 s_out3 = extract_field <v_out2, bitpos> */
3583 if (extract_scalar_result)
3585 tree rhs;
3587 if (vect_print_dump_info (REPORT_DETAILS))
3588 fprintf (vect_dump, "extract scalar result");
3590 if (BYTES_BIG_ENDIAN)
3591 bitpos = size_binop (MULT_EXPR,
3592 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3593 TYPE_SIZE (scalar_type));
3594 else
3595 bitpos = bitsize_zero_node;
3597 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3598 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3599 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3600 gimple_assign_set_lhs (epilog_stmt, new_temp);
3601 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3602 VEC_safe_push (tree, heap, scalar_results, new_temp);
3605 vect_finalize_reduction:
3607 if (double_reduc)
3608 loop = loop->inner;
3610 /* 2.5 Adjust the final result by the initial value of the reduction
3611 variable. (When such adjustment is not needed, then
3612 'adjustment_def' is zero). For example, if code is PLUS we create:
3613 new_temp = loop_exit_def + adjustment_def */
3615 if (adjustment_def)
3617 gcc_assert (!slp_node);
3618 if (nested_in_vect_loop)
3620 new_phi = VEC_index (gimple, new_phis, 0);
3621 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3622 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3623 new_dest = vect_create_destination_var (scalar_dest, vectype);
3625 else
3627 new_temp = VEC_index (tree, scalar_results, 0);
3628 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3629 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3630 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3633 epilog_stmt = gimple_build_assign (new_dest, expr);
3634 new_temp = make_ssa_name (new_dest, epilog_stmt);
3635 gimple_assign_set_lhs (epilog_stmt, new_temp);
3636 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3637 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3638 if (nested_in_vect_loop)
3640 set_vinfo_for_stmt (epilog_stmt,
3641 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3642 NULL));
3643 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3644 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3646 if (!double_reduc)
3647 VEC_quick_push (tree, scalar_results, new_temp);
3648 else
3649 VEC_replace (tree, scalar_results, 0, new_temp);
3651 else
3652 VEC_replace (tree, scalar_results, 0, new_temp);
3654 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3657 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3658 phis with new adjusted scalar results, i.e., replace use <s_out0>
3659 with use <s_out4>.
3661 Transform:
3662 loop_exit:
3663 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3664 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3665 v_out2 = reduce <v_out1>
3666 s_out3 = extract_field <v_out2, 0>
3667 s_out4 = adjust_result <s_out3>
3668 use <s_out0>
3669 use <s_out0>
3671 into:
3673 loop_exit:
3674 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3675 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3676 v_out2 = reduce <v_out1>
3677 s_out3 = extract_field <v_out2, 0>
3678 s_out4 = adjust_result <s_out3>
3679 use <s_out4>
3680 use <s_out4> */
3682 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3683 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3684 need to match SCALAR_RESULTS with corresponding statements. The first
3685 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3686 the first vector stmt, etc.
3687 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3688 if (group_size > VEC_length (gimple, new_phis))
3690 ratio = group_size / VEC_length (gimple, new_phis);
3691 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3693 else
3694 ratio = 1;
3696 for (k = 0; k < group_size; k++)
3698 if (k % ratio == 0)
3700 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3701 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3704 if (slp_node)
3706 gimple current_stmt = VEC_index (gimple,
3707 SLP_TREE_SCALAR_STMTS (slp_node), k);
3709 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3710 /* SLP statements can't participate in patterns. */
3711 gcc_assert (!orig_stmt);
3712 scalar_dest = gimple_assign_lhs (current_stmt);
3715 phis = VEC_alloc (gimple, heap, 3);
3716 /* Find the loop-closed-use at the loop exit of the original scalar
3717 result. (The reduction result is expected to have two immediate uses -
3718 one at the latch block, and one at the loop exit). */
3719 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3720 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3721 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3723 /* We expect to have found an exit_phi because of loop-closed-ssa
3724 form. */
3725 gcc_assert (!VEC_empty (gimple, phis));
3727 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3729 if (outer_loop)
3731 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3732 gimple vect_phi;
3734 /* FORNOW. Currently not supporting the case that an inner-loop
3735 reduction is not used in the outer-loop (but only outside the
3736 outer-loop), unless it is double reduction. */
3737 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3738 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3739 || double_reduc);
3741 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3742 if (!double_reduc
3743 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3744 != vect_double_reduction_def)
3745 continue;
3747 /* Handle double reduction:
3749 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3750 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3751 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3752 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3754 At that point the regular reduction (stmt2 and stmt3) is
3755 already vectorized, as well as the exit phi node, stmt4.
3756 Here we vectorize the phi node of double reduction, stmt1, and
3757 update all relevant statements. */
3759 /* Go through all the uses of s2 to find double reduction phi
3760 node, i.e., stmt1 above. */
3761 orig_name = PHI_RESULT (exit_phi);
3762 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3764 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3765 stmt_vec_info new_phi_vinfo;
3766 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3767 basic_block bb = gimple_bb (use_stmt);
3768 gimple use;
3770 /* Check that USE_STMT is really double reduction phi
3771 node. */
3772 if (gimple_code (use_stmt) != GIMPLE_PHI
3773 || gimple_phi_num_args (use_stmt) != 2
3774 || !use_stmt_vinfo
3775 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3776 != vect_double_reduction_def
3777 || bb->loop_father != outer_loop)
3778 continue;
3780 /* Create vector phi node for double reduction:
3781 vs1 = phi <vs0, vs2>
3782 vs1 was created previously in this function by a call to
3783 vect_get_vec_def_for_operand and is stored in
3784 vec_initial_def;
3785 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3786 vs0 is created here. */
3788 /* Create vector phi node. */
3789 vect_phi = create_phi_node (vec_initial_def, bb);
3790 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3791 loop_vec_info_for_loop (outer_loop), NULL);
3792 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3794 /* Create vs0 - initial def of the double reduction phi. */
3795 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3796 loop_preheader_edge (outer_loop));
3797 init_def = get_initial_def_for_reduction (stmt,
3798 preheader_arg, NULL);
3799 vect_phi_init = vect_init_vector (use_stmt, init_def,
3800 vectype, NULL);
3802 /* Update phi node arguments with vs0 and vs2. */
3803 add_phi_arg (vect_phi, vect_phi_init,
3804 loop_preheader_edge (outer_loop),
3805 UNKNOWN_LOCATION);
3806 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3807 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3808 if (vect_print_dump_info (REPORT_DETAILS))
3810 fprintf (vect_dump, "created double reduction phi "
3811 "node: ");
3812 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3815 vect_phi_res = PHI_RESULT (vect_phi);
3817 /* Replace the use, i.e., set the correct vs1 in the regular
3818 reduction phi node. FORNOW, NCOPIES is always 1, so the
3819 loop is redundant. */
3820 use = reduction_phi;
3821 for (j = 0; j < ncopies; j++)
3823 edge pr_edge = loop_preheader_edge (loop);
3824 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3825 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3831 VEC_free (gimple, heap, phis);
3832 if (nested_in_vect_loop)
3834 if (double_reduc)
3835 loop = outer_loop;
3836 else
3837 continue;
3840 phis = VEC_alloc (gimple, heap, 3);
3841 /* Find the loop-closed-use at the loop exit of the original scalar
3842 result. (The reduction result is expected to have two immediate uses,
3843 one at the latch block, and one at the loop exit). For double
3844 reductions we are looking for exit phis of the outer loop. */
3845 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3847 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3848 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3849 else
3851 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3853 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3855 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3857 if (!flow_bb_inside_loop_p (loop,
3858 gimple_bb (USE_STMT (phi_use_p))))
3859 VEC_safe_push (gimple, heap, phis,
3860 USE_STMT (phi_use_p));
3866 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3868 /* Replace the uses: */
3869 orig_name = PHI_RESULT (exit_phi);
3870 scalar_result = VEC_index (tree, scalar_results, k);
3871 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3872 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3873 SET_USE (use_p, scalar_result);
3876 VEC_free (gimple, heap, phis);
3879 VEC_free (tree, heap, scalar_results);
3880 VEC_free (gimple, heap, new_phis);
3884 /* Function vectorizable_reduction.
3886 Check if STMT performs a reduction operation that can be vectorized.
3887 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3888 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3889 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3891 This function also handles reduction idioms (patterns) that have been
3892 recognized in advance during vect_pattern_recog. In this case, STMT may be
3893 of this form:
3894 X = pattern_expr (arg0, arg1, ..., X)
3895 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3896 sequence that had been detected and replaced by the pattern-stmt (STMT).
3898 In some cases of reduction patterns, the type of the reduction variable X is
3899 different than the type of the other arguments of STMT.
3900 In such cases, the vectype that is used when transforming STMT into a vector
3901 stmt is different than the vectype that is used to determine the
3902 vectorization factor, because it consists of a different number of elements
3903 than the actual number of elements that are being operated upon in parallel.
3905 For example, consider an accumulation of shorts into an int accumulator.
3906 On some targets it's possible to vectorize this pattern operating on 8
3907 shorts at a time (hence, the vectype for purposes of determining the
3908 vectorization factor should be V8HI); on the other hand, the vectype that
3909 is used to create the vector form is actually V4SI (the type of the result).
3911 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3912 indicates what is the actual level of parallelism (V8HI in the example), so
3913 that the right vectorization factor would be derived. This vectype
3914 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3915 be used to create the vectorized stmt. The right vectype for the vectorized
3916 stmt is obtained from the type of the result X:
3917 get_vectype_for_scalar_type (TREE_TYPE (X))
3919 This means that, contrary to "regular" reductions (or "regular" stmts in
3920 general), the following equation:
3921 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3922 does *NOT* necessarily hold for reduction patterns. */
3924 bool
3925 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3926 gimple *vec_stmt, slp_tree slp_node)
3928 tree vec_dest;
3929 tree scalar_dest;
3930 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3931 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3932 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3933 tree vectype_in = NULL_TREE;
3934 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3935 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3936 enum tree_code code, orig_code, epilog_reduc_code;
3937 enum machine_mode vec_mode;
3938 int op_type;
3939 optab optab, reduc_optab;
3940 tree new_temp = NULL_TREE;
3941 tree def;
3942 gimple def_stmt;
3943 enum vect_def_type dt;
3944 gimple new_phi = NULL;
3945 tree scalar_type;
3946 bool is_simple_use;
3947 gimple orig_stmt;
3948 stmt_vec_info orig_stmt_info;
3949 tree expr = NULL_TREE;
3950 int i;
3951 int ncopies;
3952 int epilog_copies;
3953 stmt_vec_info prev_stmt_info, prev_phi_info;
3954 bool single_defuse_cycle = false;
3955 tree reduc_def = NULL_TREE;
3956 gimple new_stmt = NULL;
3957 int j;
3958 tree ops[3];
3959 bool nested_cycle = false, found_nested_cycle_def = false;
3960 gimple reduc_def_stmt = NULL;
3961 /* The default is that the reduction variable is the last in statement. */
3962 int reduc_index = 2;
3963 bool double_reduc = false, dummy;
3964 basic_block def_bb;
3965 struct loop * def_stmt_loop, *outer_loop = NULL;
3966 tree def_arg;
3967 gimple def_arg_stmt;
3968 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3969 VEC (gimple, heap) *phis = NULL;
3970 int vec_num;
3971 tree def0, def1, tem;
3973 if (nested_in_vect_loop_p (loop, stmt))
3975 outer_loop = loop;
3976 loop = loop->inner;
3977 nested_cycle = true;
3980 /* 1. Is vectorizable reduction? */
3981 /* Not supportable if the reduction variable is used in the loop. */
3982 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3983 return false;
3985 /* Reductions that are not used even in an enclosing outer-loop,
3986 are expected to be "live" (used out of the loop). */
3987 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3988 && !STMT_VINFO_LIVE_P (stmt_info))
3989 return false;
3991 /* Make sure it was already recognized as a reduction computation. */
3992 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3993 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3994 return false;
3996 /* 2. Has this been recognized as a reduction pattern?
3998 Check if STMT represents a pattern that has been recognized
3999 in earlier analysis stages. For stmts that represent a pattern,
4000 the STMT_VINFO_RELATED_STMT field records the last stmt in
4001 the original sequence that constitutes the pattern. */
4003 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4004 if (orig_stmt)
4006 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4007 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4008 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4009 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4012 /* 3. Check the operands of the operation. The first operands are defined
4013 inside the loop body. The last operand is the reduction variable,
4014 which is defined by the loop-header-phi. */
4016 gcc_assert (is_gimple_assign (stmt));
4018 /* Flatten RHS. */
4019 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4021 case GIMPLE_SINGLE_RHS:
4022 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4023 if (op_type == ternary_op)
4025 tree rhs = gimple_assign_rhs1 (stmt);
4026 ops[0] = TREE_OPERAND (rhs, 0);
4027 ops[1] = TREE_OPERAND (rhs, 1);
4028 ops[2] = TREE_OPERAND (rhs, 2);
4029 code = TREE_CODE (rhs);
4031 else
4032 return false;
4033 break;
4035 case GIMPLE_BINARY_RHS:
4036 code = gimple_assign_rhs_code (stmt);
4037 op_type = TREE_CODE_LENGTH (code);
4038 gcc_assert (op_type == binary_op);
4039 ops[0] = gimple_assign_rhs1 (stmt);
4040 ops[1] = gimple_assign_rhs2 (stmt);
4041 break;
4043 case GIMPLE_UNARY_RHS:
4044 return false;
4046 default:
4047 gcc_unreachable ();
4050 scalar_dest = gimple_assign_lhs (stmt);
4051 scalar_type = TREE_TYPE (scalar_dest);
4052 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4053 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4054 return false;
4056 /* All uses but the last are expected to be defined in the loop.
4057 The last use is the reduction variable. In case of nested cycle this
4058 assumption is not true: we use reduc_index to record the index of the
4059 reduction variable. */
4060 for (i = 0; i < op_type-1; i++)
4062 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4063 if (i == 0 && code == COND_EXPR)
4064 continue;
4066 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4067 &def_stmt, &def, &dt, &tem);
4068 if (!vectype_in)
4069 vectype_in = tem;
4070 gcc_assert (is_simple_use);
4071 if (dt != vect_internal_def
4072 && dt != vect_external_def
4073 && dt != vect_constant_def
4074 && dt != vect_induction_def
4075 && !(dt == vect_nested_cycle && nested_cycle))
4076 return false;
4078 if (dt == vect_nested_cycle)
4080 found_nested_cycle_def = true;
4081 reduc_def_stmt = def_stmt;
4082 reduc_index = i;
4086 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4087 &def, &dt, &tem);
4088 if (!vectype_in)
4089 vectype_in = tem;
4090 gcc_assert (is_simple_use);
4091 gcc_assert (dt == vect_reduction_def
4092 || dt == vect_nested_cycle
4093 || ((dt == vect_internal_def || dt == vect_external_def
4094 || dt == vect_constant_def || dt == vect_induction_def)
4095 && nested_cycle && found_nested_cycle_def));
4096 if (!found_nested_cycle_def)
4097 reduc_def_stmt = def_stmt;
4099 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4100 if (orig_stmt)
4101 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4102 reduc_def_stmt,
4103 !nested_cycle,
4104 &dummy));
4105 else
4106 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4107 !nested_cycle, &dummy));
4109 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4110 return false;
4112 if (slp_node)
4113 ncopies = 1;
4114 else
4115 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4116 / TYPE_VECTOR_SUBPARTS (vectype_in));
4118 gcc_assert (ncopies >= 1);
4120 vec_mode = TYPE_MODE (vectype_in);
4122 if (code == COND_EXPR)
4124 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4126 if (vect_print_dump_info (REPORT_DETAILS))
4127 fprintf (vect_dump, "unsupported condition in reduction");
4129 return false;
4132 else
4134 /* 4. Supportable by target? */
4136 /* 4.1. check support for the operation in the loop */
4137 optab = optab_for_tree_code (code, vectype_in, optab_default);
4138 if (!optab)
4140 if (vect_print_dump_info (REPORT_DETAILS))
4141 fprintf (vect_dump, "no optab.");
4143 return false;
4146 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4148 if (vect_print_dump_info (REPORT_DETAILS))
4149 fprintf (vect_dump, "op not supported by target.");
4151 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4152 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4153 < vect_min_worthwhile_factor (code))
4154 return false;
4156 if (vect_print_dump_info (REPORT_DETAILS))
4157 fprintf (vect_dump, "proceeding using word mode.");
4160 /* Worthwhile without SIMD support? */
4161 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4162 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4163 < vect_min_worthwhile_factor (code))
4165 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "not worthwhile without SIMD support.");
4168 return false;
4172 /* 4.2. Check support for the epilog operation.
4174 If STMT represents a reduction pattern, then the type of the
4175 reduction variable may be different than the type of the rest
4176 of the arguments. For example, consider the case of accumulation
4177 of shorts into an int accumulator; The original code:
4178 S1: int_a = (int) short_a;
4179 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4181 was replaced with:
4182 STMT: int_acc = widen_sum <short_a, int_acc>
4184 This means that:
4185 1. The tree-code that is used to create the vector operation in the
4186 epilog code (that reduces the partial results) is not the
4187 tree-code of STMT, but is rather the tree-code of the original
4188 stmt from the pattern that STMT is replacing. I.e, in the example
4189 above we want to use 'widen_sum' in the loop, but 'plus' in the
4190 epilog.
4191 2. The type (mode) we use to check available target support
4192 for the vector operation to be created in the *epilog*, is
4193 determined by the type of the reduction variable (in the example
4194 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4195 However the type (mode) we use to check available target support
4196 for the vector operation to be created *inside the loop*, is
4197 determined by the type of the other arguments to STMT (in the
4198 example we'd check this: optab_handler (widen_sum_optab,
4199 vect_short_mode)).
4201 This is contrary to "regular" reductions, in which the types of all
4202 the arguments are the same as the type of the reduction variable.
4203 For "regular" reductions we can therefore use the same vector type
4204 (and also the same tree-code) when generating the epilog code and
4205 when generating the code inside the loop. */
4207 if (orig_stmt)
4209 /* This is a reduction pattern: get the vectype from the type of the
4210 reduction variable, and get the tree-code from orig_stmt. */
4211 orig_code = gimple_assign_rhs_code (orig_stmt);
4212 gcc_assert (vectype_out);
4213 vec_mode = TYPE_MODE (vectype_out);
4215 else
4217 /* Regular reduction: use the same vectype and tree-code as used for
4218 the vector code inside the loop can be used for the epilog code. */
4219 orig_code = code;
4222 if (nested_cycle)
4224 def_bb = gimple_bb (reduc_def_stmt);
4225 def_stmt_loop = def_bb->loop_father;
4226 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4227 loop_preheader_edge (def_stmt_loop));
4228 if (TREE_CODE (def_arg) == SSA_NAME
4229 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4230 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4231 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4232 && vinfo_for_stmt (def_arg_stmt)
4233 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4234 == vect_double_reduction_def)
4235 double_reduc = true;
4238 epilog_reduc_code = ERROR_MARK;
4239 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4241 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4242 optab_default);
4243 if (!reduc_optab)
4245 if (vect_print_dump_info (REPORT_DETAILS))
4246 fprintf (vect_dump, "no optab for reduction.");
4248 epilog_reduc_code = ERROR_MARK;
4251 if (reduc_optab
4252 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4254 if (vect_print_dump_info (REPORT_DETAILS))
4255 fprintf (vect_dump, "reduc op not supported by target.");
4257 epilog_reduc_code = ERROR_MARK;
4260 else
4262 if (!nested_cycle || double_reduc)
4264 if (vect_print_dump_info (REPORT_DETAILS))
4265 fprintf (vect_dump, "no reduc code for scalar code.");
4267 return false;
4271 if (double_reduc && ncopies > 1)
4273 if (vect_print_dump_info (REPORT_DETAILS))
4274 fprintf (vect_dump, "multiple types in double reduction");
4276 return false;
4279 if (!vec_stmt) /* transformation not required. */
4281 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4282 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4283 return false;
4284 return true;
4287 /** Transform. **/
4289 if (vect_print_dump_info (REPORT_DETAILS))
4290 fprintf (vect_dump, "transform reduction.");
4292 /* FORNOW: Multiple types are not supported for condition. */
4293 if (code == COND_EXPR)
4294 gcc_assert (ncopies == 1);
4296 /* Create the destination vector */
4297 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4299 /* In case the vectorization factor (VF) is bigger than the number
4300 of elements that we can fit in a vectype (nunits), we have to generate
4301 more than one vector stmt - i.e - we need to "unroll" the
4302 vector stmt by a factor VF/nunits. For more details see documentation
4303 in vectorizable_operation. */
4305 /* If the reduction is used in an outer loop we need to generate
4306 VF intermediate results, like so (e.g. for ncopies=2):
4307 r0 = phi (init, r0)
4308 r1 = phi (init, r1)
4309 r0 = x0 + r0;
4310 r1 = x1 + r1;
4311 (i.e. we generate VF results in 2 registers).
4312 In this case we have a separate def-use cycle for each copy, and therefore
4313 for each copy we get the vector def for the reduction variable from the
4314 respective phi node created for this copy.
4316 Otherwise (the reduction is unused in the loop nest), we can combine
4317 together intermediate results, like so (e.g. for ncopies=2):
4318 r = phi (init, r)
4319 r = x0 + r;
4320 r = x1 + r;
4321 (i.e. we generate VF/2 results in a single register).
4322 In this case for each copy we get the vector def for the reduction variable
4323 from the vectorized reduction operation generated in the previous iteration.
4326 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4328 single_defuse_cycle = true;
4329 epilog_copies = 1;
4331 else
4332 epilog_copies = ncopies;
4334 prev_stmt_info = NULL;
4335 prev_phi_info = NULL;
4336 if (slp_node)
4338 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4339 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4340 == TYPE_VECTOR_SUBPARTS (vectype_in));
4342 else
4344 vec_num = 1;
4345 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4346 if (op_type == ternary_op)
4347 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4350 phis = VEC_alloc (gimple, heap, vec_num);
4351 vect_defs = VEC_alloc (tree, heap, vec_num);
4352 if (!slp_node)
4353 VEC_quick_push (tree, vect_defs, NULL_TREE);
4355 for (j = 0; j < ncopies; j++)
4357 if (j == 0 || !single_defuse_cycle)
4359 for (i = 0; i < vec_num; i++)
4361 /* Create the reduction-phi that defines the reduction
4362 operand. */
4363 new_phi = create_phi_node (vec_dest, loop->header);
4364 set_vinfo_for_stmt (new_phi,
4365 new_stmt_vec_info (new_phi, loop_vinfo,
4366 NULL));
4367 if (j == 0 || slp_node)
4368 VEC_quick_push (gimple, phis, new_phi);
4372 if (code == COND_EXPR)
4374 gcc_assert (!slp_node);
4375 vectorizable_condition (stmt, gsi, vec_stmt,
4376 PHI_RESULT (VEC_index (gimple, phis, 0)),
4377 reduc_index);
4378 /* Multiple types are not supported for condition. */
4379 break;
4382 /* Handle uses. */
4383 if (j == 0)
4385 tree op0, op1 = NULL_TREE;
4387 op0 = ops[!reduc_index];
4388 if (op_type == ternary_op)
4390 if (reduc_index == 0)
4391 op1 = ops[2];
4392 else
4393 op1 = ops[1];
4396 if (slp_node)
4397 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4398 -1);
4399 else
4401 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4402 stmt, NULL);
4403 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4404 if (op_type == ternary_op)
4406 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4407 NULL);
4408 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4412 else
4414 if (!slp_node)
4416 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4417 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4418 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4419 if (op_type == ternary_op)
4421 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4422 loop_vec_def1);
4423 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4427 if (single_defuse_cycle)
4428 reduc_def = gimple_assign_lhs (new_stmt);
4430 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4433 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4435 if (slp_node)
4436 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4437 else
4439 if (!single_defuse_cycle || j == 0)
4440 reduc_def = PHI_RESULT (new_phi);
4443 def1 = ((op_type == ternary_op)
4444 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4445 if (op_type == binary_op)
4447 if (reduc_index == 0)
4448 expr = build2 (code, vectype_out, reduc_def, def0);
4449 else
4450 expr = build2 (code, vectype_out, def0, reduc_def);
4452 else
4454 if (reduc_index == 0)
4455 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4456 else
4458 if (reduc_index == 1)
4459 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4460 else
4461 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4465 new_stmt = gimple_build_assign (vec_dest, expr);
4466 new_temp = make_ssa_name (vec_dest, new_stmt);
4467 gimple_assign_set_lhs (new_stmt, new_temp);
4468 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4469 if (slp_node)
4471 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4472 VEC_quick_push (tree, vect_defs, new_temp);
4474 else
4475 VEC_replace (tree, vect_defs, 0, new_temp);
4478 if (slp_node)
4479 continue;
4481 if (j == 0)
4482 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4483 else
4484 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4486 prev_stmt_info = vinfo_for_stmt (new_stmt);
4487 prev_phi_info = vinfo_for_stmt (new_phi);
4490 /* Finalize the reduction-phi (set its arguments) and create the
4491 epilog reduction code. */
4492 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4494 new_temp = gimple_assign_lhs (*vec_stmt);
4495 VEC_replace (tree, vect_defs, 0, new_temp);
4498 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4499 epilog_reduc_code, phis, reduc_index,
4500 double_reduc, slp_node);
4502 VEC_free (gimple, heap, phis);
4503 VEC_free (tree, heap, vec_oprnds0);
4504 if (vec_oprnds1)
4505 VEC_free (tree, heap, vec_oprnds1);
4507 return true;
4510 /* Function vect_min_worthwhile_factor.
4512 For a loop where we could vectorize the operation indicated by CODE,
4513 return the minimum vectorization factor that makes it worthwhile
4514 to use generic vectors. */
4516 vect_min_worthwhile_factor (enum tree_code code)
4518 switch (code)
4520 case PLUS_EXPR:
4521 case MINUS_EXPR:
4522 case NEGATE_EXPR:
4523 return 4;
4525 case BIT_AND_EXPR:
4526 case BIT_IOR_EXPR:
4527 case BIT_XOR_EXPR:
4528 case BIT_NOT_EXPR:
4529 return 2;
4531 default:
4532 return INT_MAX;
4537 /* Function vectorizable_induction
4539 Check if PHI performs an induction computation that can be vectorized.
4540 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4541 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4542 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4544 bool
4545 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4546 gimple *vec_stmt)
4548 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4549 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4550 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4551 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4552 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4553 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4554 tree vec_def;
4556 gcc_assert (ncopies >= 1);
4557 /* FORNOW. This restriction should be relaxed. */
4558 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4560 if (vect_print_dump_info (REPORT_DETAILS))
4561 fprintf (vect_dump, "multiple types in nested loop.");
4562 return false;
4565 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4566 return false;
4568 /* FORNOW: SLP not supported. */
4569 if (STMT_SLP_TYPE (stmt_info))
4570 return false;
4572 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4574 if (gimple_code (phi) != GIMPLE_PHI)
4575 return false;
4577 if (!vec_stmt) /* transformation not required. */
4579 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4580 if (vect_print_dump_info (REPORT_DETAILS))
4581 fprintf (vect_dump, "=== vectorizable_induction ===");
4582 vect_model_induction_cost (stmt_info, ncopies);
4583 return true;
4586 /** Transform. **/
4588 if (vect_print_dump_info (REPORT_DETAILS))
4589 fprintf (vect_dump, "transform induction phi.");
4591 vec_def = get_initial_def_for_induction (phi);
4592 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4593 return true;
4596 /* Function vectorizable_live_operation.
4598 STMT computes a value that is used outside the loop. Check if
4599 it can be supported. */
4601 bool
4602 vectorizable_live_operation (gimple stmt,
4603 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4604 gimple *vec_stmt ATTRIBUTE_UNUSED)
4606 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4607 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4608 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4609 int i;
4610 int op_type;
4611 tree op;
4612 tree def;
4613 gimple def_stmt;
4614 enum vect_def_type dt;
4615 enum tree_code code;
4616 enum gimple_rhs_class rhs_class;
4618 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4620 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4621 return false;
4623 if (!is_gimple_assign (stmt))
4624 return false;
4626 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4627 return false;
4629 /* FORNOW. CHECKME. */
4630 if (nested_in_vect_loop_p (loop, stmt))
4631 return false;
4633 code = gimple_assign_rhs_code (stmt);
4634 op_type = TREE_CODE_LENGTH (code);
4635 rhs_class = get_gimple_rhs_class (code);
4636 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4637 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4639 /* FORNOW: support only if all uses are invariant. This means
4640 that the scalar operations can remain in place, unvectorized.
4641 The original last scalar value that they compute will be used. */
4643 for (i = 0; i < op_type; i++)
4645 if (rhs_class == GIMPLE_SINGLE_RHS)
4646 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4647 else
4648 op = gimple_op (stmt, i + 1);
4649 if (op
4650 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4652 if (vect_print_dump_info (REPORT_DETAILS))
4653 fprintf (vect_dump, "use not simple.");
4654 return false;
4657 if (dt != vect_external_def && dt != vect_constant_def)
4658 return false;
4661 /* No transformation is required for the cases we currently support. */
4662 return true;
4665 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4667 static void
4668 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4670 ssa_op_iter op_iter;
4671 imm_use_iterator imm_iter;
4672 def_operand_p def_p;
4673 gimple ustmt;
4675 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4677 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4679 basic_block bb;
4681 if (!is_gimple_debug (ustmt))
4682 continue;
4684 bb = gimple_bb (ustmt);
4686 if (!flow_bb_inside_loop_p (loop, bb))
4688 if (gimple_debug_bind_p (ustmt))
4690 if (vect_print_dump_info (REPORT_DETAILS))
4691 fprintf (vect_dump, "killing debug use");
4693 gimple_debug_bind_reset_value (ustmt);
4694 update_stmt (ustmt);
4696 else
4697 gcc_unreachable ();
4703 /* Function vect_transform_loop.
4705 The analysis phase has determined that the loop is vectorizable.
4706 Vectorize the loop - created vectorized stmts to replace the scalar
4707 stmts in the loop, and update the loop exit condition. */
4709 void
4710 vect_transform_loop (loop_vec_info loop_vinfo)
4712 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4713 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4714 int nbbs = loop->num_nodes;
4715 gimple_stmt_iterator si;
4716 int i;
4717 tree ratio = NULL;
4718 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4719 bool strided_store;
4720 bool slp_scheduled = false;
4721 unsigned int nunits;
4722 tree cond_expr = NULL_TREE;
4723 gimple_seq cond_expr_stmt_list = NULL;
4724 bool do_peeling_for_loop_bound;
4726 if (vect_print_dump_info (REPORT_DETAILS))
4727 fprintf (vect_dump, "=== vec_transform_loop ===");
4729 /* Peel the loop if there are data refs with unknown alignment.
4730 Only one data ref with unknown store is allowed. */
4732 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4733 vect_do_peeling_for_alignment (loop_vinfo);
4735 do_peeling_for_loop_bound
4736 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4737 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4738 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
4739 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
4741 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4742 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4743 vect_loop_versioning (loop_vinfo,
4744 !do_peeling_for_loop_bound,
4745 &cond_expr, &cond_expr_stmt_list);
4747 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4748 compile time constant), or it is a constant that doesn't divide by the
4749 vectorization factor, then an epilog loop needs to be created.
4750 We therefore duplicate the loop: the original loop will be vectorized,
4751 and will compute the first (n/VF) iterations. The second copy of the loop
4752 will remain scalar and will compute the remaining (n%VF) iterations.
4753 (VF is the vectorization factor). */
4755 if (do_peeling_for_loop_bound)
4756 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4757 cond_expr, cond_expr_stmt_list);
4758 else
4759 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4760 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4762 /* 1) Make sure the loop header has exactly two entries
4763 2) Make sure we have a preheader basic block. */
4765 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4767 split_edge (loop_preheader_edge (loop));
4769 /* FORNOW: the vectorizer supports only loops which body consist
4770 of one basic block (header + empty latch). When the vectorizer will
4771 support more involved loop forms, the order by which the BBs are
4772 traversed need to be reconsidered. */
4774 for (i = 0; i < nbbs; i++)
4776 basic_block bb = bbs[i];
4777 stmt_vec_info stmt_info;
4778 gimple phi;
4780 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4782 phi = gsi_stmt (si);
4783 if (vect_print_dump_info (REPORT_DETAILS))
4785 fprintf (vect_dump, "------>vectorizing phi: ");
4786 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4788 stmt_info = vinfo_for_stmt (phi);
4789 if (!stmt_info)
4790 continue;
4792 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4793 vect_loop_kill_debug_uses (loop, phi);
4795 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4796 && !STMT_VINFO_LIVE_P (stmt_info))
4797 continue;
4799 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4800 != (unsigned HOST_WIDE_INT) vectorization_factor)
4801 && vect_print_dump_info (REPORT_DETAILS))
4802 fprintf (vect_dump, "multiple-types.");
4804 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4806 if (vect_print_dump_info (REPORT_DETAILS))
4807 fprintf (vect_dump, "transform phi.");
4808 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4812 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4814 gimple stmt = gsi_stmt (si);
4815 bool is_store;
4817 if (vect_print_dump_info (REPORT_DETAILS))
4819 fprintf (vect_dump, "------>vectorizing statement: ");
4820 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4823 stmt_info = vinfo_for_stmt (stmt);
4825 /* vector stmts created in the outer-loop during vectorization of
4826 stmts in an inner-loop may not have a stmt_info, and do not
4827 need to be vectorized. */
4828 if (!stmt_info)
4830 gsi_next (&si);
4831 continue;
4834 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4835 vect_loop_kill_debug_uses (loop, stmt);
4837 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4838 && !STMT_VINFO_LIVE_P (stmt_info))
4840 gsi_next (&si);
4841 continue;
4844 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4845 nunits =
4846 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4847 if (!STMT_SLP_TYPE (stmt_info)
4848 && nunits != (unsigned int) vectorization_factor
4849 && vect_print_dump_info (REPORT_DETAILS))
4850 /* For SLP VF is set according to unrolling factor, and not to
4851 vector size, hence for SLP this print is not valid. */
4852 fprintf (vect_dump, "multiple-types.");
4854 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4855 reached. */
4856 if (STMT_SLP_TYPE (stmt_info))
4858 if (!slp_scheduled)
4860 slp_scheduled = true;
4862 if (vect_print_dump_info (REPORT_DETAILS))
4863 fprintf (vect_dump, "=== scheduling SLP instances ===");
4865 vect_schedule_slp (loop_vinfo, NULL);
4868 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4869 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4871 gsi_next (&si);
4872 continue;
4876 /* -------- vectorize statement ------------ */
4877 if (vect_print_dump_info (REPORT_DETAILS))
4878 fprintf (vect_dump, "transform statement.");
4880 strided_store = false;
4881 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4882 if (is_store)
4884 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4886 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4887 interleaving chain was completed - free all the stores in
4888 the chain. */
4889 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4890 gsi_remove (&si, true);
4891 continue;
4893 else
4895 /* Free the attached stmt_vec_info and remove the stmt. */
4896 free_stmt_vec_info (stmt);
4897 gsi_remove (&si, true);
4898 continue;
4901 gsi_next (&si);
4902 } /* stmts in BB */
4903 } /* BBs in loop */
4905 slpeel_make_loop_iterate_ntimes (loop, ratio);
4907 /* The memory tags and pointers in vectorized statements need to
4908 have their SSA forms updated. FIXME, why can't this be delayed
4909 until all the loops have been transformed? */
4910 update_ssa (TODO_update_ssa);
4912 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4913 fprintf (vect_dump, "LOOP VECTORIZED.");
4914 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4915 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");