PR tree-optimization/43833
[official-gcc/alias-decl.git] / gcc / tree-vect-loop.c
blob83b823d84bfed060645250ce0950ec27ac749c4d
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "cfgloop.h"
34 #include "cfglayout.h"
35 #include "expr.h"
36 #include "recog.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "toplev.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127 support different sizes of vectors, for now will need to specify one value
128 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
130 Since we only vectorize operations which vector form can be
131 expressed using existing tree codes, to verify that an operation is
132 supported, the vectorizer checks the relevant optab at the relevant
133 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134 the value found is CODE_FOR_nothing, then there's no target support, and
135 we can't vectorize the stmt.
137 For additional information on this project see:
138 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
141 /* Function vect_determine_vectorization_factor
143 Determine the vectorization factor (VF). VF is the number of data elements
144 that are operated upon in parallel in a single iteration of the vectorized
145 loop. For example, when vectorizing a loop that operates on 4byte elements,
146 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147 elements can fit in a single vector register.
149 We currently support vectorization of loops in which all types operated upon
150 are of the same size. Therefore this function currently sets VF according to
151 the size of the types operated upon, and fails if there are multiple sizes
152 in the loop.
154 VF is also the factor by which the loop iterations are strip-mined, e.g.:
155 original loop:
156 for (i=0; i<N; i++){
157 a[i] = b[i] + c[i];
160 vectorized loop:
161 for (i=0; i<N; i+=VF){
162 a[i:VF] = b[i:VF] + c[i:VF];
166 static bool
167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171 int nbbs = loop->num_nodes;
172 gimple_stmt_iterator si;
173 unsigned int vectorization_factor = 0;
174 tree scalar_type;
175 gimple phi;
176 tree vectype;
177 unsigned int nunits;
178 stmt_vec_info stmt_info;
179 int i;
180 HOST_WIDE_INT dummy;
182 if (vect_print_dump_info (REPORT_DETAILS))
183 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
185 for (i = 0; i < nbbs; i++)
187 basic_block bb = bbs[i];
189 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
191 phi = gsi_stmt (si);
192 stmt_info = vinfo_for_stmt (phi);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "==> examining phi: ");
196 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
199 gcc_assert (stmt_info);
201 if (STMT_VINFO_RELEVANT_P (stmt_info))
203 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204 scalar_type = TREE_TYPE (PHI_RESULT (phi));
206 if (vect_print_dump_info (REPORT_DETAILS))
208 fprintf (vect_dump, "get vectype for scalar type: ");
209 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
212 vectype = get_vectype_for_scalar_type (scalar_type);
213 if (!vectype)
215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
217 fprintf (vect_dump,
218 "not vectorized: unsupported data-type ");
219 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
221 return false;
223 STMT_VINFO_VECTYPE (stmt_info) = vectype;
225 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "vectype: ");
228 print_generic_expr (vect_dump, vectype, TDF_SLIM);
231 nunits = TYPE_VECTOR_SUBPARTS (vectype);
232 if (vect_print_dump_info (REPORT_DETAILS))
233 fprintf (vect_dump, "nunits = %d", nunits);
235 if (!vectorization_factor
236 || (nunits > vectorization_factor))
237 vectorization_factor = nunits;
241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
243 tree vf_vectype;
244 gimple stmt = gsi_stmt (si);
245 stmt_info = vinfo_for_stmt (stmt);
247 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "==> examining statement: ");
250 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
253 gcc_assert (stmt_info);
255 /* skip stmts which do not need to be vectorized. */
256 if (!STMT_VINFO_RELEVANT_P (stmt_info)
257 && !STMT_VINFO_LIVE_P (stmt_info))
259 if (vect_print_dump_info (REPORT_DETAILS))
260 fprintf (vect_dump, "skip.");
261 continue;
264 if (gimple_get_lhs (stmt) == NULL_TREE)
266 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
268 fprintf (vect_dump, "not vectorized: irregular stmt.");
269 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
271 return false;
274 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
278 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
279 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
281 return false;
284 if (STMT_VINFO_VECTYPE (stmt_info))
286 /* The only case when a vectype had been already set is for stmts
287 that contain a dataref, or for "pattern-stmts" (stmts generated
288 by the vectorizer to represent/replace a certain idiom). */
289 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
290 || is_pattern_stmt_p (stmt_info));
291 vectype = STMT_VINFO_VECTYPE (stmt_info);
293 else
295 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
296 && !is_pattern_stmt_p (stmt_info));
298 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
299 if (vect_print_dump_info (REPORT_DETAILS))
301 fprintf (vect_dump, "get vectype for scalar type: ");
302 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
304 vectype = get_vectype_for_scalar_type (scalar_type);
305 if (!vectype)
307 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
309 fprintf (vect_dump,
310 "not vectorized: unsupported data-type ");
311 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
313 return false;
316 STMT_VINFO_VECTYPE (stmt_info) = vectype;
319 /* The vectorization factor is according to the smallest
320 scalar type (or the largest vector size, but we only
321 support one vector size per loop). */
322 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
323 &dummy);
324 if (vect_print_dump_info (REPORT_DETAILS))
326 fprintf (vect_dump, "get vectype for scalar type: ");
327 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
329 vf_vectype = get_vectype_for_scalar_type (scalar_type);
330 if (!vf_vectype)
332 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
334 fprintf (vect_dump,
335 "not vectorized: unsupported data-type ");
336 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
338 return false;
341 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
342 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
344 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
346 fprintf (vect_dump,
347 "not vectorized: different sized vector "
348 "types in statement, ");
349 print_generic_expr (vect_dump, vectype, TDF_SLIM);
350 fprintf (vect_dump, " and ");
351 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
353 return false;
356 if (vect_print_dump_info (REPORT_DETAILS))
358 fprintf (vect_dump, "vectype: ");
359 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
362 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
363 if (vect_print_dump_info (REPORT_DETAILS))
364 fprintf (vect_dump, "nunits = %d", nunits);
366 if (!vectorization_factor
367 || (nunits > vectorization_factor))
368 vectorization_factor = nunits;
372 /* TODO: Analyze cost. Decide if worth while to vectorize. */
373 if (vect_print_dump_info (REPORT_DETAILS))
374 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
375 if (vectorization_factor <= 1)
377 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
378 fprintf (vect_dump, "not vectorized: unsupported data-type");
379 return false;
381 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
383 return true;
387 /* Function vect_is_simple_iv_evolution.
389 FORNOW: A simple evolution of an induction variables in the loop is
390 considered a polynomial evolution with constant step. */
392 static bool
393 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
394 tree * step)
396 tree init_expr;
397 tree step_expr;
398 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
400 /* When there is no evolution in this loop, the evolution function
401 is not "simple". */
402 if (evolution_part == NULL_TREE)
403 return false;
405 /* When the evolution is a polynomial of degree >= 2
406 the evolution function is not "simple". */
407 if (tree_is_chrec (evolution_part))
408 return false;
410 step_expr = evolution_part;
411 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
413 if (vect_print_dump_info (REPORT_DETAILS))
415 fprintf (vect_dump, "step: ");
416 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
417 fprintf (vect_dump, ", init: ");
418 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
421 *init = init_expr;
422 *step = step_expr;
424 if (TREE_CODE (step_expr) != INTEGER_CST)
426 if (vect_print_dump_info (REPORT_DETAILS))
427 fprintf (vect_dump, "step unknown.");
428 return false;
431 return true;
434 /* Function vect_analyze_scalar_cycles_1.
436 Examine the cross iteration def-use cycles of scalar variables
437 in LOOP. LOOP_VINFO represents the loop that is now being
438 considered for vectorization (can be LOOP, or an outer-loop
439 enclosing LOOP). */
441 static void
442 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
444 basic_block bb = loop->header;
445 tree dumy;
446 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
447 gimple_stmt_iterator gsi;
448 bool double_reduc;
450 if (vect_print_dump_info (REPORT_DETAILS))
451 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
453 /* First - identify all inductions. Reduction detection assumes that all the
454 inductions have been identified, therefore, this order must not be
455 changed. */
456 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
458 gimple phi = gsi_stmt (gsi);
459 tree access_fn = NULL;
460 tree def = PHI_RESULT (phi);
461 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
463 if (vect_print_dump_info (REPORT_DETAILS))
465 fprintf (vect_dump, "Analyze phi: ");
466 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
469 /* Skip virtual phi's. The data dependences that are associated with
470 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
471 if (!is_gimple_reg (SSA_NAME_VAR (def)))
472 continue;
474 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
476 /* Analyze the evolution function. */
477 access_fn = analyze_scalar_evolution (loop, def);
478 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
480 fprintf (vect_dump, "Access function of PHI: ");
481 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
484 if (!access_fn
485 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
487 VEC_safe_push (gimple, heap, worklist, phi);
488 continue;
491 if (vect_print_dump_info (REPORT_DETAILS))
492 fprintf (vect_dump, "Detected induction.");
493 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
497 /* Second - identify all reductions and nested cycles. */
498 while (VEC_length (gimple, worklist) > 0)
500 gimple phi = VEC_pop (gimple, worklist);
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
503 gimple reduc_stmt;
504 bool nested_cycle;
506 if (vect_print_dump_info (REPORT_DETAILS))
508 fprintf (vect_dump, "Analyze phi: ");
509 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
512 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
513 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
515 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
516 reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
517 &double_reduc);
518 if (reduc_stmt)
520 if (double_reduc)
522 if (vect_print_dump_info (REPORT_DETAILS))
523 fprintf (vect_dump, "Detected double reduction.");
525 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
526 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
527 vect_double_reduction_def;
529 else
531 if (nested_cycle)
533 if (vect_print_dump_info (REPORT_DETAILS))
534 fprintf (vect_dump, "Detected vectorizable nested cycle.");
536 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
537 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
538 vect_nested_cycle;
540 else
542 if (vect_print_dump_info (REPORT_DETAILS))
543 fprintf (vect_dump, "Detected reduction.");
545 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
546 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
547 vect_reduction_def;
548 /* Store the reduction cycles for possible vectorization in
549 loop-aware SLP. */
550 VEC_safe_push (gimple, heap,
551 LOOP_VINFO_REDUCTIONS (loop_vinfo),
552 reduc_stmt);
556 else
557 if (vect_print_dump_info (REPORT_DETAILS))
558 fprintf (vect_dump, "Unknown def-use cycle pattern.");
561 VEC_free (gimple, heap, worklist);
565 /* Function vect_analyze_scalar_cycles.
567 Examine the cross iteration def-use cycles of scalar variables, by
568 analyzing the loop-header PHIs of scalar variables; Classify each
569 cycle as one of the following: invariant, induction, reduction, unknown.
570 We do that for the loop represented by LOOP_VINFO, and also to its
571 inner-loop, if exists.
572 Examples for scalar cycles:
574 Example1: reduction:
576 loop1:
577 for (i=0; i<N; i++)
578 sum += a[i];
580 Example2: induction:
582 loop2:
583 for (i=0; i<N; i++)
584 a[i] = i; */
586 static void
587 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
591 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
593 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
594 Reductions in such inner-loop therefore have different properties than
595 the reductions in the nest that gets vectorized:
596 1. When vectorized, they are executed in the same order as in the original
597 scalar loop, so we can't change the order of computation when
598 vectorizing them.
599 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
600 current checks are too strict. */
602 if (loop->inner)
603 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
606 /* Function vect_get_loop_niters.
608 Determine how many iterations the loop is executed.
609 If an expression that represents the number of iterations
610 can be constructed, place it in NUMBER_OF_ITERATIONS.
611 Return the loop exit condition. */
613 static gimple
614 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
616 tree niters;
618 if (vect_print_dump_info (REPORT_DETAILS))
619 fprintf (vect_dump, "=== get_loop_niters ===");
621 niters = number_of_exit_cond_executions (loop);
623 if (niters != NULL_TREE
624 && niters != chrec_dont_know)
626 *number_of_iterations = niters;
628 if (vect_print_dump_info (REPORT_DETAILS))
630 fprintf (vect_dump, "==> get_loop_niters:" );
631 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
635 return get_loop_exit_condition (loop);
639 /* Function bb_in_loop_p
641 Used as predicate for dfs order traversal of the loop bbs. */
643 static bool
644 bb_in_loop_p (const_basic_block bb, const void *data)
646 const struct loop *const loop = (const struct loop *)data;
647 if (flow_bb_inside_loop_p (loop, bb))
648 return true;
649 return false;
653 /* Function new_loop_vec_info.
655 Create and initialize a new loop_vec_info struct for LOOP, as well as
656 stmt_vec_info structs for all the stmts in LOOP. */
658 static loop_vec_info
659 new_loop_vec_info (struct loop *loop)
661 loop_vec_info res;
662 basic_block *bbs;
663 gimple_stmt_iterator si;
664 unsigned int i, nbbs;
666 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
667 LOOP_VINFO_LOOP (res) = loop;
669 bbs = get_loop_body (loop);
671 /* Create/Update stmt_info for all stmts in the loop. */
672 for (i = 0; i < loop->num_nodes; i++)
674 basic_block bb = bbs[i];
676 /* BBs in a nested inner-loop will have been already processed (because
677 we will have called vect_analyze_loop_form for any nested inner-loop).
678 Therefore, for stmts in an inner-loop we just want to update the
679 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
680 loop_info of the outer-loop we are currently considering to vectorize
681 (instead of the loop_info of the inner-loop).
682 For stmts in other BBs we need to create a stmt_info from scratch. */
683 if (bb->loop_father != loop)
685 /* Inner-loop bb. */
686 gcc_assert (loop->inner && bb->loop_father == loop->inner);
687 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
689 gimple phi = gsi_stmt (si);
690 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
691 loop_vec_info inner_loop_vinfo =
692 STMT_VINFO_LOOP_VINFO (stmt_info);
693 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
694 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
696 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
698 gimple stmt = gsi_stmt (si);
699 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
700 loop_vec_info inner_loop_vinfo =
701 STMT_VINFO_LOOP_VINFO (stmt_info);
702 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
703 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
706 else
708 /* bb in current nest. */
709 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
711 gimple phi = gsi_stmt (si);
712 gimple_set_uid (phi, 0);
713 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
716 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
718 gimple stmt = gsi_stmt (si);
719 gimple_set_uid (stmt, 0);
720 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
725 /* CHECKME: We want to visit all BBs before their successors (except for
726 latch blocks, for which this assertion wouldn't hold). In the simple
727 case of the loop forms we allow, a dfs order of the BBs would the same
728 as reversed postorder traversal, so we are safe. */
730 free (bbs);
731 bbs = XCNEWVEC (basic_block, loop->num_nodes);
732 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
733 bbs, loop->num_nodes, loop);
734 gcc_assert (nbbs == loop->num_nodes);
736 LOOP_VINFO_BBS (res) = bbs;
737 LOOP_VINFO_NITERS (res) = NULL;
738 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
739 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
740 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
741 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
742 LOOP_VINFO_VECT_FACTOR (res) = 0;
743 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
744 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
745 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
746 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
747 VEC_alloc (gimple, heap,
748 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
749 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
750 VEC_alloc (ddr_p, heap,
751 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
752 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
753 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
754 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
755 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
757 return res;
761 /* Function destroy_loop_vec_info.
763 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
764 stmts in the loop. */
766 void
767 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
769 struct loop *loop;
770 basic_block *bbs;
771 int nbbs;
772 gimple_stmt_iterator si;
773 int j;
774 VEC (slp_instance, heap) *slp_instances;
775 slp_instance instance;
777 if (!loop_vinfo)
778 return;
780 loop = LOOP_VINFO_LOOP (loop_vinfo);
782 bbs = LOOP_VINFO_BBS (loop_vinfo);
783 nbbs = loop->num_nodes;
785 if (!clean_stmts)
787 free (LOOP_VINFO_BBS (loop_vinfo));
788 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
789 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
790 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
792 free (loop_vinfo);
793 loop->aux = NULL;
794 return;
797 for (j = 0; j < nbbs; j++)
799 basic_block bb = bbs[j];
800 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
801 free_stmt_vec_info (gsi_stmt (si));
803 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
805 gimple stmt = gsi_stmt (si);
806 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
808 if (stmt_info)
810 /* Check if this is a "pattern stmt" (introduced by the
811 vectorizer during the pattern recognition pass). */
812 bool remove_stmt_p = false;
813 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
814 if (orig_stmt)
816 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
817 if (orig_stmt_info
818 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
819 remove_stmt_p = true;
822 /* Free stmt_vec_info. */
823 free_stmt_vec_info (stmt);
825 /* Remove dead "pattern stmts". */
826 if (remove_stmt_p)
827 gsi_remove (&si, true);
829 gsi_next (&si);
833 free (LOOP_VINFO_BBS (loop_vinfo));
834 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
835 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
836 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
837 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
838 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
839 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
840 vect_free_slp_instance (instance);
842 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
843 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
844 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
846 free (loop_vinfo);
847 loop->aux = NULL;
851 /* Function vect_analyze_loop_1.
853 Apply a set of analyses on LOOP, and create a loop_vec_info struct
854 for it. The different analyses will record information in the
855 loop_vec_info struct. This is a subset of the analyses applied in
856 vect_analyze_loop, to be applied on an inner-loop nested in the loop
857 that is now considered for (outer-loop) vectorization. */
859 static loop_vec_info
860 vect_analyze_loop_1 (struct loop *loop)
862 loop_vec_info loop_vinfo;
864 if (vect_print_dump_info (REPORT_DETAILS))
865 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
867 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
869 loop_vinfo = vect_analyze_loop_form (loop);
870 if (!loop_vinfo)
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "bad inner-loop form.");
874 return NULL;
877 return loop_vinfo;
881 /* Function vect_analyze_loop_form.
883 Verify that certain CFG restrictions hold, including:
884 - the loop has a pre-header
885 - the loop has a single entry and exit
886 - the loop exit condition is simple enough, and the number of iterations
887 can be analyzed (a countable loop). */
889 loop_vec_info
890 vect_analyze_loop_form (struct loop *loop)
892 loop_vec_info loop_vinfo;
893 gimple loop_cond;
894 tree number_of_iterations = NULL;
895 loop_vec_info inner_loop_vinfo = NULL;
897 if (vect_print_dump_info (REPORT_DETAILS))
898 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
900 /* Different restrictions apply when we are considering an inner-most loop,
901 vs. an outer (nested) loop.
902 (FORNOW. May want to relax some of these restrictions in the future). */
904 if (!loop->inner)
906 /* Inner-most loop. We currently require that the number of BBs is
907 exactly 2 (the header and latch). Vectorizable inner-most loops
908 look like this:
910 (pre-header)
912 header <--------+
913 | | |
914 | +--> latch --+
916 (exit-bb) */
918 if (loop->num_nodes != 2)
920 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
921 fprintf (vect_dump, "not vectorized: control flow in loop.");
922 return NULL;
925 if (empty_block_p (loop->header))
927 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
928 fprintf (vect_dump, "not vectorized: empty loop.");
929 return NULL;
932 else
934 struct loop *innerloop = loop->inner;
935 edge entryedge;
937 /* Nested loop. We currently require that the loop is doubly-nested,
938 contains a single inner loop, and the number of BBs is exactly 5.
939 Vectorizable outer-loops look like this:
941 (pre-header)
943 header <---+
945 inner-loop |
947 tail ------+
949 (exit-bb)
951 The inner-loop has the properties expected of inner-most loops
952 as described above. */
954 if ((loop->inner)->inner || (loop->inner)->next)
956 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
957 fprintf (vect_dump, "not vectorized: multiple nested loops.");
958 return NULL;
961 /* Analyze the inner-loop. */
962 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
963 if (!inner_loop_vinfo)
965 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
966 fprintf (vect_dump, "not vectorized: Bad inner loop.");
967 return NULL;
970 if (!expr_invariant_in_loop_p (loop,
971 LOOP_VINFO_NITERS (inner_loop_vinfo)))
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump,
975 "not vectorized: inner-loop count not invariant.");
976 destroy_loop_vec_info (inner_loop_vinfo, true);
977 return NULL;
980 if (loop->num_nodes != 5)
982 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
983 fprintf (vect_dump, "not vectorized: control flow in loop.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
985 return NULL;
988 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
989 entryedge = EDGE_PRED (innerloop->header, 0);
990 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
991 entryedge = EDGE_PRED (innerloop->header, 1);
993 if (entryedge->src != loop->header
994 || !single_exit (innerloop)
995 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
997 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
998 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
999 destroy_loop_vec_info (inner_loop_vinfo, true);
1000 return NULL;
1003 if (vect_print_dump_info (REPORT_DETAILS))
1004 fprintf (vect_dump, "Considering outer-loop vectorization.");
1007 if (!single_exit (loop)
1008 || EDGE_COUNT (loop->header->preds) != 2)
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1012 if (!single_exit (loop))
1013 fprintf (vect_dump, "not vectorized: multiple exits.");
1014 else if (EDGE_COUNT (loop->header->preds) != 2)
1015 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1017 if (inner_loop_vinfo)
1018 destroy_loop_vec_info (inner_loop_vinfo, true);
1019 return NULL;
1022 /* We assume that the loop exit condition is at the end of the loop. i.e,
1023 that the loop is represented as a do-while (with a proper if-guard
1024 before the loop if needed), where the loop header contains all the
1025 executable statements, and the latch is empty. */
1026 if (!empty_block_p (loop->latch)
1027 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1029 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1031 if (inner_loop_vinfo)
1032 destroy_loop_vec_info (inner_loop_vinfo, true);
1033 return NULL;
1036 /* Make sure there exists a single-predecessor exit bb: */
1037 if (!single_pred_p (single_exit (loop)->dest))
1039 edge e = single_exit (loop);
1040 if (!(e->flags & EDGE_ABNORMAL))
1042 split_loop_exit_edge (e);
1043 if (vect_print_dump_info (REPORT_DETAILS))
1044 fprintf (vect_dump, "split exit edge.");
1046 else
1048 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1049 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1050 if (inner_loop_vinfo)
1051 destroy_loop_vec_info (inner_loop_vinfo, true);
1052 return NULL;
1056 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1057 if (!loop_cond)
1059 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1060 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1061 if (inner_loop_vinfo)
1062 destroy_loop_vec_info (inner_loop_vinfo, true);
1063 return NULL;
1066 if (!number_of_iterations)
1068 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1069 fprintf (vect_dump,
1070 "not vectorized: number of iterations cannot be computed.");
1071 if (inner_loop_vinfo)
1072 destroy_loop_vec_info (inner_loop_vinfo, true);
1073 return NULL;
1076 if (chrec_contains_undetermined (number_of_iterations))
1078 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1079 fprintf (vect_dump, "Infinite number of iterations.");
1080 if (inner_loop_vinfo)
1081 destroy_loop_vec_info (inner_loop_vinfo, true);
1082 return NULL;
1085 if (!NITERS_KNOWN_P (number_of_iterations))
1087 if (vect_print_dump_info (REPORT_DETAILS))
1089 fprintf (vect_dump, "Symbolic number of iterations is ");
1090 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1093 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1095 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1096 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1097 if (inner_loop_vinfo)
1098 destroy_loop_vec_info (inner_loop_vinfo, false);
1099 return NULL;
1102 loop_vinfo = new_loop_vec_info (loop);
1103 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1104 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1106 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1108 /* CHECKME: May want to keep it around it in the future. */
1109 if (inner_loop_vinfo)
1110 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 gcc_assert (!loop->aux);
1113 loop->aux = loop_vinfo;
1114 return loop_vinfo;
1118 /* Function vect_analyze_loop_operations.
1120 Scan the loop stmts and make sure they are all vectorizable. */
1122 static bool
1123 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1127 int nbbs = loop->num_nodes;
1128 gimple_stmt_iterator si;
1129 unsigned int vectorization_factor = 0;
1130 int i;
1131 gimple phi;
1132 stmt_vec_info stmt_info;
1133 bool need_to_vectorize = false;
1134 int min_profitable_iters;
1135 int min_scalar_loop_bound;
1136 unsigned int th;
1137 bool only_slp_in_loop = true, ok;
1139 if (vect_print_dump_info (REPORT_DETAILS))
1140 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1142 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1143 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1145 for (i = 0; i < nbbs; i++)
1147 basic_block bb = bbs[i];
1149 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 phi = gsi_stmt (si);
1152 ok = true;
1154 stmt_info = vinfo_for_stmt (phi);
1155 if (vect_print_dump_info (REPORT_DETAILS))
1157 fprintf (vect_dump, "examining phi: ");
1158 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1161 if (! is_loop_header_bb_p (bb))
1163 /* inner-loop loop-closed exit phi in outer-loop vectorization
1164 (i.e. a phi in the tail of the outer-loop).
1165 FORNOW: we currently don't support the case that these phis
1166 are not used in the outerloop (unless it is double reduction,
1167 i.e., this phi is vect_reduction_def), cause this case
1168 requires to actually do something here. */
1169 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1170 || STMT_VINFO_LIVE_P (stmt_info))
1171 && STMT_VINFO_DEF_TYPE (stmt_info)
1172 != vect_double_reduction_def)
1174 if (vect_print_dump_info (REPORT_DETAILS))
1175 fprintf (vect_dump,
1176 "Unsupported loop-closed phi in outer-loop.");
1177 return false;
1179 continue;
1182 gcc_assert (stmt_info);
1184 if (STMT_VINFO_LIVE_P (stmt_info))
1186 /* FORNOW: not yet supported. */
1187 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1188 fprintf (vect_dump, "not vectorized: value used after loop.");
1189 return false;
1192 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1193 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1195 /* A scalar-dependence cycle that we don't support. */
1196 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1197 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1198 return false;
1201 if (STMT_VINFO_RELEVANT_P (stmt_info))
1203 need_to_vectorize = true;
1204 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1205 ok = vectorizable_induction (phi, NULL, NULL);
1208 if (!ok)
1210 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1212 fprintf (vect_dump,
1213 "not vectorized: relevant phi not supported: ");
1214 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1216 return false;
1220 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1222 gimple stmt = gsi_stmt (si);
1223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1225 gcc_assert (stmt_info);
1227 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1228 return false;
1230 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1231 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1232 && !PURE_SLP_STMT (stmt_info))
1233 /* STMT needs both SLP and loop-based vectorization. */
1234 only_slp_in_loop = false;
1236 } /* bbs */
1238 /* All operations in the loop are either irrelevant (deal with loop
1239 control, or dead), or only used outside the loop and can be moved
1240 out of the loop (e.g. invariants, inductions). The loop can be
1241 optimized away by scalar optimizations. We're better off not
1242 touching this loop. */
1243 if (!need_to_vectorize)
1245 if (vect_print_dump_info (REPORT_DETAILS))
1246 fprintf (vect_dump,
1247 "All the computation can be taken out of the loop.");
1248 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1249 fprintf (vect_dump,
1250 "not vectorized: redundant loop. no profit to vectorize.");
1251 return false;
1254 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1255 vectorization factor of the loop is the unrolling factor required by the
1256 SLP instances. If that unrolling factor is 1, we say, that we perform
1257 pure SLP on loop - cross iteration parallelism is not exploited. */
1258 if (only_slp_in_loop)
1259 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1260 else
1261 vectorization_factor = least_common_multiple (vectorization_factor,
1262 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1264 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267 && vect_print_dump_info (REPORT_DETAILS))
1268 fprintf (vect_dump,
1269 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1270 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1272 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276 fprintf (vect_dump, "not vectorized: iteration count too small.");
1277 if (vect_print_dump_info (REPORT_DETAILS))
1278 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1279 "vectorization factor.");
1280 return false;
1283 /* Analyze cost. Decide if worth while to vectorize. */
1285 /* Once VF is set, SLP costs should be updated since the number of created
1286 vector stmts depends on VF. */
1287 vect_update_slp_costs_according_to_vf (loop_vinfo);
1289 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1290 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1292 if (min_profitable_iters < 0)
1294 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1296 if (vect_print_dump_info (REPORT_DETAILS))
1297 fprintf (vect_dump, "not vectorized: vector version will never be "
1298 "profitable.");
1299 return false;
1302 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1303 * vectorization_factor) - 1);
1305 /* Use the cost model only if it is more conservative than user specified
1306 threshold. */
1308 th = (unsigned) min_scalar_loop_bound;
1309 if (min_profitable_iters
1310 && (!min_scalar_loop_bound
1311 || min_profitable_iters > min_scalar_loop_bound))
1312 th = (unsigned) min_profitable_iters;
1314 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1315 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1317 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1318 fprintf (vect_dump, "not vectorized: vectorization not "
1319 "profitable.");
1320 if (vect_print_dump_info (REPORT_DETAILS))
1321 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1322 "user specified loop bound parameter or minimum "
1323 "profitable iterations (whichever is more conservative).");
1324 return false;
1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1328 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1329 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1331 if (vect_print_dump_info (REPORT_DETAILS))
1332 fprintf (vect_dump, "epilog loop required.");
1333 if (!vect_can_advance_ivs_p (loop_vinfo))
1335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1336 fprintf (vect_dump,
1337 "not vectorized: can't create epilog loop 1.");
1338 return false;
1340 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1343 fprintf (vect_dump,
1344 "not vectorized: can't create epilog loop 2.");
1345 return false;
1349 return true;
1353 /* Function vect_analyze_loop.
1355 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1356 for it. The different analyses will record information in the
1357 loop_vec_info struct. */
1358 loop_vec_info
1359 vect_analyze_loop (struct loop *loop)
1361 bool ok;
1362 loop_vec_info loop_vinfo;
1363 int max_vf = MAX_VECTORIZATION_FACTOR;
1364 int min_vf = 2;
1366 if (vect_print_dump_info (REPORT_DETAILS))
1367 fprintf (vect_dump, "===== analyze_loop_nest =====");
1369 if (loop_outer (loop)
1370 && loop_vec_info_for_loop (loop_outer (loop))
1371 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1373 if (vect_print_dump_info (REPORT_DETAILS))
1374 fprintf (vect_dump, "outer-loop already vectorized.");
1375 return NULL;
1378 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1380 loop_vinfo = vect_analyze_loop_form (loop);
1381 if (!loop_vinfo)
1383 if (vect_print_dump_info (REPORT_DETAILS))
1384 fprintf (vect_dump, "bad loop form.");
1385 return NULL;
1388 /* Find all data references in the loop (which correspond to vdefs/vuses)
1389 and analyze their evolution in the loop. Also adjust the minimal
1390 vectorization factor according to the loads and stores.
1392 FORNOW: Handle only simple, array references, which
1393 alignment can be forced, and aligned pointer-references. */
1395 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1396 if (!ok)
1398 if (vect_print_dump_info (REPORT_DETAILS))
1399 fprintf (vect_dump, "bad data references.");
1400 destroy_loop_vec_info (loop_vinfo, true);
1401 return NULL;
1404 /* Classify all cross-iteration scalar data-flow cycles.
1405 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1407 vect_analyze_scalar_cycles (loop_vinfo);
1409 vect_pattern_recog (loop_vinfo);
1411 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1413 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1414 if (!ok)
1416 if (vect_print_dump_info (REPORT_DETAILS))
1417 fprintf (vect_dump, "unexpected pattern.");
1418 destroy_loop_vec_info (loop_vinfo, true);
1419 return NULL;
1422 /* Analyze data dependences between the data-refs in the loop
1423 and adjust the maximum vectorization factor according to
1424 the dependences.
1425 FORNOW: fail at the first data dependence that we encounter. */
1427 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1428 if (!ok
1429 || max_vf < min_vf)
1431 if (vect_print_dump_info (REPORT_DETAILS))
1432 fprintf (vect_dump, "bad data dependence.");
1433 destroy_loop_vec_info (loop_vinfo, true);
1434 return NULL;
1437 ok = vect_determine_vectorization_factor (loop_vinfo);
1438 if (!ok)
1440 if (vect_print_dump_info (REPORT_DETAILS))
1441 fprintf (vect_dump, "can't determine vectorization factor.");
1442 destroy_loop_vec_info (loop_vinfo, true);
1443 return NULL;
1445 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1447 if (vect_print_dump_info (REPORT_DETAILS))
1448 fprintf (vect_dump, "bad data dependence.");
1449 destroy_loop_vec_info (loop_vinfo, true);
1450 return NULL;
1453 /* Analyze the alignment of the data-refs in the loop.
1454 Fail if a data reference is found that cannot be vectorized. */
1456 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1457 if (!ok)
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "bad data alignment.");
1461 destroy_loop_vec_info (loop_vinfo, true);
1462 return NULL;
1465 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1466 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1468 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1469 if (!ok)
1471 if (vect_print_dump_info (REPORT_DETAILS))
1472 fprintf (vect_dump, "bad data access.");
1473 destroy_loop_vec_info (loop_vinfo, true);
1474 return NULL;
1477 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1478 It is important to call pruning after vect_analyze_data_ref_accesses,
1479 since we use grouping information gathered by interleaving analysis. */
1480 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1481 if (!ok)
1483 if (vect_print_dump_info (REPORT_DETAILS))
1484 fprintf (vect_dump, "too long list of versioning for alias "
1485 "run-time tests.");
1486 destroy_loop_vec_info (loop_vinfo, true);
1487 return NULL;
1490 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1491 ok = vect_analyze_slp (loop_vinfo, NULL);
1492 if (ok)
1494 /* Decide which possible SLP instances to SLP. */
1495 vect_make_slp_decision (loop_vinfo);
1497 /* Find stmts that need to be both vectorized and SLPed. */
1498 vect_detect_hybrid_slp (loop_vinfo);
1501 /* This pass will decide on using loop versioning and/or loop peeling in
1502 order to enhance the alignment of data references in the loop. */
1504 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1505 if (!ok)
1507 if (vect_print_dump_info (REPORT_DETAILS))
1508 fprintf (vect_dump, "bad data alignment.");
1509 destroy_loop_vec_info (loop_vinfo, true);
1510 return NULL;
1513 /* Scan all the operations in the loop and make sure they are
1514 vectorizable. */
1516 ok = vect_analyze_loop_operations (loop_vinfo);
1517 if (!ok)
1519 if (vect_print_dump_info (REPORT_DETAILS))
1520 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1521 destroy_loop_vec_info (loop_vinfo, true);
1522 return NULL;
1525 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1527 return loop_vinfo;
1531 /* Function reduction_code_for_scalar_code
1533 Input:
1534 CODE - tree_code of a reduction operations.
1536 Output:
1537 REDUC_CODE - the corresponding tree-code to be used to reduce the
1538 vector of partial results into a single scalar result (which
1539 will also reside in a vector) or ERROR_MARK if the operation is
1540 a supported reduction operation, but does not have such tree-code.
1542 Return FALSE if CODE currently cannot be vectorized as reduction. */
1544 static bool
1545 reduction_code_for_scalar_code (enum tree_code code,
1546 enum tree_code *reduc_code)
1548 switch (code)
1550 case MAX_EXPR:
1551 *reduc_code = REDUC_MAX_EXPR;
1552 return true;
1554 case MIN_EXPR:
1555 *reduc_code = REDUC_MIN_EXPR;
1556 return true;
1558 case PLUS_EXPR:
1559 *reduc_code = REDUC_PLUS_EXPR;
1560 return true;
1562 case MULT_EXPR:
1563 case MINUS_EXPR:
1564 case BIT_IOR_EXPR:
1565 case BIT_XOR_EXPR:
1566 case BIT_AND_EXPR:
1567 *reduc_code = ERROR_MARK;
1568 return true;
1570 default:
1571 return false;
1576 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1577 STMT is printed with a message MSG. */
1579 static void
1580 report_vect_op (gimple stmt, const char *msg)
1582 fprintf (vect_dump, "%s", msg);
1583 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1587 /* Function vect_is_simple_reduction
1589 (1) Detect a cross-iteration def-use cycle that represents a simple
1590 reduction computation. We look for the following pattern:
1592 loop_header:
1593 a1 = phi < a0, a2 >
1594 a3 = ...
1595 a2 = operation (a3, a1)
1597 such that:
1598 1. operation is commutative and associative and it is safe to
1599 change the order of the computation (if CHECK_REDUCTION is true)
1600 2. no uses for a2 in the loop (a2 is used out of the loop)
1601 3. no uses of a1 in the loop besides the reduction operation.
1603 Condition 1 is tested here.
1604 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1606 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1607 nested cycles, if CHECK_REDUCTION is false.
1609 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1610 reductions:
1612 a1 = phi < a0, a2 >
1613 inner loop (def of a3)
1614 a2 = phi < a3 >
1617 gimple
1618 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1619 bool check_reduction, bool *double_reduc)
1621 struct loop *loop = (gimple_bb (phi))->loop_father;
1622 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1623 edge latch_e = loop_latch_edge (loop);
1624 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1625 gimple def_stmt, def1 = NULL, def2 = NULL;
1626 enum tree_code code;
1627 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1628 tree type;
1629 int nloop_uses;
1630 tree name;
1631 imm_use_iterator imm_iter;
1632 use_operand_p use_p;
1633 bool phi_def;
1635 *double_reduc = false;
1637 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1638 otherwise, we assume outer loop vectorization. */
1639 gcc_assert ((check_reduction && loop == vect_loop)
1640 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1642 name = PHI_RESULT (phi);
1643 nloop_uses = 0;
1644 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1646 gimple use_stmt = USE_STMT (use_p);
1647 if (is_gimple_debug (use_stmt))
1648 continue;
1649 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1650 && vinfo_for_stmt (use_stmt)
1651 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1652 nloop_uses++;
1653 if (nloop_uses > 1)
1655 if (vect_print_dump_info (REPORT_DETAILS))
1656 fprintf (vect_dump, "reduction used in loop.");
1657 return NULL;
1661 if (TREE_CODE (loop_arg) != SSA_NAME)
1663 if (vect_print_dump_info (REPORT_DETAILS))
1665 fprintf (vect_dump, "reduction: not ssa_name: ");
1666 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1668 return NULL;
1671 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1672 if (!def_stmt)
1674 if (vect_print_dump_info (REPORT_DETAILS))
1675 fprintf (vect_dump, "reduction: no def_stmt.");
1676 return NULL;
1679 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1681 if (vect_print_dump_info (REPORT_DETAILS))
1682 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1683 return NULL;
1686 if (is_gimple_assign (def_stmt))
1688 name = gimple_assign_lhs (def_stmt);
1689 phi_def = false;
1691 else
1693 name = PHI_RESULT (def_stmt);
1694 phi_def = true;
1697 nloop_uses = 0;
1698 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1700 gimple use_stmt = USE_STMT (use_p);
1701 if (is_gimple_debug (use_stmt))
1702 continue;
1703 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1704 && vinfo_for_stmt (use_stmt)
1705 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1706 nloop_uses++;
1707 if (nloop_uses > 1)
1709 if (vect_print_dump_info (REPORT_DETAILS))
1710 fprintf (vect_dump, "reduction used in loop.");
1711 return NULL;
1715 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1716 defined in the inner loop. */
1717 if (phi_def)
1719 op1 = PHI_ARG_DEF (def_stmt, 0);
1721 if (gimple_phi_num_args (def_stmt) != 1
1722 || TREE_CODE (op1) != SSA_NAME)
1724 if (vect_print_dump_info (REPORT_DETAILS))
1725 fprintf (vect_dump, "unsupported phi node definition.");
1727 return NULL;
1730 def1 = SSA_NAME_DEF_STMT (op1);
1731 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1732 && loop->inner
1733 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1734 && is_gimple_assign (def1))
1736 if (vect_print_dump_info (REPORT_DETAILS))
1737 report_vect_op (def_stmt, "detected double reduction: ");
1739 *double_reduc = true;
1740 return def_stmt;
1743 return NULL;
1746 code = gimple_assign_rhs_code (def_stmt);
1748 if (check_reduction
1749 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1751 if (vect_print_dump_info (REPORT_DETAILS))
1752 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1753 return NULL;
1756 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1758 if (code != COND_EXPR)
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 report_vect_op (def_stmt, "reduction: not binary operation: ");
1763 return NULL;
1766 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1767 if (COMPARISON_CLASS_P (op3))
1769 op4 = TREE_OPERAND (op3, 1);
1770 op3 = TREE_OPERAND (op3, 0);
1773 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1774 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1776 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1778 if (vect_print_dump_info (REPORT_DETAILS))
1779 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1781 return NULL;
1784 else
1786 op1 = gimple_assign_rhs1 (def_stmt);
1787 op2 = gimple_assign_rhs2 (def_stmt);
1789 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1791 if (vect_print_dump_info (REPORT_DETAILS))
1792 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1794 return NULL;
1798 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1799 if ((TREE_CODE (op1) == SSA_NAME
1800 && !types_compatible_p (type,TREE_TYPE (op1)))
1801 || (TREE_CODE (op2) == SSA_NAME
1802 && !types_compatible_p (type, TREE_TYPE (op2)))
1803 || (op3 && TREE_CODE (op3) == SSA_NAME
1804 && !types_compatible_p (type, TREE_TYPE (op3)))
1805 || (op4 && TREE_CODE (op4) == SSA_NAME
1806 && !types_compatible_p (type, TREE_TYPE (op4))))
1808 if (vect_print_dump_info (REPORT_DETAILS))
1810 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1811 print_generic_expr (vect_dump, type, TDF_SLIM);
1812 fprintf (vect_dump, ", operands types: ");
1813 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1814 fprintf (vect_dump, ",");
1815 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1816 if (op3)
1818 fprintf (vect_dump, ",");
1819 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1822 if (op4)
1824 fprintf (vect_dump, ",");
1825 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1829 return NULL;
1832 /* Check that it's ok to change the order of the computation.
1833 Generally, when vectorizing a reduction we change the order of the
1834 computation. This may change the behavior of the program in some
1835 cases, so we need to check that this is ok. One exception is when
1836 vectorizing an outer-loop: the inner-loop is executed sequentially,
1837 and therefore vectorizing reductions in the inner-loop during
1838 outer-loop vectorization is safe. */
1840 /* CHECKME: check for !flag_finite_math_only too? */
1841 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1842 && check_reduction)
1844 /* Changing the order of operations changes the semantics. */
1845 if (vect_print_dump_info (REPORT_DETAILS))
1846 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1847 return NULL;
1849 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1850 && check_reduction)
1852 /* Changing the order of operations changes the semantics. */
1853 if (vect_print_dump_info (REPORT_DETAILS))
1854 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1855 return NULL;
1857 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1859 /* Changing the order of operations changes the semantics. */
1860 if (vect_print_dump_info (REPORT_DETAILS))
1861 report_vect_op (def_stmt,
1862 "reduction: unsafe fixed-point math optimization: ");
1863 return NULL;
1866 /* Reduction is safe. We're dealing with one of the following:
1867 1) integer arithmetic and no trapv
1868 2) floating point arithmetic, and special flags permit this optimization
1869 3) nested cycle (i.e., outer loop vectorization). */
1870 if (TREE_CODE (op1) == SSA_NAME)
1871 def1 = SSA_NAME_DEF_STMT (op1);
1873 if (TREE_CODE (op2) == SSA_NAME)
1874 def2 = SSA_NAME_DEF_STMT (op2);
1876 if (code != COND_EXPR
1877 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1879 if (vect_print_dump_info (REPORT_DETAILS))
1880 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1881 return NULL;
1884 /* Check that one def is the reduction def, defined by PHI,
1885 the other def is either defined in the loop ("vect_internal_def"),
1886 or it's an induction (defined by a loop-header phi-node). */
1888 if (def2 && def2 == phi
1889 && (code == COND_EXPR
1890 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1891 && (is_gimple_assign (def1)
1892 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1893 == vect_induction_def
1894 || (gimple_code (def1) == GIMPLE_PHI
1895 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1896 == vect_internal_def
1897 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1899 if (vect_print_dump_info (REPORT_DETAILS))
1900 report_vect_op (def_stmt, "detected reduction: ");
1901 return def_stmt;
1903 else if (def1 && def1 == phi
1904 && (code == COND_EXPR
1905 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1906 && (is_gimple_assign (def2)
1907 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1908 == vect_induction_def
1909 || (gimple_code (def2) == GIMPLE_PHI
1910 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1911 == vect_internal_def
1912 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1914 if (check_reduction)
1916 /* Swap operands (just for simplicity - so that the rest of the code
1917 can assume that the reduction variable is always the last (second)
1918 argument). */
1919 if (vect_print_dump_info (REPORT_DETAILS))
1920 report_vect_op (def_stmt,
1921 "detected reduction: need to swap operands: ");
1923 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1924 gimple_assign_rhs2_ptr (def_stmt));
1926 else
1928 if (vect_print_dump_info (REPORT_DETAILS))
1929 report_vect_op (def_stmt, "detected reduction: ");
1932 return def_stmt;
1934 else
1936 if (vect_print_dump_info (REPORT_DETAILS))
1937 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1939 return NULL;
1944 /* Function vect_estimate_min_profitable_iters
1946 Return the number of iterations required for the vector version of the
1947 loop to be profitable relative to the cost of the scalar version of the
1948 loop.
1950 TODO: Take profile info into account before making vectorization
1951 decisions, if available. */
1954 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1956 int i;
1957 int min_profitable_iters;
1958 int peel_iters_prologue;
1959 int peel_iters_epilogue;
1960 int vec_inside_cost = 0;
1961 int vec_outside_cost = 0;
1962 int scalar_single_iter_cost = 0;
1963 int scalar_outside_cost = 0;
1964 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1966 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1967 int nbbs = loop->num_nodes;
1968 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1969 int peel_guard_costs = 0;
1970 int innerloop_iters = 0, factor;
1971 VEC (slp_instance, heap) *slp_instances;
1972 slp_instance instance;
1974 /* Cost model disabled. */
1975 if (!flag_vect_cost_model)
1977 if (vect_print_dump_info (REPORT_COST))
1978 fprintf (vect_dump, "cost model disabled.");
1979 return 0;
1982 /* Requires loop versioning tests to handle misalignment. */
1983 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1985 /* FIXME: Make cost depend on complexity of individual check. */
1986 vec_outside_cost +=
1987 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1988 if (vect_print_dump_info (REPORT_COST))
1989 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1990 "versioning to treat misalignment.\n");
1993 /* Requires loop versioning with alias checks. */
1994 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1996 /* FIXME: Make cost depend on complexity of individual check. */
1997 vec_outside_cost +=
1998 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1999 if (vect_print_dump_info (REPORT_COST))
2000 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2001 "versioning aliasing.\n");
2004 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2005 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2006 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
2008 /* Count statements in scalar loop. Using this as scalar cost for a single
2009 iteration for now.
2011 TODO: Add outer loop support.
2013 TODO: Consider assigning different costs to different scalar
2014 statements. */
2016 /* FORNOW. */
2017 if (loop->inner)
2018 innerloop_iters = 50; /* FIXME */
2020 for (i = 0; i < nbbs; i++)
2022 gimple_stmt_iterator si;
2023 basic_block bb = bbs[i];
2025 if (bb->loop_father == loop->inner)
2026 factor = innerloop_iters;
2027 else
2028 factor = 1;
2030 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2032 gimple stmt = gsi_stmt (si);
2033 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2034 /* Skip stmts that are not vectorized inside the loop. */
2035 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2036 && (!STMT_VINFO_LIVE_P (stmt_info)
2037 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2038 continue;
2039 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
2040 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2041 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2042 some of the "outside" costs are generated inside the outer-loop. */
2043 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2047 /* Add additional cost for the peeled instructions in prologue and epilogue
2048 loop.
2050 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2051 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2053 TODO: Build an expression that represents peel_iters for prologue and
2054 epilogue to be used in a run-time test. */
2056 if (byte_misalign < 0)
2058 peel_iters_prologue = vf/2;
2059 if (vect_print_dump_info (REPORT_COST))
2060 fprintf (vect_dump, "cost model: "
2061 "prologue peel iters set to vf/2.");
2063 /* If peeling for alignment is unknown, loop bound of main loop becomes
2064 unknown. */
2065 peel_iters_epilogue = vf/2;
2066 if (vect_print_dump_info (REPORT_COST))
2067 fprintf (vect_dump, "cost model: "
2068 "epilogue peel iters set to vf/2 because "
2069 "peeling for alignment is unknown .");
2071 /* If peeled iterations are unknown, count a taken branch and a not taken
2072 branch per peeled loop. Even if scalar loop iterations are known,
2073 vector iterations are not known since peeled prologue iterations are
2074 not known. Hence guards remain the same. */
2075 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
2076 + TARG_COND_NOT_TAKEN_BRANCH_COST);
2078 else
2080 if (byte_misalign)
2082 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2083 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2084 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2085 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2087 peel_iters_prologue = nelements - (byte_misalign / element_size);
2089 else
2090 peel_iters_prologue = 0;
2092 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2094 peel_iters_epilogue = vf/2;
2095 if (vect_print_dump_info (REPORT_COST))
2096 fprintf (vect_dump, "cost model: "
2097 "epilogue peel iters set to vf/2 because "
2098 "loop iterations are unknown .");
2100 /* If peeled iterations are known but number of scalar loop
2101 iterations are unknown, count a taken branch per peeled loop. */
2102 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
2105 else
2107 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2108 peel_iters_prologue = niters < peel_iters_prologue ?
2109 niters : peel_iters_prologue;
2110 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2114 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2115 + (peel_iters_epilogue * scalar_single_iter_cost)
2116 + peel_guard_costs;
2118 /* FORNOW: The scalar outside cost is incremented in one of the
2119 following ways:
2121 1. The vectorizer checks for alignment and aliasing and generates
2122 a condition that allows dynamic vectorization. A cost model
2123 check is ANDED with the versioning condition. Hence scalar code
2124 path now has the added cost of the versioning check.
2126 if (cost > th & versioning_check)
2127 jmp to vector code
2129 Hence run-time scalar is incremented by not-taken branch cost.
2131 2. The vectorizer then checks if a prologue is required. If the
2132 cost model check was not done before during versioning, it has to
2133 be done before the prologue check.
2135 if (cost <= th)
2136 prologue = scalar_iters
2137 if (prologue == 0)
2138 jmp to vector code
2139 else
2140 execute prologue
2141 if (prologue == num_iters)
2142 go to exit
2144 Hence the run-time scalar cost is incremented by a taken branch,
2145 plus a not-taken branch, plus a taken branch cost.
2147 3. The vectorizer then checks if an epilogue is required. If the
2148 cost model check was not done before during prologue check, it
2149 has to be done with the epilogue check.
2151 if (prologue == 0)
2152 jmp to vector code
2153 else
2154 execute prologue
2155 if (prologue == num_iters)
2156 go to exit
2157 vector code:
2158 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2159 jmp to epilogue
2161 Hence the run-time scalar cost should be incremented by 2 taken
2162 branches.
2164 TODO: The back end may reorder the BBS's differently and reverse
2165 conditions/branch directions. Change the estimates below to
2166 something more reasonable. */
2168 /* If the number of iterations is known and we do not do versioning, we can
2169 decide whether to vectorize at compile time. Hence the scalar version
2170 do not carry cost model guard costs. */
2171 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2173 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2175 /* Cost model check occurs at versioning. */
2176 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2177 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2178 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2179 else
2181 /* Cost model check occurs at prologue generation. */
2182 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2183 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2184 + TARG_COND_NOT_TAKEN_BRANCH_COST;
2185 /* Cost model check occurs at epilogue generation. */
2186 else
2187 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2191 /* Add SLP costs. */
2192 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2193 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2195 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2196 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2199 /* Calculate number of iterations required to make the vector version
2200 profitable, relative to the loop bodies only. The following condition
2201 must hold true:
2202 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2203 where
2204 SIC = scalar iteration cost, VIC = vector iteration cost,
2205 VOC = vector outside cost, VF = vectorization factor,
2206 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2207 SOC = scalar outside cost for run time cost model check. */
2209 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2211 if (vec_outside_cost <= 0)
2212 min_profitable_iters = 1;
2213 else
2215 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2216 - vec_inside_cost * peel_iters_prologue
2217 - vec_inside_cost * peel_iters_epilogue)
2218 / ((scalar_single_iter_cost * vf)
2219 - vec_inside_cost);
2221 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2222 <= ((vec_inside_cost * min_profitable_iters)
2223 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2224 min_profitable_iters++;
2227 /* vector version will never be profitable. */
2228 else
2230 if (vect_print_dump_info (REPORT_COST))
2231 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2232 "divided by the scalar iteration cost = %d "
2233 "is greater or equal to the vectorization factor = %d.",
2234 vec_inside_cost, scalar_single_iter_cost, vf);
2235 return -1;
2238 if (vect_print_dump_info (REPORT_COST))
2240 fprintf (vect_dump, "Cost model analysis: \n");
2241 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2242 vec_inside_cost);
2243 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2244 vec_outside_cost);
2245 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2246 scalar_single_iter_cost);
2247 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2248 fprintf (vect_dump, " prologue iterations: %d\n",
2249 peel_iters_prologue);
2250 fprintf (vect_dump, " epilogue iterations: %d\n",
2251 peel_iters_epilogue);
2252 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2253 min_profitable_iters);
2256 min_profitable_iters =
2257 min_profitable_iters < vf ? vf : min_profitable_iters;
2259 /* Because the condition we create is:
2260 if (niters <= min_profitable_iters)
2261 then skip the vectorized loop. */
2262 min_profitable_iters--;
2264 if (vect_print_dump_info (REPORT_COST))
2265 fprintf (vect_dump, " Profitability threshold = %d\n",
2266 min_profitable_iters);
2268 return min_profitable_iters;
2272 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2273 functions. Design better to avoid maintenance issues. */
2275 /* Function vect_model_reduction_cost.
2277 Models cost for a reduction operation, including the vector ops
2278 generated within the strip-mine loop, the initial definition before
2279 the loop, and the epilogue code that must be generated. */
2281 static bool
2282 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2283 int ncopies)
2285 int outer_cost = 0;
2286 enum tree_code code;
2287 optab optab;
2288 tree vectype;
2289 gimple stmt, orig_stmt;
2290 tree reduction_op;
2291 enum machine_mode mode;
2292 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2293 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2296 /* Cost of reduction op inside loop. */
2297 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2299 stmt = STMT_VINFO_STMT (stmt_info);
2301 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2303 case GIMPLE_SINGLE_RHS:
2304 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2305 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2306 break;
2307 case GIMPLE_UNARY_RHS:
2308 reduction_op = gimple_assign_rhs1 (stmt);
2309 break;
2310 case GIMPLE_BINARY_RHS:
2311 reduction_op = gimple_assign_rhs2 (stmt);
2312 break;
2313 default:
2314 gcc_unreachable ();
2317 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2318 if (!vectype)
2320 if (vect_print_dump_info (REPORT_COST))
2322 fprintf (vect_dump, "unsupported data-type ");
2323 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2325 return false;
2328 mode = TYPE_MODE (vectype);
2329 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2331 if (!orig_stmt)
2332 orig_stmt = STMT_VINFO_STMT (stmt_info);
2334 code = gimple_assign_rhs_code (orig_stmt);
2336 /* Add in cost for initial definition. */
2337 outer_cost += TARG_SCALAR_TO_VEC_COST;
2339 /* Determine cost of epilogue code.
2341 We have a reduction operator that will reduce the vector in one statement.
2342 Also requires scalar extract. */
2344 if (!nested_in_vect_loop_p (loop, orig_stmt))
2346 if (reduc_code != ERROR_MARK)
2347 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2348 else
2350 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2351 tree bitsize =
2352 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2353 int element_bitsize = tree_low_cst (bitsize, 1);
2354 int nelements = vec_size_in_bits / element_bitsize;
2356 optab = optab_for_tree_code (code, vectype, optab_default);
2358 /* We have a whole vector shift available. */
2359 if (VECTOR_MODE_P (mode)
2360 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2361 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2362 /* Final reduction via vector shifts and the reduction operator. Also
2363 requires scalar extract. */
2364 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2365 + TARG_VEC_TO_SCALAR_COST);
2366 else
2367 /* Use extracts and reduction op for final reduction. For N elements,
2368 we have N extracts and N-1 reduction ops. */
2369 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2373 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2375 if (vect_print_dump_info (REPORT_COST))
2376 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2377 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2378 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2380 return true;
2384 /* Function vect_model_induction_cost.
2386 Models cost for induction operations. */
2388 static void
2389 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2391 /* loop cost for vec_loop. */
2392 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2393 /* prologue cost for vec_init and vec_step. */
2394 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2396 if (vect_print_dump_info (REPORT_COST))
2397 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2398 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2399 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2403 /* Function get_initial_def_for_induction
2405 Input:
2406 STMT - a stmt that performs an induction operation in the loop.
2407 IV_PHI - the initial value of the induction variable
2409 Output:
2410 Return a vector variable, initialized with the first VF values of
2411 the induction variable. E.g., for an iv with IV_PHI='X' and
2412 evolution S, for a vector of 4 units, we want to return:
2413 [X, X + S, X + 2*S, X + 3*S]. */
2415 static tree
2416 get_initial_def_for_induction (gimple iv_phi)
2418 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2419 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2420 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2421 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2422 tree vectype;
2423 int nunits;
2424 edge pe = loop_preheader_edge (loop);
2425 struct loop *iv_loop;
2426 basic_block new_bb;
2427 tree vec, vec_init, vec_step, t;
2428 tree access_fn;
2429 tree new_var;
2430 tree new_name;
2431 gimple init_stmt, induction_phi, new_stmt;
2432 tree induc_def, vec_def, vec_dest;
2433 tree init_expr, step_expr;
2434 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2435 int i;
2436 bool ok;
2437 int ncopies;
2438 tree expr;
2439 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2440 bool nested_in_vect_loop = false;
2441 gimple_seq stmts = NULL;
2442 imm_use_iterator imm_iter;
2443 use_operand_p use_p;
2444 gimple exit_phi;
2445 edge latch_e;
2446 tree loop_arg;
2447 gimple_stmt_iterator si;
2448 basic_block bb = gimple_bb (iv_phi);
2449 tree stepvectype;
2451 vectype = get_vectype_for_scalar_type (scalar_type);
2452 gcc_assert (vectype);
2453 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2454 ncopies = vf / nunits;
2456 gcc_assert (phi_info);
2457 gcc_assert (ncopies >= 1);
2459 /* Find the first insertion point in the BB. */
2460 si = gsi_after_labels (bb);
2462 if (INTEGRAL_TYPE_P (scalar_type))
2463 step_expr = build_int_cst (scalar_type, 0);
2464 else if (POINTER_TYPE_P (scalar_type))
2465 step_expr = build_int_cst (sizetype, 0);
2466 else
2467 step_expr = build_real (scalar_type, dconst0);
2469 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2470 if (nested_in_vect_loop_p (loop, iv_phi))
2472 nested_in_vect_loop = true;
2473 iv_loop = loop->inner;
2475 else
2476 iv_loop = loop;
2477 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2479 latch_e = loop_latch_edge (iv_loop);
2480 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2482 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2483 gcc_assert (access_fn);
2484 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2485 &init_expr, &step_expr);
2486 gcc_assert (ok);
2487 pe = loop_preheader_edge (iv_loop);
2489 /* Create the vector that holds the initial_value of the induction. */
2490 if (nested_in_vect_loop)
2492 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2493 been created during vectorization of previous stmts; We obtain it from
2494 the STMT_VINFO_VEC_STMT of the defining stmt. */
2495 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2496 loop_preheader_edge (iv_loop));
2497 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2499 else
2501 /* iv_loop is the loop to be vectorized. Create:
2502 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2503 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2504 add_referenced_var (new_var);
2506 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2507 if (stmts)
2509 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2510 gcc_assert (!new_bb);
2513 t = NULL_TREE;
2514 t = tree_cons (NULL_TREE, init_expr, t);
2515 for (i = 1; i < nunits; i++)
2517 /* Create: new_name_i = new_name + step_expr */
2518 enum tree_code code = POINTER_TYPE_P (scalar_type)
2519 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2520 init_stmt = gimple_build_assign_with_ops (code, new_var,
2521 new_name, step_expr);
2522 new_name = make_ssa_name (new_var, init_stmt);
2523 gimple_assign_set_lhs (init_stmt, new_name);
2525 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2526 gcc_assert (!new_bb);
2528 if (vect_print_dump_info (REPORT_DETAILS))
2530 fprintf (vect_dump, "created new init_stmt: ");
2531 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2533 t = tree_cons (NULL_TREE, new_name, t);
2535 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2536 vec = build_constructor_from_list (vectype, nreverse (t));
2537 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2541 /* Create the vector that holds the step of the induction. */
2542 if (nested_in_vect_loop)
2543 /* iv_loop is nested in the loop to be vectorized. Generate:
2544 vec_step = [S, S, S, S] */
2545 new_name = step_expr;
2546 else
2548 /* iv_loop is the loop to be vectorized. Generate:
2549 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2550 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2551 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2552 expr, step_expr);
2555 t = NULL_TREE;
2556 for (i = 0; i < nunits; i++)
2557 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2558 gcc_assert (CONSTANT_CLASS_P (new_name));
2559 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2560 gcc_assert (stepvectype);
2561 vec = build_vector (stepvectype, t);
2562 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2565 /* Create the following def-use cycle:
2566 loop prolog:
2567 vec_init = ...
2568 vec_step = ...
2569 loop:
2570 vec_iv = PHI <vec_init, vec_loop>
2572 STMT
2574 vec_loop = vec_iv + vec_step; */
2576 /* Create the induction-phi that defines the induction-operand. */
2577 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2578 add_referenced_var (vec_dest);
2579 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2580 set_vinfo_for_stmt (induction_phi,
2581 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2582 induc_def = PHI_RESULT (induction_phi);
2584 /* Create the iv update inside the loop */
2585 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2586 induc_def, vec_step);
2587 vec_def = make_ssa_name (vec_dest, new_stmt);
2588 gimple_assign_set_lhs (new_stmt, vec_def);
2589 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2590 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2591 NULL));
2593 /* Set the arguments of the phi node: */
2594 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2595 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2596 UNKNOWN_LOCATION);
2599 /* In case that vectorization factor (VF) is bigger than the number
2600 of elements that we can fit in a vectype (nunits), we have to generate
2601 more than one vector stmt - i.e - we need to "unroll" the
2602 vector stmt by a factor VF/nunits. For more details see documentation
2603 in vectorizable_operation. */
2605 if (ncopies > 1)
2607 stmt_vec_info prev_stmt_vinfo;
2608 /* FORNOW. This restriction should be relaxed. */
2609 gcc_assert (!nested_in_vect_loop);
2611 /* Create the vector that holds the step of the induction. */
2612 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2613 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2614 expr, step_expr);
2615 t = NULL_TREE;
2616 for (i = 0; i < nunits; i++)
2617 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2618 gcc_assert (CONSTANT_CLASS_P (new_name));
2619 vec = build_vector (stepvectype, t);
2620 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2622 vec_def = induc_def;
2623 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2624 for (i = 1; i < ncopies; i++)
2626 /* vec_i = vec_prev + vec_step */
2627 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2628 vec_def, vec_step);
2629 vec_def = make_ssa_name (vec_dest, new_stmt);
2630 gimple_assign_set_lhs (new_stmt, vec_def);
2632 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2633 set_vinfo_for_stmt (new_stmt,
2634 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2635 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2636 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2640 if (nested_in_vect_loop)
2642 /* Find the loop-closed exit-phi of the induction, and record
2643 the final vector of induction results: */
2644 exit_phi = NULL;
2645 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2647 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2649 exit_phi = USE_STMT (use_p);
2650 break;
2653 if (exit_phi)
2655 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2656 /* FORNOW. Currently not supporting the case that an inner-loop induction
2657 is not used in the outer-loop (i.e. only outside the outer-loop). */
2658 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2659 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2661 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2662 if (vect_print_dump_info (REPORT_DETAILS))
2664 fprintf (vect_dump, "vector of inductions after inner-loop:");
2665 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2671 if (vect_print_dump_info (REPORT_DETAILS))
2673 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2674 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2675 fprintf (vect_dump, "\n");
2676 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2679 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2680 return induc_def;
2684 /* Function get_initial_def_for_reduction
2686 Input:
2687 STMT - a stmt that performs a reduction operation in the loop.
2688 INIT_VAL - the initial value of the reduction variable
2690 Output:
2691 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2692 of the reduction (used for adjusting the epilog - see below).
2693 Return a vector variable, initialized according to the operation that STMT
2694 performs. This vector will be used as the initial value of the
2695 vector of partial results.
2697 Option1 (adjust in epilog): Initialize the vector as follows:
2698 add/bit or/xor: [0,0,...,0,0]
2699 mult/bit and: [1,1,...,1,1]
2700 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2701 and when necessary (e.g. add/mult case) let the caller know
2702 that it needs to adjust the result by init_val.
2704 Option2: Initialize the vector as follows:
2705 add/bit or/xor: [init_val,0,0,...,0]
2706 mult/bit and: [init_val,1,1,...,1]
2707 min/max/cond_expr: [init_val,init_val,...,init_val]
2708 and no adjustments are needed.
2710 For example, for the following code:
2712 s = init_val;
2713 for (i=0;i<n;i++)
2714 s = s + a[i];
2716 STMT is 's = s + a[i]', and the reduction variable is 's'.
2717 For a vector of 4 units, we want to return either [0,0,0,init_val],
2718 or [0,0,0,0] and let the caller know that it needs to adjust
2719 the result at the end by 'init_val'.
2721 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2722 initialization vector is simpler (same element in all entries), if
2723 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2725 A cost model should help decide between these two schemes. */
2727 tree
2728 get_initial_def_for_reduction (gimple stmt, tree init_val,
2729 tree *adjustment_def)
2731 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2732 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2733 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2734 tree scalar_type = TREE_TYPE (init_val);
2735 tree vectype = get_vectype_for_scalar_type (scalar_type);
2736 int nunits;
2737 enum tree_code code = gimple_assign_rhs_code (stmt);
2738 tree def_for_init;
2739 tree init_def;
2740 tree t = NULL_TREE;
2741 int i;
2742 bool nested_in_vect_loop = false;
2743 tree init_value;
2744 REAL_VALUE_TYPE real_init_val = dconst0;
2745 int int_init_val = 0;
2746 gimple def_stmt = NULL;
2748 gcc_assert (vectype);
2749 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2751 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2752 || SCALAR_FLOAT_TYPE_P (scalar_type));
2754 if (nested_in_vect_loop_p (loop, stmt))
2755 nested_in_vect_loop = true;
2756 else
2757 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2759 /* In case of double reduction we only create a vector variable to be put
2760 in the reduction phi node. The actual statement creation is done in
2761 vect_create_epilog_for_reduction. */
2762 if (adjustment_def && nested_in_vect_loop
2763 && TREE_CODE (init_val) == SSA_NAME
2764 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2765 && gimple_code (def_stmt) == GIMPLE_PHI
2766 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2767 && vinfo_for_stmt (def_stmt)
2768 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2769 == vect_double_reduction_def)
2771 *adjustment_def = NULL;
2772 return vect_create_destination_var (init_val, vectype);
2775 if (TREE_CONSTANT (init_val))
2777 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2778 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2779 else
2780 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2782 else
2783 init_value = init_val;
2785 switch (code)
2787 case WIDEN_SUM_EXPR:
2788 case DOT_PROD_EXPR:
2789 case PLUS_EXPR:
2790 case MINUS_EXPR:
2791 case BIT_IOR_EXPR:
2792 case BIT_XOR_EXPR:
2793 case MULT_EXPR:
2794 case BIT_AND_EXPR:
2795 /* ADJUSMENT_DEF is NULL when called from
2796 vect_create_epilog_for_reduction to vectorize double reduction. */
2797 if (adjustment_def)
2799 if (nested_in_vect_loop)
2800 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2801 NULL);
2802 else
2803 *adjustment_def = init_val;
2806 if (code == MULT_EXPR || code == BIT_AND_EXPR)
2808 real_init_val = dconst1;
2809 int_init_val = 1;
2812 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2813 def_for_init = build_real (scalar_type, real_init_val);
2814 else
2815 def_for_init = build_int_cst (scalar_type, int_init_val);
2817 /* Create a vector of '0' or '1' except the first element. */
2818 for (i = nunits - 2; i >= 0; --i)
2819 t = tree_cons (NULL_TREE, def_for_init, t);
2821 /* Option1: the first element is '0' or '1' as well. */
2822 if (adjustment_def)
2824 t = tree_cons (NULL_TREE, def_for_init, t);
2825 init_def = build_vector (vectype, t);
2826 break;
2829 /* Option2: the first element is INIT_VAL. */
2830 t = tree_cons (NULL_TREE, init_value, t);
2831 if (TREE_CONSTANT (init_val))
2832 init_def = build_vector (vectype, t);
2833 else
2834 init_def = build_constructor_from_list (vectype, t);
2836 break;
2838 case MIN_EXPR:
2839 case MAX_EXPR:
2840 case COND_EXPR:
2841 if (adjustment_def)
2843 *adjustment_def = NULL_TREE;
2844 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2845 break;
2848 for (i = nunits - 1; i >= 0; --i)
2849 t = tree_cons (NULL_TREE, init_value, t);
2851 if (TREE_CONSTANT (init_val))
2852 init_def = build_vector (vectype, t);
2853 else
2854 init_def = build_constructor_from_list (vectype, t);
2856 break;
2858 default:
2859 gcc_unreachable ();
2862 return init_def;
2866 /* Function vect_create_epilog_for_reduction
2868 Create code at the loop-epilog to finalize the result of a reduction
2869 computation.
2871 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
2872 reduction statements.
2873 STMT is the scalar reduction stmt that is being vectorized.
2874 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2875 number of elements that we can fit in a vectype (nunits). In this case
2876 we have to generate more than one vector stmt - i.e - we need to "unroll"
2877 the vector stmt by a factor VF/nunits. For more details see documentation
2878 in vectorizable_operation.
2879 REDUC_CODE is the tree-code for the epilog reduction.
2880 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
2881 computation.
2882 REDUC_INDEX is the index of the operand in the right hand side of the
2883 statement that is defined by REDUCTION_PHI.
2884 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2885 SLP_NODE is an SLP node containing a group of reduction statements. The
2886 first one in this group is STMT.
2888 This function:
2889 1. Creates the reduction def-use cycles: sets the arguments for
2890 REDUCTION_PHIS:
2891 The loop-entry argument is the vectorized initial-value of the reduction.
2892 The loop-latch argument is taken from VECT_DEFS - the vector of partial
2893 sums.
2894 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
2895 by applying the operation specified by REDUC_CODE if available, or by
2896 other means (whole-vector shifts or a scalar loop).
2897 The function also creates a new phi node at the loop exit to preserve
2898 loop-closed form, as illustrated below.
2900 The flow at the entry to this function:
2902 loop:
2903 vec_def = phi <null, null> # REDUCTION_PHI
2904 VECT_DEF = vector_stmt # vectorized form of STMT
2905 s_loop = scalar_stmt # (scalar) STMT
2906 loop_exit:
2907 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2908 use <s_out0>
2909 use <s_out0>
2911 The above is transformed by this function into:
2913 loop:
2914 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2915 VECT_DEF = vector_stmt # vectorized form of STMT
2916 s_loop = scalar_stmt # (scalar) STMT
2917 loop_exit:
2918 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2919 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2920 v_out2 = reduce <v_out1>
2921 s_out3 = extract_field <v_out2, 0>
2922 s_out4 = adjust_result <s_out3>
2923 use <s_out4>
2924 use <s_out4>
2927 static void
2928 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
2929 int ncopies, enum tree_code reduc_code,
2930 VEC (gimple, heap) *reduction_phis,
2931 int reduc_index, bool double_reduc,
2932 slp_tree slp_node)
2934 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2935 stmt_vec_info prev_phi_info;
2936 tree vectype;
2937 enum machine_mode mode;
2938 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2939 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2940 basic_block exit_bb;
2941 tree scalar_dest;
2942 tree scalar_type;
2943 gimple new_phi = NULL, phi;
2944 gimple_stmt_iterator exit_gsi;
2945 tree vec_dest;
2946 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
2947 gimple epilog_stmt = NULL;
2948 enum tree_code code = gimple_assign_rhs_code (stmt);
2949 gimple exit_phi;
2950 tree bitsize, bitpos;
2951 tree adjustment_def = NULL;
2952 tree vec_initial_def = NULL;
2953 tree reduction_op, expr, def;
2954 tree orig_name, scalar_result;
2955 imm_use_iterator imm_iter;
2956 use_operand_p use_p;
2957 bool extract_scalar_result = false;
2958 gimple use_stmt, orig_stmt, reduction_phi = NULL;
2959 bool nested_in_vect_loop = false;
2960 VEC (gimple, heap) *new_phis = NULL;
2961 enum vect_def_type dt = vect_unknown_def_type;
2962 int j, i;
2963 VEC (tree, heap) *scalar_results = NULL;
2964 unsigned int group_size = 1, k, ratio;
2965 VEC (tree, heap) *vec_initial_defs = NULL;
2966 VEC (gimple, heap) *phis;
2968 if (slp_node)
2969 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
2971 if (nested_in_vect_loop_p (loop, stmt))
2973 outer_loop = loop;
2974 loop = loop->inner;
2975 nested_in_vect_loop = true;
2976 gcc_assert (!slp_node);
2979 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2981 case GIMPLE_SINGLE_RHS:
2982 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2983 == ternary_op);
2984 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2985 break;
2986 case GIMPLE_UNARY_RHS:
2987 reduction_op = gimple_assign_rhs1 (stmt);
2988 break;
2989 case GIMPLE_BINARY_RHS:
2990 reduction_op = reduc_index ?
2991 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2992 break;
2993 default:
2994 gcc_unreachable ();
2997 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2998 gcc_assert (vectype);
2999 mode = TYPE_MODE (vectype);
3001 /* 1. Create the reduction def-use cycle:
3002 Set the arguments of REDUCTION_PHIS, i.e., transform
3004 loop:
3005 vec_def = phi <null, null> # REDUCTION_PHI
3006 VECT_DEF = vector_stmt # vectorized form of STMT
3009 into:
3011 loop:
3012 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3013 VECT_DEF = vector_stmt # vectorized form of STMT
3016 (in case of SLP, do it for all the phis). */
3018 /* Get the loop-entry arguments. */
3019 if (slp_node)
3020 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3021 else
3023 vec_initial_defs = VEC_alloc (tree, heap, 1);
3024 /* For the case of reduction, vect_get_vec_def_for_operand returns
3025 the scalar def before the loop, that defines the initial value
3026 of the reduction variable. */
3027 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3028 &adjustment_def);
3029 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3032 /* Set phi nodes arguments. */
3033 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3035 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3036 tree def = VEC_index (tree, vect_defs, i);
3037 for (j = 0; j < ncopies; j++)
3039 /* Set the loop-entry arg of the reduction-phi. */
3040 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3041 UNKNOWN_LOCATION);
3043 /* Set the loop-latch arg for the reduction-phi. */
3044 if (j > 0)
3045 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3047 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3049 if (vect_print_dump_info (REPORT_DETAILS))
3051 fprintf (vect_dump, "transform reduction: created def-use"
3052 " cycle: ");
3053 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3054 fprintf (vect_dump, "\n");
3055 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3056 TDF_SLIM);
3059 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3063 VEC_free (tree, heap, vec_initial_defs);
3065 /* 2. Create epilog code.
3066 The reduction epilog code operates across the elements of the vector
3067 of partial results computed by the vectorized loop.
3068 The reduction epilog code consists of:
3070 step 1: compute the scalar result in a vector (v_out2)
3071 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3072 step 3: adjust the scalar result (s_out3) if needed.
3074 Step 1 can be accomplished using one the following three schemes:
3075 (scheme 1) using reduc_code, if available.
3076 (scheme 2) using whole-vector shifts, if available.
3077 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3078 combined.
3080 The overall epilog code looks like this:
3082 s_out0 = phi <s_loop> # original EXIT_PHI
3083 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3084 v_out2 = reduce <v_out1> # step 1
3085 s_out3 = extract_field <v_out2, 0> # step 2
3086 s_out4 = adjust_result <s_out3> # step 3
3088 (step 3 is optional, and steps 1 and 2 may be combined).
3089 Lastly, the uses of s_out0 are replaced by s_out4. */
3092 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3093 v_out1 = phi <VECT_DEF>
3094 Store them in NEW_PHIS. */
3096 exit_bb = single_exit (loop)->dest;
3097 prev_phi_info = NULL;
3098 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3099 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3101 for (j = 0; j < ncopies; j++)
3103 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3104 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3105 if (j == 0)
3106 VEC_quick_push (gimple, new_phis, phi);
3107 else
3109 def = vect_get_vec_def_for_stmt_copy (dt, def);
3110 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3113 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3114 prev_phi_info = vinfo_for_stmt (phi);
3118 exit_gsi = gsi_after_labels (exit_bb);
3120 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3121 (i.e. when reduc_code is not available) and in the final adjustment
3122 code (if needed). Also get the original scalar reduction variable as
3123 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3124 represents a reduction pattern), the tree-code and scalar-def are
3125 taken from the original stmt that the pattern-stmt (STMT) replaces.
3126 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3127 are taken from STMT. */
3129 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3130 if (!orig_stmt)
3132 /* Regular reduction */
3133 orig_stmt = stmt;
3135 else
3137 /* Reduction pattern */
3138 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3139 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3140 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3143 code = gimple_assign_rhs_code (orig_stmt);
3144 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3145 partial results are added and not subtracted. */
3146 if (code == MINUS_EXPR)
3147 code = PLUS_EXPR;
3149 scalar_dest = gimple_assign_lhs (orig_stmt);
3150 scalar_type = TREE_TYPE (scalar_dest);
3151 scalar_results = VEC_alloc (tree, heap, group_size);
3152 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3153 bitsize = TYPE_SIZE (scalar_type);
3155 /* In case this is a reduction in an inner-loop while vectorizing an outer
3156 loop - we don't need to extract a single scalar result at the end of the
3157 inner-loop (unless it is double reduction, i.e., the use of reduction is
3158 outside the outer-loop). The final vector of partial results will be used
3159 in the vectorized outer-loop, or reduced to a scalar result at the end of
3160 the outer-loop. */
3161 if (nested_in_vect_loop && !double_reduc)
3162 goto vect_finalize_reduction;
3164 /* 2.3 Create the reduction code, using one of the three schemes described
3165 above. In SLP we simply need to extract all the elements from the
3166 vector (without reducing them), so we use scalar shifts. */
3167 if (reduc_code != ERROR_MARK && !slp_node)
3169 tree tmp;
3171 /*** Case 1: Create:
3172 v_out2 = reduc_expr <v_out1> */
3174 if (vect_print_dump_info (REPORT_DETAILS))
3175 fprintf (vect_dump, "Reduce using direct vector reduction.");
3177 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3178 new_phi = VEC_index (gimple, new_phis, 0);
3179 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3180 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3181 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3182 gimple_assign_set_lhs (epilog_stmt, new_temp);
3183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3185 extract_scalar_result = true;
3187 else
3189 enum tree_code shift_code = ERROR_MARK;
3190 bool have_whole_vector_shift = true;
3191 int bit_offset;
3192 int element_bitsize = tree_low_cst (bitsize, 1);
3193 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3194 tree vec_temp;
3196 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3197 shift_code = VEC_RSHIFT_EXPR;
3198 else
3199 have_whole_vector_shift = false;
3201 /* Regardless of whether we have a whole vector shift, if we're
3202 emulating the operation via tree-vect-generic, we don't want
3203 to use it. Only the first round of the reduction is likely
3204 to still be profitable via emulation. */
3205 /* ??? It might be better to emit a reduction tree code here, so that
3206 tree-vect-generic can expand the first round via bit tricks. */
3207 if (!VECTOR_MODE_P (mode))
3208 have_whole_vector_shift = false;
3209 else
3211 optab optab = optab_for_tree_code (code, vectype, optab_default);
3212 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3213 have_whole_vector_shift = false;
3216 if (have_whole_vector_shift && !slp_node)
3218 /*** Case 2: Create:
3219 for (offset = VS/2; offset >= element_size; offset/=2)
3221 Create: va' = vec_shift <va, offset>
3222 Create: va = vop <va, va'>
3223 } */
3225 if (vect_print_dump_info (REPORT_DETAILS))
3226 fprintf (vect_dump, "Reduce using vector shifts");
3228 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3229 new_phi = VEC_index (gimple, new_phis, 0);
3230 new_temp = PHI_RESULT (new_phi);
3231 for (bit_offset = vec_size_in_bits/2;
3232 bit_offset >= element_bitsize;
3233 bit_offset /= 2)
3235 tree bitpos = size_int (bit_offset);
3237 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3238 vec_dest, new_temp, bitpos);
3239 new_name = make_ssa_name (vec_dest, epilog_stmt);
3240 gimple_assign_set_lhs (epilog_stmt, new_name);
3241 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3243 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3244 new_name, new_temp);
3245 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3246 gimple_assign_set_lhs (epilog_stmt, new_temp);
3247 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3250 extract_scalar_result = true;
3252 else
3254 tree rhs;
3256 /*** Case 3: Create:
3257 s = extract_field <v_out2, 0>
3258 for (offset = element_size;
3259 offset < vector_size;
3260 offset += element_size;)
3262 Create: s' = extract_field <v_out2, offset>
3263 Create: s = op <s, s'> // For non SLP cases
3264 } */
3266 if (vect_print_dump_info (REPORT_DETAILS))
3267 fprintf (vect_dump, "Reduce using scalar code. ");
3269 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3270 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3272 vec_temp = PHI_RESULT (new_phi);
3273 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3274 bitsize_zero_node);
3275 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3276 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3277 gimple_assign_set_lhs (epilog_stmt, new_temp);
3278 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3280 /* In SLP we don't need to apply reduction operation, so we just
3281 collect s' values in SCALAR_RESULTS. */
3282 if (slp_node)
3283 VEC_safe_push (tree, heap, scalar_results, new_temp);
3285 for (bit_offset = element_bitsize;
3286 bit_offset < vec_size_in_bits;
3287 bit_offset += element_bitsize)
3289 tree bitpos = bitsize_int (bit_offset);
3290 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3291 bitsize, bitpos);
3293 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3294 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3295 gimple_assign_set_lhs (epilog_stmt, new_name);
3296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3298 if (slp_node)
3300 /* In SLP we don't need to apply reduction operation, so
3301 we just collect s' values in SCALAR_RESULTS. */
3302 new_temp = new_name;
3303 VEC_safe_push (tree, heap, scalar_results, new_name);
3305 else
3307 epilog_stmt = gimple_build_assign_with_ops (code,
3308 new_scalar_dest, new_name, new_temp);
3309 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3310 gimple_assign_set_lhs (epilog_stmt, new_temp);
3311 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3316 /* The only case where we need to reduce scalar results in SLP, is
3317 unrolling. If the size of SCALAR_RESULTS is greater than
3318 GROUP_SIZE, we reduce them combining elements modulo
3319 GROUP_SIZE. */
3320 if (slp_node)
3322 tree res, first_res, new_res;
3323 gimple new_stmt;
3325 /* Reduce multiple scalar results in case of SLP unrolling. */
3326 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3327 j++)
3329 first_res = VEC_index (tree, scalar_results, j % group_size);
3330 new_stmt = gimple_build_assign_with_ops (code,
3331 new_scalar_dest, first_res, res);
3332 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3333 gimple_assign_set_lhs (new_stmt, new_res);
3334 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3335 VEC_replace (tree, scalar_results, j % group_size, new_res);
3338 else
3339 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3340 VEC_safe_push (tree, heap, scalar_results, new_temp);
3342 extract_scalar_result = false;
3346 /* 2.4 Extract the final scalar result. Create:
3347 s_out3 = extract_field <v_out2, bitpos> */
3349 if (extract_scalar_result)
3351 tree rhs;
3353 if (vect_print_dump_info (REPORT_DETAILS))
3354 fprintf (vect_dump, "extract scalar result");
3356 if (BYTES_BIG_ENDIAN)
3357 bitpos = size_binop (MULT_EXPR,
3358 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3359 TYPE_SIZE (scalar_type));
3360 else
3361 bitpos = bitsize_zero_node;
3363 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3364 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3365 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3366 gimple_assign_set_lhs (epilog_stmt, new_temp);
3367 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3368 VEC_safe_push (tree, heap, scalar_results, new_temp);
3371 vect_finalize_reduction:
3373 /* 2.5 Adjust the final result by the initial value of the reduction
3374 variable. (When such adjustment is not needed, then
3375 'adjustment_def' is zero). For example, if code is PLUS we create:
3376 new_temp = loop_exit_def + adjustment_def */
3378 if (adjustment_def)
3380 gcc_assert (!slp_node);
3381 if (nested_in_vect_loop)
3383 new_phi = VEC_index (gimple, new_phis, 0);
3384 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3385 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3386 new_dest = vect_create_destination_var (scalar_dest, vectype);
3388 else
3390 new_temp = VEC_index (tree, scalar_results, 0);
3391 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3392 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3393 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3396 epilog_stmt = gimple_build_assign (new_dest, expr);
3397 new_temp = make_ssa_name (new_dest, epilog_stmt);
3398 gimple_assign_set_lhs (epilog_stmt, new_temp);
3399 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3400 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3401 if (nested_in_vect_loop)
3403 set_vinfo_for_stmt (epilog_stmt,
3404 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3405 NULL));
3406 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3407 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3409 if (!double_reduc)
3410 VEC_quick_push (tree, scalar_results, new_temp);
3411 else
3412 VEC_replace (tree, scalar_results, 0, new_temp);
3414 else
3415 VEC_replace (tree, scalar_results, 0, new_temp);
3417 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3420 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3421 phis with new adjusted scalar results, i.e., replace use <s_out0>
3422 with use <s_out4>.
3424 Transform:
3425 loop_exit:
3426 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3427 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3428 v_out2 = reduce <v_out1>
3429 s_out3 = extract_field <v_out2, 0>
3430 s_out4 = adjust_result <s_out3>
3431 use <s_out0>
3432 use <s_out0>
3434 into:
3436 loop_exit:
3437 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3438 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3439 v_out2 = reduce <v_out1>
3440 s_out3 = extract_field <v_out2, 0>
3441 s_out4 = adjust_result <s_out3>
3442 use <s_out4>
3443 use <s_out4> */
3445 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3446 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3447 need to match SCALAR_RESULTS with corresponding statements. The first
3448 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3449 the first vector stmt, etc.
3450 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3451 if (group_size > VEC_length (gimple, new_phis))
3453 ratio = group_size / VEC_length (gimple, new_phis);
3454 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3456 else
3457 ratio = 1;
3459 for (k = 0; k < group_size; k++)
3461 if (k % ratio == 0)
3463 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3464 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3467 if (slp_node)
3469 gimple current_stmt = VEC_index (gimple,
3470 SLP_TREE_SCALAR_STMTS (slp_node), k);
3472 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3473 /* SLP statements can't participate in patterns. */
3474 gcc_assert (!orig_stmt);
3475 scalar_dest = gimple_assign_lhs (current_stmt);
3478 phis = VEC_alloc (gimple, heap, 3);
3479 /* Find the loop-closed-use at the loop exit of the original scalar
3480 result. (The reduction result is expected to have two immediate uses -
3481 one at the latch block, and one at the loop exit). */
3482 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3483 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3484 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3486 /* We expect to have found an exit_phi because of loop-closed-ssa
3487 form. */
3488 gcc_assert (!VEC_empty (gimple, phis));
3490 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3492 if (outer_loop)
3494 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3495 gimple vect_phi;
3497 /* FORNOW. Currently not supporting the case that an inner-loop
3498 reduction is not used in the outer-loop (but only outside the
3499 outer-loop), unless it is double reduction. */
3500 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3501 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3502 || double_reduc);
3504 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3505 if (!double_reduc
3506 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3507 != vect_double_reduction_def)
3508 continue;
3510 /* Handle double reduction:
3512 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3513 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3514 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3515 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3517 At that point the regular reduction (stmt2 and stmt3) is
3518 already vectorized, as well as the exit phi node, stmt4.
3519 Here we vectorize the phi node of double reduction, stmt1, and
3520 update all relevant statements. */
3522 /* Go through all the uses of s2 to find double reduction phi
3523 node, i.e., stmt1 above. */
3524 orig_name = PHI_RESULT (exit_phi);
3525 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3527 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3528 stmt_vec_info new_phi_vinfo;
3529 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3530 basic_block bb = gimple_bb (use_stmt);
3531 gimple use;
3533 /* Check that USE_STMT is really double reduction phi
3534 node. */
3535 if (gimple_code (use_stmt) != GIMPLE_PHI
3536 || gimple_phi_num_args (use_stmt) != 2
3537 || !use_stmt_vinfo
3538 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3539 != vect_double_reduction_def
3540 || bb->loop_father != outer_loop)
3541 continue;
3543 /* Create vector phi node for double reduction:
3544 vs1 = phi <vs0, vs2>
3545 vs1 was created previously in this function by a call to
3546 vect_get_vec_def_for_operand and is stored in
3547 vec_initial_def;
3548 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3549 vs0 is created here. */
3551 /* Create vector phi node. */
3552 vect_phi = create_phi_node (vec_initial_def, bb);
3553 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3554 loop_vec_info_for_loop (outer_loop), NULL);
3555 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3557 /* Create vs0 - initial def of the double reduction phi. */
3558 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3559 loop_preheader_edge (outer_loop));
3560 init_def = get_initial_def_for_reduction (stmt,
3561 preheader_arg, NULL);
3562 vect_phi_init = vect_init_vector (use_stmt, init_def,
3563 vectype, NULL);
3565 /* Update phi node arguments with vs0 and vs2. */
3566 add_phi_arg (vect_phi, vect_phi_init,
3567 loop_preheader_edge (outer_loop),
3568 UNKNOWN_LOCATION);
3569 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3570 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3571 if (vect_print_dump_info (REPORT_DETAILS))
3573 fprintf (vect_dump, "created double reduction phi "
3574 "node: ");
3575 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3578 vect_phi_res = PHI_RESULT (vect_phi);
3580 /* Replace the use, i.e., set the correct vs1 in the regular
3581 reduction phi node. FORNOW, NCOPIES is always 1, so the
3582 loop is redundant. */
3583 use = reduction_phi;
3584 for (j = 0; j < ncopies; j++)
3586 edge pr_edge = loop_preheader_edge (loop);
3587 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3588 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3593 /* Replace the uses: */
3594 orig_name = PHI_RESULT (exit_phi);
3595 scalar_result = VEC_index (tree, scalar_results, k);
3596 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3597 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3598 SET_USE (use_p, scalar_result);
3601 VEC_free (gimple, heap, phis);
3604 VEC_free (tree, heap, scalar_results);
3605 VEC_free (gimple, heap, new_phis);
3609 /* Function vectorizable_reduction.
3611 Check if STMT performs a reduction operation that can be vectorized.
3612 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3613 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3614 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3616 This function also handles reduction idioms (patterns) that have been
3617 recognized in advance during vect_pattern_recog. In this case, STMT may be
3618 of this form:
3619 X = pattern_expr (arg0, arg1, ..., X)
3620 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3621 sequence that had been detected and replaced by the pattern-stmt (STMT).
3623 In some cases of reduction patterns, the type of the reduction variable X is
3624 different than the type of the other arguments of STMT.
3625 In such cases, the vectype that is used when transforming STMT into a vector
3626 stmt is different than the vectype that is used to determine the
3627 vectorization factor, because it consists of a different number of elements
3628 than the actual number of elements that are being operated upon in parallel.
3630 For example, consider an accumulation of shorts into an int accumulator.
3631 On some targets it's possible to vectorize this pattern operating on 8
3632 shorts at a time (hence, the vectype for purposes of determining the
3633 vectorization factor should be V8HI); on the other hand, the vectype that
3634 is used to create the vector form is actually V4SI (the type of the result).
3636 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3637 indicates what is the actual level of parallelism (V8HI in the example), so
3638 that the right vectorization factor would be derived. This vectype
3639 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3640 be used to create the vectorized stmt. The right vectype for the vectorized
3641 stmt is obtained from the type of the result X:
3642 get_vectype_for_scalar_type (TREE_TYPE (X))
3644 This means that, contrary to "regular" reductions (or "regular" stmts in
3645 general), the following equation:
3646 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3647 does *NOT* necessarily hold for reduction patterns. */
3649 bool
3650 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3651 gimple *vec_stmt, slp_tree slp_node)
3653 tree vec_dest;
3654 tree scalar_dest;
3655 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3656 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3657 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3658 tree vectype_in = NULL_TREE;
3659 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3660 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3661 enum tree_code code, orig_code, epilog_reduc_code;
3662 enum machine_mode vec_mode;
3663 int op_type;
3664 optab optab, reduc_optab;
3665 tree new_temp = NULL_TREE;
3666 tree def;
3667 gimple def_stmt;
3668 enum vect_def_type dt;
3669 gimple new_phi = NULL;
3670 tree scalar_type;
3671 bool is_simple_use;
3672 gimple orig_stmt;
3673 stmt_vec_info orig_stmt_info;
3674 tree expr = NULL_TREE;
3675 int i;
3676 int ncopies;
3677 int epilog_copies;
3678 stmt_vec_info prev_stmt_info, prev_phi_info;
3679 bool single_defuse_cycle = false;
3680 tree reduc_def = NULL_TREE;
3681 gimple new_stmt = NULL;
3682 int j;
3683 tree ops[3];
3684 bool nested_cycle = false, found_nested_cycle_def = false;
3685 gimple reduc_def_stmt = NULL;
3686 /* The default is that the reduction variable is the last in statement. */
3687 int reduc_index = 2;
3688 bool double_reduc = false, dummy;
3689 basic_block def_bb;
3690 struct loop * def_stmt_loop, *outer_loop = NULL;
3691 tree def_arg;
3692 gimple def_arg_stmt;
3693 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3694 VEC (gimple, heap) *phis = NULL;
3695 int vec_num;
3696 tree def0, def1;
3698 if (nested_in_vect_loop_p (loop, stmt))
3700 outer_loop = loop;
3701 loop = loop->inner;
3702 nested_cycle = true;
3705 /* 1. Is vectorizable reduction? */
3706 /* Not supportable if the reduction variable is used in the loop. */
3707 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3708 return false;
3710 /* Reductions that are not used even in an enclosing outer-loop,
3711 are expected to be "live" (used out of the loop). */
3712 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3713 && !STMT_VINFO_LIVE_P (stmt_info))
3714 return false;
3716 /* Make sure it was already recognized as a reduction computation. */
3717 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3718 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3719 return false;
3721 /* 2. Has this been recognized as a reduction pattern?
3723 Check if STMT represents a pattern that has been recognized
3724 in earlier analysis stages. For stmts that represent a pattern,
3725 the STMT_VINFO_RELATED_STMT field records the last stmt in
3726 the original sequence that constitutes the pattern. */
3728 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3729 if (orig_stmt)
3731 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3732 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3733 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3734 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3737 /* 3. Check the operands of the operation. The first operands are defined
3738 inside the loop body. The last operand is the reduction variable,
3739 which is defined by the loop-header-phi. */
3741 gcc_assert (is_gimple_assign (stmt));
3743 /* Flatten RHS */
3744 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3746 case GIMPLE_SINGLE_RHS:
3747 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3748 if (op_type == ternary_op)
3750 tree rhs = gimple_assign_rhs1 (stmt);
3751 ops[0] = TREE_OPERAND (rhs, 0);
3752 ops[1] = TREE_OPERAND (rhs, 1);
3753 ops[2] = TREE_OPERAND (rhs, 2);
3754 code = TREE_CODE (rhs);
3756 else
3757 return false;
3758 break;
3760 case GIMPLE_BINARY_RHS:
3761 code = gimple_assign_rhs_code (stmt);
3762 op_type = TREE_CODE_LENGTH (code);
3763 gcc_assert (op_type == binary_op);
3764 ops[0] = gimple_assign_rhs1 (stmt);
3765 ops[1] = gimple_assign_rhs2 (stmt);
3766 break;
3768 case GIMPLE_UNARY_RHS:
3769 return false;
3771 default:
3772 gcc_unreachable ();
3775 scalar_dest = gimple_assign_lhs (stmt);
3776 scalar_type = TREE_TYPE (scalar_dest);
3777 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3778 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3779 return false;
3781 /* All uses but the last are expected to be defined in the loop.
3782 The last use is the reduction variable. In case of nested cycle this
3783 assumption is not true: we use reduc_index to record the index of the
3784 reduction variable. */
3785 for (i = 0; i < op_type-1; i++)
3787 tree tem;
3789 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
3790 if (i == 0 && code == COND_EXPR)
3791 continue;
3793 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3794 &def_stmt, &def, &dt, &tem);
3795 if (!vectype_in)
3796 vectype_in = tem;
3797 gcc_assert (is_simple_use);
3798 if (dt != vect_internal_def
3799 && dt != vect_external_def
3800 && dt != vect_constant_def
3801 && dt != vect_induction_def
3802 && !(dt == vect_nested_cycle && nested_cycle))
3803 return false;
3805 if (dt == vect_nested_cycle)
3807 found_nested_cycle_def = true;
3808 reduc_def_stmt = def_stmt;
3809 reduc_index = i;
3813 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3814 &def, &dt);
3815 gcc_assert (is_simple_use);
3816 gcc_assert (dt == vect_reduction_def
3817 || dt == vect_nested_cycle
3818 || ((dt == vect_internal_def || dt == vect_external_def
3819 || dt == vect_constant_def || dt == vect_induction_def)
3820 && nested_cycle && found_nested_cycle_def));
3821 if (!found_nested_cycle_def)
3822 reduc_def_stmt = def_stmt;
3824 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3825 if (orig_stmt)
3826 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3827 reduc_def_stmt,
3828 !nested_cycle,
3829 &dummy));
3830 else
3831 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3832 !nested_cycle, &dummy));
3834 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3835 return false;
3837 if (slp_node)
3838 ncopies = 1;
3839 else
3840 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3841 / TYPE_VECTOR_SUBPARTS (vectype_in));
3843 gcc_assert (ncopies >= 1);
3845 vec_mode = TYPE_MODE (vectype_in);
3847 if (code == COND_EXPR)
3849 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3851 if (vect_print_dump_info (REPORT_DETAILS))
3852 fprintf (vect_dump, "unsupported condition in reduction");
3854 return false;
3857 else
3859 /* 4. Supportable by target? */
3861 /* 4.1. check support for the operation in the loop */
3862 optab = optab_for_tree_code (code, vectype_in, optab_default);
3863 if (!optab)
3865 if (vect_print_dump_info (REPORT_DETAILS))
3866 fprintf (vect_dump, "no optab.");
3868 return false;
3871 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3873 if (vect_print_dump_info (REPORT_DETAILS))
3874 fprintf (vect_dump, "op not supported by target.");
3876 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3877 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3878 < vect_min_worthwhile_factor (code))
3879 return false;
3881 if (vect_print_dump_info (REPORT_DETAILS))
3882 fprintf (vect_dump, "proceeding using word mode.");
3885 /* Worthwhile without SIMD support? */
3886 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
3887 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3888 < vect_min_worthwhile_factor (code))
3890 if (vect_print_dump_info (REPORT_DETAILS))
3891 fprintf (vect_dump, "not worthwhile without SIMD support.");
3893 return false;
3897 /* 4.2. Check support for the epilog operation.
3899 If STMT represents a reduction pattern, then the type of the
3900 reduction variable may be different than the type of the rest
3901 of the arguments. For example, consider the case of accumulation
3902 of shorts into an int accumulator; The original code:
3903 S1: int_a = (int) short_a;
3904 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3906 was replaced with:
3907 STMT: int_acc = widen_sum <short_a, int_acc>
3909 This means that:
3910 1. The tree-code that is used to create the vector operation in the
3911 epilog code (that reduces the partial results) is not the
3912 tree-code of STMT, but is rather the tree-code of the original
3913 stmt from the pattern that STMT is replacing. I.e, in the example
3914 above we want to use 'widen_sum' in the loop, but 'plus' in the
3915 epilog.
3916 2. The type (mode) we use to check available target support
3917 for the vector operation to be created in the *epilog*, is
3918 determined by the type of the reduction variable (in the example
3919 above we'd check this: plus_optab[vect_int_mode]).
3920 However the type (mode) we use to check available target support
3921 for the vector operation to be created *inside the loop*, is
3922 determined by the type of the other arguments to STMT (in the
3923 example we'd check this: widen_sum_optab[vect_short_mode]).
3925 This is contrary to "regular" reductions, in which the types of all
3926 the arguments are the same as the type of the reduction variable.
3927 For "regular" reductions we can therefore use the same vector type
3928 (and also the same tree-code) when generating the epilog code and
3929 when generating the code inside the loop. */
3931 if (orig_stmt)
3933 /* This is a reduction pattern: get the vectype from the type of the
3934 reduction variable, and get the tree-code from orig_stmt. */
3935 orig_code = gimple_assign_rhs_code (orig_stmt);
3936 gcc_assert (vectype_out);
3937 vec_mode = TYPE_MODE (vectype_out);
3939 else
3941 /* Regular reduction: use the same vectype and tree-code as used for
3942 the vector code inside the loop can be used for the epilog code. */
3943 orig_code = code;
3946 if (nested_cycle)
3948 def_bb = gimple_bb (reduc_def_stmt);
3949 def_stmt_loop = def_bb->loop_father;
3950 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3951 loop_preheader_edge (def_stmt_loop));
3952 if (TREE_CODE (def_arg) == SSA_NAME
3953 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3954 && gimple_code (def_arg_stmt) == GIMPLE_PHI
3955 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3956 && vinfo_for_stmt (def_arg_stmt)
3957 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3958 == vect_double_reduction_def)
3959 double_reduc = true;
3962 epilog_reduc_code = ERROR_MARK;
3963 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3965 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
3966 optab_default);
3967 if (!reduc_optab)
3969 if (vect_print_dump_info (REPORT_DETAILS))
3970 fprintf (vect_dump, "no optab for reduction.");
3972 epilog_reduc_code = ERROR_MARK;
3975 if (reduc_optab
3976 && optab_handler (reduc_optab, vec_mode)->insn_code
3977 == CODE_FOR_nothing)
3979 if (vect_print_dump_info (REPORT_DETAILS))
3980 fprintf (vect_dump, "reduc op not supported by target.");
3982 epilog_reduc_code = ERROR_MARK;
3985 else
3987 if (!nested_cycle || double_reduc)
3989 if (vect_print_dump_info (REPORT_DETAILS))
3990 fprintf (vect_dump, "no reduc code for scalar code.");
3992 return false;
3996 if (double_reduc && ncopies > 1)
3998 if (vect_print_dump_info (REPORT_DETAILS))
3999 fprintf (vect_dump, "multiple types in double reduction");
4001 return false;
4004 if (!vec_stmt) /* transformation not required. */
4006 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4007 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4008 return false;
4009 return true;
4012 /** Transform. **/
4014 if (vect_print_dump_info (REPORT_DETAILS))
4015 fprintf (vect_dump, "transform reduction.");
4017 /* FORNOW: Multiple types are not supported for condition. */
4018 if (code == COND_EXPR)
4019 gcc_assert (ncopies == 1);
4021 /* Create the destination vector */
4022 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4024 /* In case the vectorization factor (VF) is bigger than the number
4025 of elements that we can fit in a vectype (nunits), we have to generate
4026 more than one vector stmt - i.e - we need to "unroll" the
4027 vector stmt by a factor VF/nunits. For more details see documentation
4028 in vectorizable_operation. */
4030 /* If the reduction is used in an outer loop we need to generate
4031 VF intermediate results, like so (e.g. for ncopies=2):
4032 r0 = phi (init, r0)
4033 r1 = phi (init, r1)
4034 r0 = x0 + r0;
4035 r1 = x1 + r1;
4036 (i.e. we generate VF results in 2 registers).
4037 In this case we have a separate def-use cycle for each copy, and therefore
4038 for each copy we get the vector def for the reduction variable from the
4039 respective phi node created for this copy.
4041 Otherwise (the reduction is unused in the loop nest), we can combine
4042 together intermediate results, like so (e.g. for ncopies=2):
4043 r = phi (init, r)
4044 r = x0 + r;
4045 r = x1 + r;
4046 (i.e. we generate VF/2 results in a single register).
4047 In this case for each copy we get the vector def for the reduction variable
4048 from the vectorized reduction operation generated in the previous iteration.
4051 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4053 single_defuse_cycle = true;
4054 epilog_copies = 1;
4056 else
4057 epilog_copies = ncopies;
4059 prev_stmt_info = NULL;
4060 prev_phi_info = NULL;
4061 if (slp_node)
4063 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4064 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4065 == TYPE_VECTOR_SUBPARTS (vectype_in));
4067 else
4069 vec_num = 1;
4070 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4071 if (op_type == ternary_op)
4072 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4075 phis = VEC_alloc (gimple, heap, vec_num);
4076 vect_defs = VEC_alloc (tree, heap, vec_num);
4077 if (!slp_node)
4078 VEC_quick_push (tree, vect_defs, NULL_TREE);
4080 for (j = 0; j < ncopies; j++)
4082 if (j == 0 || !single_defuse_cycle)
4084 for (i = 0; i < vec_num; i++)
4086 /* Create the reduction-phi that defines the reduction
4087 operand. */
4088 new_phi = create_phi_node (vec_dest, loop->header);
4089 set_vinfo_for_stmt (new_phi,
4090 new_stmt_vec_info (new_phi, loop_vinfo,
4091 NULL));
4092 if (j == 0 || slp_node)
4093 VEC_quick_push (gimple, phis, new_phi);
4097 if (code == COND_EXPR)
4099 gcc_assert (!slp_node);
4100 vectorizable_condition (stmt, gsi, vec_stmt,
4101 PHI_RESULT (VEC_index (gimple, phis, 0)),
4102 reduc_index);
4103 /* Multiple types are not supported for condition. */
4104 break;
4107 /* Handle uses. */
4108 if (j == 0)
4110 if (slp_node)
4111 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4112 else
4114 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4115 stmt, NULL);
4116 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4117 if (op_type == ternary_op)
4119 if (reduc_index == 0)
4120 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4121 NULL);
4122 else
4123 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4124 NULL);
4126 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4130 else
4132 if (!slp_node)
4134 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4135 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4136 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4137 if (op_type == ternary_op)
4139 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4140 loop_vec_def1);
4141 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4145 if (single_defuse_cycle)
4146 reduc_def = gimple_assign_lhs (new_stmt);
4148 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4151 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
4153 if (slp_node)
4154 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4155 else
4157 if (!single_defuse_cycle || j == 0)
4158 reduc_def = PHI_RESULT (new_phi);
4161 def1 = ((op_type == ternary_op)
4162 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4163 if (op_type == binary_op)
4165 if (reduc_index == 0)
4166 expr = build2 (code, vectype_out, reduc_def, def0);
4167 else
4168 expr = build2 (code, vectype_out, def0, reduc_def);
4170 else
4172 if (reduc_index == 0)
4173 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4174 else
4176 if (reduc_index == 1)
4177 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4178 else
4179 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4183 new_stmt = gimple_build_assign (vec_dest, expr);
4184 new_temp = make_ssa_name (vec_dest, new_stmt);
4185 gimple_assign_set_lhs (new_stmt, new_temp);
4186 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4187 if (slp_node)
4189 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4190 VEC_quick_push (tree, vect_defs, new_temp);
4192 else
4193 VEC_replace (tree, vect_defs, 0, new_temp);
4196 if (slp_node)
4197 continue;
4199 if (j == 0)
4200 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4201 else
4202 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4204 prev_stmt_info = vinfo_for_stmt (new_stmt);
4205 prev_phi_info = vinfo_for_stmt (new_phi);
4208 /* Finalize the reduction-phi (set its arguments) and create the
4209 epilog reduction code. */
4210 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4212 new_temp = gimple_assign_lhs (*vec_stmt);
4213 VEC_replace (tree, vect_defs, 0, new_temp);
4216 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4217 epilog_reduc_code, phis, reduc_index,
4218 double_reduc, slp_node);
4220 VEC_free (gimple, heap, phis);
4221 VEC_free (tree, heap, vec_oprnds0);
4222 if (vec_oprnds1)
4223 VEC_free (tree, heap, vec_oprnds1);
4225 return true;
4228 /* Function vect_min_worthwhile_factor.
4230 For a loop where we could vectorize the operation indicated by CODE,
4231 return the minimum vectorization factor that makes it worthwhile
4232 to use generic vectors. */
4234 vect_min_worthwhile_factor (enum tree_code code)
4236 switch (code)
4238 case PLUS_EXPR:
4239 case MINUS_EXPR:
4240 case NEGATE_EXPR:
4241 return 4;
4243 case BIT_AND_EXPR:
4244 case BIT_IOR_EXPR:
4245 case BIT_XOR_EXPR:
4246 case BIT_NOT_EXPR:
4247 return 2;
4249 default:
4250 return INT_MAX;
4255 /* Function vectorizable_induction
4257 Check if PHI performs an induction computation that can be vectorized.
4258 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4259 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4260 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4262 bool
4263 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4264 gimple *vec_stmt)
4266 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4267 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4268 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4269 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4270 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4271 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4272 tree vec_def;
4274 gcc_assert (ncopies >= 1);
4275 /* FORNOW. This restriction should be relaxed. */
4276 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4278 if (vect_print_dump_info (REPORT_DETAILS))
4279 fprintf (vect_dump, "multiple types in nested loop.");
4280 return false;
4283 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4284 return false;
4286 /* FORNOW: SLP not supported. */
4287 if (STMT_SLP_TYPE (stmt_info))
4288 return false;
4290 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4292 if (gimple_code (phi) != GIMPLE_PHI)
4293 return false;
4295 if (!vec_stmt) /* transformation not required. */
4297 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4298 if (vect_print_dump_info (REPORT_DETAILS))
4299 fprintf (vect_dump, "=== vectorizable_induction ===");
4300 vect_model_induction_cost (stmt_info, ncopies);
4301 return true;
4304 /** Transform. **/
4306 if (vect_print_dump_info (REPORT_DETAILS))
4307 fprintf (vect_dump, "transform induction phi.");
4309 vec_def = get_initial_def_for_induction (phi);
4310 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4311 return true;
4314 /* Function vectorizable_live_operation.
4316 STMT computes a value that is used outside the loop. Check if
4317 it can be supported. */
4319 bool
4320 vectorizable_live_operation (gimple stmt,
4321 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4322 gimple *vec_stmt ATTRIBUTE_UNUSED)
4324 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4325 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4326 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4327 int i;
4328 int op_type;
4329 tree op;
4330 tree def;
4331 gimple def_stmt;
4332 enum vect_def_type dt;
4333 enum tree_code code;
4334 enum gimple_rhs_class rhs_class;
4336 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4338 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4339 return false;
4341 if (!is_gimple_assign (stmt))
4342 return false;
4344 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4345 return false;
4347 /* FORNOW. CHECKME. */
4348 if (nested_in_vect_loop_p (loop, stmt))
4349 return false;
4351 code = gimple_assign_rhs_code (stmt);
4352 op_type = TREE_CODE_LENGTH (code);
4353 rhs_class = get_gimple_rhs_class (code);
4354 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4355 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4357 /* FORNOW: support only if all uses are invariant. This means
4358 that the scalar operations can remain in place, unvectorized.
4359 The original last scalar value that they compute will be used. */
4361 for (i = 0; i < op_type; i++)
4363 if (rhs_class == GIMPLE_SINGLE_RHS)
4364 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4365 else
4366 op = gimple_op (stmt, i + 1);
4367 if (op
4368 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4370 if (vect_print_dump_info (REPORT_DETAILS))
4371 fprintf (vect_dump, "use not simple.");
4372 return false;
4375 if (dt != vect_external_def && dt != vect_constant_def)
4376 return false;
4379 /* No transformation is required for the cases we currently support. */
4380 return true;
4383 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4385 static void
4386 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4388 ssa_op_iter op_iter;
4389 imm_use_iterator imm_iter;
4390 def_operand_p def_p;
4391 gimple ustmt;
4393 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4395 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4397 basic_block bb;
4399 if (!is_gimple_debug (ustmt))
4400 continue;
4402 bb = gimple_bb (ustmt);
4404 if (!flow_bb_inside_loop_p (loop, bb))
4406 if (gimple_debug_bind_p (ustmt))
4408 if (vect_print_dump_info (REPORT_DETAILS))
4409 fprintf (vect_dump, "killing debug use");
4411 gimple_debug_bind_reset_value (ustmt);
4412 update_stmt (ustmt);
4414 else
4415 gcc_unreachable ();
4421 /* Function vect_transform_loop.
4423 The analysis phase has determined that the loop is vectorizable.
4424 Vectorize the loop - created vectorized stmts to replace the scalar
4425 stmts in the loop, and update the loop exit condition. */
4427 void
4428 vect_transform_loop (loop_vec_info loop_vinfo)
4430 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4431 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4432 int nbbs = loop->num_nodes;
4433 gimple_stmt_iterator si;
4434 int i;
4435 tree ratio = NULL;
4436 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4437 bool strided_store;
4438 bool slp_scheduled = false;
4439 unsigned int nunits;
4440 tree cond_expr = NULL_TREE;
4441 gimple_seq cond_expr_stmt_list = NULL;
4442 bool do_peeling_for_loop_bound;
4444 if (vect_print_dump_info (REPORT_DETAILS))
4445 fprintf (vect_dump, "=== vec_transform_loop ===");
4447 /* Peel the loop if there are data refs with unknown alignment.
4448 Only one data ref with unknown store is allowed. */
4450 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4451 vect_do_peeling_for_alignment (loop_vinfo);
4453 do_peeling_for_loop_bound
4454 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4455 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4456 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4458 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4459 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4460 vect_loop_versioning (loop_vinfo,
4461 !do_peeling_for_loop_bound,
4462 &cond_expr, &cond_expr_stmt_list);
4464 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4465 compile time constant), or it is a constant that doesn't divide by the
4466 vectorization factor, then an epilog loop needs to be created.
4467 We therefore duplicate the loop: the original loop will be vectorized,
4468 and will compute the first (n/VF) iterations. The second copy of the loop
4469 will remain scalar and will compute the remaining (n%VF) iterations.
4470 (VF is the vectorization factor). */
4472 if (do_peeling_for_loop_bound)
4473 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4474 cond_expr, cond_expr_stmt_list);
4475 else
4476 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4477 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4479 /* 1) Make sure the loop header has exactly two entries
4480 2) Make sure we have a preheader basic block. */
4482 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4484 split_edge (loop_preheader_edge (loop));
4486 /* FORNOW: the vectorizer supports only loops which body consist
4487 of one basic block (header + empty latch). When the vectorizer will
4488 support more involved loop forms, the order by which the BBs are
4489 traversed need to be reconsidered. */
4491 for (i = 0; i < nbbs; i++)
4493 basic_block bb = bbs[i];
4494 stmt_vec_info stmt_info;
4495 gimple phi;
4497 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4499 phi = gsi_stmt (si);
4500 if (vect_print_dump_info (REPORT_DETAILS))
4502 fprintf (vect_dump, "------>vectorizing phi: ");
4503 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4505 stmt_info = vinfo_for_stmt (phi);
4506 if (!stmt_info)
4507 continue;
4509 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4510 vect_loop_kill_debug_uses (loop, phi);
4512 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4513 && !STMT_VINFO_LIVE_P (stmt_info))
4514 continue;
4516 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4517 != (unsigned HOST_WIDE_INT) vectorization_factor)
4518 && vect_print_dump_info (REPORT_DETAILS))
4519 fprintf (vect_dump, "multiple-types.");
4521 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4523 if (vect_print_dump_info (REPORT_DETAILS))
4524 fprintf (vect_dump, "transform phi.");
4525 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4529 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4531 gimple stmt = gsi_stmt (si);
4532 bool is_store;
4534 if (vect_print_dump_info (REPORT_DETAILS))
4536 fprintf (vect_dump, "------>vectorizing statement: ");
4537 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4540 stmt_info = vinfo_for_stmt (stmt);
4542 /* vector stmts created in the outer-loop during vectorization of
4543 stmts in an inner-loop may not have a stmt_info, and do not
4544 need to be vectorized. */
4545 if (!stmt_info)
4547 gsi_next (&si);
4548 continue;
4551 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4552 vect_loop_kill_debug_uses (loop, stmt);
4554 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4555 && !STMT_VINFO_LIVE_P (stmt_info))
4557 gsi_next (&si);
4558 continue;
4561 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4562 nunits =
4563 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4564 if (!STMT_SLP_TYPE (stmt_info)
4565 && nunits != (unsigned int) vectorization_factor
4566 && vect_print_dump_info (REPORT_DETAILS))
4567 /* For SLP VF is set according to unrolling factor, and not to
4568 vector size, hence for SLP this print is not valid. */
4569 fprintf (vect_dump, "multiple-types.");
4571 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4572 reached. */
4573 if (STMT_SLP_TYPE (stmt_info))
4575 if (!slp_scheduled)
4577 slp_scheduled = true;
4579 if (vect_print_dump_info (REPORT_DETAILS))
4580 fprintf (vect_dump, "=== scheduling SLP instances ===");
4582 vect_schedule_slp (loop_vinfo, NULL);
4585 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4586 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4588 gsi_next (&si);
4589 continue;
4593 /* -------- vectorize statement ------------ */
4594 if (vect_print_dump_info (REPORT_DETAILS))
4595 fprintf (vect_dump, "transform statement.");
4597 strided_store = false;
4598 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4599 if (is_store)
4601 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4603 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4604 interleaving chain was completed - free all the stores in
4605 the chain. */
4606 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4607 gsi_remove (&si, true);
4608 continue;
4610 else
4612 /* Free the attached stmt_vec_info and remove the stmt. */
4613 free_stmt_vec_info (stmt);
4614 gsi_remove (&si, true);
4615 continue;
4618 gsi_next (&si);
4619 } /* stmts in BB */
4620 } /* BBs in loop */
4622 slpeel_make_loop_iterate_ntimes (loop, ratio);
4624 /* The memory tags and pointers in vectorized statements need to
4625 have their SSA forms updated. FIXME, why can't this be delayed
4626 until all the loops have been transformed? */
4627 update_ssa (TODO_update_ssa);
4629 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4630 fprintf (vect_dump, "LOOP VECTORIZED.");
4631 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4632 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");