* tree-flow-inline.h (op_iter_init): Reject GIMPLE_PHI stmts.
[official-gcc.git] / gcc / tree-vect-loop.c
blobc9d0c460b624ea7d9c17898ed3818adda122222f
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 return false;
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 return false;
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 return false;
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 else
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 return false;
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 return false;
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 return false;
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386 return true;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 *init = init_expr;
425 *step = step_expr;
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
434 return true;
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
525 if (double_reduc)
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
534 else
536 if (nested_cycle)
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
545 else
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
579 Example1: reduction:
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
585 Example2: induction:
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
621 tree niters;
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
711 else
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
761 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
762 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
763 LOOP_VINFO_PEELING_HTAB (res) = NULL;
764 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
766 return res;
770 /* Function destroy_loop_vec_info.
772 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
773 stmts in the loop. */
775 void
776 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
778 struct loop *loop;
779 basic_block *bbs;
780 int nbbs;
781 gimple_stmt_iterator si;
782 int j;
783 VEC (slp_instance, heap) *slp_instances;
784 slp_instance instance;
786 if (!loop_vinfo)
787 return;
789 loop = LOOP_VINFO_LOOP (loop_vinfo);
791 bbs = LOOP_VINFO_BBS (loop_vinfo);
792 nbbs = loop->num_nodes;
794 if (!clean_stmts)
796 free (LOOP_VINFO_BBS (loop_vinfo));
797 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
798 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
799 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
800 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
801 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
803 free (loop_vinfo);
804 loop->aux = NULL;
805 return;
808 for (j = 0; j < nbbs; j++)
810 basic_block bb = bbs[j];
811 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
812 free_stmt_vec_info (gsi_stmt (si));
814 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
816 gimple stmt = gsi_stmt (si);
817 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
819 if (stmt_info)
821 /* Check if this is a "pattern stmt" (introduced by the
822 vectorizer during the pattern recognition pass). */
823 bool remove_stmt_p = false;
824 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
825 if (orig_stmt)
827 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
828 if (orig_stmt_info
829 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
830 remove_stmt_p = true;
833 /* Free stmt_vec_info. */
834 free_stmt_vec_info (stmt);
836 /* Remove dead "pattern stmts". */
837 if (remove_stmt_p)
838 gsi_remove (&si, true);
840 gsi_next (&si);
844 free (LOOP_VINFO_BBS (loop_vinfo));
845 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
846 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
847 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
849 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
850 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
851 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
852 vect_free_slp_instance (instance);
854 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
855 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
856 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
857 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
859 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
860 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
862 free (loop_vinfo);
863 loop->aux = NULL;
867 /* Function vect_analyze_loop_1.
869 Apply a set of analyses on LOOP, and create a loop_vec_info struct
870 for it. The different analyses will record information in the
871 loop_vec_info struct. This is a subset of the analyses applied in
872 vect_analyze_loop, to be applied on an inner-loop nested in the loop
873 that is now considered for (outer-loop) vectorization. */
875 static loop_vec_info
876 vect_analyze_loop_1 (struct loop *loop)
878 loop_vec_info loop_vinfo;
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
883 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
885 loop_vinfo = vect_analyze_loop_form (loop);
886 if (!loop_vinfo)
888 if (vect_print_dump_info (REPORT_DETAILS))
889 fprintf (vect_dump, "bad inner-loop form.");
890 return NULL;
893 return loop_vinfo;
897 /* Function vect_analyze_loop_form.
899 Verify that certain CFG restrictions hold, including:
900 - the loop has a pre-header
901 - the loop has a single entry and exit
902 - the loop exit condition is simple enough, and the number of iterations
903 can be analyzed (a countable loop). */
905 loop_vec_info
906 vect_analyze_loop_form (struct loop *loop)
908 loop_vec_info loop_vinfo;
909 gimple loop_cond;
910 tree number_of_iterations = NULL;
911 loop_vec_info inner_loop_vinfo = NULL;
913 if (vect_print_dump_info (REPORT_DETAILS))
914 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
916 /* Different restrictions apply when we are considering an inner-most loop,
917 vs. an outer (nested) loop.
918 (FORNOW. May want to relax some of these restrictions in the future). */
920 if (!loop->inner)
922 /* Inner-most loop. We currently require that the number of BBs is
923 exactly 2 (the header and latch). Vectorizable inner-most loops
924 look like this:
926 (pre-header)
928 header <--------+
929 | | |
930 | +--> latch --+
932 (exit-bb) */
934 if (loop->num_nodes != 2)
936 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
937 fprintf (vect_dump, "not vectorized: control flow in loop.");
938 return NULL;
941 if (empty_block_p (loop->header))
943 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
944 fprintf (vect_dump, "not vectorized: empty loop.");
945 return NULL;
948 else
950 struct loop *innerloop = loop->inner;
951 edge entryedge;
953 /* Nested loop. We currently require that the loop is doubly-nested,
954 contains a single inner loop, and the number of BBs is exactly 5.
955 Vectorizable outer-loops look like this:
957 (pre-header)
959 header <---+
961 inner-loop |
963 tail ------+
965 (exit-bb)
967 The inner-loop has the properties expected of inner-most loops
968 as described above. */
970 if ((loop->inner)->inner || (loop->inner)->next)
972 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
973 fprintf (vect_dump, "not vectorized: multiple nested loops.");
974 return NULL;
977 /* Analyze the inner-loop. */
978 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
979 if (!inner_loop_vinfo)
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
982 fprintf (vect_dump, "not vectorized: Bad inner loop.");
983 return NULL;
986 if (!expr_invariant_in_loop_p (loop,
987 LOOP_VINFO_NITERS (inner_loop_vinfo)))
989 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
990 fprintf (vect_dump,
991 "not vectorized: inner-loop count not invariant.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
993 return NULL;
996 if (loop->num_nodes != 5)
998 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
999 fprintf (vect_dump, "not vectorized: control flow in loop.");
1000 destroy_loop_vec_info (inner_loop_vinfo, true);
1001 return NULL;
1004 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1005 entryedge = EDGE_PRED (innerloop->header, 0);
1006 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1007 entryedge = EDGE_PRED (innerloop->header, 1);
1009 if (entryedge->src != loop->header
1010 || !single_exit (innerloop)
1011 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1013 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1014 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1015 destroy_loop_vec_info (inner_loop_vinfo, true);
1016 return NULL;
1019 if (vect_print_dump_info (REPORT_DETAILS))
1020 fprintf (vect_dump, "Considering outer-loop vectorization.");
1023 if (!single_exit (loop)
1024 || EDGE_COUNT (loop->header->preds) != 2)
1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1028 if (!single_exit (loop))
1029 fprintf (vect_dump, "not vectorized: multiple exits.");
1030 else if (EDGE_COUNT (loop->header->preds) != 2)
1031 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1033 if (inner_loop_vinfo)
1034 destroy_loop_vec_info (inner_loop_vinfo, true);
1035 return NULL;
1038 /* We assume that the loop exit condition is at the end of the loop. i.e,
1039 that the loop is represented as a do-while (with a proper if-guard
1040 before the loop if needed), where the loop header contains all the
1041 executable statements, and the latch is empty. */
1042 if (!empty_block_p (loop->latch)
1043 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1045 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1046 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1047 if (inner_loop_vinfo)
1048 destroy_loop_vec_info (inner_loop_vinfo, true);
1049 return NULL;
1052 /* Make sure there exists a single-predecessor exit bb: */
1053 if (!single_pred_p (single_exit (loop)->dest))
1055 edge e = single_exit (loop);
1056 if (!(e->flags & EDGE_ABNORMAL))
1058 split_loop_exit_edge (e);
1059 if (vect_print_dump_info (REPORT_DETAILS))
1060 fprintf (vect_dump, "split exit edge.");
1062 else
1064 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1065 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1066 if (inner_loop_vinfo)
1067 destroy_loop_vec_info (inner_loop_vinfo, true);
1068 return NULL;
1072 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1073 if (!loop_cond)
1075 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1076 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1077 if (inner_loop_vinfo)
1078 destroy_loop_vec_info (inner_loop_vinfo, true);
1079 return NULL;
1082 if (!number_of_iterations)
1084 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085 fprintf (vect_dump,
1086 "not vectorized: number of iterations cannot be computed.");
1087 if (inner_loop_vinfo)
1088 destroy_loop_vec_info (inner_loop_vinfo, true);
1089 return NULL;
1092 if (chrec_contains_undetermined (number_of_iterations))
1094 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1095 fprintf (vect_dump, "Infinite number of iterations.");
1096 if (inner_loop_vinfo)
1097 destroy_loop_vec_info (inner_loop_vinfo, true);
1098 return NULL;
1101 if (!NITERS_KNOWN_P (number_of_iterations))
1103 if (vect_print_dump_info (REPORT_DETAILS))
1105 fprintf (vect_dump, "Symbolic number of iterations is ");
1106 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1109 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1111 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1112 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1113 if (inner_loop_vinfo)
1114 destroy_loop_vec_info (inner_loop_vinfo, false);
1115 return NULL;
1118 loop_vinfo = new_loop_vec_info (loop);
1119 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1120 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1122 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1124 /* CHECKME: May want to keep it around it in the future. */
1125 if (inner_loop_vinfo)
1126 destroy_loop_vec_info (inner_loop_vinfo, false);
1128 gcc_assert (!loop->aux);
1129 loop->aux = loop_vinfo;
1130 return loop_vinfo;
1134 /* Get cost by calling cost target builtin. */
1136 static inline int
1137 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1139 tree dummy_type = NULL;
1140 int dummy = 0;
1142 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1143 dummy_type, dummy);
1147 /* Function vect_analyze_loop_operations.
1149 Scan the loop stmts and make sure they are all vectorizable. */
1151 static bool
1152 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1155 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1156 int nbbs = loop->num_nodes;
1157 gimple_stmt_iterator si;
1158 unsigned int vectorization_factor = 0;
1159 int i;
1160 gimple phi;
1161 stmt_vec_info stmt_info;
1162 bool need_to_vectorize = false;
1163 int min_profitable_iters;
1164 int min_scalar_loop_bound;
1165 unsigned int th;
1166 bool only_slp_in_loop = true, ok;
1168 if (vect_print_dump_info (REPORT_DETAILS))
1169 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1171 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1172 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1173 if (slp)
1175 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1176 vectorization factor of the loop is the unrolling factor required by
1177 the SLP instances. If that unrolling factor is 1, we say, that we
1178 perform pure SLP on loop - cross iteration parallelism is not
1179 exploited. */
1180 for (i = 0; i < nbbs; i++)
1182 basic_block bb = bbs[i];
1183 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1185 gimple stmt = gsi_stmt (si);
1186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1187 gcc_assert (stmt_info);
1188 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1189 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1190 && !PURE_SLP_STMT (stmt_info))
1191 /* STMT needs both SLP and loop-based vectorization. */
1192 only_slp_in_loop = false;
1196 if (only_slp_in_loop)
1197 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1198 else
1199 vectorization_factor = least_common_multiple (vectorization_factor,
1200 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1202 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1203 if (vect_print_dump_info (REPORT_DETAILS))
1204 fprintf (vect_dump, "Updating vectorization factor to %d ",
1205 vectorization_factor);
1208 for (i = 0; i < nbbs; i++)
1210 basic_block bb = bbs[i];
1212 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1214 phi = gsi_stmt (si);
1215 ok = true;
1217 stmt_info = vinfo_for_stmt (phi);
1218 if (vect_print_dump_info (REPORT_DETAILS))
1220 fprintf (vect_dump, "examining phi: ");
1221 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1224 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1225 (i.e., a phi in the tail of the outer-loop). */
1226 if (! is_loop_header_bb_p (bb))
1228 /* FORNOW: we currently don't support the case that these phis
1229 are not used in the outerloop (unless it is double reduction,
1230 i.e., this phi is vect_reduction_def), cause this case
1231 requires to actually do something here. */
1232 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1233 || STMT_VINFO_LIVE_P (stmt_info))
1234 && STMT_VINFO_DEF_TYPE (stmt_info)
1235 != vect_double_reduction_def)
1237 if (vect_print_dump_info (REPORT_DETAILS))
1238 fprintf (vect_dump,
1239 "Unsupported loop-closed phi in outer-loop.");
1240 return false;
1243 /* If PHI is used in the outer loop, we check that its operand
1244 is defined in the inner loop. */
1245 if (STMT_VINFO_RELEVANT_P (stmt_info))
1247 tree phi_op;
1248 gimple op_def_stmt;
1250 if (gimple_phi_num_args (phi) != 1)
1251 return false;
1253 phi_op = PHI_ARG_DEF (phi, 0);
1254 if (TREE_CODE (phi_op) != SSA_NAME)
1255 return false;
1257 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1258 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1259 return false;
1261 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1262 != vect_used_in_outer
1263 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1264 != vect_used_in_outer_by_reduction)
1265 return false;
1268 continue;
1271 gcc_assert (stmt_info);
1273 if (STMT_VINFO_LIVE_P (stmt_info))
1275 /* FORNOW: not yet supported. */
1276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277 fprintf (vect_dump, "not vectorized: value used after loop.");
1278 return false;
1281 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1282 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1284 /* A scalar-dependence cycle that we don't support. */
1285 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1286 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1287 return false;
1290 if (STMT_VINFO_RELEVANT_P (stmt_info))
1292 need_to_vectorize = true;
1293 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1294 ok = vectorizable_induction (phi, NULL, NULL);
1297 if (!ok)
1299 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301 fprintf (vect_dump,
1302 "not vectorized: relevant phi not supported: ");
1303 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1305 return false;
1309 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1311 gimple stmt = gsi_stmt (si);
1312 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1313 return false;
1315 } /* bbs */
1317 /* All operations in the loop are either irrelevant (deal with loop
1318 control, or dead), or only used outside the loop and can be moved
1319 out of the loop (e.g. invariants, inductions). The loop can be
1320 optimized away by scalar optimizations. We're better off not
1321 touching this loop. */
1322 if (!need_to_vectorize)
1324 if (vect_print_dump_info (REPORT_DETAILS))
1325 fprintf (vect_dump,
1326 "All the computation can be taken out of the loop.");
1327 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1328 fprintf (vect_dump,
1329 "not vectorized: redundant loop. no profit to vectorize.");
1330 return false;
1333 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1334 && vect_print_dump_info (REPORT_DETAILS))
1335 fprintf (vect_dump,
1336 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1337 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1339 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1340 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1343 fprintf (vect_dump, "not vectorized: iteration count too small.");
1344 if (vect_print_dump_info (REPORT_DETAILS))
1345 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1346 "vectorization factor.");
1347 return false;
1350 /* Analyze cost. Decide if worth while to vectorize. */
1352 /* Once VF is set, SLP costs should be updated since the number of created
1353 vector stmts depends on VF. */
1354 vect_update_slp_costs_according_to_vf (loop_vinfo);
1356 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1357 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1359 if (min_profitable_iters < 0)
1361 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1362 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1363 if (vect_print_dump_info (REPORT_DETAILS))
1364 fprintf (vect_dump, "not vectorized: vector version will never be "
1365 "profitable.");
1366 return false;
1369 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1370 * vectorization_factor) - 1);
1372 /* Use the cost model only if it is more conservative than user specified
1373 threshold. */
1375 th = (unsigned) min_scalar_loop_bound;
1376 if (min_profitable_iters
1377 && (!min_scalar_loop_bound
1378 || min_profitable_iters > min_scalar_loop_bound))
1379 th = (unsigned) min_profitable_iters;
1381 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1382 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1384 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1385 fprintf (vect_dump, "not vectorized: vectorization not "
1386 "profitable.");
1387 if (vect_print_dump_info (REPORT_DETAILS))
1388 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1389 "user specified loop bound parameter or minimum "
1390 "profitable iterations (whichever is more conservative).");
1391 return false;
1394 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1395 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1396 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1398 if (vect_print_dump_info (REPORT_DETAILS))
1399 fprintf (vect_dump, "epilog loop required.");
1400 if (!vect_can_advance_ivs_p (loop_vinfo))
1402 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1403 fprintf (vect_dump,
1404 "not vectorized: can't create epilog loop 1.");
1405 return false;
1407 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1409 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1410 fprintf (vect_dump,
1411 "not vectorized: can't create epilog loop 2.");
1412 return false;
1416 return true;
1420 /* Function vect_analyze_loop_2.
1422 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1423 for it. The different analyses will record information in the
1424 loop_vec_info struct. */
1425 static bool
1426 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1428 bool ok, dummy, slp = false;
1429 int max_vf = MAX_VECTORIZATION_FACTOR;
1430 int min_vf = 2;
1432 /* Find all data references in the loop (which correspond to vdefs/vuses)
1433 and analyze their evolution in the loop. Also adjust the minimal
1434 vectorization factor according to the loads and stores.
1436 FORNOW: Handle only simple, array references, which
1437 alignment can be forced, and aligned pointer-references. */
1439 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1440 if (!ok)
1442 if (vect_print_dump_info (REPORT_DETAILS))
1443 fprintf (vect_dump, "bad data references.");
1444 return false;
1447 /* Classify all cross-iteration scalar data-flow cycles.
1448 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1450 vect_analyze_scalar_cycles (loop_vinfo);
1452 vect_pattern_recog (loop_vinfo);
1454 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1456 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1457 if (!ok)
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "unexpected pattern.");
1461 return false;
1464 /* Analyze data dependences between the data-refs in the loop
1465 and adjust the maximum vectorization factor according to
1466 the dependences.
1467 FORNOW: fail at the first data dependence that we encounter. */
1469 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1470 if (!ok
1471 || max_vf < min_vf)
1473 if (vect_print_dump_info (REPORT_DETAILS))
1474 fprintf (vect_dump, "bad data dependence.");
1475 return false;
1478 ok = vect_determine_vectorization_factor (loop_vinfo);
1479 if (!ok)
1481 if (vect_print_dump_info (REPORT_DETAILS))
1482 fprintf (vect_dump, "can't determine vectorization factor.");
1483 return false;
1485 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1487 if (vect_print_dump_info (REPORT_DETAILS))
1488 fprintf (vect_dump, "bad data dependence.");
1489 return false;
1492 /* Analyze the alignment of the data-refs in the loop.
1493 Fail if a data reference is found that cannot be vectorized. */
1495 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1496 if (!ok)
1498 if (vect_print_dump_info (REPORT_DETAILS))
1499 fprintf (vect_dump, "bad data alignment.");
1500 return false;
1503 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1504 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1506 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1507 if (!ok)
1509 if (vect_print_dump_info (REPORT_DETAILS))
1510 fprintf (vect_dump, "bad data access.");
1511 return false;
1514 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1515 It is important to call pruning after vect_analyze_data_ref_accesses,
1516 since we use grouping information gathered by interleaving analysis. */
1517 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1518 if (!ok)
1520 if (vect_print_dump_info (REPORT_DETAILS))
1521 fprintf (vect_dump, "too long list of versioning for alias "
1522 "run-time tests.");
1523 return false;
1526 /* This pass will decide on using loop versioning and/or loop peeling in
1527 order to enhance the alignment of data references in the loop. */
1529 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1530 if (!ok)
1532 if (vect_print_dump_info (REPORT_DETAILS))
1533 fprintf (vect_dump, "bad data alignment.");
1534 return false;
1537 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1538 ok = vect_analyze_slp (loop_vinfo, NULL);
1539 if (ok)
1541 /* Decide which possible SLP instances to SLP. */
1542 slp = vect_make_slp_decision (loop_vinfo);
1544 /* Find stmts that need to be both vectorized and SLPed. */
1545 vect_detect_hybrid_slp (loop_vinfo);
1547 else
1548 return false;
1550 /* Scan all the operations in the loop and make sure they are
1551 vectorizable. */
1553 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1554 if (!ok)
1556 if (vect_print_dump_info (REPORT_DETAILS))
1557 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1558 return false;
1561 return true;
1564 /* Function vect_analyze_loop.
1566 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1567 for it. The different analyses will record information in the
1568 loop_vec_info struct. */
1569 loop_vec_info
1570 vect_analyze_loop (struct loop *loop)
1572 loop_vec_info loop_vinfo;
1573 unsigned int vector_sizes;
1575 /* Autodetect first vector size we try. */
1576 current_vector_size = 0;
1577 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1579 if (vect_print_dump_info (REPORT_DETAILS))
1580 fprintf (vect_dump, "===== analyze_loop_nest =====");
1582 if (loop_outer (loop)
1583 && loop_vec_info_for_loop (loop_outer (loop))
1584 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1586 if (vect_print_dump_info (REPORT_DETAILS))
1587 fprintf (vect_dump, "outer-loop already vectorized.");
1588 return NULL;
1591 while (1)
1593 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1594 loop_vinfo = vect_analyze_loop_form (loop);
1595 if (!loop_vinfo)
1597 if (vect_print_dump_info (REPORT_DETAILS))
1598 fprintf (vect_dump, "bad loop form.");
1599 return NULL;
1602 if (vect_analyze_loop_2 (loop_vinfo))
1604 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1606 return loop_vinfo;
1609 destroy_loop_vec_info (loop_vinfo, true);
1611 vector_sizes &= ~current_vector_size;
1612 if (vector_sizes == 0
1613 || current_vector_size == 0)
1614 return NULL;
1616 /* Try the next biggest vector size. */
1617 current_vector_size = 1 << floor_log2 (vector_sizes);
1618 if (vect_print_dump_info (REPORT_DETAILS))
1619 fprintf (vect_dump, "***** Re-trying analysis with "
1620 "vector size %d\n", current_vector_size);
1625 /* Function reduction_code_for_scalar_code
1627 Input:
1628 CODE - tree_code of a reduction operations.
1630 Output:
1631 REDUC_CODE - the corresponding tree-code to be used to reduce the
1632 vector of partial results into a single scalar result (which
1633 will also reside in a vector) or ERROR_MARK if the operation is
1634 a supported reduction operation, but does not have such tree-code.
1636 Return FALSE if CODE currently cannot be vectorized as reduction. */
1638 static bool
1639 reduction_code_for_scalar_code (enum tree_code code,
1640 enum tree_code *reduc_code)
1642 switch (code)
1644 case MAX_EXPR:
1645 *reduc_code = REDUC_MAX_EXPR;
1646 return true;
1648 case MIN_EXPR:
1649 *reduc_code = REDUC_MIN_EXPR;
1650 return true;
1652 case PLUS_EXPR:
1653 *reduc_code = REDUC_PLUS_EXPR;
1654 return true;
1656 case MULT_EXPR:
1657 case MINUS_EXPR:
1658 case BIT_IOR_EXPR:
1659 case BIT_XOR_EXPR:
1660 case BIT_AND_EXPR:
1661 *reduc_code = ERROR_MARK;
1662 return true;
1664 default:
1665 return false;
1670 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1671 STMT is printed with a message MSG. */
1673 static void
1674 report_vect_op (gimple stmt, const char *msg)
1676 fprintf (vect_dump, "%s", msg);
1677 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1681 /* Detect SLP reduction of the form:
1683 #a1 = phi <a5, a0>
1684 a2 = operation (a1)
1685 a3 = operation (a2)
1686 a4 = operation (a3)
1687 a5 = operation (a4)
1689 #a = phi <a5>
1691 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1692 FIRST_STMT is the first reduction stmt in the chain
1693 (a2 = operation (a1)).
1695 Return TRUE if a reduction chain was detected. */
1697 static bool
1698 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1700 struct loop *loop = (gimple_bb (phi))->loop_father;
1701 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1702 enum tree_code code;
1703 gimple current_stmt = NULL, use_stmt = NULL, first, next_stmt;
1704 stmt_vec_info use_stmt_info, current_stmt_info;
1705 tree lhs;
1706 imm_use_iterator imm_iter;
1707 use_operand_p use_p;
1708 int nloop_uses, size = 0, nuses;
1709 bool found = false;
1711 if (loop != vect_loop)
1712 return false;
1714 lhs = PHI_RESULT (phi);
1715 code = gimple_assign_rhs_code (first_stmt);
1716 while (1)
1718 nloop_uses = 0;
1719 nuses = 0;
1720 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1722 use_stmt = USE_STMT (use_p);
1723 nuses++;
1724 if (is_gimple_debug (use_stmt))
1725 continue;
1727 /* Check if we got back to the reduction phi. */
1728 if (gimple_code (use_stmt) == GIMPLE_PHI
1729 && use_stmt == phi)
1731 found = true;
1732 break;
1735 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1736 && vinfo_for_stmt (use_stmt)
1737 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))
1738 && use_stmt != first_stmt)
1739 nloop_uses++;
1741 if (nloop_uses > 1)
1742 return false;
1745 /* We reached a statement with no uses. */
1746 if (nuses == 0)
1747 return false;
1749 if (found)
1750 break;
1752 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1753 if (gimple_code (use_stmt) == GIMPLE_PHI)
1754 return false;
1756 if (!is_gimple_assign (use_stmt)
1757 || code != gimple_assign_rhs_code (use_stmt)
1758 || !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1759 return false;
1761 /* Insert USE_STMT into reduction chain. */
1762 use_stmt_info = vinfo_for_stmt (use_stmt);
1763 if (current_stmt)
1765 current_stmt_info = vinfo_for_stmt (current_stmt);
1766 GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt;
1767 GROUP_FIRST_ELEMENT (use_stmt_info)
1768 = GROUP_FIRST_ELEMENT (current_stmt_info);
1770 else
1771 GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt;
1773 lhs = gimple_assign_lhs (use_stmt);
1774 current_stmt = use_stmt;
1775 size++;
1778 if (!found || use_stmt != phi || size < 2)
1779 return false;
1781 /* Swap the operands, if needed, to make the reduction operand be the second
1782 operand. */
1783 lhs = PHI_RESULT (phi);
1784 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1785 while (next_stmt)
1787 if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
1789 if (gimple_assign_rhs2 (next_stmt) == lhs)
1791 tree op = gimple_assign_rhs1 (next_stmt);
1792 gimple def_stmt = NULL;
1794 if (TREE_CODE (op) == SSA_NAME)
1795 def_stmt = SSA_NAME_DEF_STMT (op);
1797 /* Check that the other def is either defined in the loop
1798 ("vect_internal_def"), or it's an induction (defined by a
1799 loop-header phi-node). */
1800 if (code == COND_EXPR
1801 || (def_stmt
1802 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1803 && (is_gimple_assign (def_stmt)
1804 || is_gimple_call (def_stmt)
1805 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1806 == vect_induction_def
1807 || (gimple_code (def_stmt) == GIMPLE_PHI
1808 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1809 == vect_internal_def
1810 && !is_loop_header_bb_p (gimple_bb (def_stmt))))))
1812 lhs = gimple_assign_lhs (next_stmt);
1813 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1814 continue;
1817 return false;
1819 else
1821 tree op = gimple_assign_rhs2 (next_stmt);
1822 gimple def_stmt = NULL;
1824 if (TREE_CODE (op) == SSA_NAME)
1825 def_stmt = SSA_NAME_DEF_STMT (op);
1827 /* Check that the other def is either defined in the loop
1828 ("vect_internal_def"), or it's an induction (defined by a
1829 loop-header phi-node). */
1830 if (code == COND_EXPR
1831 || (def_stmt
1832 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1833 && (is_gimple_assign (def_stmt)
1834 || is_gimple_call (def_stmt)
1835 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1836 == vect_induction_def
1837 || (gimple_code (def_stmt) == GIMPLE_PHI
1838 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1839 == vect_internal_def
1840 && !is_loop_header_bb_p (gimple_bb (def_stmt))))))
1842 if (vect_print_dump_info (REPORT_DETAILS))
1844 fprintf (vect_dump, "swapping oprnds: ");
1845 print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1848 swap_tree_operands (next_stmt,
1849 gimple_assign_rhs1_ptr (next_stmt),
1850 gimple_assign_rhs2_ptr (next_stmt));
1851 mark_symbols_for_renaming (next_stmt);
1853 else
1854 return false;
1858 lhs = gimple_assign_lhs (next_stmt);
1859 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1862 /* Save the chain for further analysis in SLP detection. */
1863 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1864 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1865 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1867 return true;
1871 /* Function vect_is_simple_reduction_1
1873 (1) Detect a cross-iteration def-use cycle that represents a simple
1874 reduction computation. We look for the following pattern:
1876 loop_header:
1877 a1 = phi < a0, a2 >
1878 a3 = ...
1879 a2 = operation (a3, a1)
1881 such that:
1882 1. operation is commutative and associative and it is safe to
1883 change the order of the computation (if CHECK_REDUCTION is true)
1884 2. no uses for a2 in the loop (a2 is used out of the loop)
1885 3. no uses of a1 in the loop besides the reduction operation
1886 4. no uses of a1 outside the loop.
1888 Conditions 1,4 are tested here.
1889 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1891 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1892 nested cycles, if CHECK_REDUCTION is false.
1894 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1895 reductions:
1897 a1 = phi < a0, a2 >
1898 inner loop (def of a3)
1899 a2 = phi < a3 >
1901 If MODIFY is true it tries also to rework the code in-place to enable
1902 detection of more reduction patterns. For the time being we rewrite
1903 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1906 static gimple
1907 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1908 bool check_reduction, bool *double_reduc,
1909 bool modify)
1911 struct loop *loop = (gimple_bb (phi))->loop_father;
1912 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1913 edge latch_e = loop_latch_edge (loop);
1914 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1915 gimple def_stmt, def1 = NULL, def2 = NULL;
1916 enum tree_code orig_code, code;
1917 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1918 tree type;
1919 int nloop_uses;
1920 tree name;
1921 imm_use_iterator imm_iter;
1922 use_operand_p use_p;
1923 bool phi_def;
1925 *double_reduc = false;
1927 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1928 otherwise, we assume outer loop vectorization. */
1929 gcc_assert ((check_reduction && loop == vect_loop)
1930 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1932 name = PHI_RESULT (phi);
1933 nloop_uses = 0;
1934 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1936 gimple use_stmt = USE_STMT (use_p);
1937 if (is_gimple_debug (use_stmt))
1938 continue;
1940 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1942 if (vect_print_dump_info (REPORT_DETAILS))
1943 fprintf (vect_dump, "intermediate value used outside loop.");
1945 return NULL;
1948 if (vinfo_for_stmt (use_stmt)
1949 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1950 nloop_uses++;
1951 if (nloop_uses > 1)
1953 if (vect_print_dump_info (REPORT_DETAILS))
1954 fprintf (vect_dump, "reduction used in loop.");
1955 return NULL;
1959 if (TREE_CODE (loop_arg) != SSA_NAME)
1961 if (vect_print_dump_info (REPORT_DETAILS))
1963 fprintf (vect_dump, "reduction: not ssa_name: ");
1964 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1966 return NULL;
1969 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1970 if (!def_stmt)
1972 if (vect_print_dump_info (REPORT_DETAILS))
1973 fprintf (vect_dump, "reduction: no def_stmt.");
1974 return NULL;
1977 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1979 if (vect_print_dump_info (REPORT_DETAILS))
1980 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1981 return NULL;
1984 if (is_gimple_assign (def_stmt))
1986 name = gimple_assign_lhs (def_stmt);
1987 phi_def = false;
1989 else
1991 name = PHI_RESULT (def_stmt);
1992 phi_def = true;
1995 nloop_uses = 0;
1996 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1998 gimple use_stmt = USE_STMT (use_p);
1999 if (is_gimple_debug (use_stmt))
2000 continue;
2001 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2002 && vinfo_for_stmt (use_stmt)
2003 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2004 nloop_uses++;
2005 if (nloop_uses > 1)
2007 if (vect_print_dump_info (REPORT_DETAILS))
2008 fprintf (vect_dump, "reduction used in loop.");
2009 return NULL;
2013 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2014 defined in the inner loop. */
2015 if (phi_def)
2017 op1 = PHI_ARG_DEF (def_stmt, 0);
2019 if (gimple_phi_num_args (def_stmt) != 1
2020 || TREE_CODE (op1) != SSA_NAME)
2022 if (vect_print_dump_info (REPORT_DETAILS))
2023 fprintf (vect_dump, "unsupported phi node definition.");
2025 return NULL;
2028 def1 = SSA_NAME_DEF_STMT (op1);
2029 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2030 && loop->inner
2031 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2032 && is_gimple_assign (def1))
2034 if (vect_print_dump_info (REPORT_DETAILS))
2035 report_vect_op (def_stmt, "detected double reduction: ");
2037 *double_reduc = true;
2038 return def_stmt;
2041 return NULL;
2044 code = orig_code = gimple_assign_rhs_code (def_stmt);
2046 /* We can handle "res -= x[i]", which is non-associative by
2047 simply rewriting this into "res += -x[i]". Avoid changing
2048 gimple instruction for the first simple tests and only do this
2049 if we're allowed to change code at all. */
2050 if (code == MINUS_EXPR
2051 && modify
2052 && (op1 = gimple_assign_rhs1 (def_stmt))
2053 && TREE_CODE (op1) == SSA_NAME
2054 && SSA_NAME_DEF_STMT (op1) == phi)
2055 code = PLUS_EXPR;
2057 if (check_reduction
2058 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2060 if (vect_print_dump_info (REPORT_DETAILS))
2061 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2062 return NULL;
2065 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2067 if (code != COND_EXPR)
2069 if (vect_print_dump_info (REPORT_DETAILS))
2070 report_vect_op (def_stmt, "reduction: not binary operation: ");
2072 return NULL;
2075 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2076 if (COMPARISON_CLASS_P (op3))
2078 op4 = TREE_OPERAND (op3, 1);
2079 op3 = TREE_OPERAND (op3, 0);
2082 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
2083 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
2085 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2087 if (vect_print_dump_info (REPORT_DETAILS))
2088 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2090 return NULL;
2093 else
2095 op1 = gimple_assign_rhs1 (def_stmt);
2096 op2 = gimple_assign_rhs2 (def_stmt);
2098 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2100 if (vect_print_dump_info (REPORT_DETAILS))
2101 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2103 return NULL;
2107 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2108 if ((TREE_CODE (op1) == SSA_NAME
2109 && !types_compatible_p (type,TREE_TYPE (op1)))
2110 || (TREE_CODE (op2) == SSA_NAME
2111 && !types_compatible_p (type, TREE_TYPE (op2)))
2112 || (op3 && TREE_CODE (op3) == SSA_NAME
2113 && !types_compatible_p (type, TREE_TYPE (op3)))
2114 || (op4 && TREE_CODE (op4) == SSA_NAME
2115 && !types_compatible_p (type, TREE_TYPE (op4))))
2117 if (vect_print_dump_info (REPORT_DETAILS))
2119 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2120 print_generic_expr (vect_dump, type, TDF_SLIM);
2121 fprintf (vect_dump, ", operands types: ");
2122 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2123 fprintf (vect_dump, ",");
2124 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2125 if (op3)
2127 fprintf (vect_dump, ",");
2128 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2131 if (op4)
2133 fprintf (vect_dump, ",");
2134 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2138 return NULL;
2141 /* Check that it's ok to change the order of the computation.
2142 Generally, when vectorizing a reduction we change the order of the
2143 computation. This may change the behavior of the program in some
2144 cases, so we need to check that this is ok. One exception is when
2145 vectorizing an outer-loop: the inner-loop is executed sequentially,
2146 and therefore vectorizing reductions in the inner-loop during
2147 outer-loop vectorization is safe. */
2149 /* CHECKME: check for !flag_finite_math_only too? */
2150 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2151 && check_reduction)
2153 /* Changing the order of operations changes the semantics. */
2154 if (vect_print_dump_info (REPORT_DETAILS))
2155 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2156 return NULL;
2158 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2159 && check_reduction)
2161 /* Changing the order of operations changes the semantics. */
2162 if (vect_print_dump_info (REPORT_DETAILS))
2163 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2164 return NULL;
2166 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2168 /* Changing the order of operations changes the semantics. */
2169 if (vect_print_dump_info (REPORT_DETAILS))
2170 report_vect_op (def_stmt,
2171 "reduction: unsafe fixed-point math optimization: ");
2172 return NULL;
2175 /* If we detected "res -= x[i]" earlier, rewrite it into
2176 "res += -x[i]" now. If this turns out to be useless reassoc
2177 will clean it up again. */
2178 if (orig_code == MINUS_EXPR)
2180 tree rhs = gimple_assign_rhs2 (def_stmt);
2181 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2182 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2183 rhs, NULL);
2184 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2185 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2186 loop_info, NULL));
2187 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2188 gimple_assign_set_rhs2 (def_stmt, negrhs);
2189 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2190 update_stmt (def_stmt);
2193 /* Reduction is safe. We're dealing with one of the following:
2194 1) integer arithmetic and no trapv
2195 2) floating point arithmetic, and special flags permit this optimization
2196 3) nested cycle (i.e., outer loop vectorization). */
2197 if (TREE_CODE (op1) == SSA_NAME)
2198 def1 = SSA_NAME_DEF_STMT (op1);
2200 if (TREE_CODE (op2) == SSA_NAME)
2201 def2 = SSA_NAME_DEF_STMT (op2);
2203 if (code != COND_EXPR
2204 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2206 if (vect_print_dump_info (REPORT_DETAILS))
2207 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2208 return NULL;
2211 /* Check that one def is the reduction def, defined by PHI,
2212 the other def is either defined in the loop ("vect_internal_def"),
2213 or it's an induction (defined by a loop-header phi-node). */
2215 if (def2 && def2 == phi
2216 && (code == COND_EXPR
2217 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2218 && (is_gimple_assign (def1)
2219 || is_gimple_call (def1)
2220 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2221 == vect_induction_def
2222 || (gimple_code (def1) == GIMPLE_PHI
2223 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2224 == vect_internal_def
2225 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2227 if (vect_print_dump_info (REPORT_DETAILS))
2228 report_vect_op (def_stmt, "detected reduction: ");
2229 return def_stmt;
2232 if (def1 && def1 == phi
2233 && (code == COND_EXPR
2234 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2235 && (is_gimple_assign (def2)
2236 || is_gimple_call (def2)
2237 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2238 == vect_induction_def
2239 || (gimple_code (def2) == GIMPLE_PHI
2240 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2241 == vect_internal_def
2242 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2244 if (check_reduction)
2246 /* Swap operands (just for simplicity - so that the rest of the code
2247 can assume that the reduction variable is always the last (second)
2248 argument). */
2249 if (vect_print_dump_info (REPORT_DETAILS))
2250 report_vect_op (def_stmt,
2251 "detected reduction: need to swap operands: ");
2253 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2254 gimple_assign_rhs2_ptr (def_stmt));
2256 else
2258 if (vect_print_dump_info (REPORT_DETAILS))
2259 report_vect_op (def_stmt, "detected reduction: ");
2262 return def_stmt;
2265 /* Try to find SLP reduction chain. */
2266 if (vect_is_slp_reduction (loop_info, phi, def_stmt))
2268 if (vect_print_dump_info (REPORT_DETAILS))
2269 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2271 return def_stmt;
2274 if (vect_print_dump_info (REPORT_DETAILS))
2275 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2277 return NULL;
2280 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2281 in-place. Arguments as there. */
2283 static gimple
2284 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2285 bool check_reduction, bool *double_reduc)
2287 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2288 double_reduc, false);
2291 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2292 in-place if it enables detection of more reductions. Arguments
2293 as there. */
2295 gimple
2296 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2297 bool check_reduction, bool *double_reduc)
2299 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2300 double_reduc, true);
2303 /* Calculate the cost of one scalar iteration of the loop. */
2305 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2307 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2308 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2309 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2310 int innerloop_iters, i, stmt_cost;
2312 /* Count statements in scalar loop. Using this as scalar cost for a single
2313 iteration for now.
2315 TODO: Add outer loop support.
2317 TODO: Consider assigning different costs to different scalar
2318 statements. */
2320 /* FORNOW. */
2321 innerloop_iters = 1;
2322 if (loop->inner)
2323 innerloop_iters = 50; /* FIXME */
2325 for (i = 0; i < nbbs; i++)
2327 gimple_stmt_iterator si;
2328 basic_block bb = bbs[i];
2330 if (bb->loop_father == loop->inner)
2331 factor = innerloop_iters;
2332 else
2333 factor = 1;
2335 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2337 gimple stmt = gsi_stmt (si);
2338 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2340 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2341 continue;
2343 /* Skip stmts that are not vectorized inside the loop. */
2344 if (stmt_info
2345 && !STMT_VINFO_RELEVANT_P (stmt_info)
2346 && (!STMT_VINFO_LIVE_P (stmt_info)
2347 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2348 continue;
2350 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2352 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2353 stmt_cost = vect_get_cost (scalar_load);
2354 else
2355 stmt_cost = vect_get_cost (scalar_store);
2357 else
2358 stmt_cost = vect_get_cost (scalar_stmt);
2360 scalar_single_iter_cost += stmt_cost * factor;
2363 return scalar_single_iter_cost;
2366 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2368 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2369 int *peel_iters_epilogue,
2370 int scalar_single_iter_cost)
2372 int peel_guard_costs = 0;
2373 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2375 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2377 *peel_iters_epilogue = vf/2;
2378 if (vect_print_dump_info (REPORT_COST))
2379 fprintf (vect_dump, "cost model: "
2380 "epilogue peel iters set to vf/2 because "
2381 "loop iterations are unknown .");
2383 /* If peeled iterations are known but number of scalar loop
2384 iterations are unknown, count a taken branch per peeled loop. */
2385 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2387 else
2389 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2390 peel_iters_prologue = niters < peel_iters_prologue ?
2391 niters : peel_iters_prologue;
2392 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2393 /* If we need to peel for gaps, but no peeling is required, we have to
2394 peel VF iterations. */
2395 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2396 *peel_iters_epilogue = vf;
2399 return (peel_iters_prologue * scalar_single_iter_cost)
2400 + (*peel_iters_epilogue * scalar_single_iter_cost)
2401 + peel_guard_costs;
2404 /* Function vect_estimate_min_profitable_iters
2406 Return the number of iterations required for the vector version of the
2407 loop to be profitable relative to the cost of the scalar version of the
2408 loop.
2410 TODO: Take profile info into account before making vectorization
2411 decisions, if available. */
2414 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2416 int i;
2417 int min_profitable_iters;
2418 int peel_iters_prologue;
2419 int peel_iters_epilogue;
2420 int vec_inside_cost = 0;
2421 int vec_outside_cost = 0;
2422 int scalar_single_iter_cost = 0;
2423 int scalar_outside_cost = 0;
2424 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2425 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2426 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2427 int nbbs = loop->num_nodes;
2428 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2429 int peel_guard_costs = 0;
2430 int innerloop_iters = 0, factor;
2431 VEC (slp_instance, heap) *slp_instances;
2432 slp_instance instance;
2434 /* Cost model disabled. */
2435 if (!flag_vect_cost_model)
2437 if (vect_print_dump_info (REPORT_COST))
2438 fprintf (vect_dump, "cost model disabled.");
2439 return 0;
2442 /* Requires loop versioning tests to handle misalignment. */
2443 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2445 /* FIXME: Make cost depend on complexity of individual check. */
2446 vec_outside_cost +=
2447 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2448 if (vect_print_dump_info (REPORT_COST))
2449 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2450 "versioning to treat misalignment.\n");
2453 /* Requires loop versioning with alias checks. */
2454 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2456 /* FIXME: Make cost depend on complexity of individual check. */
2457 vec_outside_cost +=
2458 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2459 if (vect_print_dump_info (REPORT_COST))
2460 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2461 "versioning aliasing.\n");
2464 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2465 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2466 vec_outside_cost += vect_get_cost (cond_branch_taken);
2468 /* Count statements in scalar loop. Using this as scalar cost for a single
2469 iteration for now.
2471 TODO: Add outer loop support.
2473 TODO: Consider assigning different costs to different scalar
2474 statements. */
2476 /* FORNOW. */
2477 if (loop->inner)
2478 innerloop_iters = 50; /* FIXME */
2480 for (i = 0; i < nbbs; i++)
2482 gimple_stmt_iterator si;
2483 basic_block bb = bbs[i];
2485 if (bb->loop_father == loop->inner)
2486 factor = innerloop_iters;
2487 else
2488 factor = 1;
2490 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2492 gimple stmt = gsi_stmt (si);
2493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2494 /* Skip stmts that are not vectorized inside the loop. */
2495 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2496 && (!STMT_VINFO_LIVE_P (stmt_info)
2497 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2498 continue;
2499 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2500 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2501 some of the "outside" costs are generated inside the outer-loop. */
2502 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2506 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2508 /* Add additional cost for the peeled instructions in prologue and epilogue
2509 loop.
2511 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2512 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2514 TODO: Build an expression that represents peel_iters for prologue and
2515 epilogue to be used in a run-time test. */
2517 if (npeel < 0)
2519 peel_iters_prologue = vf/2;
2520 if (vect_print_dump_info (REPORT_COST))
2521 fprintf (vect_dump, "cost model: "
2522 "prologue peel iters set to vf/2.");
2524 /* If peeling for alignment is unknown, loop bound of main loop becomes
2525 unknown. */
2526 peel_iters_epilogue = vf/2;
2527 if (vect_print_dump_info (REPORT_COST))
2528 fprintf (vect_dump, "cost model: "
2529 "epilogue peel iters set to vf/2 because "
2530 "peeling for alignment is unknown .");
2532 /* If peeled iterations are unknown, count a taken branch and a not taken
2533 branch per peeled loop. Even if scalar loop iterations are known,
2534 vector iterations are not known since peeled prologue iterations are
2535 not known. Hence guards remain the same. */
2536 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2537 + vect_get_cost (cond_branch_not_taken));
2538 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2539 + (peel_iters_epilogue * scalar_single_iter_cost)
2540 + peel_guard_costs;
2542 else
2544 peel_iters_prologue = npeel;
2545 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2546 peel_iters_prologue, &peel_iters_epilogue,
2547 scalar_single_iter_cost);
2550 /* FORNOW: The scalar outside cost is incremented in one of the
2551 following ways:
2553 1. The vectorizer checks for alignment and aliasing and generates
2554 a condition that allows dynamic vectorization. A cost model
2555 check is ANDED with the versioning condition. Hence scalar code
2556 path now has the added cost of the versioning check.
2558 if (cost > th & versioning_check)
2559 jmp to vector code
2561 Hence run-time scalar is incremented by not-taken branch cost.
2563 2. The vectorizer then checks if a prologue is required. If the
2564 cost model check was not done before during versioning, it has to
2565 be done before the prologue check.
2567 if (cost <= th)
2568 prologue = scalar_iters
2569 if (prologue == 0)
2570 jmp to vector code
2571 else
2572 execute prologue
2573 if (prologue == num_iters)
2574 go to exit
2576 Hence the run-time scalar cost is incremented by a taken branch,
2577 plus a not-taken branch, plus a taken branch cost.
2579 3. The vectorizer then checks if an epilogue is required. If the
2580 cost model check was not done before during prologue check, it
2581 has to be done with the epilogue check.
2583 if (prologue == 0)
2584 jmp to vector code
2585 else
2586 execute prologue
2587 if (prologue == num_iters)
2588 go to exit
2589 vector code:
2590 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2591 jmp to epilogue
2593 Hence the run-time scalar cost should be incremented by 2 taken
2594 branches.
2596 TODO: The back end may reorder the BBS's differently and reverse
2597 conditions/branch directions. Change the estimates below to
2598 something more reasonable. */
2600 /* If the number of iterations is known and we do not do versioning, we can
2601 decide whether to vectorize at compile time. Hence the scalar version
2602 do not carry cost model guard costs. */
2603 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2604 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2605 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2607 /* Cost model check occurs at versioning. */
2608 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2609 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2610 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2611 else
2613 /* Cost model check occurs at prologue generation. */
2614 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2615 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2616 + vect_get_cost (cond_branch_not_taken);
2617 /* Cost model check occurs at epilogue generation. */
2618 else
2619 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2623 /* Add SLP costs. */
2624 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2625 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2627 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2628 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2631 /* Calculate number of iterations required to make the vector version
2632 profitable, relative to the loop bodies only. The following condition
2633 must hold true:
2634 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2635 where
2636 SIC = scalar iteration cost, VIC = vector iteration cost,
2637 VOC = vector outside cost, VF = vectorization factor,
2638 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2639 SOC = scalar outside cost for run time cost model check. */
2641 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2643 if (vec_outside_cost <= 0)
2644 min_profitable_iters = 1;
2645 else
2647 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2648 - vec_inside_cost * peel_iters_prologue
2649 - vec_inside_cost * peel_iters_epilogue)
2650 / ((scalar_single_iter_cost * vf)
2651 - vec_inside_cost);
2653 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2654 <= ((vec_inside_cost * min_profitable_iters)
2655 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2656 min_profitable_iters++;
2659 /* vector version will never be profitable. */
2660 else
2662 if (vect_print_dump_info (REPORT_COST))
2663 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2664 "divided by the scalar iteration cost = %d "
2665 "is greater or equal to the vectorization factor = %d.",
2666 vec_inside_cost, scalar_single_iter_cost, vf);
2667 return -1;
2670 if (vect_print_dump_info (REPORT_COST))
2672 fprintf (vect_dump, "Cost model analysis: \n");
2673 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2674 vec_inside_cost);
2675 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2676 vec_outside_cost);
2677 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2678 scalar_single_iter_cost);
2679 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2680 fprintf (vect_dump, " prologue iterations: %d\n",
2681 peel_iters_prologue);
2682 fprintf (vect_dump, " epilogue iterations: %d\n",
2683 peel_iters_epilogue);
2684 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2685 min_profitable_iters);
2688 min_profitable_iters =
2689 min_profitable_iters < vf ? vf : min_profitable_iters;
2691 /* Because the condition we create is:
2692 if (niters <= min_profitable_iters)
2693 then skip the vectorized loop. */
2694 min_profitable_iters--;
2696 if (vect_print_dump_info (REPORT_COST))
2697 fprintf (vect_dump, " Profitability threshold = %d\n",
2698 min_profitable_iters);
2700 return min_profitable_iters;
2704 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2705 functions. Design better to avoid maintenance issues. */
2707 /* Function vect_model_reduction_cost.
2709 Models cost for a reduction operation, including the vector ops
2710 generated within the strip-mine loop, the initial definition before
2711 the loop, and the epilogue code that must be generated. */
2713 static bool
2714 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2715 int ncopies)
2717 int outer_cost = 0;
2718 enum tree_code code;
2719 optab optab;
2720 tree vectype;
2721 gimple stmt, orig_stmt;
2722 tree reduction_op;
2723 enum machine_mode mode;
2724 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2725 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2728 /* Cost of reduction op inside loop. */
2729 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2730 += ncopies * vect_get_cost (vector_stmt);
2732 stmt = STMT_VINFO_STMT (stmt_info);
2734 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2736 case GIMPLE_SINGLE_RHS:
2737 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2738 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2739 break;
2740 case GIMPLE_UNARY_RHS:
2741 reduction_op = gimple_assign_rhs1 (stmt);
2742 break;
2743 case GIMPLE_BINARY_RHS:
2744 reduction_op = gimple_assign_rhs2 (stmt);
2745 break;
2746 case GIMPLE_TERNARY_RHS:
2747 reduction_op = gimple_assign_rhs3 (stmt);
2748 break;
2749 default:
2750 gcc_unreachable ();
2753 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2754 if (!vectype)
2756 if (vect_print_dump_info (REPORT_COST))
2758 fprintf (vect_dump, "unsupported data-type ");
2759 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2761 return false;
2764 mode = TYPE_MODE (vectype);
2765 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2767 if (!orig_stmt)
2768 orig_stmt = STMT_VINFO_STMT (stmt_info);
2770 code = gimple_assign_rhs_code (orig_stmt);
2772 /* Add in cost for initial definition. */
2773 outer_cost += vect_get_cost (scalar_to_vec);
2775 /* Determine cost of epilogue code.
2777 We have a reduction operator that will reduce the vector in one statement.
2778 Also requires scalar extract. */
2780 if (!nested_in_vect_loop_p (loop, orig_stmt))
2782 if (reduc_code != ERROR_MARK)
2783 outer_cost += vect_get_cost (vector_stmt)
2784 + vect_get_cost (vec_to_scalar);
2785 else
2787 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2788 tree bitsize =
2789 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2790 int element_bitsize = tree_low_cst (bitsize, 1);
2791 int nelements = vec_size_in_bits / element_bitsize;
2793 optab = optab_for_tree_code (code, vectype, optab_default);
2795 /* We have a whole vector shift available. */
2796 if (VECTOR_MODE_P (mode)
2797 && optab_handler (optab, mode) != CODE_FOR_nothing
2798 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2799 /* Final reduction via vector shifts and the reduction operator. Also
2800 requires scalar extract. */
2801 outer_cost += ((exact_log2(nelements) * 2)
2802 * vect_get_cost (vector_stmt)
2803 + vect_get_cost (vec_to_scalar));
2804 else
2805 /* Use extracts and reduction op for final reduction. For N elements,
2806 we have N extracts and N-1 reduction ops. */
2807 outer_cost += ((nelements + nelements - 1)
2808 * vect_get_cost (vector_stmt));
2812 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2814 if (vect_print_dump_info (REPORT_COST))
2815 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2816 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2817 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2819 return true;
2823 /* Function vect_model_induction_cost.
2825 Models cost for induction operations. */
2827 static void
2828 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2830 /* loop cost for vec_loop. */
2831 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2832 = ncopies * vect_get_cost (vector_stmt);
2833 /* prologue cost for vec_init and vec_step. */
2834 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2835 = 2 * vect_get_cost (scalar_to_vec);
2837 if (vect_print_dump_info (REPORT_COST))
2838 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2839 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2840 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2844 /* Function get_initial_def_for_induction
2846 Input:
2847 STMT - a stmt that performs an induction operation in the loop.
2848 IV_PHI - the initial value of the induction variable
2850 Output:
2851 Return a vector variable, initialized with the first VF values of
2852 the induction variable. E.g., for an iv with IV_PHI='X' and
2853 evolution S, for a vector of 4 units, we want to return:
2854 [X, X + S, X + 2*S, X + 3*S]. */
2856 static tree
2857 get_initial_def_for_induction (gimple iv_phi)
2859 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2860 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2861 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2862 tree scalar_type;
2863 tree vectype;
2864 int nunits;
2865 edge pe = loop_preheader_edge (loop);
2866 struct loop *iv_loop;
2867 basic_block new_bb;
2868 tree vec, vec_init, vec_step, t;
2869 tree access_fn;
2870 tree new_var;
2871 tree new_name;
2872 gimple init_stmt, induction_phi, new_stmt;
2873 tree induc_def, vec_def, vec_dest;
2874 tree init_expr, step_expr;
2875 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2876 int i;
2877 bool ok;
2878 int ncopies;
2879 tree expr;
2880 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2881 bool nested_in_vect_loop = false;
2882 gimple_seq stmts = NULL;
2883 imm_use_iterator imm_iter;
2884 use_operand_p use_p;
2885 gimple exit_phi;
2886 edge latch_e;
2887 tree loop_arg;
2888 gimple_stmt_iterator si;
2889 basic_block bb = gimple_bb (iv_phi);
2890 tree stepvectype;
2891 tree resvectype;
2893 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2894 if (nested_in_vect_loop_p (loop, iv_phi))
2896 nested_in_vect_loop = true;
2897 iv_loop = loop->inner;
2899 else
2900 iv_loop = loop;
2901 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2903 latch_e = loop_latch_edge (iv_loop);
2904 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2906 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2907 gcc_assert (access_fn);
2908 STRIP_NOPS (access_fn);
2909 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2910 &init_expr, &step_expr);
2911 gcc_assert (ok);
2912 pe = loop_preheader_edge (iv_loop);
2914 scalar_type = TREE_TYPE (init_expr);
2915 vectype = get_vectype_for_scalar_type (scalar_type);
2916 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2917 gcc_assert (vectype);
2918 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2919 ncopies = vf / nunits;
2921 gcc_assert (phi_info);
2922 gcc_assert (ncopies >= 1);
2924 /* Find the first insertion point in the BB. */
2925 si = gsi_after_labels (bb);
2927 /* Create the vector that holds the initial_value of the induction. */
2928 if (nested_in_vect_loop)
2930 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2931 been created during vectorization of previous stmts. We obtain it
2932 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2933 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2934 loop_preheader_edge (iv_loop));
2935 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2937 else
2939 /* iv_loop is the loop to be vectorized. Create:
2940 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2941 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2942 add_referenced_var (new_var);
2944 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2945 if (stmts)
2947 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2948 gcc_assert (!new_bb);
2951 t = NULL_TREE;
2952 t = tree_cons (NULL_TREE, new_name, t);
2953 for (i = 1; i < nunits; i++)
2955 /* Create: new_name_i = new_name + step_expr */
2956 enum tree_code code = POINTER_TYPE_P (scalar_type)
2957 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2958 init_stmt = gimple_build_assign_with_ops (code, new_var,
2959 new_name, step_expr);
2960 new_name = make_ssa_name (new_var, init_stmt);
2961 gimple_assign_set_lhs (init_stmt, new_name);
2963 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2964 gcc_assert (!new_bb);
2966 if (vect_print_dump_info (REPORT_DETAILS))
2968 fprintf (vect_dump, "created new init_stmt: ");
2969 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2971 t = tree_cons (NULL_TREE, new_name, t);
2973 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2974 vec = build_constructor_from_list (vectype, nreverse (t));
2975 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2979 /* Create the vector that holds the step of the induction. */
2980 if (nested_in_vect_loop)
2981 /* iv_loop is nested in the loop to be vectorized. Generate:
2982 vec_step = [S, S, S, S] */
2983 new_name = step_expr;
2984 else
2986 /* iv_loop is the loop to be vectorized. Generate:
2987 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2988 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2989 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2990 expr, step_expr);
2993 t = unshare_expr (new_name);
2994 gcc_assert (CONSTANT_CLASS_P (new_name));
2995 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2996 gcc_assert (stepvectype);
2997 vec = build_vector_from_val (stepvectype, t);
2998 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3001 /* Create the following def-use cycle:
3002 loop prolog:
3003 vec_init = ...
3004 vec_step = ...
3005 loop:
3006 vec_iv = PHI <vec_init, vec_loop>
3008 STMT
3010 vec_loop = vec_iv + vec_step; */
3012 /* Create the induction-phi that defines the induction-operand. */
3013 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3014 add_referenced_var (vec_dest);
3015 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3016 set_vinfo_for_stmt (induction_phi,
3017 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3018 induc_def = PHI_RESULT (induction_phi);
3020 /* Create the iv update inside the loop */
3021 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3022 induc_def, vec_step);
3023 vec_def = make_ssa_name (vec_dest, new_stmt);
3024 gimple_assign_set_lhs (new_stmt, vec_def);
3025 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3026 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3027 NULL));
3029 /* Set the arguments of the phi node: */
3030 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3031 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3032 UNKNOWN_LOCATION);
3035 /* In case that vectorization factor (VF) is bigger than the number
3036 of elements that we can fit in a vectype (nunits), we have to generate
3037 more than one vector stmt - i.e - we need to "unroll" the
3038 vector stmt by a factor VF/nunits. For more details see documentation
3039 in vectorizable_operation. */
3041 if (ncopies > 1)
3043 stmt_vec_info prev_stmt_vinfo;
3044 /* FORNOW. This restriction should be relaxed. */
3045 gcc_assert (!nested_in_vect_loop);
3047 /* Create the vector that holds the step of the induction. */
3048 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3049 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3050 expr, step_expr);
3051 t = unshare_expr (new_name);
3052 gcc_assert (CONSTANT_CLASS_P (new_name));
3053 vec = build_vector_from_val (stepvectype, t);
3054 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3056 vec_def = induc_def;
3057 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3058 for (i = 1; i < ncopies; i++)
3060 /* vec_i = vec_prev + vec_step */
3061 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3062 vec_def, vec_step);
3063 vec_def = make_ssa_name (vec_dest, new_stmt);
3064 gimple_assign_set_lhs (new_stmt, vec_def);
3066 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3067 if (!useless_type_conversion_p (resvectype, vectype))
3069 new_stmt = gimple_build_assign_with_ops
3070 (VIEW_CONVERT_EXPR,
3071 vect_get_new_vect_var (resvectype, vect_simple_var,
3072 "vec_iv_"),
3073 build1 (VIEW_CONVERT_EXPR, resvectype,
3074 gimple_assign_lhs (new_stmt)), NULL_TREE);
3075 gimple_assign_set_lhs (new_stmt,
3076 make_ssa_name
3077 (gimple_assign_lhs (new_stmt), new_stmt));
3078 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3080 set_vinfo_for_stmt (new_stmt,
3081 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3082 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3083 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3087 if (nested_in_vect_loop)
3089 /* Find the loop-closed exit-phi of the induction, and record
3090 the final vector of induction results: */
3091 exit_phi = NULL;
3092 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3094 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3096 exit_phi = USE_STMT (use_p);
3097 break;
3100 if (exit_phi)
3102 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3103 /* FORNOW. Currently not supporting the case that an inner-loop induction
3104 is not used in the outer-loop (i.e. only outside the outer-loop). */
3105 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3106 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3108 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3109 if (vect_print_dump_info (REPORT_DETAILS))
3111 fprintf (vect_dump, "vector of inductions after inner-loop:");
3112 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3118 if (vect_print_dump_info (REPORT_DETAILS))
3120 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3121 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3122 fprintf (vect_dump, "\n");
3123 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3126 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3127 if (!useless_type_conversion_p (resvectype, vectype))
3129 new_stmt = gimple_build_assign_with_ops
3130 (VIEW_CONVERT_EXPR,
3131 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3132 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3133 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3134 gimple_assign_set_lhs (new_stmt, induc_def);
3135 si = gsi_start_bb (bb);
3136 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3137 set_vinfo_for_stmt (new_stmt,
3138 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3139 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3140 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3143 return induc_def;
3147 /* Function get_initial_def_for_reduction
3149 Input:
3150 STMT - a stmt that performs a reduction operation in the loop.
3151 INIT_VAL - the initial value of the reduction variable
3153 Output:
3154 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3155 of the reduction (used for adjusting the epilog - see below).
3156 Return a vector variable, initialized according to the operation that STMT
3157 performs. This vector will be used as the initial value of the
3158 vector of partial results.
3160 Option1 (adjust in epilog): Initialize the vector as follows:
3161 add/bit or/xor: [0,0,...,0,0]
3162 mult/bit and: [1,1,...,1,1]
3163 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3164 and when necessary (e.g. add/mult case) let the caller know
3165 that it needs to adjust the result by init_val.
3167 Option2: Initialize the vector as follows:
3168 add/bit or/xor: [init_val,0,0,...,0]
3169 mult/bit and: [init_val,1,1,...,1]
3170 min/max/cond_expr: [init_val,init_val,...,init_val]
3171 and no adjustments are needed.
3173 For example, for the following code:
3175 s = init_val;
3176 for (i=0;i<n;i++)
3177 s = s + a[i];
3179 STMT is 's = s + a[i]', and the reduction variable is 's'.
3180 For a vector of 4 units, we want to return either [0,0,0,init_val],
3181 or [0,0,0,0] and let the caller know that it needs to adjust
3182 the result at the end by 'init_val'.
3184 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3185 initialization vector is simpler (same element in all entries), if
3186 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3188 A cost model should help decide between these two schemes. */
3190 tree
3191 get_initial_def_for_reduction (gimple stmt, tree init_val,
3192 tree *adjustment_def)
3194 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3195 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3196 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3197 tree scalar_type = TREE_TYPE (init_val);
3198 tree vectype = get_vectype_for_scalar_type (scalar_type);
3199 int nunits;
3200 enum tree_code code = gimple_assign_rhs_code (stmt);
3201 tree def_for_init;
3202 tree init_def;
3203 tree t = NULL_TREE;
3204 int i;
3205 bool nested_in_vect_loop = false;
3206 tree init_value;
3207 REAL_VALUE_TYPE real_init_val = dconst0;
3208 int int_init_val = 0;
3209 gimple def_stmt = NULL;
3211 gcc_assert (vectype);
3212 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3214 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3215 || SCALAR_FLOAT_TYPE_P (scalar_type));
3217 if (nested_in_vect_loop_p (loop, stmt))
3218 nested_in_vect_loop = true;
3219 else
3220 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3222 /* In case of double reduction we only create a vector variable to be put
3223 in the reduction phi node. The actual statement creation is done in
3224 vect_create_epilog_for_reduction. */
3225 if (adjustment_def && nested_in_vect_loop
3226 && TREE_CODE (init_val) == SSA_NAME
3227 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3228 && gimple_code (def_stmt) == GIMPLE_PHI
3229 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3230 && vinfo_for_stmt (def_stmt)
3231 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3232 == vect_double_reduction_def)
3234 *adjustment_def = NULL;
3235 return vect_create_destination_var (init_val, vectype);
3238 if (TREE_CONSTANT (init_val))
3240 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3241 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3242 else
3243 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3245 else
3246 init_value = init_val;
3248 switch (code)
3250 case WIDEN_SUM_EXPR:
3251 case DOT_PROD_EXPR:
3252 case PLUS_EXPR:
3253 case MINUS_EXPR:
3254 case BIT_IOR_EXPR:
3255 case BIT_XOR_EXPR:
3256 case MULT_EXPR:
3257 case BIT_AND_EXPR:
3258 /* ADJUSMENT_DEF is NULL when called from
3259 vect_create_epilog_for_reduction to vectorize double reduction. */
3260 if (adjustment_def)
3262 if (nested_in_vect_loop)
3263 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3264 NULL);
3265 else
3266 *adjustment_def = init_val;
3269 if (code == MULT_EXPR)
3271 real_init_val = dconst1;
3272 int_init_val = 1;
3275 if (code == BIT_AND_EXPR)
3276 int_init_val = -1;
3278 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3279 def_for_init = build_real (scalar_type, real_init_val);
3280 else
3281 def_for_init = build_int_cst (scalar_type, int_init_val);
3283 /* Create a vector of '0' or '1' except the first element. */
3284 for (i = nunits - 2; i >= 0; --i)
3285 t = tree_cons (NULL_TREE, def_for_init, t);
3287 /* Option1: the first element is '0' or '1' as well. */
3288 if (adjustment_def)
3290 t = tree_cons (NULL_TREE, def_for_init, t);
3291 init_def = build_vector (vectype, t);
3292 break;
3295 /* Option2: the first element is INIT_VAL. */
3296 t = tree_cons (NULL_TREE, init_value, t);
3297 if (TREE_CONSTANT (init_val))
3298 init_def = build_vector (vectype, t);
3299 else
3300 init_def = build_constructor_from_list (vectype, t);
3302 break;
3304 case MIN_EXPR:
3305 case MAX_EXPR:
3306 case COND_EXPR:
3307 if (adjustment_def)
3309 *adjustment_def = NULL_TREE;
3310 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3311 break;
3314 init_def = build_vector_from_val (vectype, init_value);
3315 break;
3317 default:
3318 gcc_unreachable ();
3321 return init_def;
3325 /* Function vect_create_epilog_for_reduction
3327 Create code at the loop-epilog to finalize the result of a reduction
3328 computation.
3330 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3331 reduction statements.
3332 STMT is the scalar reduction stmt that is being vectorized.
3333 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3334 number of elements that we can fit in a vectype (nunits). In this case
3335 we have to generate more than one vector stmt - i.e - we need to "unroll"
3336 the vector stmt by a factor VF/nunits. For more details see documentation
3337 in vectorizable_operation.
3338 REDUC_CODE is the tree-code for the epilog reduction.
3339 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3340 computation.
3341 REDUC_INDEX is the index of the operand in the right hand side of the
3342 statement that is defined by REDUCTION_PHI.
3343 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3344 SLP_NODE is an SLP node containing a group of reduction statements. The
3345 first one in this group is STMT.
3347 This function:
3348 1. Creates the reduction def-use cycles: sets the arguments for
3349 REDUCTION_PHIS:
3350 The loop-entry argument is the vectorized initial-value of the reduction.
3351 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3352 sums.
3353 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3354 by applying the operation specified by REDUC_CODE if available, or by
3355 other means (whole-vector shifts or a scalar loop).
3356 The function also creates a new phi node at the loop exit to preserve
3357 loop-closed form, as illustrated below.
3359 The flow at the entry to this function:
3361 loop:
3362 vec_def = phi <null, null> # REDUCTION_PHI
3363 VECT_DEF = vector_stmt # vectorized form of STMT
3364 s_loop = scalar_stmt # (scalar) STMT
3365 loop_exit:
3366 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3367 use <s_out0>
3368 use <s_out0>
3370 The above is transformed by this function into:
3372 loop:
3373 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3374 VECT_DEF = vector_stmt # vectorized form of STMT
3375 s_loop = scalar_stmt # (scalar) STMT
3376 loop_exit:
3377 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3378 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3379 v_out2 = reduce <v_out1>
3380 s_out3 = extract_field <v_out2, 0>
3381 s_out4 = adjust_result <s_out3>
3382 use <s_out4>
3383 use <s_out4>
3386 static void
3387 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3388 int ncopies, enum tree_code reduc_code,
3389 VEC (gimple, heap) *reduction_phis,
3390 int reduc_index, bool double_reduc,
3391 slp_tree slp_node)
3393 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3394 stmt_vec_info prev_phi_info;
3395 tree vectype;
3396 enum machine_mode mode;
3397 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3398 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3399 basic_block exit_bb;
3400 tree scalar_dest;
3401 tree scalar_type;
3402 gimple new_phi = NULL, phi;
3403 gimple_stmt_iterator exit_gsi;
3404 tree vec_dest;
3405 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3406 gimple epilog_stmt = NULL;
3407 enum tree_code code = gimple_assign_rhs_code (stmt);
3408 gimple exit_phi;
3409 tree bitsize, bitpos;
3410 tree adjustment_def = NULL;
3411 tree vec_initial_def = NULL;
3412 tree reduction_op, expr, def;
3413 tree orig_name, scalar_result;
3414 imm_use_iterator imm_iter, phi_imm_iter;
3415 use_operand_p use_p, phi_use_p;
3416 bool extract_scalar_result = false;
3417 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3418 bool nested_in_vect_loop = false;
3419 VEC (gimple, heap) *new_phis = NULL;
3420 enum vect_def_type dt = vect_unknown_def_type;
3421 int j, i;
3422 VEC (tree, heap) *scalar_results = NULL;
3423 unsigned int group_size = 1, k, ratio;
3424 VEC (tree, heap) *vec_initial_defs = NULL;
3425 VEC (gimple, heap) *phis;
3426 bool slp_reduc = false;
3427 tree new_phi_result;
3429 if (slp_node)
3430 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3432 if (nested_in_vect_loop_p (loop, stmt))
3434 outer_loop = loop;
3435 loop = loop->inner;
3436 nested_in_vect_loop = true;
3437 gcc_assert (!slp_node);
3440 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3442 case GIMPLE_SINGLE_RHS:
3443 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3444 == ternary_op);
3445 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3446 break;
3447 case GIMPLE_UNARY_RHS:
3448 reduction_op = gimple_assign_rhs1 (stmt);
3449 break;
3450 case GIMPLE_BINARY_RHS:
3451 reduction_op = reduc_index ?
3452 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3453 break;
3454 case GIMPLE_TERNARY_RHS:
3455 reduction_op = gimple_op (stmt, reduc_index + 1);
3456 break;
3457 default:
3458 gcc_unreachable ();
3461 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3462 gcc_assert (vectype);
3463 mode = TYPE_MODE (vectype);
3465 /* 1. Create the reduction def-use cycle:
3466 Set the arguments of REDUCTION_PHIS, i.e., transform
3468 loop:
3469 vec_def = phi <null, null> # REDUCTION_PHI
3470 VECT_DEF = vector_stmt # vectorized form of STMT
3473 into:
3475 loop:
3476 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3477 VECT_DEF = vector_stmt # vectorized form of STMT
3480 (in case of SLP, do it for all the phis). */
3482 /* Get the loop-entry arguments. */
3483 if (slp_node)
3484 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3485 NULL, reduc_index);
3486 else
3488 vec_initial_defs = VEC_alloc (tree, heap, 1);
3489 /* For the case of reduction, vect_get_vec_def_for_operand returns
3490 the scalar def before the loop, that defines the initial value
3491 of the reduction variable. */
3492 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3493 &adjustment_def);
3494 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3497 /* Set phi nodes arguments. */
3498 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3500 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3501 tree def = VEC_index (tree, vect_defs, i);
3502 for (j = 0; j < ncopies; j++)
3504 /* Set the loop-entry arg of the reduction-phi. */
3505 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3506 UNKNOWN_LOCATION);
3508 /* Set the loop-latch arg for the reduction-phi. */
3509 if (j > 0)
3510 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3512 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3514 if (vect_print_dump_info (REPORT_DETAILS))
3516 fprintf (vect_dump, "transform reduction: created def-use"
3517 " cycle: ");
3518 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3519 fprintf (vect_dump, "\n");
3520 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3521 TDF_SLIM);
3524 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3528 VEC_free (tree, heap, vec_initial_defs);
3530 /* 2. Create epilog code.
3531 The reduction epilog code operates across the elements of the vector
3532 of partial results computed by the vectorized loop.
3533 The reduction epilog code consists of:
3535 step 1: compute the scalar result in a vector (v_out2)
3536 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3537 step 3: adjust the scalar result (s_out3) if needed.
3539 Step 1 can be accomplished using one the following three schemes:
3540 (scheme 1) using reduc_code, if available.
3541 (scheme 2) using whole-vector shifts, if available.
3542 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3543 combined.
3545 The overall epilog code looks like this:
3547 s_out0 = phi <s_loop> # original EXIT_PHI
3548 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3549 v_out2 = reduce <v_out1> # step 1
3550 s_out3 = extract_field <v_out2, 0> # step 2
3551 s_out4 = adjust_result <s_out3> # step 3
3553 (step 3 is optional, and steps 1 and 2 may be combined).
3554 Lastly, the uses of s_out0 are replaced by s_out4. */
3557 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3558 v_out1 = phi <VECT_DEF>
3559 Store them in NEW_PHIS. */
3561 exit_bb = single_exit (loop)->dest;
3562 prev_phi_info = NULL;
3563 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3564 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3566 for (j = 0; j < ncopies; j++)
3568 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3569 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3570 if (j == 0)
3571 VEC_quick_push (gimple, new_phis, phi);
3572 else
3574 def = vect_get_vec_def_for_stmt_copy (dt, def);
3575 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3578 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3579 prev_phi_info = vinfo_for_stmt (phi);
3583 /* The epilogue is created for the outer-loop, i.e., for the loop being
3584 vectorized. */
3585 if (double_reduc)
3587 loop = outer_loop;
3588 exit_bb = single_exit (loop)->dest;
3591 exit_gsi = gsi_after_labels (exit_bb);
3593 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3594 (i.e. when reduc_code is not available) and in the final adjustment
3595 code (if needed). Also get the original scalar reduction variable as
3596 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3597 represents a reduction pattern), the tree-code and scalar-def are
3598 taken from the original stmt that the pattern-stmt (STMT) replaces.
3599 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3600 are taken from STMT. */
3602 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3603 if (!orig_stmt)
3605 /* Regular reduction */
3606 orig_stmt = stmt;
3608 else
3610 /* Reduction pattern */
3611 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3612 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3613 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3616 code = gimple_assign_rhs_code (orig_stmt);
3617 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3618 partial results are added and not subtracted. */
3619 if (code == MINUS_EXPR)
3620 code = PLUS_EXPR;
3622 scalar_dest = gimple_assign_lhs (orig_stmt);
3623 scalar_type = TREE_TYPE (scalar_dest);
3624 scalar_results = VEC_alloc (tree, heap, group_size);
3625 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3626 bitsize = TYPE_SIZE (scalar_type);
3628 /* In case this is a reduction in an inner-loop while vectorizing an outer
3629 loop - we don't need to extract a single scalar result at the end of the
3630 inner-loop (unless it is double reduction, i.e., the use of reduction is
3631 outside the outer-loop). The final vector of partial results will be used
3632 in the vectorized outer-loop, or reduced to a scalar result at the end of
3633 the outer-loop. */
3634 if (nested_in_vect_loop && !double_reduc)
3635 goto vect_finalize_reduction;
3637 /* SLP reduction without reduction chain, e.g.,
3638 # a1 = phi <a2, a0>
3639 # b1 = phi <b2, b0>
3640 a2 = operation (a1)
3641 b2 = operation (b1) */
3642 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3644 /* In case of reduction chain, e.g.,
3645 # a1 = phi <a3, a0>
3646 a2 = operation (a1)
3647 a3 = operation (a2),
3649 we may end up with more than one vector result. Here we reduce them to
3650 one vector. */
3651 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3653 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3654 tree tmp;
3656 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3657 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3659 gimple next_phi = VEC_index (gimple, new_phis, k);
3660 tree second_vect = PHI_RESULT (next_phi);
3661 gimple new_vec_stmt;
3663 tmp = build2 (code, vectype, first_vect, second_vect);
3664 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3665 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3666 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3667 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3670 new_phi_result = first_vect;
3672 else
3673 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3675 /* 2.3 Create the reduction code, using one of the three schemes described
3676 above. In SLP we simply need to extract all the elements from the
3677 vector (without reducing them), so we use scalar shifts. */
3678 if (reduc_code != ERROR_MARK && !slp_reduc)
3680 tree tmp;
3682 /*** Case 1: Create:
3683 v_out2 = reduc_expr <v_out1> */
3685 if (vect_print_dump_info (REPORT_DETAILS))
3686 fprintf (vect_dump, "Reduce using direct vector reduction.");
3688 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3689 tmp = build1 (reduc_code, vectype, new_phi_result);
3690 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3691 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3692 gimple_assign_set_lhs (epilog_stmt, new_temp);
3693 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3695 extract_scalar_result = true;
3697 else
3699 enum tree_code shift_code = ERROR_MARK;
3700 bool have_whole_vector_shift = true;
3701 int bit_offset;
3702 int element_bitsize = tree_low_cst (bitsize, 1);
3703 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3704 tree vec_temp;
3706 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3707 shift_code = VEC_RSHIFT_EXPR;
3708 else
3709 have_whole_vector_shift = false;
3711 /* Regardless of whether we have a whole vector shift, if we're
3712 emulating the operation via tree-vect-generic, we don't want
3713 to use it. Only the first round of the reduction is likely
3714 to still be profitable via emulation. */
3715 /* ??? It might be better to emit a reduction tree code here, so that
3716 tree-vect-generic can expand the first round via bit tricks. */
3717 if (!VECTOR_MODE_P (mode))
3718 have_whole_vector_shift = false;
3719 else
3721 optab optab = optab_for_tree_code (code, vectype, optab_default);
3722 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3723 have_whole_vector_shift = false;
3726 if (have_whole_vector_shift && !slp_reduc)
3728 /*** Case 2: Create:
3729 for (offset = VS/2; offset >= element_size; offset/=2)
3731 Create: va' = vec_shift <va, offset>
3732 Create: va = vop <va, va'>
3733 } */
3735 if (vect_print_dump_info (REPORT_DETAILS))
3736 fprintf (vect_dump, "Reduce using vector shifts");
3738 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3739 new_temp = new_phi_result;
3740 for (bit_offset = vec_size_in_bits/2;
3741 bit_offset >= element_bitsize;
3742 bit_offset /= 2)
3744 tree bitpos = size_int (bit_offset);
3746 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3747 vec_dest, new_temp, bitpos);
3748 new_name = make_ssa_name (vec_dest, epilog_stmt);
3749 gimple_assign_set_lhs (epilog_stmt, new_name);
3750 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3752 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3753 new_name, new_temp);
3754 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3755 gimple_assign_set_lhs (epilog_stmt, new_temp);
3756 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3759 extract_scalar_result = true;
3761 else
3763 tree rhs;
3765 /*** Case 3: Create:
3766 s = extract_field <v_out2, 0>
3767 for (offset = element_size;
3768 offset < vector_size;
3769 offset += element_size;)
3771 Create: s' = extract_field <v_out2, offset>
3772 Create: s = op <s, s'> // For non SLP cases
3773 } */
3775 if (vect_print_dump_info (REPORT_DETAILS))
3776 fprintf (vect_dump, "Reduce using scalar code. ");
3778 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3779 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3781 vec_temp = PHI_RESULT (new_phi);
3782 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3783 bitsize_zero_node);
3784 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3785 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3786 gimple_assign_set_lhs (epilog_stmt, new_temp);
3787 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3789 /* In SLP we don't need to apply reduction operation, so we just
3790 collect s' values in SCALAR_RESULTS. */
3791 if (slp_reduc)
3792 VEC_safe_push (tree, heap, scalar_results, new_temp);
3794 for (bit_offset = element_bitsize;
3795 bit_offset < vec_size_in_bits;
3796 bit_offset += element_bitsize)
3798 tree bitpos = bitsize_int (bit_offset);
3799 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3800 bitsize, bitpos);
3802 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3803 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3804 gimple_assign_set_lhs (epilog_stmt, new_name);
3805 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3807 if (slp_reduc)
3809 /* In SLP we don't need to apply reduction operation, so
3810 we just collect s' values in SCALAR_RESULTS. */
3811 new_temp = new_name;
3812 VEC_safe_push (tree, heap, scalar_results, new_name);
3814 else
3816 epilog_stmt = gimple_build_assign_with_ops (code,
3817 new_scalar_dest, new_name, new_temp);
3818 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3819 gimple_assign_set_lhs (epilog_stmt, new_temp);
3820 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3825 /* The only case where we need to reduce scalar results in SLP, is
3826 unrolling. If the size of SCALAR_RESULTS is greater than
3827 GROUP_SIZE, we reduce them combining elements modulo
3828 GROUP_SIZE. */
3829 if (slp_reduc)
3831 tree res, first_res, new_res;
3832 gimple new_stmt;
3834 /* Reduce multiple scalar results in case of SLP unrolling. */
3835 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3836 j++)
3838 first_res = VEC_index (tree, scalar_results, j % group_size);
3839 new_stmt = gimple_build_assign_with_ops (code,
3840 new_scalar_dest, first_res, res);
3841 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3842 gimple_assign_set_lhs (new_stmt, new_res);
3843 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3844 VEC_replace (tree, scalar_results, j % group_size, new_res);
3847 else
3848 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3849 VEC_safe_push (tree, heap, scalar_results, new_temp);
3851 extract_scalar_result = false;
3855 /* 2.4 Extract the final scalar result. Create:
3856 s_out3 = extract_field <v_out2, bitpos> */
3858 if (extract_scalar_result)
3860 tree rhs;
3862 if (vect_print_dump_info (REPORT_DETAILS))
3863 fprintf (vect_dump, "extract scalar result");
3865 if (BYTES_BIG_ENDIAN)
3866 bitpos = size_binop (MULT_EXPR,
3867 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3868 TYPE_SIZE (scalar_type));
3869 else
3870 bitpos = bitsize_zero_node;
3872 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3873 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3874 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3875 gimple_assign_set_lhs (epilog_stmt, new_temp);
3876 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3877 VEC_safe_push (tree, heap, scalar_results, new_temp);
3880 vect_finalize_reduction:
3882 if (double_reduc)
3883 loop = loop->inner;
3885 /* 2.5 Adjust the final result by the initial value of the reduction
3886 variable. (When such adjustment is not needed, then
3887 'adjustment_def' is zero). For example, if code is PLUS we create:
3888 new_temp = loop_exit_def + adjustment_def */
3890 if (adjustment_def)
3892 gcc_assert (!slp_reduc);
3893 if (nested_in_vect_loop)
3895 new_phi = VEC_index (gimple, new_phis, 0);
3896 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3897 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3898 new_dest = vect_create_destination_var (scalar_dest, vectype);
3900 else
3902 new_temp = VEC_index (tree, scalar_results, 0);
3903 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3904 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3905 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3908 epilog_stmt = gimple_build_assign (new_dest, expr);
3909 new_temp = make_ssa_name (new_dest, epilog_stmt);
3910 gimple_assign_set_lhs (epilog_stmt, new_temp);
3911 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3912 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3913 if (nested_in_vect_loop)
3915 set_vinfo_for_stmt (epilog_stmt,
3916 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3917 NULL));
3918 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3919 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3921 if (!double_reduc)
3922 VEC_quick_push (tree, scalar_results, new_temp);
3923 else
3924 VEC_replace (tree, scalar_results, 0, new_temp);
3926 else
3927 VEC_replace (tree, scalar_results, 0, new_temp);
3929 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3932 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3933 phis with new adjusted scalar results, i.e., replace use <s_out0>
3934 with use <s_out4>.
3936 Transform:
3937 loop_exit:
3938 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3939 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3940 v_out2 = reduce <v_out1>
3941 s_out3 = extract_field <v_out2, 0>
3942 s_out4 = adjust_result <s_out3>
3943 use <s_out0>
3944 use <s_out0>
3946 into:
3948 loop_exit:
3949 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3950 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3951 v_out2 = reduce <v_out1>
3952 s_out3 = extract_field <v_out2, 0>
3953 s_out4 = adjust_result <s_out3>
3954 use <s_out4>
3955 use <s_out4> */
3958 /* In SLP reduction chain we reduce vector results into one vector if
3959 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
3960 the last stmt in the reduction chain, since we are looking for the loop
3961 exit phi node. */
3962 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3964 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
3965 SLP_TREE_SCALAR_STMTS (slp_node),
3966 group_size - 1));
3967 group_size = 1;
3970 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3971 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3972 need to match SCALAR_RESULTS with corresponding statements. The first
3973 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3974 the first vector stmt, etc.
3975 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3976 if (group_size > VEC_length (gimple, new_phis))
3978 ratio = group_size / VEC_length (gimple, new_phis);
3979 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3981 else
3982 ratio = 1;
3984 for (k = 0; k < group_size; k++)
3986 if (k % ratio == 0)
3988 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3989 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3992 if (slp_reduc)
3994 gimple current_stmt = VEC_index (gimple,
3995 SLP_TREE_SCALAR_STMTS (slp_node), k);
3997 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3998 /* SLP statements can't participate in patterns. */
3999 gcc_assert (!orig_stmt);
4000 scalar_dest = gimple_assign_lhs (current_stmt);
4003 phis = VEC_alloc (gimple, heap, 3);
4004 /* Find the loop-closed-use at the loop exit of the original scalar
4005 result. (The reduction result is expected to have two immediate uses -
4006 one at the latch block, and one at the loop exit). */
4007 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4008 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4009 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4011 /* We expect to have found an exit_phi because of loop-closed-ssa
4012 form. */
4013 gcc_assert (!VEC_empty (gimple, phis));
4015 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4017 if (outer_loop)
4019 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4020 gimple vect_phi;
4022 /* FORNOW. Currently not supporting the case that an inner-loop
4023 reduction is not used in the outer-loop (but only outside the
4024 outer-loop), unless it is double reduction. */
4025 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4026 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4027 || double_reduc);
4029 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4030 if (!double_reduc
4031 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4032 != vect_double_reduction_def)
4033 continue;
4035 /* Handle double reduction:
4037 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4038 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4039 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4040 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4042 At that point the regular reduction (stmt2 and stmt3) is
4043 already vectorized, as well as the exit phi node, stmt4.
4044 Here we vectorize the phi node of double reduction, stmt1, and
4045 update all relevant statements. */
4047 /* Go through all the uses of s2 to find double reduction phi
4048 node, i.e., stmt1 above. */
4049 orig_name = PHI_RESULT (exit_phi);
4050 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4052 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4053 stmt_vec_info new_phi_vinfo;
4054 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4055 basic_block bb = gimple_bb (use_stmt);
4056 gimple use;
4058 /* Check that USE_STMT is really double reduction phi
4059 node. */
4060 if (gimple_code (use_stmt) != GIMPLE_PHI
4061 || gimple_phi_num_args (use_stmt) != 2
4062 || !use_stmt_vinfo
4063 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4064 != vect_double_reduction_def
4065 || bb->loop_father != outer_loop)
4066 continue;
4068 /* Create vector phi node for double reduction:
4069 vs1 = phi <vs0, vs2>
4070 vs1 was created previously in this function by a call to
4071 vect_get_vec_def_for_operand and is stored in
4072 vec_initial_def;
4073 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4074 vs0 is created here. */
4076 /* Create vector phi node. */
4077 vect_phi = create_phi_node (vec_initial_def, bb);
4078 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4079 loop_vec_info_for_loop (outer_loop), NULL);
4080 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4082 /* Create vs0 - initial def of the double reduction phi. */
4083 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4084 loop_preheader_edge (outer_loop));
4085 init_def = get_initial_def_for_reduction (stmt,
4086 preheader_arg, NULL);
4087 vect_phi_init = vect_init_vector (use_stmt, init_def,
4088 vectype, NULL);
4090 /* Update phi node arguments with vs0 and vs2. */
4091 add_phi_arg (vect_phi, vect_phi_init,
4092 loop_preheader_edge (outer_loop),
4093 UNKNOWN_LOCATION);
4094 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4095 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4096 if (vect_print_dump_info (REPORT_DETAILS))
4098 fprintf (vect_dump, "created double reduction phi "
4099 "node: ");
4100 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4103 vect_phi_res = PHI_RESULT (vect_phi);
4105 /* Replace the use, i.e., set the correct vs1 in the regular
4106 reduction phi node. FORNOW, NCOPIES is always 1, so the
4107 loop is redundant. */
4108 use = reduction_phi;
4109 for (j = 0; j < ncopies; j++)
4111 edge pr_edge = loop_preheader_edge (loop);
4112 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4113 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4119 VEC_free (gimple, heap, phis);
4120 if (nested_in_vect_loop)
4122 if (double_reduc)
4123 loop = outer_loop;
4124 else
4125 continue;
4128 phis = VEC_alloc (gimple, heap, 3);
4129 /* Find the loop-closed-use at the loop exit of the original scalar
4130 result. (The reduction result is expected to have two immediate uses,
4131 one at the latch block, and one at the loop exit). For double
4132 reductions we are looking for exit phis of the outer loop. */
4133 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4135 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4136 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4137 else
4139 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4141 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4143 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4145 if (!flow_bb_inside_loop_p (loop,
4146 gimple_bb (USE_STMT (phi_use_p))))
4147 VEC_safe_push (gimple, heap, phis,
4148 USE_STMT (phi_use_p));
4154 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4156 /* Replace the uses: */
4157 orig_name = PHI_RESULT (exit_phi);
4158 scalar_result = VEC_index (tree, scalar_results, k);
4159 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4160 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4161 SET_USE (use_p, scalar_result);
4164 VEC_free (gimple, heap, phis);
4167 VEC_free (tree, heap, scalar_results);
4168 VEC_free (gimple, heap, new_phis);
4172 /* Function vectorizable_reduction.
4174 Check if STMT performs a reduction operation that can be vectorized.
4175 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4176 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4177 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4179 This function also handles reduction idioms (patterns) that have been
4180 recognized in advance during vect_pattern_recog. In this case, STMT may be
4181 of this form:
4182 X = pattern_expr (arg0, arg1, ..., X)
4183 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4184 sequence that had been detected and replaced by the pattern-stmt (STMT).
4186 In some cases of reduction patterns, the type of the reduction variable X is
4187 different than the type of the other arguments of STMT.
4188 In such cases, the vectype that is used when transforming STMT into a vector
4189 stmt is different than the vectype that is used to determine the
4190 vectorization factor, because it consists of a different number of elements
4191 than the actual number of elements that are being operated upon in parallel.
4193 For example, consider an accumulation of shorts into an int accumulator.
4194 On some targets it's possible to vectorize this pattern operating on 8
4195 shorts at a time (hence, the vectype for purposes of determining the
4196 vectorization factor should be V8HI); on the other hand, the vectype that
4197 is used to create the vector form is actually V4SI (the type of the result).
4199 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4200 indicates what is the actual level of parallelism (V8HI in the example), so
4201 that the right vectorization factor would be derived. This vectype
4202 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4203 be used to create the vectorized stmt. The right vectype for the vectorized
4204 stmt is obtained from the type of the result X:
4205 get_vectype_for_scalar_type (TREE_TYPE (X))
4207 This means that, contrary to "regular" reductions (or "regular" stmts in
4208 general), the following equation:
4209 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4210 does *NOT* necessarily hold for reduction patterns. */
4212 bool
4213 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4214 gimple *vec_stmt, slp_tree slp_node)
4216 tree vec_dest;
4217 tree scalar_dest;
4218 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4219 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4220 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4221 tree vectype_in = NULL_TREE;
4222 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4223 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4224 enum tree_code code, orig_code, epilog_reduc_code;
4225 enum machine_mode vec_mode;
4226 int op_type;
4227 optab optab, reduc_optab;
4228 tree new_temp = NULL_TREE;
4229 tree def;
4230 gimple def_stmt;
4231 enum vect_def_type dt;
4232 gimple new_phi = NULL;
4233 tree scalar_type;
4234 bool is_simple_use;
4235 gimple orig_stmt;
4236 stmt_vec_info orig_stmt_info;
4237 tree expr = NULL_TREE;
4238 int i;
4239 int ncopies;
4240 int epilog_copies;
4241 stmt_vec_info prev_stmt_info, prev_phi_info;
4242 bool single_defuse_cycle = false;
4243 tree reduc_def = NULL_TREE;
4244 gimple new_stmt = NULL;
4245 int j;
4246 tree ops[3];
4247 bool nested_cycle = false, found_nested_cycle_def = false;
4248 gimple reduc_def_stmt = NULL;
4249 /* The default is that the reduction variable is the last in statement. */
4250 int reduc_index = 2;
4251 bool double_reduc = false, dummy;
4252 basic_block def_bb;
4253 struct loop * def_stmt_loop, *outer_loop = NULL;
4254 tree def_arg;
4255 gimple def_arg_stmt;
4256 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4257 VEC (gimple, heap) *phis = NULL;
4258 int vec_num;
4259 tree def0, def1, tem;
4261 /* In case of reduction chain we switch to the first stmt in the chain, but
4262 we don't update STMT_INFO, since only the last stmt is marked as reduction
4263 and has reduction properties. */
4264 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4265 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4267 if (nested_in_vect_loop_p (loop, stmt))
4269 outer_loop = loop;
4270 loop = loop->inner;
4271 nested_cycle = true;
4274 /* 1. Is vectorizable reduction? */
4275 /* Not supportable if the reduction variable is used in the loop, unless
4276 it's a reduction chain. */
4277 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4278 && !GROUP_FIRST_ELEMENT (stmt_info))
4279 return false;
4281 /* Reductions that are not used even in an enclosing outer-loop,
4282 are expected to be "live" (used out of the loop). */
4283 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4284 && !STMT_VINFO_LIVE_P (stmt_info))
4285 return false;
4287 /* Make sure it was already recognized as a reduction computation. */
4288 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4289 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4290 return false;
4292 /* 2. Has this been recognized as a reduction pattern?
4294 Check if STMT represents a pattern that has been recognized
4295 in earlier analysis stages. For stmts that represent a pattern,
4296 the STMT_VINFO_RELATED_STMT field records the last stmt in
4297 the original sequence that constitutes the pattern. */
4299 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4300 if (orig_stmt)
4302 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4303 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4304 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4305 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4308 /* 3. Check the operands of the operation. The first operands are defined
4309 inside the loop body. The last operand is the reduction variable,
4310 which is defined by the loop-header-phi. */
4312 gcc_assert (is_gimple_assign (stmt));
4314 /* Flatten RHS. */
4315 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4317 case GIMPLE_SINGLE_RHS:
4318 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4319 if (op_type == ternary_op)
4321 tree rhs = gimple_assign_rhs1 (stmt);
4322 ops[0] = TREE_OPERAND (rhs, 0);
4323 ops[1] = TREE_OPERAND (rhs, 1);
4324 ops[2] = TREE_OPERAND (rhs, 2);
4325 code = TREE_CODE (rhs);
4327 else
4328 return false;
4329 break;
4331 case GIMPLE_BINARY_RHS:
4332 code = gimple_assign_rhs_code (stmt);
4333 op_type = TREE_CODE_LENGTH (code);
4334 gcc_assert (op_type == binary_op);
4335 ops[0] = gimple_assign_rhs1 (stmt);
4336 ops[1] = gimple_assign_rhs2 (stmt);
4337 break;
4339 case GIMPLE_TERNARY_RHS:
4340 code = gimple_assign_rhs_code (stmt);
4341 op_type = TREE_CODE_LENGTH (code);
4342 gcc_assert (op_type == ternary_op);
4343 ops[0] = gimple_assign_rhs1 (stmt);
4344 ops[1] = gimple_assign_rhs2 (stmt);
4345 ops[2] = gimple_assign_rhs3 (stmt);
4346 break;
4348 case GIMPLE_UNARY_RHS:
4349 return false;
4351 default:
4352 gcc_unreachable ();
4355 scalar_dest = gimple_assign_lhs (stmt);
4356 scalar_type = TREE_TYPE (scalar_dest);
4357 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4358 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4359 return false;
4361 /* All uses but the last are expected to be defined in the loop.
4362 The last use is the reduction variable. In case of nested cycle this
4363 assumption is not true: we use reduc_index to record the index of the
4364 reduction variable. */
4365 for (i = 0; i < op_type-1; i++)
4367 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4368 if (i == 0 && code == COND_EXPR)
4369 continue;
4371 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4372 &def_stmt, &def, &dt, &tem);
4373 if (!vectype_in)
4374 vectype_in = tem;
4375 gcc_assert (is_simple_use);
4377 if (dt != vect_internal_def
4378 && dt != vect_external_def
4379 && dt != vect_constant_def
4380 && dt != vect_induction_def
4381 && !(dt == vect_nested_cycle && nested_cycle))
4382 return false;
4384 if (dt == vect_nested_cycle)
4386 found_nested_cycle_def = true;
4387 reduc_def_stmt = def_stmt;
4388 reduc_index = i;
4392 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4393 &def, &dt, &tem);
4394 if (!vectype_in)
4395 vectype_in = tem;
4396 gcc_assert (is_simple_use);
4397 gcc_assert (dt == vect_reduction_def
4398 || dt == vect_nested_cycle
4399 || ((dt == vect_internal_def || dt == vect_external_def
4400 || dt == vect_constant_def || dt == vect_induction_def)
4401 && nested_cycle && found_nested_cycle_def));
4402 if (!found_nested_cycle_def)
4403 reduc_def_stmt = def_stmt;
4405 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4406 if (orig_stmt)
4407 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4408 reduc_def_stmt,
4409 !nested_cycle,
4410 &dummy));
4411 else
4413 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4414 !nested_cycle, &dummy);
4415 /* We changed STMT to be the first stmt in reduction chain, hence we
4416 check that in this case the first element in the chain is STMT. */
4417 gcc_assert (stmt == tmp
4418 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4421 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4422 return false;
4424 if (slp_node || PURE_SLP_STMT (stmt_info))
4425 ncopies = 1;
4426 else
4427 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4428 / TYPE_VECTOR_SUBPARTS (vectype_in));
4430 gcc_assert (ncopies >= 1);
4432 vec_mode = TYPE_MODE (vectype_in);
4434 if (code == COND_EXPR)
4436 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4438 if (vect_print_dump_info (REPORT_DETAILS))
4439 fprintf (vect_dump, "unsupported condition in reduction");
4441 return false;
4444 else
4446 /* 4. Supportable by target? */
4448 /* 4.1. check support for the operation in the loop */
4449 optab = optab_for_tree_code (code, vectype_in, optab_default);
4450 if (!optab)
4452 if (vect_print_dump_info (REPORT_DETAILS))
4453 fprintf (vect_dump, "no optab.");
4455 return false;
4458 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4460 if (vect_print_dump_info (REPORT_DETAILS))
4461 fprintf (vect_dump, "op not supported by target.");
4463 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4464 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4465 < vect_min_worthwhile_factor (code))
4466 return false;
4468 if (vect_print_dump_info (REPORT_DETAILS))
4469 fprintf (vect_dump, "proceeding using word mode.");
4472 /* Worthwhile without SIMD support? */
4473 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4474 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4475 < vect_min_worthwhile_factor (code))
4477 if (vect_print_dump_info (REPORT_DETAILS))
4478 fprintf (vect_dump, "not worthwhile without SIMD support.");
4480 return false;
4484 /* 4.2. Check support for the epilog operation.
4486 If STMT represents a reduction pattern, then the type of the
4487 reduction variable may be different than the type of the rest
4488 of the arguments. For example, consider the case of accumulation
4489 of shorts into an int accumulator; The original code:
4490 S1: int_a = (int) short_a;
4491 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4493 was replaced with:
4494 STMT: int_acc = widen_sum <short_a, int_acc>
4496 This means that:
4497 1. The tree-code that is used to create the vector operation in the
4498 epilog code (that reduces the partial results) is not the
4499 tree-code of STMT, but is rather the tree-code of the original
4500 stmt from the pattern that STMT is replacing. I.e, in the example
4501 above we want to use 'widen_sum' in the loop, but 'plus' in the
4502 epilog.
4503 2. The type (mode) we use to check available target support
4504 for the vector operation to be created in the *epilog*, is
4505 determined by the type of the reduction variable (in the example
4506 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4507 However the type (mode) we use to check available target support
4508 for the vector operation to be created *inside the loop*, is
4509 determined by the type of the other arguments to STMT (in the
4510 example we'd check this: optab_handler (widen_sum_optab,
4511 vect_short_mode)).
4513 This is contrary to "regular" reductions, in which the types of all
4514 the arguments are the same as the type of the reduction variable.
4515 For "regular" reductions we can therefore use the same vector type
4516 (and also the same tree-code) when generating the epilog code and
4517 when generating the code inside the loop. */
4519 if (orig_stmt)
4521 /* This is a reduction pattern: get the vectype from the type of the
4522 reduction variable, and get the tree-code from orig_stmt. */
4523 orig_code = gimple_assign_rhs_code (orig_stmt);
4524 gcc_assert (vectype_out);
4525 vec_mode = TYPE_MODE (vectype_out);
4527 else
4529 /* Regular reduction: use the same vectype and tree-code as used for
4530 the vector code inside the loop can be used for the epilog code. */
4531 orig_code = code;
4534 if (nested_cycle)
4536 def_bb = gimple_bb (reduc_def_stmt);
4537 def_stmt_loop = def_bb->loop_father;
4538 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4539 loop_preheader_edge (def_stmt_loop));
4540 if (TREE_CODE (def_arg) == SSA_NAME
4541 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4542 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4543 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4544 && vinfo_for_stmt (def_arg_stmt)
4545 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4546 == vect_double_reduction_def)
4547 double_reduc = true;
4550 epilog_reduc_code = ERROR_MARK;
4551 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4553 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4554 optab_default);
4555 if (!reduc_optab)
4557 if (vect_print_dump_info (REPORT_DETAILS))
4558 fprintf (vect_dump, "no optab for reduction.");
4560 epilog_reduc_code = ERROR_MARK;
4563 if (reduc_optab
4564 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4566 if (vect_print_dump_info (REPORT_DETAILS))
4567 fprintf (vect_dump, "reduc op not supported by target.");
4569 epilog_reduc_code = ERROR_MARK;
4572 else
4574 if (!nested_cycle || double_reduc)
4576 if (vect_print_dump_info (REPORT_DETAILS))
4577 fprintf (vect_dump, "no reduc code for scalar code.");
4579 return false;
4583 if (double_reduc && ncopies > 1)
4585 if (vect_print_dump_info (REPORT_DETAILS))
4586 fprintf (vect_dump, "multiple types in double reduction");
4588 return false;
4591 if (!vec_stmt) /* transformation not required. */
4593 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4594 return false;
4595 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4596 return true;
4599 /** Transform. **/
4601 if (vect_print_dump_info (REPORT_DETAILS))
4602 fprintf (vect_dump, "transform reduction.");
4604 /* FORNOW: Multiple types are not supported for condition. */
4605 if (code == COND_EXPR)
4606 gcc_assert (ncopies == 1);
4608 /* Create the destination vector */
4609 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4611 /* In case the vectorization factor (VF) is bigger than the number
4612 of elements that we can fit in a vectype (nunits), we have to generate
4613 more than one vector stmt - i.e - we need to "unroll" the
4614 vector stmt by a factor VF/nunits. For more details see documentation
4615 in vectorizable_operation. */
4617 /* If the reduction is used in an outer loop we need to generate
4618 VF intermediate results, like so (e.g. for ncopies=2):
4619 r0 = phi (init, r0)
4620 r1 = phi (init, r1)
4621 r0 = x0 + r0;
4622 r1 = x1 + r1;
4623 (i.e. we generate VF results in 2 registers).
4624 In this case we have a separate def-use cycle for each copy, and therefore
4625 for each copy we get the vector def for the reduction variable from the
4626 respective phi node created for this copy.
4628 Otherwise (the reduction is unused in the loop nest), we can combine
4629 together intermediate results, like so (e.g. for ncopies=2):
4630 r = phi (init, r)
4631 r = x0 + r;
4632 r = x1 + r;
4633 (i.e. we generate VF/2 results in a single register).
4634 In this case for each copy we get the vector def for the reduction variable
4635 from the vectorized reduction operation generated in the previous iteration.
4638 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4640 single_defuse_cycle = true;
4641 epilog_copies = 1;
4643 else
4644 epilog_copies = ncopies;
4646 prev_stmt_info = NULL;
4647 prev_phi_info = NULL;
4648 if (slp_node)
4650 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4651 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4652 == TYPE_VECTOR_SUBPARTS (vectype_in));
4654 else
4656 vec_num = 1;
4657 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4658 if (op_type == ternary_op)
4659 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4662 phis = VEC_alloc (gimple, heap, vec_num);
4663 vect_defs = VEC_alloc (tree, heap, vec_num);
4664 if (!slp_node)
4665 VEC_quick_push (tree, vect_defs, NULL_TREE);
4667 for (j = 0; j < ncopies; j++)
4669 if (j == 0 || !single_defuse_cycle)
4671 for (i = 0; i < vec_num; i++)
4673 /* Create the reduction-phi that defines the reduction
4674 operand. */
4675 new_phi = create_phi_node (vec_dest, loop->header);
4676 set_vinfo_for_stmt (new_phi,
4677 new_stmt_vec_info (new_phi, loop_vinfo,
4678 NULL));
4679 if (j == 0 || slp_node)
4680 VEC_quick_push (gimple, phis, new_phi);
4684 if (code == COND_EXPR)
4686 gcc_assert (!slp_node);
4687 vectorizable_condition (stmt, gsi, vec_stmt,
4688 PHI_RESULT (VEC_index (gimple, phis, 0)),
4689 reduc_index);
4690 /* Multiple types are not supported for condition. */
4691 break;
4694 /* Handle uses. */
4695 if (j == 0)
4697 tree op0, op1 = NULL_TREE;
4699 op0 = ops[!reduc_index];
4700 if (op_type == ternary_op)
4702 if (reduc_index == 0)
4703 op1 = ops[2];
4704 else
4705 op1 = ops[1];
4708 if (slp_node)
4709 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4710 -1);
4711 else
4713 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4714 stmt, NULL);
4715 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4716 if (op_type == ternary_op)
4718 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4719 NULL);
4720 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4724 else
4726 if (!slp_node)
4728 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4729 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4730 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4731 if (op_type == ternary_op)
4733 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4734 loop_vec_def1);
4735 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4739 if (single_defuse_cycle)
4740 reduc_def = gimple_assign_lhs (new_stmt);
4742 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4745 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4747 if (slp_node)
4748 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4749 else
4751 if (!single_defuse_cycle || j == 0)
4752 reduc_def = PHI_RESULT (new_phi);
4755 def1 = ((op_type == ternary_op)
4756 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4757 if (op_type == binary_op)
4759 if (reduc_index == 0)
4760 expr = build2 (code, vectype_out, reduc_def, def0);
4761 else
4762 expr = build2 (code, vectype_out, def0, reduc_def);
4764 else
4766 if (reduc_index == 0)
4767 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4768 else
4770 if (reduc_index == 1)
4771 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4772 else
4773 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4777 new_stmt = gimple_build_assign (vec_dest, expr);
4778 new_temp = make_ssa_name (vec_dest, new_stmt);
4779 gimple_assign_set_lhs (new_stmt, new_temp);
4780 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4782 if (slp_node)
4784 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4785 VEC_quick_push (tree, vect_defs, new_temp);
4787 else
4788 VEC_replace (tree, vect_defs, 0, new_temp);
4791 if (slp_node)
4792 continue;
4794 if (j == 0)
4795 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4796 else
4797 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4799 prev_stmt_info = vinfo_for_stmt (new_stmt);
4800 prev_phi_info = vinfo_for_stmt (new_phi);
4803 /* Finalize the reduction-phi (set its arguments) and create the
4804 epilog reduction code. */
4805 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4807 new_temp = gimple_assign_lhs (*vec_stmt);
4808 VEC_replace (tree, vect_defs, 0, new_temp);
4811 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4812 epilog_reduc_code, phis, reduc_index,
4813 double_reduc, slp_node);
4815 VEC_free (gimple, heap, phis);
4816 VEC_free (tree, heap, vec_oprnds0);
4817 if (vec_oprnds1)
4818 VEC_free (tree, heap, vec_oprnds1);
4820 return true;
4823 /* Function vect_min_worthwhile_factor.
4825 For a loop where we could vectorize the operation indicated by CODE,
4826 return the minimum vectorization factor that makes it worthwhile
4827 to use generic vectors. */
4829 vect_min_worthwhile_factor (enum tree_code code)
4831 switch (code)
4833 case PLUS_EXPR:
4834 case MINUS_EXPR:
4835 case NEGATE_EXPR:
4836 return 4;
4838 case BIT_AND_EXPR:
4839 case BIT_IOR_EXPR:
4840 case BIT_XOR_EXPR:
4841 case BIT_NOT_EXPR:
4842 return 2;
4844 default:
4845 return INT_MAX;
4850 /* Function vectorizable_induction
4852 Check if PHI performs an induction computation that can be vectorized.
4853 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4854 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4855 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4857 bool
4858 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4859 gimple *vec_stmt)
4861 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4864 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4865 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4866 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4867 tree vec_def;
4869 gcc_assert (ncopies >= 1);
4870 /* FORNOW. This restriction should be relaxed. */
4871 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4873 if (vect_print_dump_info (REPORT_DETAILS))
4874 fprintf (vect_dump, "multiple types in nested loop.");
4875 return false;
4878 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4879 return false;
4881 /* FORNOW: SLP not supported. */
4882 if (STMT_SLP_TYPE (stmt_info))
4883 return false;
4885 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4887 if (gimple_code (phi) != GIMPLE_PHI)
4888 return false;
4890 if (!vec_stmt) /* transformation not required. */
4892 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4893 if (vect_print_dump_info (REPORT_DETAILS))
4894 fprintf (vect_dump, "=== vectorizable_induction ===");
4895 vect_model_induction_cost (stmt_info, ncopies);
4896 return true;
4899 /** Transform. **/
4901 if (vect_print_dump_info (REPORT_DETAILS))
4902 fprintf (vect_dump, "transform induction phi.");
4904 vec_def = get_initial_def_for_induction (phi);
4905 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4906 return true;
4909 /* Function vectorizable_live_operation.
4911 STMT computes a value that is used outside the loop. Check if
4912 it can be supported. */
4914 bool
4915 vectorizable_live_operation (gimple stmt,
4916 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4917 gimple *vec_stmt ATTRIBUTE_UNUSED)
4919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4921 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922 int i;
4923 int op_type;
4924 tree op;
4925 tree def;
4926 gimple def_stmt;
4927 enum vect_def_type dt;
4928 enum tree_code code;
4929 enum gimple_rhs_class rhs_class;
4931 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4933 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4934 return false;
4936 if (!is_gimple_assign (stmt))
4937 return false;
4939 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4940 return false;
4942 /* FORNOW. CHECKME. */
4943 if (nested_in_vect_loop_p (loop, stmt))
4944 return false;
4946 code = gimple_assign_rhs_code (stmt);
4947 op_type = TREE_CODE_LENGTH (code);
4948 rhs_class = get_gimple_rhs_class (code);
4949 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4950 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4952 /* FORNOW: support only if all uses are invariant. This means
4953 that the scalar operations can remain in place, unvectorized.
4954 The original last scalar value that they compute will be used. */
4956 for (i = 0; i < op_type; i++)
4958 if (rhs_class == GIMPLE_SINGLE_RHS)
4959 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4960 else
4961 op = gimple_op (stmt, i + 1);
4962 if (op
4963 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4965 if (vect_print_dump_info (REPORT_DETAILS))
4966 fprintf (vect_dump, "use not simple.");
4967 return false;
4970 if (dt != vect_external_def && dt != vect_constant_def)
4971 return false;
4974 /* No transformation is required for the cases we currently support. */
4975 return true;
4978 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4980 static void
4981 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4983 ssa_op_iter op_iter;
4984 imm_use_iterator imm_iter;
4985 def_operand_p def_p;
4986 gimple ustmt;
4988 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4990 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4992 basic_block bb;
4994 if (!is_gimple_debug (ustmt))
4995 continue;
4997 bb = gimple_bb (ustmt);
4999 if (!flow_bb_inside_loop_p (loop, bb))
5001 if (gimple_debug_bind_p (ustmt))
5003 if (vect_print_dump_info (REPORT_DETAILS))
5004 fprintf (vect_dump, "killing debug use");
5006 gimple_debug_bind_reset_value (ustmt);
5007 update_stmt (ustmt);
5009 else
5010 gcc_unreachable ();
5016 /* Function vect_transform_loop.
5018 The analysis phase has determined that the loop is vectorizable.
5019 Vectorize the loop - created vectorized stmts to replace the scalar
5020 stmts in the loop, and update the loop exit condition. */
5022 void
5023 vect_transform_loop (loop_vec_info loop_vinfo)
5025 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5026 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5027 int nbbs = loop->num_nodes;
5028 gimple_stmt_iterator si;
5029 int i;
5030 tree ratio = NULL;
5031 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5032 bool strided_store;
5033 bool slp_scheduled = false;
5034 unsigned int nunits;
5035 tree cond_expr = NULL_TREE;
5036 gimple_seq cond_expr_stmt_list = NULL;
5037 bool do_peeling_for_loop_bound;
5039 if (vect_print_dump_info (REPORT_DETAILS))
5040 fprintf (vect_dump, "=== vec_transform_loop ===");
5042 /* Peel the loop if there are data refs with unknown alignment.
5043 Only one data ref with unknown store is allowed. */
5045 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5046 vect_do_peeling_for_alignment (loop_vinfo);
5048 do_peeling_for_loop_bound
5049 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5050 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5051 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5052 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5054 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5055 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5056 vect_loop_versioning (loop_vinfo,
5057 !do_peeling_for_loop_bound,
5058 &cond_expr, &cond_expr_stmt_list);
5060 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5061 compile time constant), or it is a constant that doesn't divide by the
5062 vectorization factor, then an epilog loop needs to be created.
5063 We therefore duplicate the loop: the original loop will be vectorized,
5064 and will compute the first (n/VF) iterations. The second copy of the loop
5065 will remain scalar and will compute the remaining (n%VF) iterations.
5066 (VF is the vectorization factor). */
5068 if (do_peeling_for_loop_bound)
5069 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5070 cond_expr, cond_expr_stmt_list);
5071 else
5072 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5073 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5075 /* 1) Make sure the loop header has exactly two entries
5076 2) Make sure we have a preheader basic block. */
5078 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5080 split_edge (loop_preheader_edge (loop));
5082 /* FORNOW: the vectorizer supports only loops which body consist
5083 of one basic block (header + empty latch). When the vectorizer will
5084 support more involved loop forms, the order by which the BBs are
5085 traversed need to be reconsidered. */
5087 for (i = 0; i < nbbs; i++)
5089 basic_block bb = bbs[i];
5090 stmt_vec_info stmt_info;
5091 gimple phi;
5093 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5095 phi = gsi_stmt (si);
5096 if (vect_print_dump_info (REPORT_DETAILS))
5098 fprintf (vect_dump, "------>vectorizing phi: ");
5099 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5101 stmt_info = vinfo_for_stmt (phi);
5102 if (!stmt_info)
5103 continue;
5105 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5106 vect_loop_kill_debug_uses (loop, phi);
5108 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5109 && !STMT_VINFO_LIVE_P (stmt_info))
5110 continue;
5112 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5113 != (unsigned HOST_WIDE_INT) vectorization_factor)
5114 && vect_print_dump_info (REPORT_DETAILS))
5115 fprintf (vect_dump, "multiple-types.");
5117 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5119 if (vect_print_dump_info (REPORT_DETAILS))
5120 fprintf (vect_dump, "transform phi.");
5121 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5125 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
5127 gimple stmt = gsi_stmt (si);
5128 bool is_store;
5130 if (vect_print_dump_info (REPORT_DETAILS))
5132 fprintf (vect_dump, "------>vectorizing statement: ");
5133 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5136 stmt_info = vinfo_for_stmt (stmt);
5138 /* vector stmts created in the outer-loop during vectorization of
5139 stmts in an inner-loop may not have a stmt_info, and do not
5140 need to be vectorized. */
5141 if (!stmt_info)
5143 gsi_next (&si);
5144 continue;
5147 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5148 vect_loop_kill_debug_uses (loop, stmt);
5150 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5151 && !STMT_VINFO_LIVE_P (stmt_info))
5153 gsi_next (&si);
5154 continue;
5157 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5158 nunits =
5159 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5160 if (!STMT_SLP_TYPE (stmt_info)
5161 && nunits != (unsigned int) vectorization_factor
5162 && vect_print_dump_info (REPORT_DETAILS))
5163 /* For SLP VF is set according to unrolling factor, and not to
5164 vector size, hence for SLP this print is not valid. */
5165 fprintf (vect_dump, "multiple-types.");
5167 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5168 reached. */
5169 if (STMT_SLP_TYPE (stmt_info))
5171 if (!slp_scheduled)
5173 slp_scheduled = true;
5175 if (vect_print_dump_info (REPORT_DETAILS))
5176 fprintf (vect_dump, "=== scheduling SLP instances ===");
5178 vect_schedule_slp (loop_vinfo, NULL);
5181 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5182 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5184 gsi_next (&si);
5185 continue;
5189 /* -------- vectorize statement ------------ */
5190 if (vect_print_dump_info (REPORT_DETAILS))
5191 fprintf (vect_dump, "transform statement.");
5193 strided_store = false;
5194 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5195 if (is_store)
5197 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5199 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5200 interleaving chain was completed - free all the stores in
5201 the chain. */
5202 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5203 gsi_remove (&si, true);
5204 continue;
5206 else
5208 /* Free the attached stmt_vec_info and remove the stmt. */
5209 free_stmt_vec_info (stmt);
5210 gsi_remove (&si, true);
5211 continue;
5214 gsi_next (&si);
5215 } /* stmts in BB */
5216 } /* BBs in loop */
5218 slpeel_make_loop_iterate_ntimes (loop, ratio);
5220 /* The memory tags and pointers in vectorized statements need to
5221 have their SSA forms updated. FIXME, why can't this be delayed
5222 until all the loops have been transformed? */
5223 update_ssa (TODO_update_ssa);
5225 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5226 fprintf (vect_dump, "LOOP VECTORIZED.");
5227 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5228 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");