2012-11-04 Janus Weil <janus@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-loop.c
blob908caed0b57137b36a12447401e73d92e1ee535b
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "dumpfile.h"
27 #include "tm.h"
28 #include "ggc.h"
29 #include "tree.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "diagnostic-core.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "target.h"
45 /* Loop Vectorization Pass.
47 This pass tries to vectorize loops.
49 For example, the vectorizer transforms the following simple loop:
51 short a[N]; short b[N]; short c[N]; int i;
53 for (i=0; i<N; i++){
54 a[i] = b[i] + c[i];
57 as if it was manually vectorized by rewriting the source code into:
59 typedef int __attribute__((mode(V8HI))) v8hi;
60 short a[N]; short b[N]; short c[N]; int i;
61 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
62 v8hi va, vb, vc;
64 for (i=0; i<N/8; i++){
65 vb = pb[i];
66 vc = pc[i];
67 va = vb + vc;
68 pa[i] = va;
71 The main entry to this pass is vectorize_loops(), in which
72 the vectorizer applies a set of analyses on a given set of loops,
73 followed by the actual vectorization transformation for the loops that
74 had successfully passed the analysis phase.
75 Throughout this pass we make a distinction between two types of
76 data: scalars (which are represented by SSA_NAMES), and memory references
77 ("data-refs"). These two types of data require different handling both
78 during analysis and transformation. The types of data-refs that the
79 vectorizer currently supports are ARRAY_REFS which base is an array DECL
80 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
81 accesses are required to have a simple (consecutive) access pattern.
83 Analysis phase:
84 ===============
85 The driver for the analysis phase is vect_analyze_loop().
86 It applies a set of analyses, some of which rely on the scalar evolution
87 analyzer (scev) developed by Sebastian Pop.
89 During the analysis phase the vectorizer records some information
90 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
91 loop, as well as general information about the loop as a whole, which is
92 recorded in a "loop_vec_info" struct attached to each loop.
94 Transformation phase:
95 =====================
96 The loop transformation phase scans all the stmts in the loop, and
97 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
98 the loop that needs to be vectorized. It inserts the vector code sequence
99 just before the scalar stmt S, and records a pointer to the vector code
100 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
101 attached to S). This pointer will be used for the vectorization of following
102 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
103 otherwise, we rely on dead code elimination for removing it.
105 For example, say stmt S1 was vectorized into stmt VS1:
107 VS1: vb = px[i];
108 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
109 S2: a = b;
111 To vectorize stmt S2, the vectorizer first finds the stmt that defines
112 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
113 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
114 resulting sequence would be:
116 VS1: vb = px[i];
117 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 VS2: va = vb;
119 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
121 Operands that are not SSA_NAMEs, are data-refs that appear in
122 load/store operations (like 'x[i]' in S1), and are handled differently.
124 Target modeling:
125 =================
126 Currently the only target specific information that is used is the
127 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
128 Targets that can support different sizes of vectors, for now will need
129 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
130 flexibility will be added in the future.
132 Since we only vectorize operations which vector form can be
133 expressed using existing tree codes, to verify that an operation is
134 supported, the vectorizer checks the relevant optab at the relevant
135 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
136 the value found is CODE_FOR_nothing, then there's no target support, and
137 we can't vectorize the stmt.
139 For additional information on this project see:
140 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
143 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
145 /* Function vect_determine_vectorization_factor
147 Determine the vectorization factor (VF). VF is the number of data elements
148 that are operated upon in parallel in a single iteration of the vectorized
149 loop. For example, when vectorizing a loop that operates on 4byte elements,
150 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
151 elements can fit in a single vector register.
153 We currently support vectorization of loops in which all types operated upon
154 are of the same size. Therefore this function currently sets VF according to
155 the size of the types operated upon, and fails if there are multiple sizes
156 in the loop.
158 VF is also the factor by which the loop iterations are strip-mined, e.g.:
159 original loop:
160 for (i=0; i<N; i++){
161 a[i] = b[i] + c[i];
164 vectorized loop:
165 for (i=0; i<N; i+=VF){
166 a[i:VF] = b[i:VF] + c[i:VF];
170 static bool
171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
175 int nbbs = loop->num_nodes;
176 gimple_stmt_iterator si;
177 unsigned int vectorization_factor = 0;
178 tree scalar_type;
179 gimple phi;
180 tree vectype;
181 unsigned int nunits;
182 stmt_vec_info stmt_info;
183 int i;
184 HOST_WIDE_INT dummy;
185 gimple stmt, pattern_stmt = NULL;
186 gimple_seq pattern_def_seq = NULL;
187 gimple_stmt_iterator pattern_def_si = gsi_none ();
188 bool analyze_pattern_stmt = false;
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE, vect_location,
192 "=== vect_determine_vectorization_factor ===");
194 for (i = 0; i < nbbs; i++)
196 basic_block bb = bbs[i];
198 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
200 phi = gsi_stmt (si);
201 stmt_info = vinfo_for_stmt (phi);
202 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
205 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
208 gcc_assert (stmt_info);
210 if (STMT_VINFO_RELEVANT_P (stmt_info))
212 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
213 scalar_type = TREE_TYPE (PHI_RESULT (phi));
215 if (dump_enabled_p ())
217 dump_printf_loc (MSG_NOTE, vect_location,
218 "get vectype for scalar type: ");
219 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
222 vectype = get_vectype_for_scalar_type (scalar_type);
223 if (!vectype)
225 if (dump_enabled_p ())
227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228 "not vectorized: unsupported "
229 "data-type ");
230 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
231 scalar_type);
233 return false;
235 STMT_VINFO_VECTYPE (stmt_info) = vectype;
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
240 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
243 nunits = TYPE_VECTOR_SUBPARTS (vectype);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
247 if (!vectorization_factor
248 || (nunits > vectorization_factor))
249 vectorization_factor = nunits;
253 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
255 tree vf_vectype;
257 if (analyze_pattern_stmt)
258 stmt = pattern_stmt;
259 else
260 stmt = gsi_stmt (si);
262 stmt_info = vinfo_for_stmt (stmt);
264 if (dump_enabled_p ())
266 dump_printf_loc (MSG_NOTE, vect_location,
267 "==> examining statement: ");
268 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
271 gcc_assert (stmt_info);
273 /* Skip stmts which do not need to be vectorized. */
274 if (!STMT_VINFO_RELEVANT_P (stmt_info)
275 && !STMT_VINFO_LIVE_P (stmt_info))
277 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
278 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
279 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
280 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
282 stmt = pattern_stmt;
283 stmt_info = vinfo_for_stmt (pattern_stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining pattern statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 else
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
295 gsi_next (&si);
296 continue;
299 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
300 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
301 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
302 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 analyze_pattern_stmt = true;
305 /* If a pattern statement has def stmts, analyze them too. */
306 if (is_pattern_stmt_p (stmt_info))
308 if (pattern_def_seq == NULL)
310 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
311 pattern_def_si = gsi_start (pattern_def_seq);
313 else if (!gsi_end_p (pattern_def_si))
314 gsi_next (&pattern_def_si);
315 if (pattern_def_seq != NULL)
317 gimple pattern_def_stmt = NULL;
318 stmt_vec_info pattern_def_stmt_info = NULL;
320 while (!gsi_end_p (pattern_def_si))
322 pattern_def_stmt = gsi_stmt (pattern_def_si);
323 pattern_def_stmt_info
324 = vinfo_for_stmt (pattern_def_stmt);
325 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
326 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
327 break;
328 gsi_next (&pattern_def_si);
331 if (!gsi_end_p (pattern_def_si))
333 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location,
336 "==> examining pattern def stmt: ");
337 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
338 pattern_def_stmt, 0);
341 stmt = pattern_def_stmt;
342 stmt_info = pattern_def_stmt_info;
344 else
346 pattern_def_si = gsi_none ();
347 analyze_pattern_stmt = false;
350 else
351 analyze_pattern_stmt = false;
354 if (gimple_get_lhs (stmt) == NULL_TREE)
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "not vectorized: irregular stmt.");
360 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
363 return false;
366 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
368 if (dump_enabled_p ())
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "not vectorized: vector stmt in loop:");
372 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
374 return false;
377 if (STMT_VINFO_VECTYPE (stmt_info))
379 /* The only case when a vectype had been already set is for stmts
380 that contain a dataref, or for "pattern-stmts" (stmts
381 generated by the vectorizer to represent/replace a certain
382 idiom). */
383 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
384 || is_pattern_stmt_p (stmt_info)
385 || !gsi_end_p (pattern_def_si));
386 vectype = STMT_VINFO_VECTYPE (stmt_info);
388 else
390 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
391 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_NOTE, vect_location,
395 "get vectype for scalar type: ");
396 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
398 vectype = get_vectype_for_scalar_type (scalar_type);
399 if (!vectype)
401 if (dump_enabled_p ())
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "not vectorized: unsupported "
405 "data-type ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
407 scalar_type);
409 return false;
412 STMT_VINFO_VECTYPE (stmt_info) = vectype;
415 /* The vectorization factor is according to the smallest
416 scalar type (or the largest vector size, but we only
417 support one vector size per loop). */
418 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
419 &dummy);
420 if (dump_enabled_p ())
422 dump_printf_loc (MSG_NOTE, vect_location,
423 "get vectype for scalar type: ");
424 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
426 vf_vectype = get_vectype_for_scalar_type (scalar_type);
427 if (!vf_vectype)
429 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
432 "not vectorized: unsupported data-type ");
433 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
434 scalar_type);
436 return false;
439 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
440 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
445 "not vectorized: different sized vector "
446 "types in statement, ");
447 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
448 vectype);
449 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
450 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
451 vf_vectype);
453 return false;
456 if (dump_enabled_p ())
458 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
459 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
462 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
465 if (!vectorization_factor
466 || (nunits > vectorization_factor))
467 vectorization_factor = nunits;
469 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
471 pattern_def_seq = NULL;
472 gsi_next (&si);
477 /* TODO: Analyze cost. Decide if worth while to vectorize. */
478 if (dump_enabled_p ())
479 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
480 vectorization_factor);
481 if (vectorization_factor <= 1)
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 "not vectorized: unsupported data-type");
486 return false;
488 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
490 return true;
494 /* Function vect_is_simple_iv_evolution.
496 FORNOW: A simple evolution of an induction variables in the loop is
497 considered a polynomial evolution with constant step. */
499 static bool
500 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
501 tree * step)
503 tree init_expr;
504 tree step_expr;
505 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
507 /* When there is no evolution in this loop, the evolution function
508 is not "simple". */
509 if (evolution_part == NULL_TREE)
510 return false;
512 /* When the evolution is a polynomial of degree >= 2
513 the evolution function is not "simple". */
514 if (tree_is_chrec (evolution_part))
515 return false;
517 step_expr = evolution_part;
518 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
523 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
524 dump_printf (MSG_NOTE, ", init: ");
525 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
528 *init = init_expr;
529 *step = step_expr;
531 if (TREE_CODE (step_expr) != INTEGER_CST)
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "step unknown.");
536 return false;
539 return true;
542 /* Function vect_analyze_scalar_cycles_1.
544 Examine the cross iteration def-use cycles of scalar variables
545 in LOOP. LOOP_VINFO represents the loop that is now being
546 considered for vectorization (can be LOOP, or an outer-loop
547 enclosing LOOP). */
549 static void
550 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
552 basic_block bb = loop->header;
553 tree dumy;
554 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
555 gimple_stmt_iterator gsi;
556 bool double_reduc;
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "=== vect_analyze_scalar_cycles ===");
562 /* First - identify all inductions. Reduction detection assumes that all the
563 inductions have been identified, therefore, this order must not be
564 changed. */
565 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
567 gimple phi = gsi_stmt (gsi);
568 tree access_fn = NULL;
569 tree def = PHI_RESULT (phi);
570 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
572 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
575 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
578 /* Skip virtual phi's. The data dependences that are associated with
579 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
580 if (virtual_operand_p (def))
581 continue;
583 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
585 /* Analyze the evolution function. */
586 access_fn = analyze_scalar_evolution (loop, def);
587 if (access_fn)
589 STRIP_NOPS (access_fn);
590 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "Access function of PHI: ");
594 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
596 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
597 = evolution_part_in_loop_num (access_fn, loop->num);
600 if (!access_fn
601 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
603 VEC_safe_push (gimple, heap, worklist, phi);
604 continue;
607 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
611 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
615 /* Second - identify all reductions and nested cycles. */
616 while (VEC_length (gimple, worklist) > 0)
618 gimple phi = VEC_pop (gimple, worklist);
619 tree def = PHI_RESULT (phi);
620 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
621 gimple reduc_stmt;
622 bool nested_cycle;
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
627 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
630 gcc_assert (!virtual_operand_p (def)
631 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
633 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
634 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
635 &double_reduc);
636 if (reduc_stmt)
638 if (double_reduc)
640 if (dump_enabled_p ())
641 dump_printf_loc (MSG_NOTE, vect_location,
642 "Detected double reduction.");
644 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
645 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
646 vect_double_reduction_def;
648 else
650 if (nested_cycle)
652 if (dump_enabled_p ())
653 dump_printf_loc (MSG_NOTE, vect_location,
654 "Detected vectorizable nested cycle.");
656 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
657 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
658 vect_nested_cycle;
660 else
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_NOTE, vect_location,
664 "Detected reduction.");
666 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
667 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
668 vect_reduction_def;
669 /* Store the reduction cycles for possible vectorization in
670 loop-aware SLP. */
671 VEC_safe_push (gimple, heap,
672 LOOP_VINFO_REDUCTIONS (loop_vinfo),
673 reduc_stmt);
677 else
678 if (dump_enabled_p ())
679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
680 "Unknown def-use cycle pattern.");
683 VEC_free (gimple, heap, worklist);
687 /* Function vect_analyze_scalar_cycles.
689 Examine the cross iteration def-use cycles of scalar variables, by
690 analyzing the loop-header PHIs of scalar variables. Classify each
691 cycle as one of the following: invariant, induction, reduction, unknown.
692 We do that for the loop represented by LOOP_VINFO, and also to its
693 inner-loop, if exists.
694 Examples for scalar cycles:
696 Example1: reduction:
698 loop1:
699 for (i=0; i<N; i++)
700 sum += a[i];
702 Example2: induction:
704 loop2:
705 for (i=0; i<N; i++)
706 a[i] = i; */
708 static void
709 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
711 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
713 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
715 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
716 Reductions in such inner-loop therefore have different properties than
717 the reductions in the nest that gets vectorized:
718 1. When vectorized, they are executed in the same order as in the original
719 scalar loop, so we can't change the order of computation when
720 vectorizing them.
721 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
722 current checks are too strict. */
724 if (loop->inner)
725 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
728 /* Function vect_get_loop_niters.
730 Determine how many iterations the loop is executed.
731 If an expression that represents the number of iterations
732 can be constructed, place it in NUMBER_OF_ITERATIONS.
733 Return the loop exit condition. */
735 static gimple
736 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
738 tree niters;
740 if (dump_enabled_p ())
741 dump_printf_loc (MSG_NOTE, vect_location,
742 "=== get_loop_niters ===");
743 niters = number_of_exit_cond_executions (loop);
745 if (niters != NULL_TREE
746 && niters != chrec_dont_know)
748 *number_of_iterations = niters;
750 if (dump_enabled_p ())
752 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
753 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
757 return get_loop_exit_condition (loop);
761 /* Function bb_in_loop_p
763 Used as predicate for dfs order traversal of the loop bbs. */
765 static bool
766 bb_in_loop_p (const_basic_block bb, const void *data)
768 const struct loop *const loop = (const struct loop *)data;
769 if (flow_bb_inside_loop_p (loop, bb))
770 return true;
771 return false;
775 /* Function new_loop_vec_info.
777 Create and initialize a new loop_vec_info struct for LOOP, as well as
778 stmt_vec_info structs for all the stmts in LOOP. */
780 static loop_vec_info
781 new_loop_vec_info (struct loop *loop)
783 loop_vec_info res;
784 basic_block *bbs;
785 gimple_stmt_iterator si;
786 unsigned int i, nbbs;
788 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
789 LOOP_VINFO_LOOP (res) = loop;
791 bbs = get_loop_body (loop);
793 /* Create/Update stmt_info for all stmts in the loop. */
794 for (i = 0; i < loop->num_nodes; i++)
796 basic_block bb = bbs[i];
798 /* BBs in a nested inner-loop will have been already processed (because
799 we will have called vect_analyze_loop_form for any nested inner-loop).
800 Therefore, for stmts in an inner-loop we just want to update the
801 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
802 loop_info of the outer-loop we are currently considering to vectorize
803 (instead of the loop_info of the inner-loop).
804 For stmts in other BBs we need to create a stmt_info from scratch. */
805 if (bb->loop_father != loop)
807 /* Inner-loop bb. */
808 gcc_assert (loop->inner && bb->loop_father == loop->inner);
809 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
811 gimple phi = gsi_stmt (si);
812 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
813 loop_vec_info inner_loop_vinfo =
814 STMT_VINFO_LOOP_VINFO (stmt_info);
815 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
816 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
818 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
820 gimple stmt = gsi_stmt (si);
821 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
822 loop_vec_info inner_loop_vinfo =
823 STMT_VINFO_LOOP_VINFO (stmt_info);
824 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
825 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
828 else
830 /* bb in current nest. */
831 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
833 gimple phi = gsi_stmt (si);
834 gimple_set_uid (phi, 0);
835 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
838 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
840 gimple stmt = gsi_stmt (si);
841 gimple_set_uid (stmt, 0);
842 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
847 /* CHECKME: We want to visit all BBs before their successors (except for
848 latch blocks, for which this assertion wouldn't hold). In the simple
849 case of the loop forms we allow, a dfs order of the BBs would the same
850 as reversed postorder traversal, so we are safe. */
852 free (bbs);
853 bbs = XCNEWVEC (basic_block, loop->num_nodes);
854 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
855 bbs, loop->num_nodes, loop);
856 gcc_assert (nbbs == loop->num_nodes);
858 LOOP_VINFO_BBS (res) = bbs;
859 LOOP_VINFO_NITERS (res) = NULL;
860 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
861 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
862 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
863 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
864 LOOP_VINFO_VECT_FACTOR (res) = 0;
865 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
866 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
867 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
868 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
869 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
870 VEC_alloc (gimple, heap,
871 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
872 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
873 VEC_alloc (ddr_p, heap,
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
875 LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
876 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
877 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
878 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
879 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
880 LOOP_VINFO_PEELING_HTAB (res) = NULL;
881 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
882 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
883 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
885 return res;
889 /* Function destroy_loop_vec_info.
891 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
892 stmts in the loop. */
894 void
895 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
897 struct loop *loop;
898 basic_block *bbs;
899 int nbbs;
900 gimple_stmt_iterator si;
901 int j;
902 VEC (slp_instance, heap) *slp_instances;
903 slp_instance instance;
904 bool swapped;
906 if (!loop_vinfo)
907 return;
909 loop = LOOP_VINFO_LOOP (loop_vinfo);
911 bbs = LOOP_VINFO_BBS (loop_vinfo);
912 nbbs = loop->num_nodes;
913 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
915 if (!clean_stmts)
917 free (LOOP_VINFO_BBS (loop_vinfo));
918 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
919 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
920 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
921 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
922 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
924 free (loop_vinfo);
925 loop->aux = NULL;
926 return;
929 for (j = 0; j < nbbs; j++)
931 basic_block bb = bbs[j];
932 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
933 free_stmt_vec_info (gsi_stmt (si));
935 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
937 gimple stmt = gsi_stmt (si);
939 /* We may have broken canonical form by moving a constant
940 into RHS1 of a commutative op. Fix such occurrences. */
941 if (swapped && is_gimple_assign (stmt))
943 enum tree_code code = gimple_assign_rhs_code (stmt);
945 if ((code == PLUS_EXPR
946 || code == POINTER_PLUS_EXPR
947 || code == MULT_EXPR)
948 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
949 swap_tree_operands (stmt,
950 gimple_assign_rhs1_ptr (stmt),
951 gimple_assign_rhs2_ptr (stmt));
954 /* Free stmt_vec_info. */
955 free_stmt_vec_info (stmt);
956 gsi_next (&si);
960 free (LOOP_VINFO_BBS (loop_vinfo));
961 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
962 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
963 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
964 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
965 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
966 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
967 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
968 vect_free_slp_instance (instance);
970 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
971 VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
972 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
973 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
975 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
976 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
978 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
980 free (loop_vinfo);
981 loop->aux = NULL;
985 /* Function vect_analyze_loop_1.
987 Apply a set of analyses on LOOP, and create a loop_vec_info struct
988 for it. The different analyses will record information in the
989 loop_vec_info struct. This is a subset of the analyses applied in
990 vect_analyze_loop, to be applied on an inner-loop nested in the loop
991 that is now considered for (outer-loop) vectorization. */
993 static loop_vec_info
994 vect_analyze_loop_1 (struct loop *loop)
996 loop_vec_info loop_vinfo;
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_NOTE, vect_location,
1000 "===== analyze_loop_nest_1 =====");
1002 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1004 loop_vinfo = vect_analyze_loop_form (loop);
1005 if (!loop_vinfo)
1007 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1009 "bad inner-loop form.");
1010 return NULL;
1013 return loop_vinfo;
1017 /* Function vect_analyze_loop_form.
1019 Verify that certain CFG restrictions hold, including:
1020 - the loop has a pre-header
1021 - the loop has a single entry and exit
1022 - the loop exit condition is simple enough, and the number of iterations
1023 can be analyzed (a countable loop). */
1025 loop_vec_info
1026 vect_analyze_loop_form (struct loop *loop)
1028 loop_vec_info loop_vinfo;
1029 gimple loop_cond;
1030 tree number_of_iterations = NULL;
1031 loop_vec_info inner_loop_vinfo = NULL;
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_NOTE, vect_location,
1035 "=== vect_analyze_loop_form ===");
1037 /* Different restrictions apply when we are considering an inner-most loop,
1038 vs. an outer (nested) loop.
1039 (FORNOW. May want to relax some of these restrictions in the future). */
1041 if (!loop->inner)
1043 /* Inner-most loop. We currently require that the number of BBs is
1044 exactly 2 (the header and latch). Vectorizable inner-most loops
1045 look like this:
1047 (pre-header)
1049 header <--------+
1050 | | |
1051 | +--> latch --+
1053 (exit-bb) */
1055 if (loop->num_nodes != 2)
1057 if (dump_enabled_p ())
1058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1059 "not vectorized: control flow in loop.");
1060 return NULL;
1063 if (empty_block_p (loop->header))
1065 if (dump_enabled_p ())
1066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 "not vectorized: empty loop.");
1068 return NULL;
1071 else
1073 struct loop *innerloop = loop->inner;
1074 edge entryedge;
1076 /* Nested loop. We currently require that the loop is doubly-nested,
1077 contains a single inner loop, and the number of BBs is exactly 5.
1078 Vectorizable outer-loops look like this:
1080 (pre-header)
1082 header <---+
1084 inner-loop |
1086 tail ------+
1088 (exit-bb)
1090 The inner-loop has the properties expected of inner-most loops
1091 as described above. */
1093 if ((loop->inner)->inner || (loop->inner)->next)
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "not vectorized: multiple nested loops.");
1098 return NULL;
1101 /* Analyze the inner-loop. */
1102 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1103 if (!inner_loop_vinfo)
1105 if (dump_enabled_p ())
1106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107 "not vectorized: Bad inner loop.");
1108 return NULL;
1111 if (!expr_invariant_in_loop_p (loop,
1112 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1114 if (dump_enabled_p ())
1115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116 "not vectorized: inner-loop count not invariant.");
1117 destroy_loop_vec_info (inner_loop_vinfo, true);
1118 return NULL;
1121 if (loop->num_nodes != 5)
1123 if (dump_enabled_p ())
1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125 "not vectorized: control flow in loop.");
1126 destroy_loop_vec_info (inner_loop_vinfo, true);
1127 return NULL;
1130 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1131 entryedge = EDGE_PRED (innerloop->header, 0);
1132 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1133 entryedge = EDGE_PRED (innerloop->header, 1);
1135 if (entryedge->src != loop->header
1136 || !single_exit (innerloop)
1137 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "not vectorized: unsupported outerloop form.");
1142 destroy_loop_vec_info (inner_loop_vinfo, true);
1143 return NULL;
1146 if (dump_enabled_p ())
1147 dump_printf_loc (MSG_NOTE, vect_location,
1148 "Considering outer-loop vectorization.");
1151 if (!single_exit (loop)
1152 || EDGE_COUNT (loop->header->preds) != 2)
1154 if (dump_enabled_p ())
1156 if (!single_exit (loop))
1157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1158 "not vectorized: multiple exits.");
1159 else if (EDGE_COUNT (loop->header->preds) != 2)
1160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161 "not vectorized: too many incoming edges.");
1163 if (inner_loop_vinfo)
1164 destroy_loop_vec_info (inner_loop_vinfo, true);
1165 return NULL;
1168 /* We assume that the loop exit condition is at the end of the loop. i.e,
1169 that the loop is represented as a do-while (with a proper if-guard
1170 before the loop if needed), where the loop header contains all the
1171 executable statements, and the latch is empty. */
1172 if (!empty_block_p (loop->latch)
1173 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177 "not vectorized: unexpected loop form.");
1178 if (inner_loop_vinfo)
1179 destroy_loop_vec_info (inner_loop_vinfo, true);
1180 return NULL;
1183 /* Make sure there exists a single-predecessor exit bb: */
1184 if (!single_pred_p (single_exit (loop)->dest))
1186 edge e = single_exit (loop);
1187 if (!(e->flags & EDGE_ABNORMAL))
1189 split_loop_exit_edge (e);
1190 if (dump_enabled_p ())
1191 dump_printf (MSG_NOTE, "split exit edge.");
1193 else
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "not vectorized: abnormal loop exit edge.");
1198 if (inner_loop_vinfo)
1199 destroy_loop_vec_info (inner_loop_vinfo, true);
1200 return NULL;
1204 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1205 if (!loop_cond)
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "not vectorized: complicated exit condition.");
1210 if (inner_loop_vinfo)
1211 destroy_loop_vec_info (inner_loop_vinfo, true);
1212 return NULL;
1215 if (!number_of_iterations)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "not vectorized: number of iterations cannot be "
1220 "computed.");
1221 if (inner_loop_vinfo)
1222 destroy_loop_vec_info (inner_loop_vinfo, true);
1223 return NULL;
1226 if (chrec_contains_undetermined (number_of_iterations))
1228 if (dump_enabled_p ())
1229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 "Infinite number of iterations.");
1231 if (inner_loop_vinfo)
1232 destroy_loop_vec_info (inner_loop_vinfo, true);
1233 return NULL;
1236 if (!NITERS_KNOWN_P (number_of_iterations))
1238 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location,
1241 "Symbolic number of iterations is ");
1242 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1245 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "not vectorized: number of iterations = 0.");
1250 if (inner_loop_vinfo)
1251 destroy_loop_vec_info (inner_loop_vinfo, false);
1252 return NULL;
1255 loop_vinfo = new_loop_vec_info (loop);
1256 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1257 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1259 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1261 /* CHECKME: May want to keep it around it in the future. */
1262 if (inner_loop_vinfo)
1263 destroy_loop_vec_info (inner_loop_vinfo, false);
1265 gcc_assert (!loop->aux);
1266 loop->aux = loop_vinfo;
1267 return loop_vinfo;
1271 /* Function vect_analyze_loop_operations.
1273 Scan the loop stmts and make sure they are all vectorizable. */
1275 static bool
1276 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1278 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1279 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1280 int nbbs = loop->num_nodes;
1281 gimple_stmt_iterator si;
1282 unsigned int vectorization_factor = 0;
1283 int i;
1284 gimple phi;
1285 stmt_vec_info stmt_info;
1286 bool need_to_vectorize = false;
1287 int min_profitable_iters;
1288 int min_scalar_loop_bound;
1289 unsigned int th;
1290 bool only_slp_in_loop = true, ok;
1291 HOST_WIDE_INT max_niter;
1292 HOST_WIDE_INT estimated_niter;
1293 int min_profitable_estimate;
1295 if (dump_enabled_p ())
1296 dump_printf_loc (MSG_NOTE, vect_location,
1297 "=== vect_analyze_loop_operations ===");
1299 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1300 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1301 if (slp)
1303 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1304 vectorization factor of the loop is the unrolling factor required by
1305 the SLP instances. If that unrolling factor is 1, we say, that we
1306 perform pure SLP on loop - cross iteration parallelism is not
1307 exploited. */
1308 for (i = 0; i < nbbs; i++)
1310 basic_block bb = bbs[i];
1311 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1313 gimple stmt = gsi_stmt (si);
1314 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1315 gcc_assert (stmt_info);
1316 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1317 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1318 && !PURE_SLP_STMT (stmt_info))
1319 /* STMT needs both SLP and loop-based vectorization. */
1320 only_slp_in_loop = false;
1324 if (only_slp_in_loop)
1325 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1326 else
1327 vectorization_factor = least_common_multiple (vectorization_factor,
1328 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1330 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1331 if (dump_enabled_p ())
1332 dump_printf_loc (MSG_NOTE, vect_location,
1333 "Updating vectorization factor to %d ",
1334 vectorization_factor);
1337 for (i = 0; i < nbbs; i++)
1339 basic_block bb = bbs[i];
1341 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1343 phi = gsi_stmt (si);
1344 ok = true;
1346 stmt_info = vinfo_for_stmt (phi);
1347 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1350 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1353 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1354 (i.e., a phi in the tail of the outer-loop). */
1355 if (! is_loop_header_bb_p (bb))
1357 /* FORNOW: we currently don't support the case that these phis
1358 are not used in the outerloop (unless it is double reduction,
1359 i.e., this phi is vect_reduction_def), cause this case
1360 requires to actually do something here. */
1361 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1362 || STMT_VINFO_LIVE_P (stmt_info))
1363 && STMT_VINFO_DEF_TYPE (stmt_info)
1364 != vect_double_reduction_def)
1366 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368 "Unsupported loop-closed phi in "
1369 "outer-loop.");
1370 return false;
1373 /* If PHI is used in the outer loop, we check that its operand
1374 is defined in the inner loop. */
1375 if (STMT_VINFO_RELEVANT_P (stmt_info))
1377 tree phi_op;
1378 gimple op_def_stmt;
1380 if (gimple_phi_num_args (phi) != 1)
1381 return false;
1383 phi_op = PHI_ARG_DEF (phi, 0);
1384 if (TREE_CODE (phi_op) != SSA_NAME)
1385 return false;
1387 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1388 if (!op_def_stmt
1389 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1390 || !vinfo_for_stmt (op_def_stmt))
1391 return false;
1393 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1394 != vect_used_in_outer
1395 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1396 != vect_used_in_outer_by_reduction)
1397 return false;
1400 continue;
1403 gcc_assert (stmt_info);
1405 if (STMT_VINFO_LIVE_P (stmt_info))
1407 /* FORNOW: not yet supported. */
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "not vectorized: value used after loop.");
1411 return false;
1414 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1415 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1417 /* A scalar-dependence cycle that we don't support. */
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420 "not vectorized: scalar dependence cycle.");
1421 return false;
1424 if (STMT_VINFO_RELEVANT_P (stmt_info))
1426 need_to_vectorize = true;
1427 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1428 ok = vectorizable_induction (phi, NULL, NULL);
1431 if (!ok)
1433 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "not vectorized: relevant phi not "
1437 "supported: ");
1438 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1440 return false;
1444 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1446 gimple stmt = gsi_stmt (si);
1447 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1448 return false;
1450 } /* bbs */
1452 /* All operations in the loop are either irrelevant (deal with loop
1453 control, or dead), or only used outside the loop and can be moved
1454 out of the loop (e.g. invariants, inductions). The loop can be
1455 optimized away by scalar optimizations. We're better off not
1456 touching this loop. */
1457 if (!need_to_vectorize)
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "All the computation can be taken out of the loop.");
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "not vectorized: redundant loop. no profit to "
1465 "vectorize.");
1466 return false;
1469 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1470 dump_printf_loc (MSG_NOTE, vect_location,
1471 "vectorization_factor = %d, niters = "
1472 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1473 LOOP_VINFO_INT_NITERS (loop_vinfo));
1475 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1476 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1477 || ((max_niter = max_stmt_executions_int (loop)) != -1
1478 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1480 if (dump_enabled_p ())
1481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482 "not vectorized: iteration count too small.");
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "not vectorized: iteration count smaller than "
1486 "vectorization factor.");
1487 return false;
1490 /* Analyze cost. Decide if worth while to vectorize. */
1492 /* Once VF is set, SLP costs should be updated since the number of created
1493 vector stmts depends on VF. */
1494 vect_update_slp_costs_according_to_vf (loop_vinfo);
1496 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1497 &min_profitable_estimate);
1498 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1500 if (min_profitable_iters < 0)
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: vectorization not profitable.");
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: vector version will never be "
1508 "profitable.");
1509 return false;
1512 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1513 * vectorization_factor) - 1);
1516 /* Use the cost model only if it is more conservative than user specified
1517 threshold. */
1519 th = (unsigned) min_scalar_loop_bound;
1520 if (min_profitable_iters
1521 && (!min_scalar_loop_bound
1522 || min_profitable_iters > min_scalar_loop_bound))
1523 th = (unsigned) min_profitable_iters;
1525 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1526 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: vectorization not profitable.");
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_NOTE, vect_location,
1533 "not vectorized: iteration count smaller than user "
1534 "specified loop bound parameter or minimum profitable "
1535 "iterations (whichever is more conservative).");
1536 return false;
1539 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1540 && ((unsigned HOST_WIDE_INT) estimated_niter
1541 <= MAX (th, (unsigned)min_profitable_estimate)))
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: estimated iteration count too "
1546 "small.");
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location,
1549 "not vectorized: estimated iteration count smaller "
1550 "than specified loop bound parameter or minimum "
1551 "profitable iterations (whichever is more "
1552 "conservative).");
1553 return false;
1556 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1557 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1558 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1560 if (dump_enabled_p ())
1561 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1562 if (!vect_can_advance_ivs_p (loop_vinfo))
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "not vectorized: can't create epilog loop 1.");
1567 return false;
1569 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1571 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573 "not vectorized: can't create epilog loop 2.");
1574 return false;
1578 return true;
1582 /* Function vect_analyze_loop_2.
1584 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1585 for it. The different analyses will record information in the
1586 loop_vec_info struct. */
1587 static bool
1588 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1590 bool ok, slp = false;
1591 int max_vf = MAX_VECTORIZATION_FACTOR;
1592 int min_vf = 2;
1594 /* Find all data references in the loop (which correspond to vdefs/vuses)
1595 and analyze their evolution in the loop. Also adjust the minimal
1596 vectorization factor according to the loads and stores.
1598 FORNOW: Handle only simple, array references, which
1599 alignment can be forced, and aligned pointer-references. */
1601 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1602 if (!ok)
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "bad data references.");
1607 return false;
1610 /* Classify all cross-iteration scalar data-flow cycles.
1611 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1613 vect_analyze_scalar_cycles (loop_vinfo);
1615 vect_pattern_recog (loop_vinfo, NULL);
1617 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1619 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620 if (!ok)
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 "unexpected pattern.");
1625 return false;
1628 /* Analyze data dependences between the data-refs in the loop
1629 and adjust the maximum vectorization factor according to
1630 the dependences.
1631 FORNOW: fail at the first data dependence that we encounter. */
1633 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1634 if (!ok
1635 || max_vf < min_vf)
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "bad data dependence.");
1640 return false;
1643 ok = vect_determine_vectorization_factor (loop_vinfo);
1644 if (!ok)
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648 "can't determine vectorization factor.");
1649 return false;
1651 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 "bad data dependence.");
1656 return false;
1659 /* Analyze the alignment of the data-refs in the loop.
1660 Fail if a data reference is found that cannot be vectorized. */
1662 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663 if (!ok)
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "bad data alignment.");
1668 return false;
1671 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1672 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1674 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1675 if (!ok)
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "bad data access.");
1680 return false;
1683 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1684 It is important to call pruning after vect_analyze_data_ref_accesses,
1685 since we use grouping information gathered by interleaving analysis. */
1686 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1687 if (!ok)
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "too long list of versioning for alias "
1692 "run-time tests.");
1693 return false;
1696 /* This pass will decide on using loop versioning and/or loop peeling in
1697 order to enhance the alignment of data references in the loop. */
1699 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1700 if (!ok)
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "bad data alignment.");
1705 return false;
1708 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1709 ok = vect_analyze_slp (loop_vinfo, NULL);
1710 if (ok)
1712 /* Decide which possible SLP instances to SLP. */
1713 slp = vect_make_slp_decision (loop_vinfo);
1715 /* Find stmts that need to be both vectorized and SLPed. */
1716 vect_detect_hybrid_slp (loop_vinfo);
1718 else
1719 return false;
1721 /* Scan all the operations in the loop and make sure they are
1722 vectorizable. */
1724 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1725 if (!ok)
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 "bad operation or unsupported loop bound.");
1730 return false;
1733 return true;
1736 /* Function vect_analyze_loop.
1738 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1739 for it. The different analyses will record information in the
1740 loop_vec_info struct. */
1741 loop_vec_info
1742 vect_analyze_loop (struct loop *loop)
1744 loop_vec_info loop_vinfo;
1745 unsigned int vector_sizes;
1747 /* Autodetect first vector size we try. */
1748 current_vector_size = 0;
1749 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "===== analyze_loop_nest =====");
1755 if (loop_outer (loop)
1756 && loop_vec_info_for_loop (loop_outer (loop))
1757 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE, vect_location,
1761 "outer-loop already vectorized.");
1762 return NULL;
1765 while (1)
1767 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1768 loop_vinfo = vect_analyze_loop_form (loop);
1769 if (!loop_vinfo)
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "bad loop form.");
1774 return NULL;
1777 if (vect_analyze_loop_2 (loop_vinfo))
1779 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1781 return loop_vinfo;
1784 destroy_loop_vec_info (loop_vinfo, true);
1786 vector_sizes &= ~current_vector_size;
1787 if (vector_sizes == 0
1788 || current_vector_size == 0)
1789 return NULL;
1791 /* Try the next biggest vector size. */
1792 current_vector_size = 1 << floor_log2 (vector_sizes);
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "***** Re-trying analysis with "
1796 "vector size %d\n", current_vector_size);
1801 /* Function reduction_code_for_scalar_code
1803 Input:
1804 CODE - tree_code of a reduction operations.
1806 Output:
1807 REDUC_CODE - the corresponding tree-code to be used to reduce the
1808 vector of partial results into a single scalar result (which
1809 will also reside in a vector) or ERROR_MARK if the operation is
1810 a supported reduction operation, but does not have such tree-code.
1812 Return FALSE if CODE currently cannot be vectorized as reduction. */
1814 static bool
1815 reduction_code_for_scalar_code (enum tree_code code,
1816 enum tree_code *reduc_code)
1818 switch (code)
1820 case MAX_EXPR:
1821 *reduc_code = REDUC_MAX_EXPR;
1822 return true;
1824 case MIN_EXPR:
1825 *reduc_code = REDUC_MIN_EXPR;
1826 return true;
1828 case PLUS_EXPR:
1829 *reduc_code = REDUC_PLUS_EXPR;
1830 return true;
1832 case MULT_EXPR:
1833 case MINUS_EXPR:
1834 case BIT_IOR_EXPR:
1835 case BIT_XOR_EXPR:
1836 case BIT_AND_EXPR:
1837 *reduc_code = ERROR_MARK;
1838 return true;
1840 default:
1841 return false;
1846 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1847 STMT is printed with a message MSG. */
1849 static void
1850 report_vect_op (int msg_type, gimple stmt, const char *msg)
1852 dump_printf_loc (msg_type, vect_location, "%s", msg);
1853 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1857 /* Detect SLP reduction of the form:
1859 #a1 = phi <a5, a0>
1860 a2 = operation (a1)
1861 a3 = operation (a2)
1862 a4 = operation (a3)
1863 a5 = operation (a4)
1865 #a = phi <a5>
1867 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1868 FIRST_STMT is the first reduction stmt in the chain
1869 (a2 = operation (a1)).
1871 Return TRUE if a reduction chain was detected. */
1873 static bool
1874 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1876 struct loop *loop = (gimple_bb (phi))->loop_father;
1877 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1878 enum tree_code code;
1879 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1880 stmt_vec_info use_stmt_info, current_stmt_info;
1881 tree lhs;
1882 imm_use_iterator imm_iter;
1883 use_operand_p use_p;
1884 int nloop_uses, size = 0, n_out_of_loop_uses;
1885 bool found = false;
1887 if (loop != vect_loop)
1888 return false;
1890 lhs = PHI_RESULT (phi);
1891 code = gimple_assign_rhs_code (first_stmt);
1892 while (1)
1894 nloop_uses = 0;
1895 n_out_of_loop_uses = 0;
1896 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1898 gimple use_stmt = USE_STMT (use_p);
1899 if (is_gimple_debug (use_stmt))
1900 continue;
1902 use_stmt = USE_STMT (use_p);
1904 /* Check if we got back to the reduction phi. */
1905 if (use_stmt == phi)
1907 loop_use_stmt = use_stmt;
1908 found = true;
1909 break;
1912 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1914 if (vinfo_for_stmt (use_stmt)
1915 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1917 loop_use_stmt = use_stmt;
1918 nloop_uses++;
1921 else
1922 n_out_of_loop_uses++;
1924 /* There are can be either a single use in the loop or two uses in
1925 phi nodes. */
1926 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1927 return false;
1930 if (found)
1931 break;
1933 /* We reached a statement with no loop uses. */
1934 if (nloop_uses == 0)
1935 return false;
1937 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1938 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1939 return false;
1941 if (!is_gimple_assign (loop_use_stmt)
1942 || code != gimple_assign_rhs_code (loop_use_stmt)
1943 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1944 return false;
1946 /* Insert USE_STMT into reduction chain. */
1947 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1948 if (current_stmt)
1950 current_stmt_info = vinfo_for_stmt (current_stmt);
1951 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1952 GROUP_FIRST_ELEMENT (use_stmt_info)
1953 = GROUP_FIRST_ELEMENT (current_stmt_info);
1955 else
1956 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1958 lhs = gimple_assign_lhs (loop_use_stmt);
1959 current_stmt = loop_use_stmt;
1960 size++;
1963 if (!found || loop_use_stmt != phi || size < 2)
1964 return false;
1966 /* Swap the operands, if needed, to make the reduction operand be the second
1967 operand. */
1968 lhs = PHI_RESULT (phi);
1969 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1970 while (next_stmt)
1972 if (gimple_assign_rhs2 (next_stmt) == lhs)
1974 tree op = gimple_assign_rhs1 (next_stmt);
1975 gimple def_stmt = NULL;
1977 if (TREE_CODE (op) == SSA_NAME)
1978 def_stmt = SSA_NAME_DEF_STMT (op);
1980 /* Check that the other def is either defined in the loop
1981 ("vect_internal_def"), or it's an induction (defined by a
1982 loop-header phi-node). */
1983 if (def_stmt
1984 && gimple_bb (def_stmt)
1985 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1986 && (is_gimple_assign (def_stmt)
1987 || is_gimple_call (def_stmt)
1988 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1989 == vect_induction_def
1990 || (gimple_code (def_stmt) == GIMPLE_PHI
1991 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1992 == vect_internal_def
1993 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1995 lhs = gimple_assign_lhs (next_stmt);
1996 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1997 continue;
2000 return false;
2002 else
2004 tree op = gimple_assign_rhs2 (next_stmt);
2005 gimple def_stmt = NULL;
2007 if (TREE_CODE (op) == SSA_NAME)
2008 def_stmt = SSA_NAME_DEF_STMT (op);
2010 /* Check that the other def is either defined in the loop
2011 ("vect_internal_def"), or it's an induction (defined by a
2012 loop-header phi-node). */
2013 if (def_stmt
2014 && gimple_bb (def_stmt)
2015 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2016 && (is_gimple_assign (def_stmt)
2017 || is_gimple_call (def_stmt)
2018 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2019 == vect_induction_def
2020 || (gimple_code (def_stmt) == GIMPLE_PHI
2021 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2022 == vect_internal_def
2023 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2025 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2028 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2031 swap_tree_operands (next_stmt,
2032 gimple_assign_rhs1_ptr (next_stmt),
2033 gimple_assign_rhs2_ptr (next_stmt));
2034 update_stmt (next_stmt);
2036 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2037 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2039 else
2040 return false;
2043 lhs = gimple_assign_lhs (next_stmt);
2044 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2047 /* Save the chain for further analysis in SLP detection. */
2048 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2049 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
2050 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2052 return true;
2056 /* Function vect_is_simple_reduction_1
2058 (1) Detect a cross-iteration def-use cycle that represents a simple
2059 reduction computation. We look for the following pattern:
2061 loop_header:
2062 a1 = phi < a0, a2 >
2063 a3 = ...
2064 a2 = operation (a3, a1)
2066 such that:
2067 1. operation is commutative and associative and it is safe to
2068 change the order of the computation (if CHECK_REDUCTION is true)
2069 2. no uses for a2 in the loop (a2 is used out of the loop)
2070 3. no uses of a1 in the loop besides the reduction operation
2071 4. no uses of a1 outside the loop.
2073 Conditions 1,4 are tested here.
2074 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2076 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2077 nested cycles, if CHECK_REDUCTION is false.
2079 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2080 reductions:
2082 a1 = phi < a0, a2 >
2083 inner loop (def of a3)
2084 a2 = phi < a3 >
2086 If MODIFY is true it tries also to rework the code in-place to enable
2087 detection of more reduction patterns. For the time being we rewrite
2088 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2091 static gimple
2092 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2093 bool check_reduction, bool *double_reduc,
2094 bool modify)
2096 struct loop *loop = (gimple_bb (phi))->loop_father;
2097 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2098 edge latch_e = loop_latch_edge (loop);
2099 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2100 gimple def_stmt, def1 = NULL, def2 = NULL;
2101 enum tree_code orig_code, code;
2102 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2103 tree type;
2104 int nloop_uses;
2105 tree name;
2106 imm_use_iterator imm_iter;
2107 use_operand_p use_p;
2108 bool phi_def;
2110 *double_reduc = false;
2112 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2113 otherwise, we assume outer loop vectorization. */
2114 gcc_assert ((check_reduction && loop == vect_loop)
2115 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2117 name = PHI_RESULT (phi);
2118 nloop_uses = 0;
2119 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2121 gimple use_stmt = USE_STMT (use_p);
2122 if (is_gimple_debug (use_stmt))
2123 continue;
2125 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "intermediate value used outside loop.");
2131 return NULL;
2134 if (vinfo_for_stmt (use_stmt)
2135 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2136 nloop_uses++;
2137 if (nloop_uses > 1)
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "reduction used in loop.");
2142 return NULL;
2146 if (TREE_CODE (loop_arg) != SSA_NAME)
2148 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "reduction: not ssa_name: ");
2152 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2154 return NULL;
2157 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2158 if (!def_stmt)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "reduction: no def_stmt.");
2163 return NULL;
2166 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2168 if (dump_enabled_p ())
2169 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2170 return NULL;
2173 if (is_gimple_assign (def_stmt))
2175 name = gimple_assign_lhs (def_stmt);
2176 phi_def = false;
2178 else
2180 name = PHI_RESULT (def_stmt);
2181 phi_def = true;
2184 nloop_uses = 0;
2185 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2187 gimple use_stmt = USE_STMT (use_p);
2188 if (is_gimple_debug (use_stmt))
2189 continue;
2190 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2191 && vinfo_for_stmt (use_stmt)
2192 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2193 nloop_uses++;
2194 if (nloop_uses > 1)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "reduction used in loop.");
2199 return NULL;
2203 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2204 defined in the inner loop. */
2205 if (phi_def)
2207 op1 = PHI_ARG_DEF (def_stmt, 0);
2209 if (gimple_phi_num_args (def_stmt) != 1
2210 || TREE_CODE (op1) != SSA_NAME)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "unsupported phi node definition.");
2216 return NULL;
2219 def1 = SSA_NAME_DEF_STMT (op1);
2220 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2221 && loop->inner
2222 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2223 && is_gimple_assign (def1))
2225 if (dump_enabled_p ())
2226 report_vect_op (MSG_NOTE, def_stmt,
2227 "detected double reduction: ");
2229 *double_reduc = true;
2230 return def_stmt;
2233 return NULL;
2236 code = orig_code = gimple_assign_rhs_code (def_stmt);
2238 /* We can handle "res -= x[i]", which is non-associative by
2239 simply rewriting this into "res += -x[i]". Avoid changing
2240 gimple instruction for the first simple tests and only do this
2241 if we're allowed to change code at all. */
2242 if (code == MINUS_EXPR
2243 && modify
2244 && (op1 = gimple_assign_rhs1 (def_stmt))
2245 && TREE_CODE (op1) == SSA_NAME
2246 && SSA_NAME_DEF_STMT (op1) == phi)
2247 code = PLUS_EXPR;
2249 if (check_reduction
2250 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2252 if (dump_enabled_p ())
2253 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2254 "reduction: not commutative/associative: ");
2255 return NULL;
2258 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2260 if (code != COND_EXPR)
2262 if (dump_enabled_p ())
2263 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2264 "reduction: not binary operation: ");
2266 return NULL;
2269 op3 = gimple_assign_rhs1 (def_stmt);
2270 if (COMPARISON_CLASS_P (op3))
2272 op4 = TREE_OPERAND (op3, 1);
2273 op3 = TREE_OPERAND (op3, 0);
2276 op1 = gimple_assign_rhs2 (def_stmt);
2277 op2 = gimple_assign_rhs3 (def_stmt);
2279 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2281 if (dump_enabled_p ())
2282 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2283 "reduction: uses not ssa_names: ");
2285 return NULL;
2288 else
2290 op1 = gimple_assign_rhs1 (def_stmt);
2291 op2 = gimple_assign_rhs2 (def_stmt);
2293 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2295 if (dump_enabled_p ())
2296 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2297 "reduction: uses not ssa_names: ");
2299 return NULL;
2303 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2304 if ((TREE_CODE (op1) == SSA_NAME
2305 && !types_compatible_p (type,TREE_TYPE (op1)))
2306 || (TREE_CODE (op2) == SSA_NAME
2307 && !types_compatible_p (type, TREE_TYPE (op2)))
2308 || (op3 && TREE_CODE (op3) == SSA_NAME
2309 && !types_compatible_p (type, TREE_TYPE (op3)))
2310 || (op4 && TREE_CODE (op4) == SSA_NAME
2311 && !types_compatible_p (type, TREE_TYPE (op4))))
2313 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "reduction: multiple types: operation type: ");
2317 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2318 dump_printf (MSG_NOTE, ", operands types: ");
2319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2320 TREE_TYPE (op1));
2321 dump_printf (MSG_NOTE, ",");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323 TREE_TYPE (op2));
2324 if (op3)
2326 dump_printf (MSG_NOTE, ",");
2327 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2328 TREE_TYPE (op3));
2331 if (op4)
2333 dump_printf (MSG_NOTE, ",");
2334 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2335 TREE_TYPE (op4));
2339 return NULL;
2342 /* Check that it's ok to change the order of the computation.
2343 Generally, when vectorizing a reduction we change the order of the
2344 computation. This may change the behavior of the program in some
2345 cases, so we need to check that this is ok. One exception is when
2346 vectorizing an outer-loop: the inner-loop is executed sequentially,
2347 and therefore vectorizing reductions in the inner-loop during
2348 outer-loop vectorization is safe. */
2350 /* CHECKME: check for !flag_finite_math_only too? */
2351 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2352 && check_reduction)
2354 /* Changing the order of operations changes the semantics. */
2355 if (dump_enabled_p ())
2356 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2357 "reduction: unsafe fp math optimization: ");
2358 return NULL;
2360 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2361 && check_reduction)
2363 /* Changing the order of operations changes the semantics. */
2364 if (dump_enabled_p ())
2365 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2366 "reduction: unsafe int math optimization: ");
2367 return NULL;
2369 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2371 /* Changing the order of operations changes the semantics. */
2372 if (dump_enabled_p ())
2373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374 "reduction: unsafe fixed-point math optimization: ");
2375 return NULL;
2378 /* If we detected "res -= x[i]" earlier, rewrite it into
2379 "res += -x[i]" now. If this turns out to be useless reassoc
2380 will clean it up again. */
2381 if (orig_code == MINUS_EXPR)
2383 tree rhs = gimple_assign_rhs2 (def_stmt);
2384 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2385 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2386 rhs, NULL);
2387 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2388 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2389 loop_info, NULL));
2390 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2391 gimple_assign_set_rhs2 (def_stmt, negrhs);
2392 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2393 update_stmt (def_stmt);
2396 /* Reduction is safe. We're dealing with one of the following:
2397 1) integer arithmetic and no trapv
2398 2) floating point arithmetic, and special flags permit this optimization
2399 3) nested cycle (i.e., outer loop vectorization). */
2400 if (TREE_CODE (op1) == SSA_NAME)
2401 def1 = SSA_NAME_DEF_STMT (op1);
2403 if (TREE_CODE (op2) == SSA_NAME)
2404 def2 = SSA_NAME_DEF_STMT (op2);
2406 if (code != COND_EXPR
2407 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2409 if (dump_enabled_p ())
2410 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2411 return NULL;
2414 /* Check that one def is the reduction def, defined by PHI,
2415 the other def is either defined in the loop ("vect_internal_def"),
2416 or it's an induction (defined by a loop-header phi-node). */
2418 if (def2 && def2 == phi
2419 && (code == COND_EXPR
2420 || !def1 || gimple_nop_p (def1)
2421 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2422 && (is_gimple_assign (def1)
2423 || is_gimple_call (def1)
2424 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2425 == vect_induction_def
2426 || (gimple_code (def1) == GIMPLE_PHI
2427 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2428 == vect_internal_def
2429 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2431 if (dump_enabled_p ())
2432 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2433 return def_stmt;
2436 if (def1 && def1 == phi
2437 && (code == COND_EXPR
2438 || !def2 || gimple_nop_p (def2)
2439 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2440 && (is_gimple_assign (def2)
2441 || is_gimple_call (def2)
2442 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2443 == vect_induction_def
2444 || (gimple_code (def2) == GIMPLE_PHI
2445 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2446 == vect_internal_def
2447 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2449 if (check_reduction)
2451 /* Swap operands (just for simplicity - so that the rest of the code
2452 can assume that the reduction variable is always the last (second)
2453 argument). */
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE, def_stmt,
2456 "detected reduction: need to swap operands: ");
2458 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2459 gimple_assign_rhs2_ptr (def_stmt));
2461 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2462 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2464 else
2466 if (dump_enabled_p ())
2467 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2470 return def_stmt;
2473 /* Try to find SLP reduction chain. */
2474 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2476 if (dump_enabled_p ())
2477 report_vect_op (MSG_NOTE, def_stmt,
2478 "reduction: detected reduction chain: ");
2480 return def_stmt;
2483 if (dump_enabled_p ())
2484 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2485 "reduction: unknown pattern: ");
2487 return NULL;
2490 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2491 in-place. Arguments as there. */
2493 static gimple
2494 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2495 bool check_reduction, bool *double_reduc)
2497 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498 double_reduc, false);
2501 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2502 in-place if it enables detection of more reductions. Arguments
2503 as there. */
2505 gimple
2506 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2507 bool check_reduction, bool *double_reduc)
2509 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2510 double_reduc, true);
2513 /* Calculate the cost of one scalar iteration of the loop. */
2515 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2517 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2518 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2519 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2520 int innerloop_iters, i, stmt_cost;
2522 /* Count statements in scalar loop. Using this as scalar cost for a single
2523 iteration for now.
2525 TODO: Add outer loop support.
2527 TODO: Consider assigning different costs to different scalar
2528 statements. */
2530 /* FORNOW. */
2531 innerloop_iters = 1;
2532 if (loop->inner)
2533 innerloop_iters = 50; /* FIXME */
2535 for (i = 0; i < nbbs; i++)
2537 gimple_stmt_iterator si;
2538 basic_block bb = bbs[i];
2540 if (bb->loop_father == loop->inner)
2541 factor = innerloop_iters;
2542 else
2543 factor = 1;
2545 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2547 gimple stmt = gsi_stmt (si);
2548 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2550 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2551 continue;
2553 /* Skip stmts that are not vectorized inside the loop. */
2554 if (stmt_info
2555 && !STMT_VINFO_RELEVANT_P (stmt_info)
2556 && (!STMT_VINFO_LIVE_P (stmt_info)
2557 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2558 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2559 continue;
2561 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2563 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2564 stmt_cost = vect_get_stmt_cost (scalar_load);
2565 else
2566 stmt_cost = vect_get_stmt_cost (scalar_store);
2568 else
2569 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2571 scalar_single_iter_cost += stmt_cost * factor;
2574 return scalar_single_iter_cost;
2577 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2579 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2580 int *peel_iters_epilogue,
2581 int scalar_single_iter_cost,
2582 stmt_vector_for_cost *prologue_cost_vec,
2583 stmt_vector_for_cost *epilogue_cost_vec)
2585 int retval = 0;
2586 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2588 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2590 *peel_iters_epilogue = vf/2;
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "cost model: epilogue peel iters set to vf/2 "
2594 "because loop iterations are unknown .");
2596 /* If peeled iterations are known but number of scalar loop
2597 iterations are unknown, count a taken branch per peeled loop. */
2598 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2599 NULL, 0, vect_prologue);
2601 else
2603 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2604 peel_iters_prologue = niters < peel_iters_prologue ?
2605 niters : peel_iters_prologue;
2606 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2607 /* If we need to peel for gaps, but no peeling is required, we have to
2608 peel VF iterations. */
2609 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2610 *peel_iters_epilogue = vf;
2613 if (peel_iters_prologue)
2614 retval += record_stmt_cost (prologue_cost_vec,
2615 peel_iters_prologue * scalar_single_iter_cost,
2616 scalar_stmt, NULL, 0, vect_prologue);
2617 if (*peel_iters_epilogue)
2618 retval += record_stmt_cost (epilogue_cost_vec,
2619 *peel_iters_epilogue * scalar_single_iter_cost,
2620 scalar_stmt, NULL, 0, vect_epilogue);
2621 return retval;
2624 /* Function vect_estimate_min_profitable_iters
2626 Return the number of iterations required for the vector version of the
2627 loop to be profitable relative to the cost of the scalar version of the
2628 loop. */
2630 static void
2631 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2632 int *ret_min_profitable_niters,
2633 int *ret_min_profitable_estimate)
2635 int min_profitable_iters;
2636 int min_profitable_estimate;
2637 int peel_iters_prologue;
2638 int peel_iters_epilogue;
2639 unsigned vec_inside_cost = 0;
2640 int vec_outside_cost = 0;
2641 unsigned vec_prologue_cost = 0;
2642 unsigned vec_epilogue_cost = 0;
2643 int scalar_single_iter_cost = 0;
2644 int scalar_outside_cost = 0;
2645 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2646 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2649 /* Cost model disabled. */
2650 if (!flag_vect_cost_model)
2652 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2653 *ret_min_profitable_niters = 0;
2654 *ret_min_profitable_estimate = 0;
2655 return;
2658 /* Requires loop versioning tests to handle misalignment. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len = VEC_length (gimple,
2663 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2664 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2665 vect_prologue);
2666 dump_printf (MSG_NOTE,
2667 "cost model: Adding cost of checks for loop "
2668 "versioning to treat misalignment.\n");
2671 /* Requires loop versioning with alias checks. */
2672 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2674 /* FIXME: Make cost depend on complexity of individual check. */
2675 unsigned len = VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2676 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2677 vect_prologue);
2678 dump_printf (MSG_NOTE,
2679 "cost model: Adding cost of checks for loop "
2680 "versioning aliasing.\n");
2683 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2684 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2685 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2686 vect_prologue);
2688 /* Count statements in scalar loop. Using this as scalar cost for a single
2689 iteration for now.
2691 TODO: Add outer loop support.
2693 TODO: Consider assigning different costs to different scalar
2694 statements. */
2696 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2698 /* Add additional cost for the peeled instructions in prologue and epilogue
2699 loop.
2701 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2702 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2704 TODO: Build an expression that represents peel_iters for prologue and
2705 epilogue to be used in a run-time test. */
2707 if (npeel < 0)
2709 peel_iters_prologue = vf/2;
2710 dump_printf (MSG_NOTE, "cost model: "
2711 "prologue peel iters set to vf/2.");
2713 /* If peeling for alignment is unknown, loop bound of main loop becomes
2714 unknown. */
2715 peel_iters_epilogue = vf/2;
2716 dump_printf (MSG_NOTE, "cost model: "
2717 "epilogue peel iters set to vf/2 because "
2718 "peeling for alignment is unknown.");
2720 /* If peeled iterations are unknown, count a taken branch and a not taken
2721 branch per peeled loop. Even if scalar loop iterations are known,
2722 vector iterations are not known since peeled prologue iterations are
2723 not known. Hence guards remain the same. */
2724 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2725 NULL, 0, vect_prologue);
2726 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2727 NULL, 0, vect_prologue);
2728 /* FORNOW: Don't attempt to pass individual scalar instructions to
2729 the model; just assume linear cost for scalar iterations. */
2730 (void) add_stmt_cost (target_cost_data,
2731 peel_iters_prologue * scalar_single_iter_cost,
2732 scalar_stmt, NULL, 0, vect_prologue);
2733 (void) add_stmt_cost (target_cost_data,
2734 peel_iters_epilogue * scalar_single_iter_cost,
2735 scalar_stmt, NULL, 0, vect_epilogue);
2737 else
2739 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2740 stmt_info_for_cost *si;
2741 int j;
2742 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2744 prologue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2745 epilogue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2746 peel_iters_prologue = npeel;
2748 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2749 &peel_iters_epilogue,
2750 scalar_single_iter_cost,
2751 &prologue_cost_vec,
2752 &epilogue_cost_vec);
2754 FOR_EACH_VEC_ELT (stmt_info_for_cost, prologue_cost_vec, j, si)
2756 struct _stmt_vec_info *stmt_info
2757 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2758 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2759 si->misalign, vect_prologue);
2762 FOR_EACH_VEC_ELT (stmt_info_for_cost, epilogue_cost_vec, j, si)
2764 struct _stmt_vec_info *stmt_info
2765 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2766 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2767 si->misalign, vect_epilogue);
2770 VEC_free (stmt_info_for_cost, heap, prologue_cost_vec);
2771 VEC_free (stmt_info_for_cost, heap, epilogue_cost_vec);
2774 /* FORNOW: The scalar outside cost is incremented in one of the
2775 following ways:
2777 1. The vectorizer checks for alignment and aliasing and generates
2778 a condition that allows dynamic vectorization. A cost model
2779 check is ANDED with the versioning condition. Hence scalar code
2780 path now has the added cost of the versioning check.
2782 if (cost > th & versioning_check)
2783 jmp to vector code
2785 Hence run-time scalar is incremented by not-taken branch cost.
2787 2. The vectorizer then checks if a prologue is required. If the
2788 cost model check was not done before during versioning, it has to
2789 be done before the prologue check.
2791 if (cost <= th)
2792 prologue = scalar_iters
2793 if (prologue == 0)
2794 jmp to vector code
2795 else
2796 execute prologue
2797 if (prologue == num_iters)
2798 go to exit
2800 Hence the run-time scalar cost is incremented by a taken branch,
2801 plus a not-taken branch, plus a taken branch cost.
2803 3. The vectorizer then checks if an epilogue is required. If the
2804 cost model check was not done before during prologue check, it
2805 has to be done with the epilogue check.
2807 if (prologue == 0)
2808 jmp to vector code
2809 else
2810 execute prologue
2811 if (prologue == num_iters)
2812 go to exit
2813 vector code:
2814 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2815 jmp to epilogue
2817 Hence the run-time scalar cost should be incremented by 2 taken
2818 branches.
2820 TODO: The back end may reorder the BBS's differently and reverse
2821 conditions/branch directions. Change the estimates below to
2822 something more reasonable. */
2824 /* If the number of iterations is known and we do not do versioning, we can
2825 decide whether to vectorize at compile time. Hence the scalar version
2826 do not carry cost model guard costs. */
2827 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2828 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2829 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2831 /* Cost model check occurs at versioning. */
2832 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2833 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2834 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2835 else
2837 /* Cost model check occurs at prologue generation. */
2838 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2839 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2840 + vect_get_stmt_cost (cond_branch_not_taken);
2841 /* Cost model check occurs at epilogue generation. */
2842 else
2843 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2847 /* Complete the target-specific cost calculations. */
2848 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2849 &vec_inside_cost, &vec_epilogue_cost);
2851 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2853 /* Calculate number of iterations required to make the vector version
2854 profitable, relative to the loop bodies only. The following condition
2855 must hold true:
2856 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2857 where
2858 SIC = scalar iteration cost, VIC = vector iteration cost,
2859 VOC = vector outside cost, VF = vectorization factor,
2860 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2861 SOC = scalar outside cost for run time cost model check. */
2863 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2865 if (vec_outside_cost <= 0)
2866 min_profitable_iters = 1;
2867 else
2869 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2870 - vec_inside_cost * peel_iters_prologue
2871 - vec_inside_cost * peel_iters_epilogue)
2872 / ((scalar_single_iter_cost * vf)
2873 - vec_inside_cost);
2875 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2876 <= (((int) vec_inside_cost * min_profitable_iters)
2877 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2878 min_profitable_iters++;
2881 /* vector version will never be profitable. */
2882 else
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886 "cost model: the vector iteration cost = %d "
2887 "divided by the scalar iteration cost = %d "
2888 "is greater or equal to the vectorization factor = %d.",
2889 vec_inside_cost, scalar_single_iter_cost, vf);
2890 *ret_min_profitable_niters = -1;
2891 *ret_min_profitable_estimate = -1;
2892 return;
2895 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2898 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2899 vec_inside_cost);
2900 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2901 vec_prologue_cost);
2902 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2903 vec_epilogue_cost);
2904 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2905 scalar_single_iter_cost);
2906 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2907 scalar_outside_cost);
2908 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2909 vec_outside_cost);
2910 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2911 peel_iters_prologue);
2912 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2913 peel_iters_epilogue);
2914 dump_printf (MSG_NOTE,
2915 " Calculated minimum iters for profitability: %d\n",
2916 min_profitable_iters);
2919 min_profitable_iters =
2920 min_profitable_iters < vf ? vf : min_profitable_iters;
2922 /* Because the condition we create is:
2923 if (niters <= min_profitable_iters)
2924 then skip the vectorized loop. */
2925 min_profitable_iters--;
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 " Runtime profitability threshold = %d\n", min_profitable_iters);
2931 *ret_min_profitable_niters = min_profitable_iters;
2933 /* Calculate number of iterations required to make the vector version
2934 profitable, relative to the loop bodies only.
2936 Non-vectorized variant is SIC * niters and it must win over vector
2937 variant on the expected loop trip count. The following condition must hold true:
2938 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2940 if (vec_outside_cost <= 0)
2941 min_profitable_estimate = 1;
2942 else
2944 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2945 - vec_inside_cost * peel_iters_prologue
2946 - vec_inside_cost * peel_iters_epilogue)
2947 / ((scalar_single_iter_cost * vf)
2948 - vec_inside_cost);
2950 min_profitable_estimate --;
2951 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2952 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_NOTE, vect_location,
2954 " Static estimate profitability threshold = %d\n",
2955 min_profitable_iters);
2957 *ret_min_profitable_estimate = min_profitable_estimate;
2961 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2962 functions. Design better to avoid maintenance issues. */
2964 /* Function vect_model_reduction_cost.
2966 Models cost for a reduction operation, including the vector ops
2967 generated within the strip-mine loop, the initial definition before
2968 the loop, and the epilogue code that must be generated. */
2970 static bool
2971 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2972 int ncopies)
2974 int prologue_cost = 0, epilogue_cost = 0;
2975 enum tree_code code;
2976 optab optab;
2977 tree vectype;
2978 gimple stmt, orig_stmt;
2979 tree reduction_op;
2980 enum machine_mode mode;
2981 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2983 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2985 /* Cost of reduction op inside loop. */
2986 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2987 stmt_info, 0, vect_body);
2988 stmt = STMT_VINFO_STMT (stmt_info);
2990 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2992 case GIMPLE_SINGLE_RHS:
2993 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2994 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2995 break;
2996 case GIMPLE_UNARY_RHS:
2997 reduction_op = gimple_assign_rhs1 (stmt);
2998 break;
2999 case GIMPLE_BINARY_RHS:
3000 reduction_op = gimple_assign_rhs2 (stmt);
3001 break;
3002 case GIMPLE_TERNARY_RHS:
3003 reduction_op = gimple_assign_rhs3 (stmt);
3004 break;
3005 default:
3006 gcc_unreachable ();
3009 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3010 if (!vectype)
3012 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015 "unsupported data-type ");
3016 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3017 TREE_TYPE (reduction_op));
3019 return false;
3022 mode = TYPE_MODE (vectype);
3023 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3025 if (!orig_stmt)
3026 orig_stmt = STMT_VINFO_STMT (stmt_info);
3028 code = gimple_assign_rhs_code (orig_stmt);
3030 /* Add in cost for initial definition. */
3031 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3032 stmt_info, 0, vect_prologue);
3034 /* Determine cost of epilogue code.
3036 We have a reduction operator that will reduce the vector in one statement.
3037 Also requires scalar extract. */
3039 if (!nested_in_vect_loop_p (loop, orig_stmt))
3041 if (reduc_code != ERROR_MARK)
3043 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3044 stmt_info, 0, vect_epilogue);
3045 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3046 stmt_info, 0, vect_epilogue);
3048 else
3050 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3051 tree bitsize =
3052 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3053 int element_bitsize = tree_low_cst (bitsize, 1);
3054 int nelements = vec_size_in_bits / element_bitsize;
3056 optab = optab_for_tree_code (code, vectype, optab_default);
3058 /* We have a whole vector shift available. */
3059 if (VECTOR_MODE_P (mode)
3060 && optab_handler (optab, mode) != CODE_FOR_nothing
3061 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3063 /* Final reduction via vector shifts and the reduction operator.
3064 Also requires scalar extract. */
3065 epilogue_cost += add_stmt_cost (target_cost_data,
3066 exact_log2 (nelements) * 2,
3067 vector_stmt, stmt_info, 0,
3068 vect_epilogue);
3069 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3070 vec_to_scalar, stmt_info, 0,
3071 vect_epilogue);
3073 else
3074 /* Use extracts and reduction op for final reduction. For N
3075 elements, we have N extracts and N-1 reduction ops. */
3076 epilogue_cost += add_stmt_cost (target_cost_data,
3077 nelements + nelements - 1,
3078 vector_stmt, stmt_info, 0,
3079 vect_epilogue);
3083 if (dump_enabled_p ())
3084 dump_printf (MSG_NOTE,
3085 "vect_model_reduction_cost: inside_cost = %d, "
3086 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3087 prologue_cost, epilogue_cost);
3089 return true;
3093 /* Function vect_model_induction_cost.
3095 Models cost for induction operations. */
3097 static void
3098 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3100 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3101 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3102 unsigned inside_cost, prologue_cost;
3104 /* loop cost for vec_loop. */
3105 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3106 stmt_info, 0, vect_body);
3108 /* prologue cost for vec_init and vec_step. */
3109 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3110 stmt_info, 0, vect_prologue);
3112 if (dump_enabled_p ())
3113 dump_printf_loc (MSG_NOTE, vect_location,
3114 "vect_model_induction_cost: inside_cost = %d, "
3115 "prologue_cost = %d .", inside_cost, prologue_cost);
3119 /* Function get_initial_def_for_induction
3121 Input:
3122 STMT - a stmt that performs an induction operation in the loop.
3123 IV_PHI - the initial value of the induction variable
3125 Output:
3126 Return a vector variable, initialized with the first VF values of
3127 the induction variable. E.g., for an iv with IV_PHI='X' and
3128 evolution S, for a vector of 4 units, we want to return:
3129 [X, X + S, X + 2*S, X + 3*S]. */
3131 static tree
3132 get_initial_def_for_induction (gimple iv_phi)
3134 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3135 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3136 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3137 tree scalar_type;
3138 tree vectype;
3139 int nunits;
3140 edge pe = loop_preheader_edge (loop);
3141 struct loop *iv_loop;
3142 basic_block new_bb;
3143 tree vec, vec_init, vec_step, t;
3144 tree access_fn;
3145 tree new_var;
3146 tree new_name;
3147 gimple init_stmt, induction_phi, new_stmt;
3148 tree induc_def, vec_def, vec_dest;
3149 tree init_expr, step_expr;
3150 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3151 int i;
3152 bool ok;
3153 int ncopies;
3154 tree expr;
3155 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3156 bool nested_in_vect_loop = false;
3157 gimple_seq stmts = NULL;
3158 imm_use_iterator imm_iter;
3159 use_operand_p use_p;
3160 gimple exit_phi;
3161 edge latch_e;
3162 tree loop_arg;
3163 gimple_stmt_iterator si;
3164 basic_block bb = gimple_bb (iv_phi);
3165 tree stepvectype;
3166 tree resvectype;
3168 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3169 if (nested_in_vect_loop_p (loop, iv_phi))
3171 nested_in_vect_loop = true;
3172 iv_loop = loop->inner;
3174 else
3175 iv_loop = loop;
3176 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3178 latch_e = loop_latch_edge (iv_loop);
3179 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3181 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3182 gcc_assert (access_fn);
3183 STRIP_NOPS (access_fn);
3184 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3185 &init_expr, &step_expr);
3186 gcc_assert (ok);
3187 pe = loop_preheader_edge (iv_loop);
3189 scalar_type = TREE_TYPE (init_expr);
3190 vectype = get_vectype_for_scalar_type (scalar_type);
3191 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3192 gcc_assert (vectype);
3193 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3194 ncopies = vf / nunits;
3196 gcc_assert (phi_info);
3197 gcc_assert (ncopies >= 1);
3199 /* Find the first insertion point in the BB. */
3200 si = gsi_after_labels (bb);
3202 /* Create the vector that holds the initial_value of the induction. */
3203 if (nested_in_vect_loop)
3205 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3206 been created during vectorization of previous stmts. We obtain it
3207 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3208 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3209 loop_preheader_edge (iv_loop));
3210 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3212 else
3214 VEC(constructor_elt,gc) *v;
3216 /* iv_loop is the loop to be vectorized. Create:
3217 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3218 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3219 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3220 if (stmts)
3222 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3223 gcc_assert (!new_bb);
3226 v = VEC_alloc (constructor_elt, gc, nunits);
3227 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3228 for (i = 1; i < nunits; i++)
3230 /* Create: new_name_i = new_name + step_expr */
3231 enum tree_code code = POINTER_TYPE_P (scalar_type)
3232 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3233 init_stmt = gimple_build_assign_with_ops (code, new_var,
3234 new_name, step_expr);
3235 new_name = make_ssa_name (new_var, init_stmt);
3236 gimple_assign_set_lhs (init_stmt, new_name);
3238 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3239 gcc_assert (!new_bb);
3241 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE, vect_location,
3244 "created new init_stmt: ");
3245 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3247 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3249 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3250 vec = build_constructor (vectype, v);
3251 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3255 /* Create the vector that holds the step of the induction. */
3256 if (nested_in_vect_loop)
3257 /* iv_loop is nested in the loop to be vectorized. Generate:
3258 vec_step = [S, S, S, S] */
3259 new_name = step_expr;
3260 else
3262 /* iv_loop is the loop to be vectorized. Generate:
3263 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3264 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3265 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3266 expr, step_expr);
3269 t = unshare_expr (new_name);
3270 gcc_assert (CONSTANT_CLASS_P (new_name));
3271 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3272 gcc_assert (stepvectype);
3273 vec = build_vector_from_val (stepvectype, t);
3274 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3277 /* Create the following def-use cycle:
3278 loop prolog:
3279 vec_init = ...
3280 vec_step = ...
3281 loop:
3282 vec_iv = PHI <vec_init, vec_loop>
3284 STMT
3286 vec_loop = vec_iv + vec_step; */
3288 /* Create the induction-phi that defines the induction-operand. */
3289 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3290 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3291 set_vinfo_for_stmt (induction_phi,
3292 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3293 induc_def = PHI_RESULT (induction_phi);
3295 /* Create the iv update inside the loop */
3296 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3297 induc_def, vec_step);
3298 vec_def = make_ssa_name (vec_dest, new_stmt);
3299 gimple_assign_set_lhs (new_stmt, vec_def);
3300 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3301 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3302 NULL));
3304 /* Set the arguments of the phi node: */
3305 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3306 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3307 UNKNOWN_LOCATION);
3310 /* In case that vectorization factor (VF) is bigger than the number
3311 of elements that we can fit in a vectype (nunits), we have to generate
3312 more than one vector stmt - i.e - we need to "unroll" the
3313 vector stmt by a factor VF/nunits. For more details see documentation
3314 in vectorizable_operation. */
3316 if (ncopies > 1)
3318 stmt_vec_info prev_stmt_vinfo;
3319 /* FORNOW. This restriction should be relaxed. */
3320 gcc_assert (!nested_in_vect_loop);
3322 /* Create the vector that holds the step of the induction. */
3323 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3324 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3325 expr, step_expr);
3326 t = unshare_expr (new_name);
3327 gcc_assert (CONSTANT_CLASS_P (new_name));
3328 vec = build_vector_from_val (stepvectype, t);
3329 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3331 vec_def = induc_def;
3332 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3333 for (i = 1; i < ncopies; i++)
3335 /* vec_i = vec_prev + vec_step */
3336 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3337 vec_def, vec_step);
3338 vec_def = make_ssa_name (vec_dest, new_stmt);
3339 gimple_assign_set_lhs (new_stmt, vec_def);
3341 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3342 if (!useless_type_conversion_p (resvectype, vectype))
3344 new_stmt = gimple_build_assign_with_ops
3345 (VIEW_CONVERT_EXPR,
3346 vect_get_new_vect_var (resvectype, vect_simple_var,
3347 "vec_iv_"),
3348 build1 (VIEW_CONVERT_EXPR, resvectype,
3349 gimple_assign_lhs (new_stmt)), NULL_TREE);
3350 gimple_assign_set_lhs (new_stmt,
3351 make_ssa_name
3352 (gimple_assign_lhs (new_stmt), new_stmt));
3353 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3355 set_vinfo_for_stmt (new_stmt,
3356 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3357 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3358 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3362 if (nested_in_vect_loop)
3364 /* Find the loop-closed exit-phi of the induction, and record
3365 the final vector of induction results: */
3366 exit_phi = NULL;
3367 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3369 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3371 exit_phi = USE_STMT (use_p);
3372 break;
3375 if (exit_phi)
3377 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3378 /* FORNOW. Currently not supporting the case that an inner-loop induction
3379 is not used in the outer-loop (i.e. only outside the outer-loop). */
3380 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3381 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3383 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3384 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE, vect_location,
3387 "vector of inductions after inner-loop:");
3388 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3394 if (dump_enabled_p ())
3396 dump_printf_loc (MSG_NOTE, vect_location,
3397 "transform induction: created def-use cycle: ");
3398 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3399 dump_printf (MSG_NOTE, "\n");
3400 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3401 SSA_NAME_DEF_STMT (vec_def), 0);
3404 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3405 if (!useless_type_conversion_p (resvectype, vectype))
3407 new_stmt = gimple_build_assign_with_ops
3408 (VIEW_CONVERT_EXPR,
3409 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3410 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3411 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3412 gimple_assign_set_lhs (new_stmt, induc_def);
3413 si = gsi_start_bb (bb);
3414 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3415 set_vinfo_for_stmt (new_stmt,
3416 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3417 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3418 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3421 return induc_def;
3425 /* Function get_initial_def_for_reduction
3427 Input:
3428 STMT - a stmt that performs a reduction operation in the loop.
3429 INIT_VAL - the initial value of the reduction variable
3431 Output:
3432 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3433 of the reduction (used for adjusting the epilog - see below).
3434 Return a vector variable, initialized according to the operation that STMT
3435 performs. This vector will be used as the initial value of the
3436 vector of partial results.
3438 Option1 (adjust in epilog): Initialize the vector as follows:
3439 add/bit or/xor: [0,0,...,0,0]
3440 mult/bit and: [1,1,...,1,1]
3441 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3442 and when necessary (e.g. add/mult case) let the caller know
3443 that it needs to adjust the result by init_val.
3445 Option2: Initialize the vector as follows:
3446 add/bit or/xor: [init_val,0,0,...,0]
3447 mult/bit and: [init_val,1,1,...,1]
3448 min/max/cond_expr: [init_val,init_val,...,init_val]
3449 and no adjustments are needed.
3451 For example, for the following code:
3453 s = init_val;
3454 for (i=0;i<n;i++)
3455 s = s + a[i];
3457 STMT is 's = s + a[i]', and the reduction variable is 's'.
3458 For a vector of 4 units, we want to return either [0,0,0,init_val],
3459 or [0,0,0,0] and let the caller know that it needs to adjust
3460 the result at the end by 'init_val'.
3462 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3463 initialization vector is simpler (same element in all entries), if
3464 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3466 A cost model should help decide between these two schemes. */
3468 tree
3469 get_initial_def_for_reduction (gimple stmt, tree init_val,
3470 tree *adjustment_def)
3472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3475 tree scalar_type = TREE_TYPE (init_val);
3476 tree vectype = get_vectype_for_scalar_type (scalar_type);
3477 int nunits;
3478 enum tree_code code = gimple_assign_rhs_code (stmt);
3479 tree def_for_init;
3480 tree init_def;
3481 tree *elts;
3482 int i;
3483 bool nested_in_vect_loop = false;
3484 tree init_value;
3485 REAL_VALUE_TYPE real_init_val = dconst0;
3486 int int_init_val = 0;
3487 gimple def_stmt = NULL;
3489 gcc_assert (vectype);
3490 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3492 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3493 || SCALAR_FLOAT_TYPE_P (scalar_type));
3495 if (nested_in_vect_loop_p (loop, stmt))
3496 nested_in_vect_loop = true;
3497 else
3498 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3500 /* In case of double reduction we only create a vector variable to be put
3501 in the reduction phi node. The actual statement creation is done in
3502 vect_create_epilog_for_reduction. */
3503 if (adjustment_def && nested_in_vect_loop
3504 && TREE_CODE (init_val) == SSA_NAME
3505 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3506 && gimple_code (def_stmt) == GIMPLE_PHI
3507 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3508 && vinfo_for_stmt (def_stmt)
3509 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3510 == vect_double_reduction_def)
3512 *adjustment_def = NULL;
3513 return vect_create_destination_var (init_val, vectype);
3516 if (TREE_CONSTANT (init_val))
3518 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3519 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3520 else
3521 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3523 else
3524 init_value = init_val;
3526 switch (code)
3528 case WIDEN_SUM_EXPR:
3529 case DOT_PROD_EXPR:
3530 case PLUS_EXPR:
3531 case MINUS_EXPR:
3532 case BIT_IOR_EXPR:
3533 case BIT_XOR_EXPR:
3534 case MULT_EXPR:
3535 case BIT_AND_EXPR:
3536 /* ADJUSMENT_DEF is NULL when called from
3537 vect_create_epilog_for_reduction to vectorize double reduction. */
3538 if (adjustment_def)
3540 if (nested_in_vect_loop)
3541 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3542 NULL);
3543 else
3544 *adjustment_def = init_val;
3547 if (code == MULT_EXPR)
3549 real_init_val = dconst1;
3550 int_init_val = 1;
3553 if (code == BIT_AND_EXPR)
3554 int_init_val = -1;
3556 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3557 def_for_init = build_real (scalar_type, real_init_val);
3558 else
3559 def_for_init = build_int_cst (scalar_type, int_init_val);
3561 /* Create a vector of '0' or '1' except the first element. */
3562 elts = XALLOCAVEC (tree, nunits);
3563 for (i = nunits - 2; i >= 0; --i)
3564 elts[i + 1] = def_for_init;
3566 /* Option1: the first element is '0' or '1' as well. */
3567 if (adjustment_def)
3569 elts[0] = def_for_init;
3570 init_def = build_vector (vectype, elts);
3571 break;
3574 /* Option2: the first element is INIT_VAL. */
3575 elts[0] = init_val;
3576 if (TREE_CONSTANT (init_val))
3577 init_def = build_vector (vectype, elts);
3578 else
3580 VEC(constructor_elt,gc) *v;
3581 v = VEC_alloc (constructor_elt, gc, nunits);
3582 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3583 for (i = 1; i < nunits; ++i)
3584 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3585 init_def = build_constructor (vectype, v);
3588 break;
3590 case MIN_EXPR:
3591 case MAX_EXPR:
3592 case COND_EXPR:
3593 if (adjustment_def)
3595 *adjustment_def = NULL_TREE;
3596 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3597 break;
3600 init_def = build_vector_from_val (vectype, init_value);
3601 break;
3603 default:
3604 gcc_unreachable ();
3607 return init_def;
3611 /* Function vect_create_epilog_for_reduction
3613 Create code at the loop-epilog to finalize the result of a reduction
3614 computation.
3616 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3617 reduction statements.
3618 STMT is the scalar reduction stmt that is being vectorized.
3619 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3620 number of elements that we can fit in a vectype (nunits). In this case
3621 we have to generate more than one vector stmt - i.e - we need to "unroll"
3622 the vector stmt by a factor VF/nunits. For more details see documentation
3623 in vectorizable_operation.
3624 REDUC_CODE is the tree-code for the epilog reduction.
3625 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3626 computation.
3627 REDUC_INDEX is the index of the operand in the right hand side of the
3628 statement that is defined by REDUCTION_PHI.
3629 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3630 SLP_NODE is an SLP node containing a group of reduction statements. The
3631 first one in this group is STMT.
3633 This function:
3634 1. Creates the reduction def-use cycles: sets the arguments for
3635 REDUCTION_PHIS:
3636 The loop-entry argument is the vectorized initial-value of the reduction.
3637 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3638 sums.
3639 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3640 by applying the operation specified by REDUC_CODE if available, or by
3641 other means (whole-vector shifts or a scalar loop).
3642 The function also creates a new phi node at the loop exit to preserve
3643 loop-closed form, as illustrated below.
3645 The flow at the entry to this function:
3647 loop:
3648 vec_def = phi <null, null> # REDUCTION_PHI
3649 VECT_DEF = vector_stmt # vectorized form of STMT
3650 s_loop = scalar_stmt # (scalar) STMT
3651 loop_exit:
3652 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3653 use <s_out0>
3654 use <s_out0>
3656 The above is transformed by this function into:
3658 loop:
3659 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3660 VECT_DEF = vector_stmt # vectorized form of STMT
3661 s_loop = scalar_stmt # (scalar) STMT
3662 loop_exit:
3663 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3664 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3665 v_out2 = reduce <v_out1>
3666 s_out3 = extract_field <v_out2, 0>
3667 s_out4 = adjust_result <s_out3>
3668 use <s_out4>
3669 use <s_out4>
3672 static void
3673 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3674 int ncopies, enum tree_code reduc_code,
3675 VEC (gimple, heap) *reduction_phis,
3676 int reduc_index, bool double_reduc,
3677 slp_tree slp_node)
3679 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3680 stmt_vec_info prev_phi_info;
3681 tree vectype;
3682 enum machine_mode mode;
3683 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3684 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3685 basic_block exit_bb;
3686 tree scalar_dest;
3687 tree scalar_type;
3688 gimple new_phi = NULL, phi;
3689 gimple_stmt_iterator exit_gsi;
3690 tree vec_dest;
3691 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3692 gimple epilog_stmt = NULL;
3693 enum tree_code code = gimple_assign_rhs_code (stmt);
3694 gimple exit_phi;
3695 tree bitsize, bitpos;
3696 tree adjustment_def = NULL;
3697 tree vec_initial_def = NULL;
3698 tree reduction_op, expr, def;
3699 tree orig_name, scalar_result;
3700 imm_use_iterator imm_iter, phi_imm_iter;
3701 use_operand_p use_p, phi_use_p;
3702 bool extract_scalar_result = false;
3703 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3704 bool nested_in_vect_loop = false;
3705 VEC (gimple, heap) *new_phis = NULL;
3706 VEC (gimple, heap) *inner_phis = NULL;
3707 enum vect_def_type dt = vect_unknown_def_type;
3708 int j, i;
3709 VEC (tree, heap) *scalar_results = NULL;
3710 unsigned int group_size = 1, k, ratio;
3711 VEC (tree, heap) *vec_initial_defs = NULL;
3712 VEC (gimple, heap) *phis;
3713 bool slp_reduc = false;
3714 tree new_phi_result;
3715 gimple inner_phi = NULL;
3717 if (slp_node)
3718 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3720 if (nested_in_vect_loop_p (loop, stmt))
3722 outer_loop = loop;
3723 loop = loop->inner;
3724 nested_in_vect_loop = true;
3725 gcc_assert (!slp_node);
3728 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3730 case GIMPLE_SINGLE_RHS:
3731 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3732 == ternary_op);
3733 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3734 break;
3735 case GIMPLE_UNARY_RHS:
3736 reduction_op = gimple_assign_rhs1 (stmt);
3737 break;
3738 case GIMPLE_BINARY_RHS:
3739 reduction_op = reduc_index ?
3740 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3741 break;
3742 case GIMPLE_TERNARY_RHS:
3743 reduction_op = gimple_op (stmt, reduc_index + 1);
3744 break;
3745 default:
3746 gcc_unreachable ();
3749 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3750 gcc_assert (vectype);
3751 mode = TYPE_MODE (vectype);
3753 /* 1. Create the reduction def-use cycle:
3754 Set the arguments of REDUCTION_PHIS, i.e., transform
3756 loop:
3757 vec_def = phi <null, null> # REDUCTION_PHI
3758 VECT_DEF = vector_stmt # vectorized form of STMT
3761 into:
3763 loop:
3764 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3765 VECT_DEF = vector_stmt # vectorized form of STMT
3768 (in case of SLP, do it for all the phis). */
3770 /* Get the loop-entry arguments. */
3771 if (slp_node)
3772 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3773 NULL, slp_node, reduc_index);
3774 else
3776 vec_initial_defs = VEC_alloc (tree, heap, 1);
3777 /* For the case of reduction, vect_get_vec_def_for_operand returns
3778 the scalar def before the loop, that defines the initial value
3779 of the reduction variable. */
3780 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3781 &adjustment_def);
3782 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3785 /* Set phi nodes arguments. */
3786 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3788 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3789 tree def = VEC_index (tree, vect_defs, i);
3790 for (j = 0; j < ncopies; j++)
3792 /* Set the loop-entry arg of the reduction-phi. */
3793 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3794 UNKNOWN_LOCATION);
3796 /* Set the loop-latch arg for the reduction-phi. */
3797 if (j > 0)
3798 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3800 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3802 if (dump_enabled_p ())
3804 dump_printf_loc (MSG_NOTE, vect_location,
3805 "transform reduction: created def-use cycle: ");
3806 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3807 dump_printf (MSG_NOTE, "\n");
3808 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3811 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3815 VEC_free (tree, heap, vec_initial_defs);
3817 /* 2. Create epilog code.
3818 The reduction epilog code operates across the elements of the vector
3819 of partial results computed by the vectorized loop.
3820 The reduction epilog code consists of:
3822 step 1: compute the scalar result in a vector (v_out2)
3823 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3824 step 3: adjust the scalar result (s_out3) if needed.
3826 Step 1 can be accomplished using one the following three schemes:
3827 (scheme 1) using reduc_code, if available.
3828 (scheme 2) using whole-vector shifts, if available.
3829 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3830 combined.
3832 The overall epilog code looks like this:
3834 s_out0 = phi <s_loop> # original EXIT_PHI
3835 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3836 v_out2 = reduce <v_out1> # step 1
3837 s_out3 = extract_field <v_out2, 0> # step 2
3838 s_out4 = adjust_result <s_out3> # step 3
3840 (step 3 is optional, and steps 1 and 2 may be combined).
3841 Lastly, the uses of s_out0 are replaced by s_out4. */
3844 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3845 v_out1 = phi <VECT_DEF>
3846 Store them in NEW_PHIS. */
3848 exit_bb = single_exit (loop)->dest;
3849 prev_phi_info = NULL;
3850 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3851 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3853 for (j = 0; j < ncopies; j++)
3855 tree new_def = copy_ssa_name (def, NULL);
3856 phi = create_phi_node (new_def, exit_bb);
3857 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3858 if (j == 0)
3859 VEC_quick_push (gimple, new_phis, phi);
3860 else
3862 def = vect_get_vec_def_for_stmt_copy (dt, def);
3863 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3866 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3867 prev_phi_info = vinfo_for_stmt (phi);
3871 /* The epilogue is created for the outer-loop, i.e., for the loop being
3872 vectorized. Create exit phis for the outer loop. */
3873 if (double_reduc)
3875 loop = outer_loop;
3876 exit_bb = single_exit (loop)->dest;
3877 inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3878 FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3880 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3881 gimple outer_phi = create_phi_node (new_result, exit_bb);
3882 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3883 PHI_RESULT (phi));
3884 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3885 loop_vinfo, NULL));
3886 VEC_quick_push (gimple, inner_phis, phi);
3887 VEC_replace (gimple, new_phis, i, outer_phi);
3888 prev_phi_info = vinfo_for_stmt (outer_phi);
3889 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3891 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3892 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3893 outer_phi = create_phi_node (new_result, exit_bb);
3894 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3895 PHI_RESULT (phi));
3896 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3897 loop_vinfo, NULL));
3898 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3899 prev_phi_info = vinfo_for_stmt (outer_phi);
3904 exit_gsi = gsi_after_labels (exit_bb);
3906 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3907 (i.e. when reduc_code is not available) and in the final adjustment
3908 code (if needed). Also get the original scalar reduction variable as
3909 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3910 represents a reduction pattern), the tree-code and scalar-def are
3911 taken from the original stmt that the pattern-stmt (STMT) replaces.
3912 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3913 are taken from STMT. */
3915 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3916 if (!orig_stmt)
3918 /* Regular reduction */
3919 orig_stmt = stmt;
3921 else
3923 /* Reduction pattern */
3924 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3925 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3926 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3929 code = gimple_assign_rhs_code (orig_stmt);
3930 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3931 partial results are added and not subtracted. */
3932 if (code == MINUS_EXPR)
3933 code = PLUS_EXPR;
3935 scalar_dest = gimple_assign_lhs (orig_stmt);
3936 scalar_type = TREE_TYPE (scalar_dest);
3937 scalar_results = VEC_alloc (tree, heap, group_size);
3938 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3939 bitsize = TYPE_SIZE (scalar_type);
3941 /* In case this is a reduction in an inner-loop while vectorizing an outer
3942 loop - we don't need to extract a single scalar result at the end of the
3943 inner-loop (unless it is double reduction, i.e., the use of reduction is
3944 outside the outer-loop). The final vector of partial results will be used
3945 in the vectorized outer-loop, or reduced to a scalar result at the end of
3946 the outer-loop. */
3947 if (nested_in_vect_loop && !double_reduc)
3948 goto vect_finalize_reduction;
3950 /* SLP reduction without reduction chain, e.g.,
3951 # a1 = phi <a2, a0>
3952 # b1 = phi <b2, b0>
3953 a2 = operation (a1)
3954 b2 = operation (b1) */
3955 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3957 /* In case of reduction chain, e.g.,
3958 # a1 = phi <a3, a0>
3959 a2 = operation (a1)
3960 a3 = operation (a2),
3962 we may end up with more than one vector result. Here we reduce them to
3963 one vector. */
3964 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3966 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3967 tree tmp;
3968 gimple new_vec_stmt = NULL;
3970 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3971 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3973 gimple next_phi = VEC_index (gimple, new_phis, k);
3974 tree second_vect = PHI_RESULT (next_phi);
3976 tmp = build2 (code, vectype, first_vect, second_vect);
3977 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3978 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3979 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3980 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3983 new_phi_result = first_vect;
3984 if (new_vec_stmt)
3986 VEC_truncate (gimple, new_phis, 0);
3987 VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3990 else
3991 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3993 /* 2.3 Create the reduction code, using one of the three schemes described
3994 above. In SLP we simply need to extract all the elements from the
3995 vector (without reducing them), so we use scalar shifts. */
3996 if (reduc_code != ERROR_MARK && !slp_reduc)
3998 tree tmp;
4000 /*** Case 1: Create:
4001 v_out2 = reduc_expr <v_out1> */
4003 if (dump_enabled_p ())
4004 dump_printf_loc (MSG_NOTE, vect_location,
4005 "Reduce using direct vector reduction.");
4007 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4008 tmp = build1 (reduc_code, vectype, new_phi_result);
4009 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4010 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4011 gimple_assign_set_lhs (epilog_stmt, new_temp);
4012 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4014 extract_scalar_result = true;
4016 else
4018 enum tree_code shift_code = ERROR_MARK;
4019 bool have_whole_vector_shift = true;
4020 int bit_offset;
4021 int element_bitsize = tree_low_cst (bitsize, 1);
4022 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4023 tree vec_temp;
4025 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4026 shift_code = VEC_RSHIFT_EXPR;
4027 else
4028 have_whole_vector_shift = false;
4030 /* Regardless of whether we have a whole vector shift, if we're
4031 emulating the operation via tree-vect-generic, we don't want
4032 to use it. Only the first round of the reduction is likely
4033 to still be profitable via emulation. */
4034 /* ??? It might be better to emit a reduction tree code here, so that
4035 tree-vect-generic can expand the first round via bit tricks. */
4036 if (!VECTOR_MODE_P (mode))
4037 have_whole_vector_shift = false;
4038 else
4040 optab optab = optab_for_tree_code (code, vectype, optab_default);
4041 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4042 have_whole_vector_shift = false;
4045 if (have_whole_vector_shift && !slp_reduc)
4047 /*** Case 2: Create:
4048 for (offset = VS/2; offset >= element_size; offset/=2)
4050 Create: va' = vec_shift <va, offset>
4051 Create: va = vop <va, va'>
4052 } */
4054 if (dump_enabled_p ())
4055 dump_printf_loc (MSG_NOTE, vect_location,
4056 "Reduce using vector shifts");
4058 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4059 new_temp = new_phi_result;
4060 for (bit_offset = vec_size_in_bits/2;
4061 bit_offset >= element_bitsize;
4062 bit_offset /= 2)
4064 tree bitpos = size_int (bit_offset);
4066 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4067 vec_dest, new_temp, bitpos);
4068 new_name = make_ssa_name (vec_dest, epilog_stmt);
4069 gimple_assign_set_lhs (epilog_stmt, new_name);
4070 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4072 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4073 new_name, new_temp);
4074 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4075 gimple_assign_set_lhs (epilog_stmt, new_temp);
4076 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079 extract_scalar_result = true;
4081 else
4083 tree rhs;
4085 /*** Case 3: Create:
4086 s = extract_field <v_out2, 0>
4087 for (offset = element_size;
4088 offset < vector_size;
4089 offset += element_size;)
4091 Create: s' = extract_field <v_out2, offset>
4092 Create: s = op <s, s'> // For non SLP cases
4093 } */
4095 if (dump_enabled_p ())
4096 dump_printf_loc (MSG_NOTE, vect_location,
4097 "Reduce using scalar code. ");
4099 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4100 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
4102 if (gimple_code (new_phi) == GIMPLE_PHI)
4103 vec_temp = PHI_RESULT (new_phi);
4104 else
4105 vec_temp = gimple_assign_lhs (new_phi);
4106 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4107 bitsize_zero_node);
4108 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4109 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4110 gimple_assign_set_lhs (epilog_stmt, new_temp);
4111 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4113 /* In SLP we don't need to apply reduction operation, so we just
4114 collect s' values in SCALAR_RESULTS. */
4115 if (slp_reduc)
4116 VEC_safe_push (tree, heap, scalar_results, new_temp);
4118 for (bit_offset = element_bitsize;
4119 bit_offset < vec_size_in_bits;
4120 bit_offset += element_bitsize)
4122 tree bitpos = bitsize_int (bit_offset);
4123 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4124 bitsize, bitpos);
4126 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4127 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4128 gimple_assign_set_lhs (epilog_stmt, new_name);
4129 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4131 if (slp_reduc)
4133 /* In SLP we don't need to apply reduction operation, so
4134 we just collect s' values in SCALAR_RESULTS. */
4135 new_temp = new_name;
4136 VEC_safe_push (tree, heap, scalar_results, new_name);
4138 else
4140 epilog_stmt = gimple_build_assign_with_ops (code,
4141 new_scalar_dest, new_name, new_temp);
4142 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4143 gimple_assign_set_lhs (epilog_stmt, new_temp);
4144 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4149 /* The only case where we need to reduce scalar results in SLP, is
4150 unrolling. If the size of SCALAR_RESULTS is greater than
4151 GROUP_SIZE, we reduce them combining elements modulo
4152 GROUP_SIZE. */
4153 if (slp_reduc)
4155 tree res, first_res, new_res;
4156 gimple new_stmt;
4158 /* Reduce multiple scalar results in case of SLP unrolling. */
4159 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
4160 j++)
4162 first_res = VEC_index (tree, scalar_results, j % group_size);
4163 new_stmt = gimple_build_assign_with_ops (code,
4164 new_scalar_dest, first_res, res);
4165 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4166 gimple_assign_set_lhs (new_stmt, new_res);
4167 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4168 VEC_replace (tree, scalar_results, j % group_size, new_res);
4171 else
4172 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4173 VEC_safe_push (tree, heap, scalar_results, new_temp);
4175 extract_scalar_result = false;
4179 /* 2.4 Extract the final scalar result. Create:
4180 s_out3 = extract_field <v_out2, bitpos> */
4182 if (extract_scalar_result)
4184 tree rhs;
4186 if (dump_enabled_p ())
4187 dump_printf_loc (MSG_NOTE, vect_location,
4188 "extract scalar result");
4190 if (BYTES_BIG_ENDIAN)
4191 bitpos = size_binop (MULT_EXPR,
4192 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4193 TYPE_SIZE (scalar_type));
4194 else
4195 bitpos = bitsize_zero_node;
4197 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4198 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4199 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4200 gimple_assign_set_lhs (epilog_stmt, new_temp);
4201 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4202 VEC_safe_push (tree, heap, scalar_results, new_temp);
4205 vect_finalize_reduction:
4207 if (double_reduc)
4208 loop = loop->inner;
4210 /* 2.5 Adjust the final result by the initial value of the reduction
4211 variable. (When such adjustment is not needed, then
4212 'adjustment_def' is zero). For example, if code is PLUS we create:
4213 new_temp = loop_exit_def + adjustment_def */
4215 if (adjustment_def)
4217 gcc_assert (!slp_reduc);
4218 if (nested_in_vect_loop)
4220 new_phi = VEC_index (gimple, new_phis, 0);
4221 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4222 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4223 new_dest = vect_create_destination_var (scalar_dest, vectype);
4225 else
4227 new_temp = VEC_index (tree, scalar_results, 0);
4228 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4229 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4230 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4233 epilog_stmt = gimple_build_assign (new_dest, expr);
4234 new_temp = make_ssa_name (new_dest, epilog_stmt);
4235 gimple_assign_set_lhs (epilog_stmt, new_temp);
4236 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4237 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4238 if (nested_in_vect_loop)
4240 set_vinfo_for_stmt (epilog_stmt,
4241 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4242 NULL));
4243 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4244 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4246 if (!double_reduc)
4247 VEC_quick_push (tree, scalar_results, new_temp);
4248 else
4249 VEC_replace (tree, scalar_results, 0, new_temp);
4251 else
4252 VEC_replace (tree, scalar_results, 0, new_temp);
4254 VEC_replace (gimple, new_phis, 0, epilog_stmt);
4257 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4258 phis with new adjusted scalar results, i.e., replace use <s_out0>
4259 with use <s_out4>.
4261 Transform:
4262 loop_exit:
4263 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4264 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4265 v_out2 = reduce <v_out1>
4266 s_out3 = extract_field <v_out2, 0>
4267 s_out4 = adjust_result <s_out3>
4268 use <s_out0>
4269 use <s_out0>
4271 into:
4273 loop_exit:
4274 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4275 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4276 v_out2 = reduce <v_out1>
4277 s_out3 = extract_field <v_out2, 0>
4278 s_out4 = adjust_result <s_out3>
4279 use <s_out4>
4280 use <s_out4> */
4283 /* In SLP reduction chain we reduce vector results into one vector if
4284 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4285 the last stmt in the reduction chain, since we are looking for the loop
4286 exit phi node. */
4287 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4289 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4290 SLP_TREE_SCALAR_STMTS (slp_node),
4291 group_size - 1));
4292 group_size = 1;
4295 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4296 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4297 need to match SCALAR_RESULTS with corresponding statements. The first
4298 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4299 the first vector stmt, etc.
4300 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4301 if (group_size > VEC_length (gimple, new_phis))
4303 ratio = group_size / VEC_length (gimple, new_phis);
4304 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4306 else
4307 ratio = 1;
4309 for (k = 0; k < group_size; k++)
4311 if (k % ratio == 0)
4313 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4314 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4315 if (double_reduc)
4316 inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4319 if (slp_reduc)
4321 gimple current_stmt = VEC_index (gimple,
4322 SLP_TREE_SCALAR_STMTS (slp_node), k);
4324 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4325 /* SLP statements can't participate in patterns. */
4326 gcc_assert (!orig_stmt);
4327 scalar_dest = gimple_assign_lhs (current_stmt);
4330 phis = VEC_alloc (gimple, heap, 3);
4331 /* Find the loop-closed-use at the loop exit of the original scalar
4332 result. (The reduction result is expected to have two immediate uses -
4333 one at the latch block, and one at the loop exit). */
4334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4335 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4336 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4338 /* We expect to have found an exit_phi because of loop-closed-ssa
4339 form. */
4340 gcc_assert (!VEC_empty (gimple, phis));
4342 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4344 if (outer_loop)
4346 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4347 gimple vect_phi;
4349 /* FORNOW. Currently not supporting the case that an inner-loop
4350 reduction is not used in the outer-loop (but only outside the
4351 outer-loop), unless it is double reduction. */
4352 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4353 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4354 || double_reduc);
4356 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4357 if (!double_reduc
4358 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4359 != vect_double_reduction_def)
4360 continue;
4362 /* Handle double reduction:
4364 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4365 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4366 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4367 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4369 At that point the regular reduction (stmt2 and stmt3) is
4370 already vectorized, as well as the exit phi node, stmt4.
4371 Here we vectorize the phi node of double reduction, stmt1, and
4372 update all relevant statements. */
4374 /* Go through all the uses of s2 to find double reduction phi
4375 node, i.e., stmt1 above. */
4376 orig_name = PHI_RESULT (exit_phi);
4377 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4379 stmt_vec_info use_stmt_vinfo;
4380 stmt_vec_info new_phi_vinfo;
4381 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4382 basic_block bb = gimple_bb (use_stmt);
4383 gimple use;
4385 /* Check that USE_STMT is really double reduction phi
4386 node. */
4387 if (gimple_code (use_stmt) != GIMPLE_PHI
4388 || gimple_phi_num_args (use_stmt) != 2
4389 || bb->loop_father != outer_loop)
4390 continue;
4391 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4392 if (!use_stmt_vinfo
4393 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4394 != vect_double_reduction_def)
4395 continue;
4397 /* Create vector phi node for double reduction:
4398 vs1 = phi <vs0, vs2>
4399 vs1 was created previously in this function by a call to
4400 vect_get_vec_def_for_operand and is stored in
4401 vec_initial_def;
4402 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4403 vs0 is created here. */
4405 /* Create vector phi node. */
4406 vect_phi = create_phi_node (vec_initial_def, bb);
4407 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4408 loop_vec_info_for_loop (outer_loop), NULL);
4409 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4411 /* Create vs0 - initial def of the double reduction phi. */
4412 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4413 loop_preheader_edge (outer_loop));
4414 init_def = get_initial_def_for_reduction (stmt,
4415 preheader_arg, NULL);
4416 vect_phi_init = vect_init_vector (use_stmt, init_def,
4417 vectype, NULL);
4419 /* Update phi node arguments with vs0 and vs2. */
4420 add_phi_arg (vect_phi, vect_phi_init,
4421 loop_preheader_edge (outer_loop),
4422 UNKNOWN_LOCATION);
4423 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4424 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4425 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_NOTE, vect_location,
4428 "created double reduction phi node: ");
4429 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4432 vect_phi_res = PHI_RESULT (vect_phi);
4434 /* Replace the use, i.e., set the correct vs1 in the regular
4435 reduction phi node. FORNOW, NCOPIES is always 1, so the
4436 loop is redundant. */
4437 use = reduction_phi;
4438 for (j = 0; j < ncopies; j++)
4440 edge pr_edge = loop_preheader_edge (loop);
4441 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4442 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4448 VEC_free (gimple, heap, phis);
4449 if (nested_in_vect_loop)
4451 if (double_reduc)
4452 loop = outer_loop;
4453 else
4454 continue;
4457 phis = VEC_alloc (gimple, heap, 3);
4458 /* Find the loop-closed-use at the loop exit of the original scalar
4459 result. (The reduction result is expected to have two immediate uses,
4460 one at the latch block, and one at the loop exit). For double
4461 reductions we are looking for exit phis of the outer loop. */
4462 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4464 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4465 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4466 else
4468 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4470 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4472 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4474 if (!flow_bb_inside_loop_p (loop,
4475 gimple_bb (USE_STMT (phi_use_p))))
4476 VEC_safe_push (gimple, heap, phis,
4477 USE_STMT (phi_use_p));
4483 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4485 /* Replace the uses: */
4486 orig_name = PHI_RESULT (exit_phi);
4487 scalar_result = VEC_index (tree, scalar_results, k);
4488 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4489 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4490 SET_USE (use_p, scalar_result);
4493 VEC_free (gimple, heap, phis);
4496 VEC_free (tree, heap, scalar_results);
4497 VEC_free (gimple, heap, new_phis);
4501 /* Function vectorizable_reduction.
4503 Check if STMT performs a reduction operation that can be vectorized.
4504 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4505 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4506 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4508 This function also handles reduction idioms (patterns) that have been
4509 recognized in advance during vect_pattern_recog. In this case, STMT may be
4510 of this form:
4511 X = pattern_expr (arg0, arg1, ..., X)
4512 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4513 sequence that had been detected and replaced by the pattern-stmt (STMT).
4515 In some cases of reduction patterns, the type of the reduction variable X is
4516 different than the type of the other arguments of STMT.
4517 In such cases, the vectype that is used when transforming STMT into a vector
4518 stmt is different than the vectype that is used to determine the
4519 vectorization factor, because it consists of a different number of elements
4520 than the actual number of elements that are being operated upon in parallel.
4522 For example, consider an accumulation of shorts into an int accumulator.
4523 On some targets it's possible to vectorize this pattern operating on 8
4524 shorts at a time (hence, the vectype for purposes of determining the
4525 vectorization factor should be V8HI); on the other hand, the vectype that
4526 is used to create the vector form is actually V4SI (the type of the result).
4528 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4529 indicates what is the actual level of parallelism (V8HI in the example), so
4530 that the right vectorization factor would be derived. This vectype
4531 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4532 be used to create the vectorized stmt. The right vectype for the vectorized
4533 stmt is obtained from the type of the result X:
4534 get_vectype_for_scalar_type (TREE_TYPE (X))
4536 This means that, contrary to "regular" reductions (or "regular" stmts in
4537 general), the following equation:
4538 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4539 does *NOT* necessarily hold for reduction patterns. */
4541 bool
4542 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4543 gimple *vec_stmt, slp_tree slp_node)
4545 tree vec_dest;
4546 tree scalar_dest;
4547 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4548 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4549 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4550 tree vectype_in = NULL_TREE;
4551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4553 enum tree_code code, orig_code, epilog_reduc_code;
4554 enum machine_mode vec_mode;
4555 int op_type;
4556 optab optab, reduc_optab;
4557 tree new_temp = NULL_TREE;
4558 tree def;
4559 gimple def_stmt;
4560 enum vect_def_type dt;
4561 gimple new_phi = NULL;
4562 tree scalar_type;
4563 bool is_simple_use;
4564 gimple orig_stmt;
4565 stmt_vec_info orig_stmt_info;
4566 tree expr = NULL_TREE;
4567 int i;
4568 int ncopies;
4569 int epilog_copies;
4570 stmt_vec_info prev_stmt_info, prev_phi_info;
4571 bool single_defuse_cycle = false;
4572 tree reduc_def = NULL_TREE;
4573 gimple new_stmt = NULL;
4574 int j;
4575 tree ops[3];
4576 bool nested_cycle = false, found_nested_cycle_def = false;
4577 gimple reduc_def_stmt = NULL;
4578 /* The default is that the reduction variable is the last in statement. */
4579 int reduc_index = 2;
4580 bool double_reduc = false, dummy;
4581 basic_block def_bb;
4582 struct loop * def_stmt_loop, *outer_loop = NULL;
4583 tree def_arg;
4584 gimple def_arg_stmt;
4585 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4586 VEC (gimple, heap) *phis = NULL;
4587 int vec_num;
4588 tree def0, def1, tem, op0, op1 = NULL_TREE;
4590 /* In case of reduction chain we switch to the first stmt in the chain, but
4591 we don't update STMT_INFO, since only the last stmt is marked as reduction
4592 and has reduction properties. */
4593 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4594 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4596 if (nested_in_vect_loop_p (loop, stmt))
4598 outer_loop = loop;
4599 loop = loop->inner;
4600 nested_cycle = true;
4603 /* 1. Is vectorizable reduction? */
4604 /* Not supportable if the reduction variable is used in the loop, unless
4605 it's a reduction chain. */
4606 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4607 && !GROUP_FIRST_ELEMENT (stmt_info))
4608 return false;
4610 /* Reductions that are not used even in an enclosing outer-loop,
4611 are expected to be "live" (used out of the loop). */
4612 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4613 && !STMT_VINFO_LIVE_P (stmt_info))
4614 return false;
4616 /* Make sure it was already recognized as a reduction computation. */
4617 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4618 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4619 return false;
4621 /* 2. Has this been recognized as a reduction pattern?
4623 Check if STMT represents a pattern that has been recognized
4624 in earlier analysis stages. For stmts that represent a pattern,
4625 the STMT_VINFO_RELATED_STMT field records the last stmt in
4626 the original sequence that constitutes the pattern. */
4628 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4629 if (orig_stmt)
4631 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4632 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4633 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4634 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4637 /* 3. Check the operands of the operation. The first operands are defined
4638 inside the loop body. The last operand is the reduction variable,
4639 which is defined by the loop-header-phi. */
4641 gcc_assert (is_gimple_assign (stmt));
4643 /* Flatten RHS. */
4644 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4646 case GIMPLE_SINGLE_RHS:
4647 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4648 if (op_type == ternary_op)
4650 tree rhs = gimple_assign_rhs1 (stmt);
4651 ops[0] = TREE_OPERAND (rhs, 0);
4652 ops[1] = TREE_OPERAND (rhs, 1);
4653 ops[2] = TREE_OPERAND (rhs, 2);
4654 code = TREE_CODE (rhs);
4656 else
4657 return false;
4658 break;
4660 case GIMPLE_BINARY_RHS:
4661 code = gimple_assign_rhs_code (stmt);
4662 op_type = TREE_CODE_LENGTH (code);
4663 gcc_assert (op_type == binary_op);
4664 ops[0] = gimple_assign_rhs1 (stmt);
4665 ops[1] = gimple_assign_rhs2 (stmt);
4666 break;
4668 case GIMPLE_TERNARY_RHS:
4669 code = gimple_assign_rhs_code (stmt);
4670 op_type = TREE_CODE_LENGTH (code);
4671 gcc_assert (op_type == ternary_op);
4672 ops[0] = gimple_assign_rhs1 (stmt);
4673 ops[1] = gimple_assign_rhs2 (stmt);
4674 ops[2] = gimple_assign_rhs3 (stmt);
4675 break;
4677 case GIMPLE_UNARY_RHS:
4678 return false;
4680 default:
4681 gcc_unreachable ();
4684 if (code == COND_EXPR && slp_node)
4685 return false;
4687 scalar_dest = gimple_assign_lhs (stmt);
4688 scalar_type = TREE_TYPE (scalar_dest);
4689 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4690 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4691 return false;
4693 /* Do not try to vectorize bit-precision reductions. */
4694 if ((TYPE_PRECISION (scalar_type)
4695 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4696 return false;
4698 /* All uses but the last are expected to be defined in the loop.
4699 The last use is the reduction variable. In case of nested cycle this
4700 assumption is not true: we use reduc_index to record the index of the
4701 reduction variable. */
4702 for (i = 0; i < op_type-1; i++)
4704 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4705 if (i == 0 && code == COND_EXPR)
4706 continue;
4708 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4709 &def_stmt, &def, &dt, &tem);
4710 if (!vectype_in)
4711 vectype_in = tem;
4712 gcc_assert (is_simple_use);
4714 if (dt != vect_internal_def
4715 && dt != vect_external_def
4716 && dt != vect_constant_def
4717 && dt != vect_induction_def
4718 && !(dt == vect_nested_cycle && nested_cycle))
4719 return false;
4721 if (dt == vect_nested_cycle)
4723 found_nested_cycle_def = true;
4724 reduc_def_stmt = def_stmt;
4725 reduc_index = i;
4729 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4730 &def_stmt, &def, &dt, &tem);
4731 if (!vectype_in)
4732 vectype_in = tem;
4733 gcc_assert (is_simple_use);
4734 gcc_assert (dt == vect_reduction_def
4735 || dt == vect_nested_cycle
4736 || ((dt == vect_internal_def || dt == vect_external_def
4737 || dt == vect_constant_def || dt == vect_induction_def)
4738 && nested_cycle && found_nested_cycle_def));
4739 if (!found_nested_cycle_def)
4740 reduc_def_stmt = def_stmt;
4742 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4743 if (orig_stmt)
4744 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4745 reduc_def_stmt,
4746 !nested_cycle,
4747 &dummy));
4748 else
4750 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4751 !nested_cycle, &dummy);
4752 /* We changed STMT to be the first stmt in reduction chain, hence we
4753 check that in this case the first element in the chain is STMT. */
4754 gcc_assert (stmt == tmp
4755 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4758 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4759 return false;
4761 if (slp_node || PURE_SLP_STMT (stmt_info))
4762 ncopies = 1;
4763 else
4764 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4765 / TYPE_VECTOR_SUBPARTS (vectype_in));
4767 gcc_assert (ncopies >= 1);
4769 vec_mode = TYPE_MODE (vectype_in);
4771 if (code == COND_EXPR)
4773 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4775 if (dump_enabled_p ())
4776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4777 "unsupported condition in reduction");
4779 return false;
4782 else
4784 /* 4. Supportable by target? */
4786 /* 4.1. check support for the operation in the loop */
4787 optab = optab_for_tree_code (code, vectype_in, optab_default);
4788 if (!optab)
4790 if (dump_enabled_p ())
4791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4792 "no optab.");
4794 return false;
4797 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4799 if (dump_enabled_p ())
4800 dump_printf (MSG_NOTE, "op not supported by target.");
4802 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4803 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4804 < vect_min_worthwhile_factor (code))
4805 return false;
4807 if (dump_enabled_p ())
4808 dump_printf (MSG_NOTE, "proceeding using word mode.");
4811 /* Worthwhile without SIMD support? */
4812 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4813 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4814 < vect_min_worthwhile_factor (code))
4816 if (dump_enabled_p ())
4817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4818 "not worthwhile without SIMD support.");
4820 return false;
4824 /* 4.2. Check support for the epilog operation.
4826 If STMT represents a reduction pattern, then the type of the
4827 reduction variable may be different than the type of the rest
4828 of the arguments. For example, consider the case of accumulation
4829 of shorts into an int accumulator; The original code:
4830 S1: int_a = (int) short_a;
4831 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4833 was replaced with:
4834 STMT: int_acc = widen_sum <short_a, int_acc>
4836 This means that:
4837 1. The tree-code that is used to create the vector operation in the
4838 epilog code (that reduces the partial results) is not the
4839 tree-code of STMT, but is rather the tree-code of the original
4840 stmt from the pattern that STMT is replacing. I.e, in the example
4841 above we want to use 'widen_sum' in the loop, but 'plus' in the
4842 epilog.
4843 2. The type (mode) we use to check available target support
4844 for the vector operation to be created in the *epilog*, is
4845 determined by the type of the reduction variable (in the example
4846 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4847 However the type (mode) we use to check available target support
4848 for the vector operation to be created *inside the loop*, is
4849 determined by the type of the other arguments to STMT (in the
4850 example we'd check this: optab_handler (widen_sum_optab,
4851 vect_short_mode)).
4853 This is contrary to "regular" reductions, in which the types of all
4854 the arguments are the same as the type of the reduction variable.
4855 For "regular" reductions we can therefore use the same vector type
4856 (and also the same tree-code) when generating the epilog code and
4857 when generating the code inside the loop. */
4859 if (orig_stmt)
4861 /* This is a reduction pattern: get the vectype from the type of the
4862 reduction variable, and get the tree-code from orig_stmt. */
4863 orig_code = gimple_assign_rhs_code (orig_stmt);
4864 gcc_assert (vectype_out);
4865 vec_mode = TYPE_MODE (vectype_out);
4867 else
4869 /* Regular reduction: use the same vectype and tree-code as used for
4870 the vector code inside the loop can be used for the epilog code. */
4871 orig_code = code;
4874 if (nested_cycle)
4876 def_bb = gimple_bb (reduc_def_stmt);
4877 def_stmt_loop = def_bb->loop_father;
4878 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4879 loop_preheader_edge (def_stmt_loop));
4880 if (TREE_CODE (def_arg) == SSA_NAME
4881 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4882 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4883 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4884 && vinfo_for_stmt (def_arg_stmt)
4885 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4886 == vect_double_reduction_def)
4887 double_reduc = true;
4890 epilog_reduc_code = ERROR_MARK;
4891 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4893 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4894 optab_default);
4895 if (!reduc_optab)
4897 if (dump_enabled_p ())
4898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4899 "no optab for reduction.");
4901 epilog_reduc_code = ERROR_MARK;
4904 if (reduc_optab
4905 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4907 if (dump_enabled_p ())
4908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4909 "reduc op not supported by target.");
4911 epilog_reduc_code = ERROR_MARK;
4914 else
4916 if (!nested_cycle || double_reduc)
4918 if (dump_enabled_p ())
4919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4920 "no reduc code for scalar code.");
4922 return false;
4926 if (double_reduc && ncopies > 1)
4928 if (dump_enabled_p ())
4929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4930 "multiple types in double reduction");
4932 return false;
4935 /* In case of widenning multiplication by a constant, we update the type
4936 of the constant to be the type of the other operand. We check that the
4937 constant fits the type in the pattern recognition pass. */
4938 if (code == DOT_PROD_EXPR
4939 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4941 if (TREE_CODE (ops[0]) == INTEGER_CST)
4942 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4943 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4944 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4945 else
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949 "invalid types in dot-prod");
4951 return false;
4955 if (!vec_stmt) /* transformation not required. */
4957 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4958 return false;
4959 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4960 return true;
4963 /** Transform. **/
4965 if (dump_enabled_p ())
4966 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4968 /* FORNOW: Multiple types are not supported for condition. */
4969 if (code == COND_EXPR)
4970 gcc_assert (ncopies == 1);
4972 /* Create the destination vector */
4973 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4975 /* In case the vectorization factor (VF) is bigger than the number
4976 of elements that we can fit in a vectype (nunits), we have to generate
4977 more than one vector stmt - i.e - we need to "unroll" the
4978 vector stmt by a factor VF/nunits. For more details see documentation
4979 in vectorizable_operation. */
4981 /* If the reduction is used in an outer loop we need to generate
4982 VF intermediate results, like so (e.g. for ncopies=2):
4983 r0 = phi (init, r0)
4984 r1 = phi (init, r1)
4985 r0 = x0 + r0;
4986 r1 = x1 + r1;
4987 (i.e. we generate VF results in 2 registers).
4988 In this case we have a separate def-use cycle for each copy, and therefore
4989 for each copy we get the vector def for the reduction variable from the
4990 respective phi node created for this copy.
4992 Otherwise (the reduction is unused in the loop nest), we can combine
4993 together intermediate results, like so (e.g. for ncopies=2):
4994 r = phi (init, r)
4995 r = x0 + r;
4996 r = x1 + r;
4997 (i.e. we generate VF/2 results in a single register).
4998 In this case for each copy we get the vector def for the reduction variable
4999 from the vectorized reduction operation generated in the previous iteration.
5002 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5004 single_defuse_cycle = true;
5005 epilog_copies = 1;
5007 else
5008 epilog_copies = ncopies;
5010 prev_stmt_info = NULL;
5011 prev_phi_info = NULL;
5012 if (slp_node)
5014 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5015 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5016 == TYPE_VECTOR_SUBPARTS (vectype_in));
5018 else
5020 vec_num = 1;
5021 vec_oprnds0 = VEC_alloc (tree, heap, 1);
5022 if (op_type == ternary_op)
5023 vec_oprnds1 = VEC_alloc (tree, heap, 1);
5026 phis = VEC_alloc (gimple, heap, vec_num);
5027 vect_defs = VEC_alloc (tree, heap, vec_num);
5028 if (!slp_node)
5029 VEC_quick_push (tree, vect_defs, NULL_TREE);
5031 for (j = 0; j < ncopies; j++)
5033 if (j == 0 || !single_defuse_cycle)
5035 for (i = 0; i < vec_num; i++)
5037 /* Create the reduction-phi that defines the reduction
5038 operand. */
5039 new_phi = create_phi_node (vec_dest, loop->header);
5040 set_vinfo_for_stmt (new_phi,
5041 new_stmt_vec_info (new_phi, loop_vinfo,
5042 NULL));
5043 if (j == 0 || slp_node)
5044 VEC_quick_push (gimple, phis, new_phi);
5048 if (code == COND_EXPR)
5050 gcc_assert (!slp_node);
5051 vectorizable_condition (stmt, gsi, vec_stmt,
5052 PHI_RESULT (VEC_index (gimple, phis, 0)),
5053 reduc_index, NULL);
5054 /* Multiple types are not supported for condition. */
5055 break;
5058 /* Handle uses. */
5059 if (j == 0)
5061 op0 = ops[!reduc_index];
5062 if (op_type == ternary_op)
5064 if (reduc_index == 0)
5065 op1 = ops[2];
5066 else
5067 op1 = ops[1];
5070 if (slp_node)
5071 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5072 slp_node, -1);
5073 else
5075 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5076 stmt, NULL);
5077 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
5078 if (op_type == ternary_op)
5080 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5081 NULL);
5082 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
5086 else
5088 if (!slp_node)
5090 enum vect_def_type dt;
5091 gimple dummy_stmt;
5092 tree dummy;
5094 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5095 &dummy_stmt, &dummy, &dt);
5096 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5097 loop_vec_def0);
5098 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
5099 if (op_type == ternary_op)
5101 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5102 &dummy, &dt);
5103 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5104 loop_vec_def1);
5105 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
5109 if (single_defuse_cycle)
5110 reduc_def = gimple_assign_lhs (new_stmt);
5112 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5115 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
5117 if (slp_node)
5118 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
5119 else
5121 if (!single_defuse_cycle || j == 0)
5122 reduc_def = PHI_RESULT (new_phi);
5125 def1 = ((op_type == ternary_op)
5126 ? VEC_index (tree, vec_oprnds1, i) : NULL);
5127 if (op_type == binary_op)
5129 if (reduc_index == 0)
5130 expr = build2 (code, vectype_out, reduc_def, def0);
5131 else
5132 expr = build2 (code, vectype_out, def0, reduc_def);
5134 else
5136 if (reduc_index == 0)
5137 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5138 else
5140 if (reduc_index == 1)
5141 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5142 else
5143 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5147 new_stmt = gimple_build_assign (vec_dest, expr);
5148 new_temp = make_ssa_name (vec_dest, new_stmt);
5149 gimple_assign_set_lhs (new_stmt, new_temp);
5150 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5152 if (slp_node)
5154 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5155 VEC_quick_push (tree, vect_defs, new_temp);
5157 else
5158 VEC_replace (tree, vect_defs, 0, new_temp);
5161 if (slp_node)
5162 continue;
5164 if (j == 0)
5165 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5166 else
5167 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5169 prev_stmt_info = vinfo_for_stmt (new_stmt);
5170 prev_phi_info = vinfo_for_stmt (new_phi);
5173 /* Finalize the reduction-phi (set its arguments) and create the
5174 epilog reduction code. */
5175 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5177 new_temp = gimple_assign_lhs (*vec_stmt);
5178 VEC_replace (tree, vect_defs, 0, new_temp);
5181 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5182 epilog_reduc_code, phis, reduc_index,
5183 double_reduc, slp_node);
5185 VEC_free (gimple, heap, phis);
5186 VEC_free (tree, heap, vec_oprnds0);
5187 if (vec_oprnds1)
5188 VEC_free (tree, heap, vec_oprnds1);
5190 return true;
5193 /* Function vect_min_worthwhile_factor.
5195 For a loop where we could vectorize the operation indicated by CODE,
5196 return the minimum vectorization factor that makes it worthwhile
5197 to use generic vectors. */
5199 vect_min_worthwhile_factor (enum tree_code code)
5201 switch (code)
5203 case PLUS_EXPR:
5204 case MINUS_EXPR:
5205 case NEGATE_EXPR:
5206 return 4;
5208 case BIT_AND_EXPR:
5209 case BIT_IOR_EXPR:
5210 case BIT_XOR_EXPR:
5211 case BIT_NOT_EXPR:
5212 return 2;
5214 default:
5215 return INT_MAX;
5220 /* Function vectorizable_induction
5222 Check if PHI performs an induction computation that can be vectorized.
5223 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5224 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5225 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5227 bool
5228 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5229 gimple *vec_stmt)
5231 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5232 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5233 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5235 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5236 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5237 tree vec_def;
5239 gcc_assert (ncopies >= 1);
5240 /* FORNOW. These restrictions should be relaxed. */
5241 if (nested_in_vect_loop_p (loop, phi))
5243 imm_use_iterator imm_iter;
5244 use_operand_p use_p;
5245 gimple exit_phi;
5246 edge latch_e;
5247 tree loop_arg;
5249 if (ncopies > 1)
5251 if (dump_enabled_p ())
5252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5253 "multiple types in nested loop.");
5254 return false;
5257 exit_phi = NULL;
5258 latch_e = loop_latch_edge (loop->inner);
5259 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5260 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5262 if (!flow_bb_inside_loop_p (loop->inner,
5263 gimple_bb (USE_STMT (use_p))))
5265 exit_phi = USE_STMT (use_p);
5266 break;
5269 if (exit_phi)
5271 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5272 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5273 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5275 if (dump_enabled_p ())
5276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5277 "inner-loop induction only used outside "
5278 "of the outer vectorized loop.");
5279 return false;
5284 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5285 return false;
5287 /* FORNOW: SLP not supported. */
5288 if (STMT_SLP_TYPE (stmt_info))
5289 return false;
5291 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5293 if (gimple_code (phi) != GIMPLE_PHI)
5294 return false;
5296 if (!vec_stmt) /* transformation not required. */
5298 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_NOTE, vect_location,
5301 "=== vectorizable_induction ===");
5302 vect_model_induction_cost (stmt_info, ncopies);
5303 return true;
5306 /** Transform. **/
5308 if (dump_enabled_p ())
5309 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5311 vec_def = get_initial_def_for_induction (phi);
5312 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5313 return true;
5316 /* Function vectorizable_live_operation.
5318 STMT computes a value that is used outside the loop. Check if
5319 it can be supported. */
5321 bool
5322 vectorizable_live_operation (gimple stmt,
5323 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5324 gimple *vec_stmt ATTRIBUTE_UNUSED)
5326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5327 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5328 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5329 int i;
5330 int op_type;
5331 tree op;
5332 tree def;
5333 gimple def_stmt;
5334 enum vect_def_type dt;
5335 enum tree_code code;
5336 enum gimple_rhs_class rhs_class;
5338 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5340 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5341 return false;
5343 if (!is_gimple_assign (stmt))
5344 return false;
5346 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5347 return false;
5349 /* FORNOW. CHECKME. */
5350 if (nested_in_vect_loop_p (loop, stmt))
5351 return false;
5353 code = gimple_assign_rhs_code (stmt);
5354 op_type = TREE_CODE_LENGTH (code);
5355 rhs_class = get_gimple_rhs_class (code);
5356 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5357 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5359 /* FORNOW: support only if all uses are invariant. This means
5360 that the scalar operations can remain in place, unvectorized.
5361 The original last scalar value that they compute will be used. */
5363 for (i = 0; i < op_type; i++)
5365 if (rhs_class == GIMPLE_SINGLE_RHS)
5366 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5367 else
5368 op = gimple_op (stmt, i + 1);
5369 if (op
5370 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5371 &dt))
5373 if (dump_enabled_p ())
5374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375 "use not simple.");
5376 return false;
5379 if (dt != vect_external_def && dt != vect_constant_def)
5380 return false;
5383 /* No transformation is required for the cases we currently support. */
5384 return true;
5387 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5389 static void
5390 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5392 ssa_op_iter op_iter;
5393 imm_use_iterator imm_iter;
5394 def_operand_p def_p;
5395 gimple ustmt;
5397 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5399 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5401 basic_block bb;
5403 if (!is_gimple_debug (ustmt))
5404 continue;
5406 bb = gimple_bb (ustmt);
5408 if (!flow_bb_inside_loop_p (loop, bb))
5410 if (gimple_debug_bind_p (ustmt))
5412 if (dump_enabled_p ())
5413 dump_printf_loc (MSG_NOTE, vect_location,
5414 "killing debug use");
5416 gimple_debug_bind_reset_value (ustmt);
5417 update_stmt (ustmt);
5419 else
5420 gcc_unreachable ();
5426 /* Function vect_transform_loop.
5428 The analysis phase has determined that the loop is vectorizable.
5429 Vectorize the loop - created vectorized stmts to replace the scalar
5430 stmts in the loop, and update the loop exit condition. */
5432 void
5433 vect_transform_loop (loop_vec_info loop_vinfo)
5435 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5436 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5437 int nbbs = loop->num_nodes;
5438 gimple_stmt_iterator si;
5439 int i;
5440 tree ratio = NULL;
5441 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5442 bool grouped_store;
5443 bool slp_scheduled = false;
5444 unsigned int nunits;
5445 gimple stmt, pattern_stmt;
5446 gimple_seq pattern_def_seq = NULL;
5447 gimple_stmt_iterator pattern_def_si = gsi_none ();
5448 bool transform_pattern_stmt = false;
5449 bool check_profitability = false;
5450 int th;
5452 if (dump_enabled_p ())
5453 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5455 /* Use the more conservative vectorization threshold. If the number
5456 of iterations is constant assume the cost check has been performed
5457 by our caller. If the threshold makes all loops profitable that
5458 run at least the vectorization factor number of times checking
5459 is pointless, too. */
5460 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5461 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5462 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5463 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5464 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5466 if (dump_enabled_p ())
5467 dump_printf_loc (MSG_NOTE, vect_location,
5468 "Profitability threshold is %d loop iterations.", th);
5469 check_profitability = true;
5472 /* Peel the loop if there are data refs with unknown alignment.
5473 Only one data ref with unknown store is allowed. */
5475 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5477 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5478 check_profitability = false;
5481 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5482 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5484 vect_loop_versioning (loop_vinfo, th, check_profitability);
5485 check_profitability = false;
5488 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5489 compile time constant), or it is a constant that doesn't divide by the
5490 vectorization factor, then an epilog loop needs to be created.
5491 We therefore duplicate the loop: the original loop will be vectorized,
5492 and will compute the first (n/VF) iterations. The second copy of the loop
5493 will remain scalar and will compute the remaining (n%VF) iterations.
5494 (VF is the vectorization factor). */
5496 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5497 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5498 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5499 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5500 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5501 th, check_profitability);
5502 else
5503 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5504 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5506 /* 1) Make sure the loop header has exactly two entries
5507 2) Make sure we have a preheader basic block. */
5509 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5511 split_edge (loop_preheader_edge (loop));
5513 /* FORNOW: the vectorizer supports only loops which body consist
5514 of one basic block (header + empty latch). When the vectorizer will
5515 support more involved loop forms, the order by which the BBs are
5516 traversed need to be reconsidered. */
5518 for (i = 0; i < nbbs; i++)
5520 basic_block bb = bbs[i];
5521 stmt_vec_info stmt_info;
5522 gimple phi;
5524 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5526 phi = gsi_stmt (si);
5527 if (dump_enabled_p ())
5529 dump_printf_loc (MSG_NOTE, vect_location,
5530 "------>vectorizing phi: ");
5531 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5533 stmt_info = vinfo_for_stmt (phi);
5534 if (!stmt_info)
5535 continue;
5537 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5538 vect_loop_kill_debug_uses (loop, phi);
5540 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5541 && !STMT_VINFO_LIVE_P (stmt_info))
5542 continue;
5544 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5545 != (unsigned HOST_WIDE_INT) vectorization_factor)
5546 && dump_enabled_p ())
5547 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5549 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5551 if (dump_enabled_p ())
5552 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5553 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5557 pattern_stmt = NULL;
5558 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5560 bool is_store;
5562 if (transform_pattern_stmt)
5563 stmt = pattern_stmt;
5564 else
5565 stmt = gsi_stmt (si);
5567 if (dump_enabled_p ())
5569 dump_printf_loc (MSG_NOTE, vect_location,
5570 "------>vectorizing statement: ");
5571 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5574 stmt_info = vinfo_for_stmt (stmt);
5576 /* vector stmts created in the outer-loop during vectorization of
5577 stmts in an inner-loop may not have a stmt_info, and do not
5578 need to be vectorized. */
5579 if (!stmt_info)
5581 gsi_next (&si);
5582 continue;
5585 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5586 vect_loop_kill_debug_uses (loop, stmt);
5588 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5589 && !STMT_VINFO_LIVE_P (stmt_info))
5591 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5592 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5593 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5594 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5596 stmt = pattern_stmt;
5597 stmt_info = vinfo_for_stmt (stmt);
5599 else
5601 gsi_next (&si);
5602 continue;
5605 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5606 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5607 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5608 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5609 transform_pattern_stmt = true;
5611 /* If pattern statement has def stmts, vectorize them too. */
5612 if (is_pattern_stmt_p (stmt_info))
5614 if (pattern_def_seq == NULL)
5616 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5617 pattern_def_si = gsi_start (pattern_def_seq);
5619 else if (!gsi_end_p (pattern_def_si))
5620 gsi_next (&pattern_def_si);
5621 if (pattern_def_seq != NULL)
5623 gimple pattern_def_stmt = NULL;
5624 stmt_vec_info pattern_def_stmt_info = NULL;
5626 while (!gsi_end_p (pattern_def_si))
5628 pattern_def_stmt = gsi_stmt (pattern_def_si);
5629 pattern_def_stmt_info
5630 = vinfo_for_stmt (pattern_def_stmt);
5631 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5632 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5633 break;
5634 gsi_next (&pattern_def_si);
5637 if (!gsi_end_p (pattern_def_si))
5639 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_NOTE, vect_location,
5642 "==> vectorizing pattern def "
5643 "stmt: ");
5644 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5645 pattern_def_stmt, 0);
5648 stmt = pattern_def_stmt;
5649 stmt_info = pattern_def_stmt_info;
5651 else
5653 pattern_def_si = gsi_none ();
5654 transform_pattern_stmt = false;
5657 else
5658 transform_pattern_stmt = false;
5661 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5662 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5663 STMT_VINFO_VECTYPE (stmt_info));
5664 if (!STMT_SLP_TYPE (stmt_info)
5665 && nunits != (unsigned int) vectorization_factor
5666 && dump_enabled_p ())
5667 /* For SLP VF is set according to unrolling factor, and not to
5668 vector size, hence for SLP this print is not valid. */
5669 dump_printf_loc (MSG_NOTE, vect_location,
5670 "multiple-types.");
5672 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5673 reached. */
5674 if (STMT_SLP_TYPE (stmt_info))
5676 if (!slp_scheduled)
5678 slp_scheduled = true;
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_NOTE, vect_location,
5682 "=== scheduling SLP instances ===");
5684 vect_schedule_slp (loop_vinfo, NULL);
5687 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5688 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5690 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5692 pattern_def_seq = NULL;
5693 gsi_next (&si);
5695 continue;
5699 /* -------- vectorize statement ------------ */
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5703 grouped_store = false;
5704 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5705 if (is_store)
5707 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5709 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5710 interleaving chain was completed - free all the stores in
5711 the chain. */
5712 gsi_next (&si);
5713 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5714 continue;
5716 else
5718 /* Free the attached stmt_vec_info and remove the stmt. */
5719 gimple store = gsi_stmt (si);
5720 free_stmt_vec_info (store);
5721 unlink_stmt_vdef (store);
5722 gsi_remove (&si, true);
5723 release_defs (store);
5724 continue;
5728 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5730 pattern_def_seq = NULL;
5731 gsi_next (&si);
5733 } /* stmts in BB */
5734 } /* BBs in loop */
5736 slpeel_make_loop_iterate_ntimes (loop, ratio);
5738 /* The memory tags and pointers in vectorized statements need to
5739 have their SSA forms updated. FIXME, why can't this be delayed
5740 until all the loops have been transformed? */
5741 update_ssa (TODO_update_ssa);
5743 if (dump_enabled_p ())
5744 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5745 if (loop->inner && dump_enabled_p ())
5746 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5747 "OUTER LOOP VECTORIZED.");