* config/pdp11/pdp11.md (lshrsi3, lshrhi3): Fix wrong code.
[official-gcc.git] / gcc / tree-vect-loop.c
blobbc87965fa91e95f28938aa58d6fe2b019a4b7387
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h"
44 #include "tree-vectorizer.h"
45 #include "target.h"
47 /* Loop Vectorization Pass.
49 This pass tries to vectorize loops.
51 For example, the vectorizer transforms the following simple loop:
53 short a[N]; short b[N]; short c[N]; int i;
55 for (i=0; i<N; i++){
56 a[i] = b[i] + c[i];
59 as if it was manually vectorized by rewriting the source code into:
61 typedef int __attribute__((mode(V8HI))) v8hi;
62 short a[N]; short b[N]; short c[N]; int i;
63 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
64 v8hi va, vb, vc;
66 for (i=0; i<N/8; i++){
67 vb = pb[i];
68 vc = pc[i];
69 va = vb + vc;
70 pa[i] = va;
73 The main entry to this pass is vectorize_loops(), in which
74 the vectorizer applies a set of analyses on a given set of loops,
75 followed by the actual vectorization transformation for the loops that
76 had successfully passed the analysis phase.
77 Throughout this pass we make a distinction between two types of
78 data: scalars (which are represented by SSA_NAMES), and memory references
79 ("data-refs"). These two types of data require different handling both
80 during analysis and transformation. The types of data-refs that the
81 vectorizer currently supports are ARRAY_REFS which base is an array DECL
82 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
83 accesses are required to have a simple (consecutive) access pattern.
85 Analysis phase:
86 ===============
87 The driver for the analysis phase is vect_analyze_loop().
88 It applies a set of analyses, some of which rely on the scalar evolution
89 analyzer (scev) developed by Sebastian Pop.
91 During the analysis phase the vectorizer records some information
92 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
93 loop, as well as general information about the loop as a whole, which is
94 recorded in a "loop_vec_info" struct attached to each loop.
96 Transformation phase:
97 =====================
98 The loop transformation phase scans all the stmts in the loop, and
99 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
100 the loop that needs to be vectorized. It inserts the vector code sequence
101 just before the scalar stmt S, and records a pointer to the vector code
102 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
103 attached to S). This pointer will be used for the vectorization of following
104 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
105 otherwise, we rely on dead code elimination for removing it.
107 For example, say stmt S1 was vectorized into stmt VS1:
109 VS1: vb = px[i];
110 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
111 S2: a = b;
113 To vectorize stmt S2, the vectorizer first finds the stmt that defines
114 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
115 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
116 resulting sequence would be:
118 VS1: vb = px[i];
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 VS2: va = vb;
121 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
123 Operands that are not SSA_NAMEs, are data-refs that appear in
124 load/store operations (like 'x[i]' in S1), and are handled differently.
126 Target modeling:
127 =================
128 Currently the only target specific information that is used is the
129 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
130 Targets that can support different sizes of vectors, for now will need
131 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
132 flexibility will be added in the future.
134 Since we only vectorize operations which vector form can be
135 expressed using existing tree codes, to verify that an operation is
136 supported, the vectorizer checks the relevant optab at the relevant
137 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
138 the value found is CODE_FOR_nothing, then there's no target support, and
139 we can't vectorize the stmt.
141 For additional information on this project see:
142 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
145 /* Function vect_determine_vectorization_factor
147 Determine the vectorization factor (VF). VF is the number of data elements
148 that are operated upon in parallel in a single iteration of the vectorized
149 loop. For example, when vectorizing a loop that operates on 4byte elements,
150 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
151 elements can fit in a single vector register.
153 We currently support vectorization of loops in which all types operated upon
154 are of the same size. Therefore this function currently sets VF according to
155 the size of the types operated upon, and fails if there are multiple sizes
156 in the loop.
158 VF is also the factor by which the loop iterations are strip-mined, e.g.:
159 original loop:
160 for (i=0; i<N; i++){
161 a[i] = b[i] + c[i];
164 vectorized loop:
165 for (i=0; i<N; i+=VF){
166 a[i:VF] = b[i:VF] + c[i:VF];
170 static bool
171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
175 int nbbs = loop->num_nodes;
176 gimple_stmt_iterator si;
177 unsigned int vectorization_factor = 0;
178 tree scalar_type;
179 gimple phi;
180 tree vectype;
181 unsigned int nunits;
182 stmt_vec_info stmt_info;
183 int i;
184 HOST_WIDE_INT dummy;
186 if (vect_print_dump_info (REPORT_DETAILS))
187 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
189 for (i = 0; i < nbbs; i++)
191 basic_block bb = bbs[i];
193 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
195 phi = gsi_stmt (si);
196 stmt_info = vinfo_for_stmt (phi);
197 if (vect_print_dump_info (REPORT_DETAILS))
199 fprintf (vect_dump, "==> examining phi: ");
200 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
203 gcc_assert (stmt_info);
205 if (STMT_VINFO_RELEVANT_P (stmt_info))
207 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
208 scalar_type = TREE_TYPE (PHI_RESULT (phi));
210 if (vect_print_dump_info (REPORT_DETAILS))
212 fprintf (vect_dump, "get vectype for scalar type: ");
213 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
216 vectype = get_vectype_for_scalar_type (scalar_type);
217 if (!vectype)
219 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
221 fprintf (vect_dump,
222 "not vectorized: unsupported data-type ");
223 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
225 return false;
227 STMT_VINFO_VECTYPE (stmt_info) = vectype;
229 if (vect_print_dump_info (REPORT_DETAILS))
231 fprintf (vect_dump, "vectype: ");
232 print_generic_expr (vect_dump, vectype, TDF_SLIM);
235 nunits = TYPE_VECTOR_SUBPARTS (vectype);
236 if (vect_print_dump_info (REPORT_DETAILS))
237 fprintf (vect_dump, "nunits = %d", nunits);
239 if (!vectorization_factor
240 || (nunits > vectorization_factor))
241 vectorization_factor = nunits;
245 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
247 tree vf_vectype;
248 gimple stmt = gsi_stmt (si);
249 stmt_info = vinfo_for_stmt (stmt);
251 if (vect_print_dump_info (REPORT_DETAILS))
253 fprintf (vect_dump, "==> examining statement: ");
254 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
257 gcc_assert (stmt_info);
259 /* skip stmts which do not need to be vectorized. */
260 if (!STMT_VINFO_RELEVANT_P (stmt_info)
261 && !STMT_VINFO_LIVE_P (stmt_info))
263 if (vect_print_dump_info (REPORT_DETAILS))
264 fprintf (vect_dump, "skip.");
265 continue;
268 if (gimple_get_lhs (stmt) == NULL_TREE)
270 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
272 fprintf (vect_dump, "not vectorized: irregular stmt.");
273 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
275 return false;
278 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
280 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
282 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
283 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
285 return false;
288 if (STMT_VINFO_VECTYPE (stmt_info))
290 /* The only case when a vectype had been already set is for stmts
291 that contain a dataref, or for "pattern-stmts" (stmts generated
292 by the vectorizer to represent/replace a certain idiom). */
293 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
294 || is_pattern_stmt_p (stmt_info));
295 vectype = STMT_VINFO_VECTYPE (stmt_info);
297 else
299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
300 && !is_pattern_stmt_p (stmt_info));
302 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
303 if (vect_print_dump_info (REPORT_DETAILS))
305 fprintf (vect_dump, "get vectype for scalar type: ");
306 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
308 vectype = get_vectype_for_scalar_type (scalar_type);
309 if (!vectype)
311 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
313 fprintf (vect_dump,
314 "not vectorized: unsupported data-type ");
315 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
317 return false;
320 STMT_VINFO_VECTYPE (stmt_info) = vectype;
323 /* The vectorization factor is according to the smallest
324 scalar type (or the largest vector size, but we only
325 support one vector size per loop). */
326 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
327 &dummy);
328 if (vect_print_dump_info (REPORT_DETAILS))
330 fprintf (vect_dump, "get vectype for scalar type: ");
331 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
333 vf_vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vf_vectype)
336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
338 fprintf (vect_dump,
339 "not vectorized: unsupported data-type ");
340 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
342 return false;
345 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
346 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
348 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
350 fprintf (vect_dump,
351 "not vectorized: different sized vector "
352 "types in statement, ");
353 print_generic_expr (vect_dump, vectype, TDF_SLIM);
354 fprintf (vect_dump, " and ");
355 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
357 return false;
360 if (vect_print_dump_info (REPORT_DETAILS))
362 fprintf (vect_dump, "vectype: ");
363 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
366 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
367 if (vect_print_dump_info (REPORT_DETAILS))
368 fprintf (vect_dump, "nunits = %d", nunits);
370 if (!vectorization_factor
371 || (nunits > vectorization_factor))
372 vectorization_factor = nunits;
376 /* TODO: Analyze cost. Decide if worth while to vectorize. */
377 if (vect_print_dump_info (REPORT_DETAILS))
378 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
379 if (vectorization_factor <= 1)
381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
382 fprintf (vect_dump, "not vectorized: unsupported data-type");
383 return false;
385 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
387 return true;
391 /* Function vect_is_simple_iv_evolution.
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution with constant step. */
396 static bool
397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree * step)
400 tree init_expr;
401 tree step_expr;
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
404 /* When there is no evolution in this loop, the evolution function
405 is not "simple". */
406 if (evolution_part == NULL_TREE)
407 return false;
409 /* When the evolution is a polynomial of degree >= 2
410 the evolution function is not "simple". */
411 if (tree_is_chrec (evolution_part))
412 return false;
414 step_expr = evolution_part;
415 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
417 if (vect_print_dump_info (REPORT_DETAILS))
419 fprintf (vect_dump, "step: ");
420 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
421 fprintf (vect_dump, ", init: ");
422 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
425 *init = init_expr;
426 *step = step_expr;
428 if (TREE_CODE (step_expr) != INTEGER_CST)
430 if (vect_print_dump_info (REPORT_DETAILS))
431 fprintf (vect_dump, "step unknown.");
432 return false;
435 return true;
438 /* Function vect_analyze_scalar_cycles_1.
440 Examine the cross iteration def-use cycles of scalar variables
441 in LOOP. LOOP_VINFO represents the loop that is now being
442 considered for vectorization (can be LOOP, or an outer-loop
443 enclosing LOOP). */
445 static void
446 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
448 basic_block bb = loop->header;
449 tree dumy;
450 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
451 gimple_stmt_iterator gsi;
452 bool double_reduc;
454 if (vect_print_dump_info (REPORT_DETAILS))
455 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
457 /* First - identify all inductions. Reduction detection assumes that all the
458 inductions have been identified, therefore, this order must not be
459 changed. */
460 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
462 gimple phi = gsi_stmt (gsi);
463 tree access_fn = NULL;
464 tree def = PHI_RESULT (phi);
465 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
467 if (vect_print_dump_info (REPORT_DETAILS))
469 fprintf (vect_dump, "Analyze phi: ");
470 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
473 /* Skip virtual phi's. The data dependences that are associated with
474 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
475 if (!is_gimple_reg (SSA_NAME_VAR (def)))
476 continue;
478 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
480 /* Analyze the evolution function. */
481 access_fn = analyze_scalar_evolution (loop, def);
482 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
484 fprintf (vect_dump, "Access function of PHI: ");
485 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
488 if (!access_fn
489 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
491 VEC_safe_push (gimple, heap, worklist, phi);
492 continue;
495 if (vect_print_dump_info (REPORT_DETAILS))
496 fprintf (vect_dump, "Detected induction.");
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
501 /* Second - identify all reductions and nested cycles. */
502 while (VEC_length (gimple, worklist) > 0)
504 gimple phi = VEC_pop (gimple, worklist);
505 tree def = PHI_RESULT (phi);
506 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
507 gimple reduc_stmt;
508 bool nested_cycle;
510 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "Analyze phi: ");
513 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
516 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
519 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
520 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
521 &double_reduc);
522 if (reduc_stmt)
524 if (double_reduc)
526 if (vect_print_dump_info (REPORT_DETAILS))
527 fprintf (vect_dump, "Detected double reduction.");
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
530 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
531 vect_double_reduction_def;
533 else
535 if (nested_cycle)
537 if (vect_print_dump_info (REPORT_DETAILS))
538 fprintf (vect_dump, "Detected vectorizable nested cycle.");
540 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
541 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
542 vect_nested_cycle;
544 else
546 if (vect_print_dump_info (REPORT_DETAILS))
547 fprintf (vect_dump, "Detected reduction.");
549 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
550 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
551 vect_reduction_def;
552 /* Store the reduction cycles for possible vectorization in
553 loop-aware SLP. */
554 VEC_safe_push (gimple, heap,
555 LOOP_VINFO_REDUCTIONS (loop_vinfo),
556 reduc_stmt);
560 else
561 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "Unknown def-use cycle pattern.");
565 VEC_free (gimple, heap, worklist);
569 /* Function vect_analyze_scalar_cycles.
571 Examine the cross iteration def-use cycles of scalar variables, by
572 analyzing the loop-header PHIs of scalar variables. Classify each
573 cycle as one of the following: invariant, induction, reduction, unknown.
574 We do that for the loop represented by LOOP_VINFO, and also to its
575 inner-loop, if exists.
576 Examples for scalar cycles:
578 Example1: reduction:
580 loop1:
581 for (i=0; i<N; i++)
582 sum += a[i];
584 Example2: induction:
586 loop2:
587 for (i=0; i<N; i++)
588 a[i] = i; */
590 static void
591 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
595 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
597 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
598 Reductions in such inner-loop therefore have different properties than
599 the reductions in the nest that gets vectorized:
600 1. When vectorized, they are executed in the same order as in the original
601 scalar loop, so we can't change the order of computation when
602 vectorizing them.
603 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
604 current checks are too strict. */
606 if (loop->inner)
607 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
610 /* Function vect_get_loop_niters.
612 Determine how many iterations the loop is executed.
613 If an expression that represents the number of iterations
614 can be constructed, place it in NUMBER_OF_ITERATIONS.
615 Return the loop exit condition. */
617 static gimple
618 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
620 tree niters;
622 if (vect_print_dump_info (REPORT_DETAILS))
623 fprintf (vect_dump, "=== get_loop_niters ===");
625 niters = number_of_exit_cond_executions (loop);
627 if (niters != NULL_TREE
628 && niters != chrec_dont_know)
630 *number_of_iterations = niters;
632 if (vect_print_dump_info (REPORT_DETAILS))
634 fprintf (vect_dump, "==> get_loop_niters:" );
635 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
639 return get_loop_exit_condition (loop);
643 /* Function bb_in_loop_p
645 Used as predicate for dfs order traversal of the loop bbs. */
647 static bool
648 bb_in_loop_p (const_basic_block bb, const void *data)
650 const struct loop *const loop = (const struct loop *)data;
651 if (flow_bb_inside_loop_p (loop, bb))
652 return true;
653 return false;
657 /* Function new_loop_vec_info.
659 Create and initialize a new loop_vec_info struct for LOOP, as well as
660 stmt_vec_info structs for all the stmts in LOOP. */
662 static loop_vec_info
663 new_loop_vec_info (struct loop *loop)
665 loop_vec_info res;
666 basic_block *bbs;
667 gimple_stmt_iterator si;
668 unsigned int i, nbbs;
670 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
671 LOOP_VINFO_LOOP (res) = loop;
673 bbs = get_loop_body (loop);
675 /* Create/Update stmt_info for all stmts in the loop. */
676 for (i = 0; i < loop->num_nodes; i++)
678 basic_block bb = bbs[i];
680 /* BBs in a nested inner-loop will have been already processed (because
681 we will have called vect_analyze_loop_form for any nested inner-loop).
682 Therefore, for stmts in an inner-loop we just want to update the
683 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
684 loop_info of the outer-loop we are currently considering to vectorize
685 (instead of the loop_info of the inner-loop).
686 For stmts in other BBs we need to create a stmt_info from scratch. */
687 if (bb->loop_father != loop)
689 /* Inner-loop bb. */
690 gcc_assert (loop->inner && bb->loop_father == loop->inner);
691 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
693 gimple phi = gsi_stmt (si);
694 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
695 loop_vec_info inner_loop_vinfo =
696 STMT_VINFO_LOOP_VINFO (stmt_info);
697 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
698 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
700 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
702 gimple stmt = gsi_stmt (si);
703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
704 loop_vec_info inner_loop_vinfo =
705 STMT_VINFO_LOOP_VINFO (stmt_info);
706 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
707 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
710 else
712 /* bb in current nest. */
713 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
715 gimple phi = gsi_stmt (si);
716 gimple_set_uid (phi, 0);
717 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
720 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
722 gimple stmt = gsi_stmt (si);
723 gimple_set_uid (stmt, 0);
724 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
729 /* CHECKME: We want to visit all BBs before their successors (except for
730 latch blocks, for which this assertion wouldn't hold). In the simple
731 case of the loop forms we allow, a dfs order of the BBs would the same
732 as reversed postorder traversal, so we are safe. */
734 free (bbs);
735 bbs = XCNEWVEC (basic_block, loop->num_nodes);
736 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
737 bbs, loop->num_nodes, loop);
738 gcc_assert (nbbs == loop->num_nodes);
740 LOOP_VINFO_BBS (res) = bbs;
741 LOOP_VINFO_NITERS (res) = NULL;
742 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
743 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
744 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
745 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
746 LOOP_VINFO_VECT_FACTOR (res) = 0;
747 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
748 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
749 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
750 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
751 VEC_alloc (gimple, heap,
752 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
753 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
754 VEC_alloc (ddr_p, heap,
755 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
756 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
757 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
758 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
759 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
760 LOOP_VINFO_PEELING_HTAB (res) = NULL;
762 return res;
766 /* Function destroy_loop_vec_info.
768 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
769 stmts in the loop. */
771 void
772 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
774 struct loop *loop;
775 basic_block *bbs;
776 int nbbs;
777 gimple_stmt_iterator si;
778 int j;
779 VEC (slp_instance, heap) *slp_instances;
780 slp_instance instance;
782 if (!loop_vinfo)
783 return;
785 loop = LOOP_VINFO_LOOP (loop_vinfo);
787 bbs = LOOP_VINFO_BBS (loop_vinfo);
788 nbbs = loop->num_nodes;
790 if (!clean_stmts)
792 free (LOOP_VINFO_BBS (loop_vinfo));
793 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
794 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
795 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
797 free (loop_vinfo);
798 loop->aux = NULL;
799 return;
802 for (j = 0; j < nbbs; j++)
804 basic_block bb = bbs[j];
805 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
806 free_stmt_vec_info (gsi_stmt (si));
808 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
810 gimple stmt = gsi_stmt (si);
811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
813 if (stmt_info)
815 /* Check if this is a "pattern stmt" (introduced by the
816 vectorizer during the pattern recognition pass). */
817 bool remove_stmt_p = false;
818 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
819 if (orig_stmt)
821 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
822 if (orig_stmt_info
823 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
824 remove_stmt_p = true;
827 /* Free stmt_vec_info. */
828 free_stmt_vec_info (stmt);
830 /* Remove dead "pattern stmts". */
831 if (remove_stmt_p)
832 gsi_remove (&si, true);
834 gsi_next (&si);
838 free (LOOP_VINFO_BBS (loop_vinfo));
839 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
840 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
841 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
842 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
843 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
844 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
845 vect_free_slp_instance (instance);
847 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
849 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
851 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
852 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
854 free (loop_vinfo);
855 loop->aux = NULL;
859 /* Function vect_analyze_loop_1.
861 Apply a set of analyses on LOOP, and create a loop_vec_info struct
862 for it. The different analyses will record information in the
863 loop_vec_info struct. This is a subset of the analyses applied in
864 vect_analyze_loop, to be applied on an inner-loop nested in the loop
865 that is now considered for (outer-loop) vectorization. */
867 static loop_vec_info
868 vect_analyze_loop_1 (struct loop *loop)
870 loop_vec_info loop_vinfo;
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
875 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
877 loop_vinfo = vect_analyze_loop_form (loop);
878 if (!loop_vinfo)
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "bad inner-loop form.");
882 return NULL;
885 return loop_vinfo;
889 /* Function vect_analyze_loop_form.
891 Verify that certain CFG restrictions hold, including:
892 - the loop has a pre-header
893 - the loop has a single entry and exit
894 - the loop exit condition is simple enough, and the number of iterations
895 can be analyzed (a countable loop). */
897 loop_vec_info
898 vect_analyze_loop_form (struct loop *loop)
900 loop_vec_info loop_vinfo;
901 gimple loop_cond;
902 tree number_of_iterations = NULL;
903 loop_vec_info inner_loop_vinfo = NULL;
905 if (vect_print_dump_info (REPORT_DETAILS))
906 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
908 /* Different restrictions apply when we are considering an inner-most loop,
909 vs. an outer (nested) loop.
910 (FORNOW. May want to relax some of these restrictions in the future). */
912 if (!loop->inner)
914 /* Inner-most loop. We currently require that the number of BBs is
915 exactly 2 (the header and latch). Vectorizable inner-most loops
916 look like this:
918 (pre-header)
920 header <--------+
921 | | |
922 | +--> latch --+
924 (exit-bb) */
926 if (loop->num_nodes != 2)
928 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
929 fprintf (vect_dump, "not vectorized: control flow in loop.");
930 return NULL;
933 if (empty_block_p (loop->header))
935 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
936 fprintf (vect_dump, "not vectorized: empty loop.");
937 return NULL;
940 else
942 struct loop *innerloop = loop->inner;
943 edge entryedge;
945 /* Nested loop. We currently require that the loop is doubly-nested,
946 contains a single inner loop, and the number of BBs is exactly 5.
947 Vectorizable outer-loops look like this:
949 (pre-header)
951 header <---+
953 inner-loop |
955 tail ------+
957 (exit-bb)
959 The inner-loop has the properties expected of inner-most loops
960 as described above. */
962 if ((loop->inner)->inner || (loop->inner)->next)
964 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
965 fprintf (vect_dump, "not vectorized: multiple nested loops.");
966 return NULL;
969 /* Analyze the inner-loop. */
970 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
971 if (!inner_loop_vinfo)
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: Bad inner loop.");
975 return NULL;
978 if (!expr_invariant_in_loop_p (loop,
979 LOOP_VINFO_NITERS (inner_loop_vinfo)))
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
982 fprintf (vect_dump,
983 "not vectorized: inner-loop count not invariant.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
985 return NULL;
988 if (loop->num_nodes != 5)
990 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
991 fprintf (vect_dump, "not vectorized: control flow in loop.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
993 return NULL;
996 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
997 entryedge = EDGE_PRED (innerloop->header, 0);
998 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
999 entryedge = EDGE_PRED (innerloop->header, 1);
1001 if (entryedge->src != loop->header
1002 || !single_exit (innerloop)
1003 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1005 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1007 destroy_loop_vec_info (inner_loop_vinfo, true);
1008 return NULL;
1011 if (vect_print_dump_info (REPORT_DETAILS))
1012 fprintf (vect_dump, "Considering outer-loop vectorization.");
1015 if (!single_exit (loop)
1016 || EDGE_COUNT (loop->header->preds) != 2)
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1020 if (!single_exit (loop))
1021 fprintf (vect_dump, "not vectorized: multiple exits.");
1022 else if (EDGE_COUNT (loop->header->preds) != 2)
1023 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1025 if (inner_loop_vinfo)
1026 destroy_loop_vec_info (inner_loop_vinfo, true);
1027 return NULL;
1030 /* We assume that the loop exit condition is at the end of the loop. i.e,
1031 that the loop is represented as a do-while (with a proper if-guard
1032 before the loop if needed), where the loop header contains all the
1033 executable statements, and the latch is empty. */
1034 if (!empty_block_p (loop->latch)
1035 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1037 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1038 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1039 if (inner_loop_vinfo)
1040 destroy_loop_vec_info (inner_loop_vinfo, true);
1041 return NULL;
1044 /* Make sure there exists a single-predecessor exit bb: */
1045 if (!single_pred_p (single_exit (loop)->dest))
1047 edge e = single_exit (loop);
1048 if (!(e->flags & EDGE_ABNORMAL))
1050 split_loop_exit_edge (e);
1051 if (vect_print_dump_info (REPORT_DETAILS))
1052 fprintf (vect_dump, "split exit edge.");
1054 else
1056 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1057 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1058 if (inner_loop_vinfo)
1059 destroy_loop_vec_info (inner_loop_vinfo, true);
1060 return NULL;
1064 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1065 if (!loop_cond)
1067 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1068 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1069 if (inner_loop_vinfo)
1070 destroy_loop_vec_info (inner_loop_vinfo, true);
1071 return NULL;
1074 if (!number_of_iterations)
1076 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1077 fprintf (vect_dump,
1078 "not vectorized: number of iterations cannot be computed.");
1079 if (inner_loop_vinfo)
1080 destroy_loop_vec_info (inner_loop_vinfo, true);
1081 return NULL;
1084 if (chrec_contains_undetermined (number_of_iterations))
1086 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1087 fprintf (vect_dump, "Infinite number of iterations.");
1088 if (inner_loop_vinfo)
1089 destroy_loop_vec_info (inner_loop_vinfo, true);
1090 return NULL;
1093 if (!NITERS_KNOWN_P (number_of_iterations))
1095 if (vect_print_dump_info (REPORT_DETAILS))
1097 fprintf (vect_dump, "Symbolic number of iterations is ");
1098 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1101 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1103 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1104 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1105 if (inner_loop_vinfo)
1106 destroy_loop_vec_info (inner_loop_vinfo, false);
1107 return NULL;
1110 loop_vinfo = new_loop_vec_info (loop);
1111 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1112 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1114 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1116 /* CHECKME: May want to keep it around it in the future. */
1117 if (inner_loop_vinfo)
1118 destroy_loop_vec_info (inner_loop_vinfo, false);
1120 gcc_assert (!loop->aux);
1121 loop->aux = loop_vinfo;
1122 return loop_vinfo;
1126 /* Get cost by calling cost target builtin. */
1128 static inline int
1129 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1131 tree dummy_type = NULL;
1132 int dummy = 0;
1134 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1135 dummy_type, dummy);
1139 /* Function vect_analyze_loop_operations.
1141 Scan the loop stmts and make sure they are all vectorizable. */
1143 static bool
1144 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1148 int nbbs = loop->num_nodes;
1149 gimple_stmt_iterator si;
1150 unsigned int vectorization_factor = 0;
1151 int i;
1152 gimple phi;
1153 stmt_vec_info stmt_info;
1154 bool need_to_vectorize = false;
1155 int min_profitable_iters;
1156 int min_scalar_loop_bound;
1157 unsigned int th;
1158 bool only_slp_in_loop = true, ok;
1160 if (vect_print_dump_info (REPORT_DETAILS))
1161 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1163 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1164 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1166 for (i = 0; i < nbbs; i++)
1168 basic_block bb = bbs[i];
1170 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1172 phi = gsi_stmt (si);
1173 ok = true;
1175 stmt_info = vinfo_for_stmt (phi);
1176 if (vect_print_dump_info (REPORT_DETAILS))
1178 fprintf (vect_dump, "examining phi: ");
1179 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1182 if (! is_loop_header_bb_p (bb))
1184 /* inner-loop loop-closed exit phi in outer-loop vectorization
1185 (i.e. a phi in the tail of the outer-loop).
1186 FORNOW: we currently don't support the case that these phis
1187 are not used in the outerloop (unless it is double reduction,
1188 i.e., this phi is vect_reduction_def), cause this case
1189 requires to actually do something here. */
1190 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1191 || STMT_VINFO_LIVE_P (stmt_info))
1192 && STMT_VINFO_DEF_TYPE (stmt_info)
1193 != vect_double_reduction_def)
1195 if (vect_print_dump_info (REPORT_DETAILS))
1196 fprintf (vect_dump,
1197 "Unsupported loop-closed phi in outer-loop.");
1198 return false;
1200 continue;
1203 gcc_assert (stmt_info);
1205 if (STMT_VINFO_LIVE_P (stmt_info))
1207 /* FORNOW: not yet supported. */
1208 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1209 fprintf (vect_dump, "not vectorized: value used after loop.");
1210 return false;
1213 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1214 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1216 /* A scalar-dependence cycle that we don't support. */
1217 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1218 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1219 return false;
1222 if (STMT_VINFO_RELEVANT_P (stmt_info))
1224 need_to_vectorize = true;
1225 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1226 ok = vectorizable_induction (phi, NULL, NULL);
1229 if (!ok)
1231 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1233 fprintf (vect_dump,
1234 "not vectorized: relevant phi not supported: ");
1235 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1237 return false;
1241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1243 gimple stmt = gsi_stmt (si);
1244 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1246 gcc_assert (stmt_info);
1248 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1249 return false;
1251 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1252 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1253 && !PURE_SLP_STMT (stmt_info))
1254 /* STMT needs both SLP and loop-based vectorization. */
1255 only_slp_in_loop = false;
1257 } /* bbs */
1259 /* All operations in the loop are either irrelevant (deal with loop
1260 control, or dead), or only used outside the loop and can be moved
1261 out of the loop (e.g. invariants, inductions). The loop can be
1262 optimized away by scalar optimizations. We're better off not
1263 touching this loop. */
1264 if (!need_to_vectorize)
1266 if (vect_print_dump_info (REPORT_DETAILS))
1267 fprintf (vect_dump,
1268 "All the computation can be taken out of the loop.");
1269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1270 fprintf (vect_dump,
1271 "not vectorized: redundant loop. no profit to vectorize.");
1272 return false;
1275 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1276 vectorization factor of the loop is the unrolling factor required by the
1277 SLP instances. If that unrolling factor is 1, we say, that we perform
1278 pure SLP on loop - cross iteration parallelism is not exploited. */
1279 if (only_slp_in_loop)
1280 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1281 else
1282 vectorization_factor = least_common_multiple (vectorization_factor,
1283 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1285 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1287 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1288 && vect_print_dump_info (REPORT_DETAILS))
1289 fprintf (vect_dump,
1290 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1291 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1293 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1297 fprintf (vect_dump, "not vectorized: iteration count too small.");
1298 if (vect_print_dump_info (REPORT_DETAILS))
1299 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1300 "vectorization factor.");
1301 return false;
1304 /* Analyze cost. Decide if worth while to vectorize. */
1306 /* Once VF is set, SLP costs should be updated since the number of created
1307 vector stmts depends on VF. */
1308 vect_update_slp_costs_according_to_vf (loop_vinfo);
1310 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1311 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1313 if (min_profitable_iters < 0)
1315 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1316 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1317 if (vect_print_dump_info (REPORT_DETAILS))
1318 fprintf (vect_dump, "not vectorized: vector version will never be "
1319 "profitable.");
1320 return false;
1323 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1324 * vectorization_factor) - 1);
1326 /* Use the cost model only if it is more conservative than user specified
1327 threshold. */
1329 th = (unsigned) min_scalar_loop_bound;
1330 if (min_profitable_iters
1331 && (!min_scalar_loop_bound
1332 || min_profitable_iters > min_scalar_loop_bound))
1333 th = (unsigned) min_profitable_iters;
1335 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1336 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1338 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1339 fprintf (vect_dump, "not vectorized: vectorization not "
1340 "profitable.");
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1343 "user specified loop bound parameter or minimum "
1344 "profitable iterations (whichever is more conservative).");
1345 return false;
1348 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1349 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1350 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1352 if (vect_print_dump_info (REPORT_DETAILS))
1353 fprintf (vect_dump, "epilog loop required.");
1354 if (!vect_can_advance_ivs_p (loop_vinfo))
1356 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1357 fprintf (vect_dump,
1358 "not vectorized: can't create epilog loop 1.");
1359 return false;
1361 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1363 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1364 fprintf (vect_dump,
1365 "not vectorized: can't create epilog loop 2.");
1366 return false;
1370 return true;
1374 /* Function vect_analyze_loop_2.
1376 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1377 for it. The different analyses will record information in the
1378 loop_vec_info struct. */
1379 static bool
1380 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1382 bool ok, dummy;
1383 int max_vf = MAX_VECTORIZATION_FACTOR;
1384 int min_vf = 2;
1386 /* Find all data references in the loop (which correspond to vdefs/vuses)
1387 and analyze their evolution in the loop. Also adjust the minimal
1388 vectorization factor according to the loads and stores.
1390 FORNOW: Handle only simple, array references, which
1391 alignment can be forced, and aligned pointer-references. */
1393 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1394 if (!ok)
1396 if (vect_print_dump_info (REPORT_DETAILS))
1397 fprintf (vect_dump, "bad data references.");
1398 return false;
1401 /* Classify all cross-iteration scalar data-flow cycles.
1402 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1404 vect_analyze_scalar_cycles (loop_vinfo);
1406 vect_pattern_recog (loop_vinfo);
1408 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1410 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1411 if (!ok)
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "unexpected pattern.");
1415 return false;
1418 /* Analyze data dependences between the data-refs in the loop
1419 and adjust the maximum vectorization factor according to
1420 the dependences.
1421 FORNOW: fail at the first data dependence that we encounter. */
1423 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1424 if (!ok
1425 || max_vf < min_vf)
1427 if (vect_print_dump_info (REPORT_DETAILS))
1428 fprintf (vect_dump, "bad data dependence.");
1429 return false;
1432 ok = vect_determine_vectorization_factor (loop_vinfo);
1433 if (!ok)
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "can't determine vectorization factor.");
1437 return false;
1439 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1441 if (vect_print_dump_info (REPORT_DETAILS))
1442 fprintf (vect_dump, "bad data dependence.");
1443 return false;
1446 /* Analyze the alignment of the data-refs in the loop.
1447 Fail if a data reference is found that cannot be vectorized. */
1449 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1450 if (!ok)
1452 if (vect_print_dump_info (REPORT_DETAILS))
1453 fprintf (vect_dump, "bad data alignment.");
1454 return false;
1457 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1458 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1460 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1461 if (!ok)
1463 if (vect_print_dump_info (REPORT_DETAILS))
1464 fprintf (vect_dump, "bad data access.");
1465 return false;
1468 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1469 It is important to call pruning after vect_analyze_data_ref_accesses,
1470 since we use grouping information gathered by interleaving analysis. */
1471 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1472 if (!ok)
1474 if (vect_print_dump_info (REPORT_DETAILS))
1475 fprintf (vect_dump, "too long list of versioning for alias "
1476 "run-time tests.");
1477 return false;
1480 /* This pass will decide on using loop versioning and/or loop peeling in
1481 order to enhance the alignment of data references in the loop. */
1483 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1484 if (!ok)
1486 if (vect_print_dump_info (REPORT_DETAILS))
1487 fprintf (vect_dump, "bad data alignment.");
1488 return false;
1491 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1492 ok = vect_analyze_slp (loop_vinfo, NULL);
1493 if (ok)
1495 /* Decide which possible SLP instances to SLP. */
1496 vect_make_slp_decision (loop_vinfo);
1498 /* Find stmts that need to be both vectorized and SLPed. */
1499 vect_detect_hybrid_slp (loop_vinfo);
1502 /* Scan all the operations in the loop and make sure they are
1503 vectorizable. */
1505 ok = vect_analyze_loop_operations (loop_vinfo);
1506 if (!ok)
1508 if (vect_print_dump_info (REPORT_DETAILS))
1509 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1510 return false;
1513 return true;
1516 /* Function vect_analyze_loop.
1518 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1519 for it. The different analyses will record information in the
1520 loop_vec_info struct. */
1521 loop_vec_info
1522 vect_analyze_loop (struct loop *loop)
1524 loop_vec_info loop_vinfo;
1525 unsigned int vector_sizes;
1527 /* Autodetect first vector size we try. */
1528 current_vector_size = 0;
1529 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1531 if (vect_print_dump_info (REPORT_DETAILS))
1532 fprintf (vect_dump, "===== analyze_loop_nest =====");
1534 if (loop_outer (loop)
1535 && loop_vec_info_for_loop (loop_outer (loop))
1536 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1538 if (vect_print_dump_info (REPORT_DETAILS))
1539 fprintf (vect_dump, "outer-loop already vectorized.");
1540 return NULL;
1543 while (1)
1545 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1546 loop_vinfo = vect_analyze_loop_form (loop);
1547 if (!loop_vinfo)
1549 if (vect_print_dump_info (REPORT_DETAILS))
1550 fprintf (vect_dump, "bad loop form.");
1551 return NULL;
1554 if (vect_analyze_loop_2 (loop_vinfo))
1556 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1558 return loop_vinfo;
1561 destroy_loop_vec_info (loop_vinfo, true);
1563 vector_sizes &= ~current_vector_size;
1564 if (vector_sizes == 0
1565 || current_vector_size == 0)
1566 return NULL;
1568 /* Try the next biggest vector size. */
1569 current_vector_size = 1 << floor_log2 (vector_sizes);
1570 if (vect_print_dump_info (REPORT_DETAILS))
1571 fprintf (vect_dump, "***** Re-trying analysis with "
1572 "vector size %d\n", current_vector_size);
1577 /* Function reduction_code_for_scalar_code
1579 Input:
1580 CODE - tree_code of a reduction operations.
1582 Output:
1583 REDUC_CODE - the corresponding tree-code to be used to reduce the
1584 vector of partial results into a single scalar result (which
1585 will also reside in a vector) or ERROR_MARK if the operation is
1586 a supported reduction operation, but does not have such tree-code.
1588 Return FALSE if CODE currently cannot be vectorized as reduction. */
1590 static bool
1591 reduction_code_for_scalar_code (enum tree_code code,
1592 enum tree_code *reduc_code)
1594 switch (code)
1596 case MAX_EXPR:
1597 *reduc_code = REDUC_MAX_EXPR;
1598 return true;
1600 case MIN_EXPR:
1601 *reduc_code = REDUC_MIN_EXPR;
1602 return true;
1604 case PLUS_EXPR:
1605 *reduc_code = REDUC_PLUS_EXPR;
1606 return true;
1608 case MULT_EXPR:
1609 case MINUS_EXPR:
1610 case BIT_IOR_EXPR:
1611 case BIT_XOR_EXPR:
1612 case BIT_AND_EXPR:
1613 *reduc_code = ERROR_MARK;
1614 return true;
1616 default:
1617 return false;
1622 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1623 STMT is printed with a message MSG. */
1625 static void
1626 report_vect_op (gimple stmt, const char *msg)
1628 fprintf (vect_dump, "%s", msg);
1629 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1633 /* Function vect_is_simple_reduction_1
1635 (1) Detect a cross-iteration def-use cycle that represents a simple
1636 reduction computation. We look for the following pattern:
1638 loop_header:
1639 a1 = phi < a0, a2 >
1640 a3 = ...
1641 a2 = operation (a3, a1)
1643 such that:
1644 1. operation is commutative and associative and it is safe to
1645 change the order of the computation (if CHECK_REDUCTION is true)
1646 2. no uses for a2 in the loop (a2 is used out of the loop)
1647 3. no uses of a1 in the loop besides the reduction operation.
1649 Condition 1 is tested here.
1650 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1652 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1653 nested cycles, if CHECK_REDUCTION is false.
1655 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1656 reductions:
1658 a1 = phi < a0, a2 >
1659 inner loop (def of a3)
1660 a2 = phi < a3 >
1662 If MODIFY is true it tries also to rework the code in-place to enable
1663 detection of more reduction patterns. For the time being we rewrite
1664 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1667 static gimple
1668 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1669 bool check_reduction, bool *double_reduc,
1670 bool modify)
1672 struct loop *loop = (gimple_bb (phi))->loop_father;
1673 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1674 edge latch_e = loop_latch_edge (loop);
1675 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1676 gimple def_stmt, def1 = NULL, def2 = NULL;
1677 enum tree_code orig_code, code;
1678 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1679 tree type;
1680 int nloop_uses;
1681 tree name;
1682 imm_use_iterator imm_iter;
1683 use_operand_p use_p;
1684 bool phi_def;
1686 *double_reduc = false;
1688 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1689 otherwise, we assume outer loop vectorization. */
1690 gcc_assert ((check_reduction && loop == vect_loop)
1691 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1693 name = PHI_RESULT (phi);
1694 nloop_uses = 0;
1695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1697 gimple use_stmt = USE_STMT (use_p);
1698 if (is_gimple_debug (use_stmt))
1699 continue;
1700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1701 && vinfo_for_stmt (use_stmt)
1702 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1703 nloop_uses++;
1704 if (nloop_uses > 1)
1706 if (vect_print_dump_info (REPORT_DETAILS))
1707 fprintf (vect_dump, "reduction used in loop.");
1708 return NULL;
1712 if (TREE_CODE (loop_arg) != SSA_NAME)
1714 if (vect_print_dump_info (REPORT_DETAILS))
1716 fprintf (vect_dump, "reduction: not ssa_name: ");
1717 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1719 return NULL;
1722 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1723 if (!def_stmt)
1725 if (vect_print_dump_info (REPORT_DETAILS))
1726 fprintf (vect_dump, "reduction: no def_stmt.");
1727 return NULL;
1730 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1732 if (vect_print_dump_info (REPORT_DETAILS))
1733 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1734 return NULL;
1737 if (is_gimple_assign (def_stmt))
1739 name = gimple_assign_lhs (def_stmt);
1740 phi_def = false;
1742 else
1744 name = PHI_RESULT (def_stmt);
1745 phi_def = true;
1748 nloop_uses = 0;
1749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1751 gimple use_stmt = USE_STMT (use_p);
1752 if (is_gimple_debug (use_stmt))
1753 continue;
1754 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1755 && vinfo_for_stmt (use_stmt)
1756 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1757 nloop_uses++;
1758 if (nloop_uses > 1)
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 fprintf (vect_dump, "reduction used in loop.");
1762 return NULL;
1766 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1767 defined in the inner loop. */
1768 if (phi_def)
1770 op1 = PHI_ARG_DEF (def_stmt, 0);
1772 if (gimple_phi_num_args (def_stmt) != 1
1773 || TREE_CODE (op1) != SSA_NAME)
1775 if (vect_print_dump_info (REPORT_DETAILS))
1776 fprintf (vect_dump, "unsupported phi node definition.");
1778 return NULL;
1781 def1 = SSA_NAME_DEF_STMT (op1);
1782 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1783 && loop->inner
1784 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1785 && is_gimple_assign (def1))
1787 if (vect_print_dump_info (REPORT_DETAILS))
1788 report_vect_op (def_stmt, "detected double reduction: ");
1790 *double_reduc = true;
1791 return def_stmt;
1794 return NULL;
1797 code = orig_code = gimple_assign_rhs_code (def_stmt);
1799 /* We can handle "res -= x[i]", which is non-associative by
1800 simply rewriting this into "res += -x[i]". Avoid changing
1801 gimple instruction for the first simple tests and only do this
1802 if we're allowed to change code at all. */
1803 if (code == MINUS_EXPR
1804 && modify
1805 && (op1 = gimple_assign_rhs1 (def_stmt))
1806 && TREE_CODE (op1) == SSA_NAME
1807 && SSA_NAME_DEF_STMT (op1) == phi)
1808 code = PLUS_EXPR;
1810 if (check_reduction
1811 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1813 if (vect_print_dump_info (REPORT_DETAILS))
1814 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1815 return NULL;
1818 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1820 if (code != COND_EXPR)
1822 if (vect_print_dump_info (REPORT_DETAILS))
1823 report_vect_op (def_stmt, "reduction: not binary operation: ");
1825 return NULL;
1828 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1829 if (COMPARISON_CLASS_P (op3))
1831 op4 = TREE_OPERAND (op3, 1);
1832 op3 = TREE_OPERAND (op3, 0);
1835 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1836 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1838 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1840 if (vect_print_dump_info (REPORT_DETAILS))
1841 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1843 return NULL;
1846 else
1848 op1 = gimple_assign_rhs1 (def_stmt);
1849 op2 = gimple_assign_rhs2 (def_stmt);
1851 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1853 if (vect_print_dump_info (REPORT_DETAILS))
1854 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1856 return NULL;
1860 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1861 if ((TREE_CODE (op1) == SSA_NAME
1862 && !types_compatible_p (type,TREE_TYPE (op1)))
1863 || (TREE_CODE (op2) == SSA_NAME
1864 && !types_compatible_p (type, TREE_TYPE (op2)))
1865 || (op3 && TREE_CODE (op3) == SSA_NAME
1866 && !types_compatible_p (type, TREE_TYPE (op3)))
1867 || (op4 && TREE_CODE (op4) == SSA_NAME
1868 && !types_compatible_p (type, TREE_TYPE (op4))))
1870 if (vect_print_dump_info (REPORT_DETAILS))
1872 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1873 print_generic_expr (vect_dump, type, TDF_SLIM);
1874 fprintf (vect_dump, ", operands types: ");
1875 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1876 fprintf (vect_dump, ",");
1877 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1878 if (op3)
1880 fprintf (vect_dump, ",");
1881 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1884 if (op4)
1886 fprintf (vect_dump, ",");
1887 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1891 return NULL;
1894 /* Check that it's ok to change the order of the computation.
1895 Generally, when vectorizing a reduction we change the order of the
1896 computation. This may change the behavior of the program in some
1897 cases, so we need to check that this is ok. One exception is when
1898 vectorizing an outer-loop: the inner-loop is executed sequentially,
1899 and therefore vectorizing reductions in the inner-loop during
1900 outer-loop vectorization is safe. */
1902 /* CHECKME: check for !flag_finite_math_only too? */
1903 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1904 && check_reduction)
1906 /* Changing the order of operations changes the semantics. */
1907 if (vect_print_dump_info (REPORT_DETAILS))
1908 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1909 return NULL;
1911 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1912 && check_reduction)
1914 /* Changing the order of operations changes the semantics. */
1915 if (vect_print_dump_info (REPORT_DETAILS))
1916 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1917 return NULL;
1919 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1921 /* Changing the order of operations changes the semantics. */
1922 if (vect_print_dump_info (REPORT_DETAILS))
1923 report_vect_op (def_stmt,
1924 "reduction: unsafe fixed-point math optimization: ");
1925 return NULL;
1928 /* If we detected "res -= x[i]" earlier, rewrite it into
1929 "res += -x[i]" now. If this turns out to be useless reassoc
1930 will clean it up again. */
1931 if (orig_code == MINUS_EXPR)
1933 tree rhs = gimple_assign_rhs2 (def_stmt);
1934 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1935 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1936 rhs, NULL);
1937 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1938 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1939 loop_info, NULL));
1940 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1941 gimple_assign_set_rhs2 (def_stmt, negrhs);
1942 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1943 update_stmt (def_stmt);
1946 /* Reduction is safe. We're dealing with one of the following:
1947 1) integer arithmetic and no trapv
1948 2) floating point arithmetic, and special flags permit this optimization
1949 3) nested cycle (i.e., outer loop vectorization). */
1950 if (TREE_CODE (op1) == SSA_NAME)
1951 def1 = SSA_NAME_DEF_STMT (op1);
1953 if (TREE_CODE (op2) == SSA_NAME)
1954 def2 = SSA_NAME_DEF_STMT (op2);
1956 if (code != COND_EXPR
1957 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1959 if (vect_print_dump_info (REPORT_DETAILS))
1960 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1961 return NULL;
1964 /* Check that one def is the reduction def, defined by PHI,
1965 the other def is either defined in the loop ("vect_internal_def"),
1966 or it's an induction (defined by a loop-header phi-node). */
1968 if (def2 && def2 == phi
1969 && (code == COND_EXPR
1970 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1971 && (is_gimple_assign (def1)
1972 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1973 == vect_induction_def
1974 || (gimple_code (def1) == GIMPLE_PHI
1975 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1976 == vect_internal_def
1977 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1979 if (vect_print_dump_info (REPORT_DETAILS))
1980 report_vect_op (def_stmt, "detected reduction: ");
1981 return def_stmt;
1983 else if (def1 && def1 == phi
1984 && (code == COND_EXPR
1985 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1986 && (is_gimple_assign (def2)
1987 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1988 == vect_induction_def
1989 || (gimple_code (def2) == GIMPLE_PHI
1990 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1991 == vect_internal_def
1992 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1994 if (check_reduction)
1996 /* Swap operands (just for simplicity - so that the rest of the code
1997 can assume that the reduction variable is always the last (second)
1998 argument). */
1999 if (vect_print_dump_info (REPORT_DETAILS))
2000 report_vect_op (def_stmt,
2001 "detected reduction: need to swap operands: ");
2003 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2004 gimple_assign_rhs2_ptr (def_stmt));
2006 else
2008 if (vect_print_dump_info (REPORT_DETAILS))
2009 report_vect_op (def_stmt, "detected reduction: ");
2012 return def_stmt;
2014 else
2016 if (vect_print_dump_info (REPORT_DETAILS))
2017 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2019 return NULL;
2023 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2024 in-place. Arguments as there. */
2026 static gimple
2027 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2028 bool check_reduction, bool *double_reduc)
2030 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2031 double_reduc, false);
2034 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2035 in-place if it enables detection of more reductions. Arguments
2036 as there. */
2038 gimple
2039 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2040 bool check_reduction, bool *double_reduc)
2042 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2043 double_reduc, true);
2046 /* Calculate the cost of one scalar iteration of the loop. */
2048 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2050 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2051 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2052 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2053 int innerloop_iters, i, stmt_cost;
2055 /* Count statements in scalar loop. Using this as scalar cost for a single
2056 iteration for now.
2058 TODO: Add outer loop support.
2060 TODO: Consider assigning different costs to different scalar
2061 statements. */
2063 /* FORNOW. */
2064 innerloop_iters = 1;
2065 if (loop->inner)
2066 innerloop_iters = 50; /* FIXME */
2068 for (i = 0; i < nbbs; i++)
2070 gimple_stmt_iterator si;
2071 basic_block bb = bbs[i];
2073 if (bb->loop_father == loop->inner)
2074 factor = innerloop_iters;
2075 else
2076 factor = 1;
2078 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2080 gimple stmt = gsi_stmt (si);
2081 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2083 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2084 continue;
2086 /* Skip stmts that are not vectorized inside the loop. */
2087 if (stmt_info
2088 && !STMT_VINFO_RELEVANT_P (stmt_info)
2089 && (!STMT_VINFO_LIVE_P (stmt_info)
2090 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2091 continue;
2093 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2095 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2096 stmt_cost = vect_get_cost (scalar_load);
2097 else
2098 stmt_cost = vect_get_cost (scalar_store);
2100 else
2101 stmt_cost = vect_get_cost (scalar_stmt);
2103 scalar_single_iter_cost += stmt_cost * factor;
2106 return scalar_single_iter_cost;
2109 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2111 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2112 int *peel_iters_epilogue,
2113 int scalar_single_iter_cost)
2115 int peel_guard_costs = 0;
2116 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2118 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2120 *peel_iters_epilogue = vf/2;
2121 if (vect_print_dump_info (REPORT_COST))
2122 fprintf (vect_dump, "cost model: "
2123 "epilogue peel iters set to vf/2 because "
2124 "loop iterations are unknown .");
2126 /* If peeled iterations are known but number of scalar loop
2127 iterations are unknown, count a taken branch per peeled loop. */
2128 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2130 else
2132 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2133 peel_iters_prologue = niters < peel_iters_prologue ?
2134 niters : peel_iters_prologue;
2135 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2138 return (peel_iters_prologue * scalar_single_iter_cost)
2139 + (*peel_iters_epilogue * scalar_single_iter_cost)
2140 + peel_guard_costs;
2143 /* Function vect_estimate_min_profitable_iters
2145 Return the number of iterations required for the vector version of the
2146 loop to be profitable relative to the cost of the scalar version of the
2147 loop.
2149 TODO: Take profile info into account before making vectorization
2150 decisions, if available. */
2153 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2155 int i;
2156 int min_profitable_iters;
2157 int peel_iters_prologue;
2158 int peel_iters_epilogue;
2159 int vec_inside_cost = 0;
2160 int vec_outside_cost = 0;
2161 int scalar_single_iter_cost = 0;
2162 int scalar_outside_cost = 0;
2163 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2164 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2165 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2166 int nbbs = loop->num_nodes;
2167 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2168 int peel_guard_costs = 0;
2169 int innerloop_iters = 0, factor;
2170 VEC (slp_instance, heap) *slp_instances;
2171 slp_instance instance;
2173 /* Cost model disabled. */
2174 if (!flag_vect_cost_model)
2176 if (vect_print_dump_info (REPORT_COST))
2177 fprintf (vect_dump, "cost model disabled.");
2178 return 0;
2181 /* Requires loop versioning tests to handle misalignment. */
2182 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2184 /* FIXME: Make cost depend on complexity of individual check. */
2185 vec_outside_cost +=
2186 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2187 if (vect_print_dump_info (REPORT_COST))
2188 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2189 "versioning to treat misalignment.\n");
2192 /* Requires loop versioning with alias checks. */
2193 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2195 /* FIXME: Make cost depend on complexity of individual check. */
2196 vec_outside_cost +=
2197 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2198 if (vect_print_dump_info (REPORT_COST))
2199 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2200 "versioning aliasing.\n");
2203 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2204 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2205 vec_outside_cost += vect_get_cost (cond_branch_taken);
2207 /* Count statements in scalar loop. Using this as scalar cost for a single
2208 iteration for now.
2210 TODO: Add outer loop support.
2212 TODO: Consider assigning different costs to different scalar
2213 statements. */
2215 /* FORNOW. */
2216 if (loop->inner)
2217 innerloop_iters = 50; /* FIXME */
2219 for (i = 0; i < nbbs; i++)
2221 gimple_stmt_iterator si;
2222 basic_block bb = bbs[i];
2224 if (bb->loop_father == loop->inner)
2225 factor = innerloop_iters;
2226 else
2227 factor = 1;
2229 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2231 gimple stmt = gsi_stmt (si);
2232 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2233 /* Skip stmts that are not vectorized inside the loop. */
2234 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2235 && (!STMT_VINFO_LIVE_P (stmt_info)
2236 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2237 continue;
2238 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2239 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2240 some of the "outside" costs are generated inside the outer-loop. */
2241 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2245 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2247 /* Add additional cost for the peeled instructions in prologue and epilogue
2248 loop.
2250 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2251 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2253 TODO: Build an expression that represents peel_iters for prologue and
2254 epilogue to be used in a run-time test. */
2256 if (npeel < 0)
2258 peel_iters_prologue = vf/2;
2259 if (vect_print_dump_info (REPORT_COST))
2260 fprintf (vect_dump, "cost model: "
2261 "prologue peel iters set to vf/2.");
2263 /* If peeling for alignment is unknown, loop bound of main loop becomes
2264 unknown. */
2265 peel_iters_epilogue = vf/2;
2266 if (vect_print_dump_info (REPORT_COST))
2267 fprintf (vect_dump, "cost model: "
2268 "epilogue peel iters set to vf/2 because "
2269 "peeling for alignment is unknown .");
2271 /* If peeled iterations are unknown, count a taken branch and a not taken
2272 branch per peeled loop. Even if scalar loop iterations are known,
2273 vector iterations are not known since peeled prologue iterations are
2274 not known. Hence guards remain the same. */
2275 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2276 + vect_get_cost (cond_branch_not_taken));
2277 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2278 + (peel_iters_epilogue * scalar_single_iter_cost)
2279 + peel_guard_costs;
2281 else
2283 peel_iters_prologue = npeel;
2284 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2285 peel_iters_prologue, &peel_iters_epilogue,
2286 scalar_single_iter_cost);
2289 /* FORNOW: The scalar outside cost is incremented in one of the
2290 following ways:
2292 1. The vectorizer checks for alignment and aliasing and generates
2293 a condition that allows dynamic vectorization. A cost model
2294 check is ANDED with the versioning condition. Hence scalar code
2295 path now has the added cost of the versioning check.
2297 if (cost > th & versioning_check)
2298 jmp to vector code
2300 Hence run-time scalar is incremented by not-taken branch cost.
2302 2. The vectorizer then checks if a prologue is required. If the
2303 cost model check was not done before during versioning, it has to
2304 be done before the prologue check.
2306 if (cost <= th)
2307 prologue = scalar_iters
2308 if (prologue == 0)
2309 jmp to vector code
2310 else
2311 execute prologue
2312 if (prologue == num_iters)
2313 go to exit
2315 Hence the run-time scalar cost is incremented by a taken branch,
2316 plus a not-taken branch, plus a taken branch cost.
2318 3. The vectorizer then checks if an epilogue is required. If the
2319 cost model check was not done before during prologue check, it
2320 has to be done with the epilogue check.
2322 if (prologue == 0)
2323 jmp to vector code
2324 else
2325 execute prologue
2326 if (prologue == num_iters)
2327 go to exit
2328 vector code:
2329 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2330 jmp to epilogue
2332 Hence the run-time scalar cost should be incremented by 2 taken
2333 branches.
2335 TODO: The back end may reorder the BBS's differently and reverse
2336 conditions/branch directions. Change the estimates below to
2337 something more reasonable. */
2339 /* If the number of iterations is known and we do not do versioning, we can
2340 decide whether to vectorize at compile time. Hence the scalar version
2341 do not carry cost model guard costs. */
2342 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2343 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2344 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2346 /* Cost model check occurs at versioning. */
2347 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2348 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2349 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2350 else
2352 /* Cost model check occurs at prologue generation. */
2353 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2354 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2355 + vect_get_cost (cond_branch_not_taken);
2356 /* Cost model check occurs at epilogue generation. */
2357 else
2358 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2362 /* Add SLP costs. */
2363 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2364 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2366 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2367 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2370 /* Calculate number of iterations required to make the vector version
2371 profitable, relative to the loop bodies only. The following condition
2372 must hold true:
2373 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2374 where
2375 SIC = scalar iteration cost, VIC = vector iteration cost,
2376 VOC = vector outside cost, VF = vectorization factor,
2377 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2378 SOC = scalar outside cost for run time cost model check. */
2380 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2382 if (vec_outside_cost <= 0)
2383 min_profitable_iters = 1;
2384 else
2386 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2387 - vec_inside_cost * peel_iters_prologue
2388 - vec_inside_cost * peel_iters_epilogue)
2389 / ((scalar_single_iter_cost * vf)
2390 - vec_inside_cost);
2392 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2393 <= ((vec_inside_cost * min_profitable_iters)
2394 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2395 min_profitable_iters++;
2398 /* vector version will never be profitable. */
2399 else
2401 if (vect_print_dump_info (REPORT_COST))
2402 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2403 "divided by the scalar iteration cost = %d "
2404 "is greater or equal to the vectorization factor = %d.",
2405 vec_inside_cost, scalar_single_iter_cost, vf);
2406 return -1;
2409 if (vect_print_dump_info (REPORT_COST))
2411 fprintf (vect_dump, "Cost model analysis: \n");
2412 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2413 vec_inside_cost);
2414 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2415 vec_outside_cost);
2416 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2417 scalar_single_iter_cost);
2418 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2419 fprintf (vect_dump, " prologue iterations: %d\n",
2420 peel_iters_prologue);
2421 fprintf (vect_dump, " epilogue iterations: %d\n",
2422 peel_iters_epilogue);
2423 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2424 min_profitable_iters);
2427 min_profitable_iters =
2428 min_profitable_iters < vf ? vf : min_profitable_iters;
2430 /* Because the condition we create is:
2431 if (niters <= min_profitable_iters)
2432 then skip the vectorized loop. */
2433 min_profitable_iters--;
2435 if (vect_print_dump_info (REPORT_COST))
2436 fprintf (vect_dump, " Profitability threshold = %d\n",
2437 min_profitable_iters);
2439 return min_profitable_iters;
2443 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2444 functions. Design better to avoid maintenance issues. */
2446 /* Function vect_model_reduction_cost.
2448 Models cost for a reduction operation, including the vector ops
2449 generated within the strip-mine loop, the initial definition before
2450 the loop, and the epilogue code that must be generated. */
2452 static bool
2453 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2454 int ncopies)
2456 int outer_cost = 0;
2457 enum tree_code code;
2458 optab optab;
2459 tree vectype;
2460 gimple stmt, orig_stmt;
2461 tree reduction_op;
2462 enum machine_mode mode;
2463 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2464 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2467 /* Cost of reduction op inside loop. */
2468 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2469 += ncopies * vect_get_cost (vector_stmt);
2471 stmt = STMT_VINFO_STMT (stmt_info);
2473 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2475 case GIMPLE_SINGLE_RHS:
2476 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2477 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2478 break;
2479 case GIMPLE_UNARY_RHS:
2480 reduction_op = gimple_assign_rhs1 (stmt);
2481 break;
2482 case GIMPLE_BINARY_RHS:
2483 reduction_op = gimple_assign_rhs2 (stmt);
2484 break;
2485 default:
2486 gcc_unreachable ();
2489 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2490 if (!vectype)
2492 if (vect_print_dump_info (REPORT_COST))
2494 fprintf (vect_dump, "unsupported data-type ");
2495 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2497 return false;
2500 mode = TYPE_MODE (vectype);
2501 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2503 if (!orig_stmt)
2504 orig_stmt = STMT_VINFO_STMT (stmt_info);
2506 code = gimple_assign_rhs_code (orig_stmt);
2508 /* Add in cost for initial definition. */
2509 outer_cost += vect_get_cost (scalar_to_vec);
2511 /* Determine cost of epilogue code.
2513 We have a reduction operator that will reduce the vector in one statement.
2514 Also requires scalar extract. */
2516 if (!nested_in_vect_loop_p (loop, orig_stmt))
2518 if (reduc_code != ERROR_MARK)
2519 outer_cost += vect_get_cost (vector_stmt)
2520 + vect_get_cost (vec_to_scalar);
2521 else
2523 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2524 tree bitsize =
2525 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2526 int element_bitsize = tree_low_cst (bitsize, 1);
2527 int nelements = vec_size_in_bits / element_bitsize;
2529 optab = optab_for_tree_code (code, vectype, optab_default);
2531 /* We have a whole vector shift available. */
2532 if (VECTOR_MODE_P (mode)
2533 && optab_handler (optab, mode) != CODE_FOR_nothing
2534 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2535 /* Final reduction via vector shifts and the reduction operator. Also
2536 requires scalar extract. */
2537 outer_cost += ((exact_log2(nelements) * 2)
2538 * vect_get_cost (vector_stmt)
2539 + vect_get_cost (vec_to_scalar));
2540 else
2541 /* Use extracts and reduction op for final reduction. For N elements,
2542 we have N extracts and N-1 reduction ops. */
2543 outer_cost += ((nelements + nelements - 1)
2544 * vect_get_cost (vector_stmt));
2548 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2550 if (vect_print_dump_info (REPORT_COST))
2551 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2552 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2553 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2555 return true;
2559 /* Function vect_model_induction_cost.
2561 Models cost for induction operations. */
2563 static void
2564 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2566 /* loop cost for vec_loop. */
2567 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2568 = ncopies * vect_get_cost (vector_stmt);
2569 /* prologue cost for vec_init and vec_step. */
2570 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2571 = 2 * vect_get_cost (scalar_to_vec);
2573 if (vect_print_dump_info (REPORT_COST))
2574 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2575 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2576 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2580 /* Function get_initial_def_for_induction
2582 Input:
2583 STMT - a stmt that performs an induction operation in the loop.
2584 IV_PHI - the initial value of the induction variable
2586 Output:
2587 Return a vector variable, initialized with the first VF values of
2588 the induction variable. E.g., for an iv with IV_PHI='X' and
2589 evolution S, for a vector of 4 units, we want to return:
2590 [X, X + S, X + 2*S, X + 3*S]. */
2592 static tree
2593 get_initial_def_for_induction (gimple iv_phi)
2595 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2596 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2597 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2598 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2599 tree vectype;
2600 int nunits;
2601 edge pe = loop_preheader_edge (loop);
2602 struct loop *iv_loop;
2603 basic_block new_bb;
2604 tree vec, vec_init, vec_step, t;
2605 tree access_fn;
2606 tree new_var;
2607 tree new_name;
2608 gimple init_stmt, induction_phi, new_stmt;
2609 tree induc_def, vec_def, vec_dest;
2610 tree init_expr, step_expr;
2611 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2612 int i;
2613 bool ok;
2614 int ncopies;
2615 tree expr;
2616 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2617 bool nested_in_vect_loop = false;
2618 gimple_seq stmts = NULL;
2619 imm_use_iterator imm_iter;
2620 use_operand_p use_p;
2621 gimple exit_phi;
2622 edge latch_e;
2623 tree loop_arg;
2624 gimple_stmt_iterator si;
2625 basic_block bb = gimple_bb (iv_phi);
2626 tree stepvectype;
2628 vectype = get_vectype_for_scalar_type (scalar_type);
2629 gcc_assert (vectype);
2630 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2631 ncopies = vf / nunits;
2633 gcc_assert (phi_info);
2634 gcc_assert (ncopies >= 1);
2636 /* Find the first insertion point in the BB. */
2637 si = gsi_after_labels (bb);
2639 if (INTEGRAL_TYPE_P (scalar_type))
2640 step_expr = build_int_cst (scalar_type, 0);
2641 else if (POINTER_TYPE_P (scalar_type))
2642 step_expr = size_zero_node;
2643 else
2644 step_expr = build_real (scalar_type, dconst0);
2646 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2647 if (nested_in_vect_loop_p (loop, iv_phi))
2649 nested_in_vect_loop = true;
2650 iv_loop = loop->inner;
2652 else
2653 iv_loop = loop;
2654 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2656 latch_e = loop_latch_edge (iv_loop);
2657 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2659 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2660 gcc_assert (access_fn);
2661 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2662 &init_expr, &step_expr);
2663 gcc_assert (ok);
2664 pe = loop_preheader_edge (iv_loop);
2666 /* Create the vector that holds the initial_value of the induction. */
2667 if (nested_in_vect_loop)
2669 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2670 been created during vectorization of previous stmts. We obtain it
2671 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2672 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2673 loop_preheader_edge (iv_loop));
2674 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2676 else
2678 /* iv_loop is the loop to be vectorized. Create:
2679 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2680 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2681 add_referenced_var (new_var);
2683 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2684 if (stmts)
2686 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2687 gcc_assert (!new_bb);
2690 t = NULL_TREE;
2691 t = tree_cons (NULL_TREE, init_expr, t);
2692 for (i = 1; i < nunits; i++)
2694 /* Create: new_name_i = new_name + step_expr */
2695 enum tree_code code = POINTER_TYPE_P (scalar_type)
2696 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2697 init_stmt = gimple_build_assign_with_ops (code, new_var,
2698 new_name, step_expr);
2699 new_name = make_ssa_name (new_var, init_stmt);
2700 gimple_assign_set_lhs (init_stmt, new_name);
2702 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2703 gcc_assert (!new_bb);
2705 if (vect_print_dump_info (REPORT_DETAILS))
2707 fprintf (vect_dump, "created new init_stmt: ");
2708 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2710 t = tree_cons (NULL_TREE, new_name, t);
2712 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2713 vec = build_constructor_from_list (vectype, nreverse (t));
2714 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2718 /* Create the vector that holds the step of the induction. */
2719 if (nested_in_vect_loop)
2720 /* iv_loop is nested in the loop to be vectorized. Generate:
2721 vec_step = [S, S, S, S] */
2722 new_name = step_expr;
2723 else
2725 /* iv_loop is the loop to be vectorized. Generate:
2726 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2727 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2728 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2729 expr, step_expr);
2732 t = unshare_expr (new_name);
2733 gcc_assert (CONSTANT_CLASS_P (new_name));
2734 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2735 gcc_assert (stepvectype);
2736 vec = build_vector_from_val (stepvectype, t);
2737 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2740 /* Create the following def-use cycle:
2741 loop prolog:
2742 vec_init = ...
2743 vec_step = ...
2744 loop:
2745 vec_iv = PHI <vec_init, vec_loop>
2747 STMT
2749 vec_loop = vec_iv + vec_step; */
2751 /* Create the induction-phi that defines the induction-operand. */
2752 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2753 add_referenced_var (vec_dest);
2754 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2755 set_vinfo_for_stmt (induction_phi,
2756 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2757 induc_def = PHI_RESULT (induction_phi);
2759 /* Create the iv update inside the loop */
2760 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2761 induc_def, vec_step);
2762 vec_def = make_ssa_name (vec_dest, new_stmt);
2763 gimple_assign_set_lhs (new_stmt, vec_def);
2764 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2765 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2766 NULL));
2768 /* Set the arguments of the phi node: */
2769 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2770 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2771 UNKNOWN_LOCATION);
2774 /* In case that vectorization factor (VF) is bigger than the number
2775 of elements that we can fit in a vectype (nunits), we have to generate
2776 more than one vector stmt - i.e - we need to "unroll" the
2777 vector stmt by a factor VF/nunits. For more details see documentation
2778 in vectorizable_operation. */
2780 if (ncopies > 1)
2782 stmt_vec_info prev_stmt_vinfo;
2783 /* FORNOW. This restriction should be relaxed. */
2784 gcc_assert (!nested_in_vect_loop);
2786 /* Create the vector that holds the step of the induction. */
2787 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2788 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2789 expr, step_expr);
2790 t = unshare_expr (new_name);
2791 gcc_assert (CONSTANT_CLASS_P (new_name));
2792 vec = build_vector_from_val (stepvectype, t);
2793 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2795 vec_def = induc_def;
2796 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2797 for (i = 1; i < ncopies; i++)
2799 /* vec_i = vec_prev + vec_step */
2800 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2801 vec_def, vec_step);
2802 vec_def = make_ssa_name (vec_dest, new_stmt);
2803 gimple_assign_set_lhs (new_stmt, vec_def);
2805 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2806 set_vinfo_for_stmt (new_stmt,
2807 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2808 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2809 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2813 if (nested_in_vect_loop)
2815 /* Find the loop-closed exit-phi of the induction, and record
2816 the final vector of induction results: */
2817 exit_phi = NULL;
2818 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2820 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2822 exit_phi = USE_STMT (use_p);
2823 break;
2826 if (exit_phi)
2828 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2829 /* FORNOW. Currently not supporting the case that an inner-loop induction
2830 is not used in the outer-loop (i.e. only outside the outer-loop). */
2831 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2832 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2834 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2835 if (vect_print_dump_info (REPORT_DETAILS))
2837 fprintf (vect_dump, "vector of inductions after inner-loop:");
2838 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2844 if (vect_print_dump_info (REPORT_DETAILS))
2846 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2847 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2848 fprintf (vect_dump, "\n");
2849 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2852 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2853 return induc_def;
2857 /* Function get_initial_def_for_reduction
2859 Input:
2860 STMT - a stmt that performs a reduction operation in the loop.
2861 INIT_VAL - the initial value of the reduction variable
2863 Output:
2864 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2865 of the reduction (used for adjusting the epilog - see below).
2866 Return a vector variable, initialized according to the operation that STMT
2867 performs. This vector will be used as the initial value of the
2868 vector of partial results.
2870 Option1 (adjust in epilog): Initialize the vector as follows:
2871 add/bit or/xor: [0,0,...,0,0]
2872 mult/bit and: [1,1,...,1,1]
2873 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2874 and when necessary (e.g. add/mult case) let the caller know
2875 that it needs to adjust the result by init_val.
2877 Option2: Initialize the vector as follows:
2878 add/bit or/xor: [init_val,0,0,...,0]
2879 mult/bit and: [init_val,1,1,...,1]
2880 min/max/cond_expr: [init_val,init_val,...,init_val]
2881 and no adjustments are needed.
2883 For example, for the following code:
2885 s = init_val;
2886 for (i=0;i<n;i++)
2887 s = s + a[i];
2889 STMT is 's = s + a[i]', and the reduction variable is 's'.
2890 For a vector of 4 units, we want to return either [0,0,0,init_val],
2891 or [0,0,0,0] and let the caller know that it needs to adjust
2892 the result at the end by 'init_val'.
2894 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2895 initialization vector is simpler (same element in all entries), if
2896 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2898 A cost model should help decide between these two schemes. */
2900 tree
2901 get_initial_def_for_reduction (gimple stmt, tree init_val,
2902 tree *adjustment_def)
2904 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2907 tree scalar_type = TREE_TYPE (init_val);
2908 tree vectype = get_vectype_for_scalar_type (scalar_type);
2909 int nunits;
2910 enum tree_code code = gimple_assign_rhs_code (stmt);
2911 tree def_for_init;
2912 tree init_def;
2913 tree t = NULL_TREE;
2914 int i;
2915 bool nested_in_vect_loop = false;
2916 tree init_value;
2917 REAL_VALUE_TYPE real_init_val = dconst0;
2918 int int_init_val = 0;
2919 gimple def_stmt = NULL;
2921 gcc_assert (vectype);
2922 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2924 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2925 || SCALAR_FLOAT_TYPE_P (scalar_type));
2927 if (nested_in_vect_loop_p (loop, stmt))
2928 nested_in_vect_loop = true;
2929 else
2930 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2932 /* In case of double reduction we only create a vector variable to be put
2933 in the reduction phi node. The actual statement creation is done in
2934 vect_create_epilog_for_reduction. */
2935 if (adjustment_def && nested_in_vect_loop
2936 && TREE_CODE (init_val) == SSA_NAME
2937 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2938 && gimple_code (def_stmt) == GIMPLE_PHI
2939 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2940 && vinfo_for_stmt (def_stmt)
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2942 == vect_double_reduction_def)
2944 *adjustment_def = NULL;
2945 return vect_create_destination_var (init_val, vectype);
2948 if (TREE_CONSTANT (init_val))
2950 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2951 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2952 else
2953 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2955 else
2956 init_value = init_val;
2958 switch (code)
2960 case WIDEN_SUM_EXPR:
2961 case DOT_PROD_EXPR:
2962 case PLUS_EXPR:
2963 case MINUS_EXPR:
2964 case BIT_IOR_EXPR:
2965 case BIT_XOR_EXPR:
2966 case MULT_EXPR:
2967 case BIT_AND_EXPR:
2968 /* ADJUSMENT_DEF is NULL when called from
2969 vect_create_epilog_for_reduction to vectorize double reduction. */
2970 if (adjustment_def)
2972 if (nested_in_vect_loop)
2973 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2974 NULL);
2975 else
2976 *adjustment_def = init_val;
2979 if (code == MULT_EXPR)
2981 real_init_val = dconst1;
2982 int_init_val = 1;
2985 if (code == BIT_AND_EXPR)
2986 int_init_val = -1;
2988 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2989 def_for_init = build_real (scalar_type, real_init_val);
2990 else
2991 def_for_init = build_int_cst (scalar_type, int_init_val);
2993 /* Create a vector of '0' or '1' except the first element. */
2994 for (i = nunits - 2; i >= 0; --i)
2995 t = tree_cons (NULL_TREE, def_for_init, t);
2997 /* Option1: the first element is '0' or '1' as well. */
2998 if (adjustment_def)
3000 t = tree_cons (NULL_TREE, def_for_init, t);
3001 init_def = build_vector (vectype, t);
3002 break;
3005 /* Option2: the first element is INIT_VAL. */
3006 t = tree_cons (NULL_TREE, init_value, t);
3007 if (TREE_CONSTANT (init_val))
3008 init_def = build_vector (vectype, t);
3009 else
3010 init_def = build_constructor_from_list (vectype, t);
3012 break;
3014 case MIN_EXPR:
3015 case MAX_EXPR:
3016 case COND_EXPR:
3017 if (adjustment_def)
3019 *adjustment_def = NULL_TREE;
3020 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3021 break;
3024 init_def = build_vector_from_val (vectype, init_value);
3025 break;
3027 default:
3028 gcc_unreachable ();
3031 return init_def;
3035 /* Function vect_create_epilog_for_reduction
3037 Create code at the loop-epilog to finalize the result of a reduction
3038 computation.
3040 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3041 reduction statements.
3042 STMT is the scalar reduction stmt that is being vectorized.
3043 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3044 number of elements that we can fit in a vectype (nunits). In this case
3045 we have to generate more than one vector stmt - i.e - we need to "unroll"
3046 the vector stmt by a factor VF/nunits. For more details see documentation
3047 in vectorizable_operation.
3048 REDUC_CODE is the tree-code for the epilog reduction.
3049 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3050 computation.
3051 REDUC_INDEX is the index of the operand in the right hand side of the
3052 statement that is defined by REDUCTION_PHI.
3053 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3054 SLP_NODE is an SLP node containing a group of reduction statements. The
3055 first one in this group is STMT.
3057 This function:
3058 1. Creates the reduction def-use cycles: sets the arguments for
3059 REDUCTION_PHIS:
3060 The loop-entry argument is the vectorized initial-value of the reduction.
3061 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3062 sums.
3063 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3064 by applying the operation specified by REDUC_CODE if available, or by
3065 other means (whole-vector shifts or a scalar loop).
3066 The function also creates a new phi node at the loop exit to preserve
3067 loop-closed form, as illustrated below.
3069 The flow at the entry to this function:
3071 loop:
3072 vec_def = phi <null, null> # REDUCTION_PHI
3073 VECT_DEF = vector_stmt # vectorized form of STMT
3074 s_loop = scalar_stmt # (scalar) STMT
3075 loop_exit:
3076 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3077 use <s_out0>
3078 use <s_out0>
3080 The above is transformed by this function into:
3082 loop:
3083 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3084 VECT_DEF = vector_stmt # vectorized form of STMT
3085 s_loop = scalar_stmt # (scalar) STMT
3086 loop_exit:
3087 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3088 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3089 v_out2 = reduce <v_out1>
3090 s_out3 = extract_field <v_out2, 0>
3091 s_out4 = adjust_result <s_out3>
3092 use <s_out4>
3093 use <s_out4>
3096 static void
3097 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3098 int ncopies, enum tree_code reduc_code,
3099 VEC (gimple, heap) *reduction_phis,
3100 int reduc_index, bool double_reduc,
3101 slp_tree slp_node)
3103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3104 stmt_vec_info prev_phi_info;
3105 tree vectype;
3106 enum machine_mode mode;
3107 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3108 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3109 basic_block exit_bb;
3110 tree scalar_dest;
3111 tree scalar_type;
3112 gimple new_phi = NULL, phi;
3113 gimple_stmt_iterator exit_gsi;
3114 tree vec_dest;
3115 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3116 gimple epilog_stmt = NULL;
3117 enum tree_code code = gimple_assign_rhs_code (stmt);
3118 gimple exit_phi;
3119 tree bitsize, bitpos;
3120 tree adjustment_def = NULL;
3121 tree vec_initial_def = NULL;
3122 tree reduction_op, expr, def;
3123 tree orig_name, scalar_result;
3124 imm_use_iterator imm_iter, phi_imm_iter;
3125 use_operand_p use_p, phi_use_p;
3126 bool extract_scalar_result = false;
3127 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3128 bool nested_in_vect_loop = false;
3129 VEC (gimple, heap) *new_phis = NULL;
3130 enum vect_def_type dt = vect_unknown_def_type;
3131 int j, i;
3132 VEC (tree, heap) *scalar_results = NULL;
3133 unsigned int group_size = 1, k, ratio;
3134 VEC (tree, heap) *vec_initial_defs = NULL;
3135 VEC (gimple, heap) *phis;
3137 if (slp_node)
3138 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3140 if (nested_in_vect_loop_p (loop, stmt))
3142 outer_loop = loop;
3143 loop = loop->inner;
3144 nested_in_vect_loop = true;
3145 gcc_assert (!slp_node);
3148 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3150 case GIMPLE_SINGLE_RHS:
3151 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3152 == ternary_op);
3153 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3154 break;
3155 case GIMPLE_UNARY_RHS:
3156 reduction_op = gimple_assign_rhs1 (stmt);
3157 break;
3158 case GIMPLE_BINARY_RHS:
3159 reduction_op = reduc_index ?
3160 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3161 break;
3162 default:
3163 gcc_unreachable ();
3166 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3167 gcc_assert (vectype);
3168 mode = TYPE_MODE (vectype);
3170 /* 1. Create the reduction def-use cycle:
3171 Set the arguments of REDUCTION_PHIS, i.e., transform
3173 loop:
3174 vec_def = phi <null, null> # REDUCTION_PHI
3175 VECT_DEF = vector_stmt # vectorized form of STMT
3178 into:
3180 loop:
3181 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3182 VECT_DEF = vector_stmt # vectorized form of STMT
3185 (in case of SLP, do it for all the phis). */
3187 /* Get the loop-entry arguments. */
3188 if (slp_node)
3189 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3190 NULL, reduc_index);
3191 else
3193 vec_initial_defs = VEC_alloc (tree, heap, 1);
3194 /* For the case of reduction, vect_get_vec_def_for_operand returns
3195 the scalar def before the loop, that defines the initial value
3196 of the reduction variable. */
3197 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3198 &adjustment_def);
3199 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3202 /* Set phi nodes arguments. */
3203 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3205 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3206 tree def = VEC_index (tree, vect_defs, i);
3207 for (j = 0; j < ncopies; j++)
3209 /* Set the loop-entry arg of the reduction-phi. */
3210 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3211 UNKNOWN_LOCATION);
3213 /* Set the loop-latch arg for the reduction-phi. */
3214 if (j > 0)
3215 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3217 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3219 if (vect_print_dump_info (REPORT_DETAILS))
3221 fprintf (vect_dump, "transform reduction: created def-use"
3222 " cycle: ");
3223 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3224 fprintf (vect_dump, "\n");
3225 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3226 TDF_SLIM);
3229 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3233 VEC_free (tree, heap, vec_initial_defs);
3235 /* 2. Create epilog code.
3236 The reduction epilog code operates across the elements of the vector
3237 of partial results computed by the vectorized loop.
3238 The reduction epilog code consists of:
3240 step 1: compute the scalar result in a vector (v_out2)
3241 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3242 step 3: adjust the scalar result (s_out3) if needed.
3244 Step 1 can be accomplished using one the following three schemes:
3245 (scheme 1) using reduc_code, if available.
3246 (scheme 2) using whole-vector shifts, if available.
3247 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3248 combined.
3250 The overall epilog code looks like this:
3252 s_out0 = phi <s_loop> # original EXIT_PHI
3253 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3254 v_out2 = reduce <v_out1> # step 1
3255 s_out3 = extract_field <v_out2, 0> # step 2
3256 s_out4 = adjust_result <s_out3> # step 3
3258 (step 3 is optional, and steps 1 and 2 may be combined).
3259 Lastly, the uses of s_out0 are replaced by s_out4. */
3262 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3263 v_out1 = phi <VECT_DEF>
3264 Store them in NEW_PHIS. */
3266 exit_bb = single_exit (loop)->dest;
3267 prev_phi_info = NULL;
3268 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3269 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3271 for (j = 0; j < ncopies; j++)
3273 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3274 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3275 if (j == 0)
3276 VEC_quick_push (gimple, new_phis, phi);
3277 else
3279 def = vect_get_vec_def_for_stmt_copy (dt, def);
3280 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3283 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3284 prev_phi_info = vinfo_for_stmt (phi);
3288 /* The epilogue is created for the outer-loop, i.e., for the loop being
3289 vectorized. */
3290 if (double_reduc)
3292 loop = outer_loop;
3293 exit_bb = single_exit (loop)->dest;
3296 exit_gsi = gsi_after_labels (exit_bb);
3298 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3299 (i.e. when reduc_code is not available) and in the final adjustment
3300 code (if needed). Also get the original scalar reduction variable as
3301 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3302 represents a reduction pattern), the tree-code and scalar-def are
3303 taken from the original stmt that the pattern-stmt (STMT) replaces.
3304 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3305 are taken from STMT. */
3307 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3308 if (!orig_stmt)
3310 /* Regular reduction */
3311 orig_stmt = stmt;
3313 else
3315 /* Reduction pattern */
3316 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3317 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3318 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3321 code = gimple_assign_rhs_code (orig_stmt);
3322 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3323 partial results are added and not subtracted. */
3324 if (code == MINUS_EXPR)
3325 code = PLUS_EXPR;
3327 scalar_dest = gimple_assign_lhs (orig_stmt);
3328 scalar_type = TREE_TYPE (scalar_dest);
3329 scalar_results = VEC_alloc (tree, heap, group_size);
3330 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3331 bitsize = TYPE_SIZE (scalar_type);
3333 /* In case this is a reduction in an inner-loop while vectorizing an outer
3334 loop - we don't need to extract a single scalar result at the end of the
3335 inner-loop (unless it is double reduction, i.e., the use of reduction is
3336 outside the outer-loop). The final vector of partial results will be used
3337 in the vectorized outer-loop, or reduced to a scalar result at the end of
3338 the outer-loop. */
3339 if (nested_in_vect_loop && !double_reduc)
3340 goto vect_finalize_reduction;
3342 /* 2.3 Create the reduction code, using one of the three schemes described
3343 above. In SLP we simply need to extract all the elements from the
3344 vector (without reducing them), so we use scalar shifts. */
3345 if (reduc_code != ERROR_MARK && !slp_node)
3347 tree tmp;
3349 /*** Case 1: Create:
3350 v_out2 = reduc_expr <v_out1> */
3352 if (vect_print_dump_info (REPORT_DETAILS))
3353 fprintf (vect_dump, "Reduce using direct vector reduction.");
3355 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3356 new_phi = VEC_index (gimple, new_phis, 0);
3357 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3358 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3359 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3360 gimple_assign_set_lhs (epilog_stmt, new_temp);
3361 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3363 extract_scalar_result = true;
3365 else
3367 enum tree_code shift_code = ERROR_MARK;
3368 bool have_whole_vector_shift = true;
3369 int bit_offset;
3370 int element_bitsize = tree_low_cst (bitsize, 1);
3371 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3372 tree vec_temp;
3374 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3375 shift_code = VEC_RSHIFT_EXPR;
3376 else
3377 have_whole_vector_shift = false;
3379 /* Regardless of whether we have a whole vector shift, if we're
3380 emulating the operation via tree-vect-generic, we don't want
3381 to use it. Only the first round of the reduction is likely
3382 to still be profitable via emulation. */
3383 /* ??? It might be better to emit a reduction tree code here, so that
3384 tree-vect-generic can expand the first round via bit tricks. */
3385 if (!VECTOR_MODE_P (mode))
3386 have_whole_vector_shift = false;
3387 else
3389 optab optab = optab_for_tree_code (code, vectype, optab_default);
3390 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3391 have_whole_vector_shift = false;
3394 if (have_whole_vector_shift && !slp_node)
3396 /*** Case 2: Create:
3397 for (offset = VS/2; offset >= element_size; offset/=2)
3399 Create: va' = vec_shift <va, offset>
3400 Create: va = vop <va, va'>
3401 } */
3403 if (vect_print_dump_info (REPORT_DETAILS))
3404 fprintf (vect_dump, "Reduce using vector shifts");
3406 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3407 new_phi = VEC_index (gimple, new_phis, 0);
3408 new_temp = PHI_RESULT (new_phi);
3409 for (bit_offset = vec_size_in_bits/2;
3410 bit_offset >= element_bitsize;
3411 bit_offset /= 2)
3413 tree bitpos = size_int (bit_offset);
3415 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3416 vec_dest, new_temp, bitpos);
3417 new_name = make_ssa_name (vec_dest, epilog_stmt);
3418 gimple_assign_set_lhs (epilog_stmt, new_name);
3419 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3421 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3422 new_name, new_temp);
3423 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3424 gimple_assign_set_lhs (epilog_stmt, new_temp);
3425 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3428 extract_scalar_result = true;
3430 else
3432 tree rhs;
3434 /*** Case 3: Create:
3435 s = extract_field <v_out2, 0>
3436 for (offset = element_size;
3437 offset < vector_size;
3438 offset += element_size;)
3440 Create: s' = extract_field <v_out2, offset>
3441 Create: s = op <s, s'> // For non SLP cases
3442 } */
3444 if (vect_print_dump_info (REPORT_DETAILS))
3445 fprintf (vect_dump, "Reduce using scalar code. ");
3447 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3448 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3450 vec_temp = PHI_RESULT (new_phi);
3451 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3452 bitsize_zero_node);
3453 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3454 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3455 gimple_assign_set_lhs (epilog_stmt, new_temp);
3456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3458 /* In SLP we don't need to apply reduction operation, so we just
3459 collect s' values in SCALAR_RESULTS. */
3460 if (slp_node)
3461 VEC_safe_push (tree, heap, scalar_results, new_temp);
3463 for (bit_offset = element_bitsize;
3464 bit_offset < vec_size_in_bits;
3465 bit_offset += element_bitsize)
3467 tree bitpos = bitsize_int (bit_offset);
3468 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3469 bitsize, bitpos);
3471 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3472 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3473 gimple_assign_set_lhs (epilog_stmt, new_name);
3474 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3476 if (slp_node)
3478 /* In SLP we don't need to apply reduction operation, so
3479 we just collect s' values in SCALAR_RESULTS. */
3480 new_temp = new_name;
3481 VEC_safe_push (tree, heap, scalar_results, new_name);
3483 else
3485 epilog_stmt = gimple_build_assign_with_ops (code,
3486 new_scalar_dest, new_name, new_temp);
3487 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3488 gimple_assign_set_lhs (epilog_stmt, new_temp);
3489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3494 /* The only case where we need to reduce scalar results in SLP, is
3495 unrolling. If the size of SCALAR_RESULTS is greater than
3496 GROUP_SIZE, we reduce them combining elements modulo
3497 GROUP_SIZE. */
3498 if (slp_node)
3500 tree res, first_res, new_res;
3501 gimple new_stmt;
3503 /* Reduce multiple scalar results in case of SLP unrolling. */
3504 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3505 j++)
3507 first_res = VEC_index (tree, scalar_results, j % group_size);
3508 new_stmt = gimple_build_assign_with_ops (code,
3509 new_scalar_dest, first_res, res);
3510 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3511 gimple_assign_set_lhs (new_stmt, new_res);
3512 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3513 VEC_replace (tree, scalar_results, j % group_size, new_res);
3516 else
3517 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3518 VEC_safe_push (tree, heap, scalar_results, new_temp);
3520 extract_scalar_result = false;
3524 /* 2.4 Extract the final scalar result. Create:
3525 s_out3 = extract_field <v_out2, bitpos> */
3527 if (extract_scalar_result)
3529 tree rhs;
3531 if (vect_print_dump_info (REPORT_DETAILS))
3532 fprintf (vect_dump, "extract scalar result");
3534 if (BYTES_BIG_ENDIAN)
3535 bitpos = size_binop (MULT_EXPR,
3536 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3537 TYPE_SIZE (scalar_type));
3538 else
3539 bitpos = bitsize_zero_node;
3541 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3542 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3543 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3544 gimple_assign_set_lhs (epilog_stmt, new_temp);
3545 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3546 VEC_safe_push (tree, heap, scalar_results, new_temp);
3549 vect_finalize_reduction:
3551 if (double_reduc)
3552 loop = loop->inner;
3554 /* 2.5 Adjust the final result by the initial value of the reduction
3555 variable. (When such adjustment is not needed, then
3556 'adjustment_def' is zero). For example, if code is PLUS we create:
3557 new_temp = loop_exit_def + adjustment_def */
3559 if (adjustment_def)
3561 gcc_assert (!slp_node);
3562 if (nested_in_vect_loop)
3564 new_phi = VEC_index (gimple, new_phis, 0);
3565 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3566 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3567 new_dest = vect_create_destination_var (scalar_dest, vectype);
3569 else
3571 new_temp = VEC_index (tree, scalar_results, 0);
3572 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3573 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3574 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3577 epilog_stmt = gimple_build_assign (new_dest, expr);
3578 new_temp = make_ssa_name (new_dest, epilog_stmt);
3579 gimple_assign_set_lhs (epilog_stmt, new_temp);
3580 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3581 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3582 if (nested_in_vect_loop)
3584 set_vinfo_for_stmt (epilog_stmt,
3585 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3586 NULL));
3587 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3588 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3590 if (!double_reduc)
3591 VEC_quick_push (tree, scalar_results, new_temp);
3592 else
3593 VEC_replace (tree, scalar_results, 0, new_temp);
3595 else
3596 VEC_replace (tree, scalar_results, 0, new_temp);
3598 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3601 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3602 phis with new adjusted scalar results, i.e., replace use <s_out0>
3603 with use <s_out4>.
3605 Transform:
3606 loop_exit:
3607 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3608 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3609 v_out2 = reduce <v_out1>
3610 s_out3 = extract_field <v_out2, 0>
3611 s_out4 = adjust_result <s_out3>
3612 use <s_out0>
3613 use <s_out0>
3615 into:
3617 loop_exit:
3618 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3619 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3620 v_out2 = reduce <v_out1>
3621 s_out3 = extract_field <v_out2, 0>
3622 s_out4 = adjust_result <s_out3>
3623 use <s_out4>
3624 use <s_out4> */
3626 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3627 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3628 need to match SCALAR_RESULTS with corresponding statements. The first
3629 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3630 the first vector stmt, etc.
3631 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3632 if (group_size > VEC_length (gimple, new_phis))
3634 ratio = group_size / VEC_length (gimple, new_phis);
3635 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3637 else
3638 ratio = 1;
3640 for (k = 0; k < group_size; k++)
3642 if (k % ratio == 0)
3644 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3645 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3648 if (slp_node)
3650 gimple current_stmt = VEC_index (gimple,
3651 SLP_TREE_SCALAR_STMTS (slp_node), k);
3653 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3654 /* SLP statements can't participate in patterns. */
3655 gcc_assert (!orig_stmt);
3656 scalar_dest = gimple_assign_lhs (current_stmt);
3659 phis = VEC_alloc (gimple, heap, 3);
3660 /* Find the loop-closed-use at the loop exit of the original scalar
3661 result. (The reduction result is expected to have two immediate uses -
3662 one at the latch block, and one at the loop exit). */
3663 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3664 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3665 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3667 /* We expect to have found an exit_phi because of loop-closed-ssa
3668 form. */
3669 gcc_assert (!VEC_empty (gimple, phis));
3671 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3673 if (outer_loop)
3675 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3676 gimple vect_phi;
3678 /* FORNOW. Currently not supporting the case that an inner-loop
3679 reduction is not used in the outer-loop (but only outside the
3680 outer-loop), unless it is double reduction. */
3681 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3682 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3683 || double_reduc);
3685 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3686 if (!double_reduc
3687 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3688 != vect_double_reduction_def)
3689 continue;
3691 /* Handle double reduction:
3693 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3694 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3695 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3696 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3698 At that point the regular reduction (stmt2 and stmt3) is
3699 already vectorized, as well as the exit phi node, stmt4.
3700 Here we vectorize the phi node of double reduction, stmt1, and
3701 update all relevant statements. */
3703 /* Go through all the uses of s2 to find double reduction phi
3704 node, i.e., stmt1 above. */
3705 orig_name = PHI_RESULT (exit_phi);
3706 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3708 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3709 stmt_vec_info new_phi_vinfo;
3710 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3711 basic_block bb = gimple_bb (use_stmt);
3712 gimple use;
3714 /* Check that USE_STMT is really double reduction phi
3715 node. */
3716 if (gimple_code (use_stmt) != GIMPLE_PHI
3717 || gimple_phi_num_args (use_stmt) != 2
3718 || !use_stmt_vinfo
3719 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3720 != vect_double_reduction_def
3721 || bb->loop_father != outer_loop)
3722 continue;
3724 /* Create vector phi node for double reduction:
3725 vs1 = phi <vs0, vs2>
3726 vs1 was created previously in this function by a call to
3727 vect_get_vec_def_for_operand and is stored in
3728 vec_initial_def;
3729 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3730 vs0 is created here. */
3732 /* Create vector phi node. */
3733 vect_phi = create_phi_node (vec_initial_def, bb);
3734 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3735 loop_vec_info_for_loop (outer_loop), NULL);
3736 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3738 /* Create vs0 - initial def of the double reduction phi. */
3739 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3740 loop_preheader_edge (outer_loop));
3741 init_def = get_initial_def_for_reduction (stmt,
3742 preheader_arg, NULL);
3743 vect_phi_init = vect_init_vector (use_stmt, init_def,
3744 vectype, NULL);
3746 /* Update phi node arguments with vs0 and vs2. */
3747 add_phi_arg (vect_phi, vect_phi_init,
3748 loop_preheader_edge (outer_loop),
3749 UNKNOWN_LOCATION);
3750 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3751 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3752 if (vect_print_dump_info (REPORT_DETAILS))
3754 fprintf (vect_dump, "created double reduction phi "
3755 "node: ");
3756 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3759 vect_phi_res = PHI_RESULT (vect_phi);
3761 /* Replace the use, i.e., set the correct vs1 in the regular
3762 reduction phi node. FORNOW, NCOPIES is always 1, so the
3763 loop is redundant. */
3764 use = reduction_phi;
3765 for (j = 0; j < ncopies; j++)
3767 edge pr_edge = loop_preheader_edge (loop);
3768 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3769 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3775 VEC_free (gimple, heap, phis);
3776 if (nested_in_vect_loop)
3778 if (double_reduc)
3779 loop = outer_loop;
3780 else
3781 continue;
3784 phis = VEC_alloc (gimple, heap, 3);
3785 /* Find the loop-closed-use at the loop exit of the original scalar
3786 result. (The reduction result is expected to have two immediate uses,
3787 one at the latch block, and one at the loop exit). For double
3788 reductions we are looking for exit phis of the outer loop. */
3789 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3791 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3792 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3793 else
3795 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3797 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3799 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3801 if (!flow_bb_inside_loop_p (loop,
3802 gimple_bb (USE_STMT (phi_use_p))))
3803 VEC_safe_push (gimple, heap, phis,
3804 USE_STMT (phi_use_p));
3810 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3812 /* Replace the uses: */
3813 orig_name = PHI_RESULT (exit_phi);
3814 scalar_result = VEC_index (tree, scalar_results, k);
3815 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3816 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3817 SET_USE (use_p, scalar_result);
3820 VEC_free (gimple, heap, phis);
3823 VEC_free (tree, heap, scalar_results);
3824 VEC_free (gimple, heap, new_phis);
3828 /* Function vectorizable_reduction.
3830 Check if STMT performs a reduction operation that can be vectorized.
3831 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3832 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3833 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3835 This function also handles reduction idioms (patterns) that have been
3836 recognized in advance during vect_pattern_recog. In this case, STMT may be
3837 of this form:
3838 X = pattern_expr (arg0, arg1, ..., X)
3839 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3840 sequence that had been detected and replaced by the pattern-stmt (STMT).
3842 In some cases of reduction patterns, the type of the reduction variable X is
3843 different than the type of the other arguments of STMT.
3844 In such cases, the vectype that is used when transforming STMT into a vector
3845 stmt is different than the vectype that is used to determine the
3846 vectorization factor, because it consists of a different number of elements
3847 than the actual number of elements that are being operated upon in parallel.
3849 For example, consider an accumulation of shorts into an int accumulator.
3850 On some targets it's possible to vectorize this pattern operating on 8
3851 shorts at a time (hence, the vectype for purposes of determining the
3852 vectorization factor should be V8HI); on the other hand, the vectype that
3853 is used to create the vector form is actually V4SI (the type of the result).
3855 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3856 indicates what is the actual level of parallelism (V8HI in the example), so
3857 that the right vectorization factor would be derived. This vectype
3858 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3859 be used to create the vectorized stmt. The right vectype for the vectorized
3860 stmt is obtained from the type of the result X:
3861 get_vectype_for_scalar_type (TREE_TYPE (X))
3863 This means that, contrary to "regular" reductions (or "regular" stmts in
3864 general), the following equation:
3865 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3866 does *NOT* necessarily hold for reduction patterns. */
3868 bool
3869 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3870 gimple *vec_stmt, slp_tree slp_node)
3872 tree vec_dest;
3873 tree scalar_dest;
3874 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3875 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3876 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3877 tree vectype_in = NULL_TREE;
3878 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3879 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3880 enum tree_code code, orig_code, epilog_reduc_code;
3881 enum machine_mode vec_mode;
3882 int op_type;
3883 optab optab, reduc_optab;
3884 tree new_temp = NULL_TREE;
3885 tree def;
3886 gimple def_stmt;
3887 enum vect_def_type dt;
3888 gimple new_phi = NULL;
3889 tree scalar_type;
3890 bool is_simple_use;
3891 gimple orig_stmt;
3892 stmt_vec_info orig_stmt_info;
3893 tree expr = NULL_TREE;
3894 int i;
3895 int ncopies;
3896 int epilog_copies;
3897 stmt_vec_info prev_stmt_info, prev_phi_info;
3898 bool single_defuse_cycle = false;
3899 tree reduc_def = NULL_TREE;
3900 gimple new_stmt = NULL;
3901 int j;
3902 tree ops[3];
3903 bool nested_cycle = false, found_nested_cycle_def = false;
3904 gimple reduc_def_stmt = NULL;
3905 /* The default is that the reduction variable is the last in statement. */
3906 int reduc_index = 2;
3907 bool double_reduc = false, dummy;
3908 basic_block def_bb;
3909 struct loop * def_stmt_loop, *outer_loop = NULL;
3910 tree def_arg;
3911 gimple def_arg_stmt;
3912 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3913 VEC (gimple, heap) *phis = NULL;
3914 int vec_num;
3915 tree def0, def1, tem;
3917 if (nested_in_vect_loop_p (loop, stmt))
3919 outer_loop = loop;
3920 loop = loop->inner;
3921 nested_cycle = true;
3924 /* 1. Is vectorizable reduction? */
3925 /* Not supportable if the reduction variable is used in the loop. */
3926 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3927 return false;
3929 /* Reductions that are not used even in an enclosing outer-loop,
3930 are expected to be "live" (used out of the loop). */
3931 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3932 && !STMT_VINFO_LIVE_P (stmt_info))
3933 return false;
3935 /* Make sure it was already recognized as a reduction computation. */
3936 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3937 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3938 return false;
3940 /* 2. Has this been recognized as a reduction pattern?
3942 Check if STMT represents a pattern that has been recognized
3943 in earlier analysis stages. For stmts that represent a pattern,
3944 the STMT_VINFO_RELATED_STMT field records the last stmt in
3945 the original sequence that constitutes the pattern. */
3947 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3948 if (orig_stmt)
3950 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3951 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3952 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3953 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3956 /* 3. Check the operands of the operation. The first operands are defined
3957 inside the loop body. The last operand is the reduction variable,
3958 which is defined by the loop-header-phi. */
3960 gcc_assert (is_gimple_assign (stmt));
3962 /* Flatten RHS. */
3963 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3965 case GIMPLE_SINGLE_RHS:
3966 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3967 if (op_type == ternary_op)
3969 tree rhs = gimple_assign_rhs1 (stmt);
3970 ops[0] = TREE_OPERAND (rhs, 0);
3971 ops[1] = TREE_OPERAND (rhs, 1);
3972 ops[2] = TREE_OPERAND (rhs, 2);
3973 code = TREE_CODE (rhs);
3975 else
3976 return false;
3977 break;
3979 case GIMPLE_BINARY_RHS:
3980 code = gimple_assign_rhs_code (stmt);
3981 op_type = TREE_CODE_LENGTH (code);
3982 gcc_assert (op_type == binary_op);
3983 ops[0] = gimple_assign_rhs1 (stmt);
3984 ops[1] = gimple_assign_rhs2 (stmt);
3985 break;
3987 case GIMPLE_UNARY_RHS:
3988 return false;
3990 default:
3991 gcc_unreachable ();
3994 scalar_dest = gimple_assign_lhs (stmt);
3995 scalar_type = TREE_TYPE (scalar_dest);
3996 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3997 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3998 return false;
4000 /* All uses but the last are expected to be defined in the loop.
4001 The last use is the reduction variable. In case of nested cycle this
4002 assumption is not true: we use reduc_index to record the index of the
4003 reduction variable. */
4004 for (i = 0; i < op_type-1; i++)
4006 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4007 if (i == 0 && code == COND_EXPR)
4008 continue;
4010 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4011 &def_stmt, &def, &dt, &tem);
4012 if (!vectype_in)
4013 vectype_in = tem;
4014 gcc_assert (is_simple_use);
4015 if (dt != vect_internal_def
4016 && dt != vect_external_def
4017 && dt != vect_constant_def
4018 && dt != vect_induction_def
4019 && !(dt == vect_nested_cycle && nested_cycle))
4020 return false;
4022 if (dt == vect_nested_cycle)
4024 found_nested_cycle_def = true;
4025 reduc_def_stmt = def_stmt;
4026 reduc_index = i;
4030 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4031 &def, &dt, &tem);
4032 if (!vectype_in)
4033 vectype_in = tem;
4034 gcc_assert (is_simple_use);
4035 gcc_assert (dt == vect_reduction_def
4036 || dt == vect_nested_cycle
4037 || ((dt == vect_internal_def || dt == vect_external_def
4038 || dt == vect_constant_def || dt == vect_induction_def)
4039 && nested_cycle && found_nested_cycle_def));
4040 if (!found_nested_cycle_def)
4041 reduc_def_stmt = def_stmt;
4043 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4044 if (orig_stmt)
4045 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4046 reduc_def_stmt,
4047 !nested_cycle,
4048 &dummy));
4049 else
4050 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4051 !nested_cycle, &dummy));
4053 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4054 return false;
4056 if (slp_node)
4057 ncopies = 1;
4058 else
4059 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4060 / TYPE_VECTOR_SUBPARTS (vectype_in));
4062 gcc_assert (ncopies >= 1);
4064 vec_mode = TYPE_MODE (vectype_in);
4066 if (code == COND_EXPR)
4068 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4070 if (vect_print_dump_info (REPORT_DETAILS))
4071 fprintf (vect_dump, "unsupported condition in reduction");
4073 return false;
4076 else
4078 /* 4. Supportable by target? */
4080 /* 4.1. check support for the operation in the loop */
4081 optab = optab_for_tree_code (code, vectype_in, optab_default);
4082 if (!optab)
4084 if (vect_print_dump_info (REPORT_DETAILS))
4085 fprintf (vect_dump, "no optab.");
4087 return false;
4090 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4092 if (vect_print_dump_info (REPORT_DETAILS))
4093 fprintf (vect_dump, "op not supported by target.");
4095 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4096 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4097 < vect_min_worthwhile_factor (code))
4098 return false;
4100 if (vect_print_dump_info (REPORT_DETAILS))
4101 fprintf (vect_dump, "proceeding using word mode.");
4104 /* Worthwhile without SIMD support? */
4105 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4106 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4107 < vect_min_worthwhile_factor (code))
4109 if (vect_print_dump_info (REPORT_DETAILS))
4110 fprintf (vect_dump, "not worthwhile without SIMD support.");
4112 return false;
4116 /* 4.2. Check support for the epilog operation.
4118 If STMT represents a reduction pattern, then the type of the
4119 reduction variable may be different than the type of the rest
4120 of the arguments. For example, consider the case of accumulation
4121 of shorts into an int accumulator; The original code:
4122 S1: int_a = (int) short_a;
4123 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4125 was replaced with:
4126 STMT: int_acc = widen_sum <short_a, int_acc>
4128 This means that:
4129 1. The tree-code that is used to create the vector operation in the
4130 epilog code (that reduces the partial results) is not the
4131 tree-code of STMT, but is rather the tree-code of the original
4132 stmt from the pattern that STMT is replacing. I.e, in the example
4133 above we want to use 'widen_sum' in the loop, but 'plus' in the
4134 epilog.
4135 2. The type (mode) we use to check available target support
4136 for the vector operation to be created in the *epilog*, is
4137 determined by the type of the reduction variable (in the example
4138 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4139 However the type (mode) we use to check available target support
4140 for the vector operation to be created *inside the loop*, is
4141 determined by the type of the other arguments to STMT (in the
4142 example we'd check this: optab_handler (widen_sum_optab,
4143 vect_short_mode)).
4145 This is contrary to "regular" reductions, in which the types of all
4146 the arguments are the same as the type of the reduction variable.
4147 For "regular" reductions we can therefore use the same vector type
4148 (and also the same tree-code) when generating the epilog code and
4149 when generating the code inside the loop. */
4151 if (orig_stmt)
4153 /* This is a reduction pattern: get the vectype from the type of the
4154 reduction variable, and get the tree-code from orig_stmt. */
4155 orig_code = gimple_assign_rhs_code (orig_stmt);
4156 gcc_assert (vectype_out);
4157 vec_mode = TYPE_MODE (vectype_out);
4159 else
4161 /* Regular reduction: use the same vectype and tree-code as used for
4162 the vector code inside the loop can be used for the epilog code. */
4163 orig_code = code;
4166 if (nested_cycle)
4168 def_bb = gimple_bb (reduc_def_stmt);
4169 def_stmt_loop = def_bb->loop_father;
4170 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4171 loop_preheader_edge (def_stmt_loop));
4172 if (TREE_CODE (def_arg) == SSA_NAME
4173 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4174 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4175 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4176 && vinfo_for_stmt (def_arg_stmt)
4177 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4178 == vect_double_reduction_def)
4179 double_reduc = true;
4182 epilog_reduc_code = ERROR_MARK;
4183 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4185 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4186 optab_default);
4187 if (!reduc_optab)
4189 if (vect_print_dump_info (REPORT_DETAILS))
4190 fprintf (vect_dump, "no optab for reduction.");
4192 epilog_reduc_code = ERROR_MARK;
4195 if (reduc_optab
4196 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4198 if (vect_print_dump_info (REPORT_DETAILS))
4199 fprintf (vect_dump, "reduc op not supported by target.");
4201 epilog_reduc_code = ERROR_MARK;
4204 else
4206 if (!nested_cycle || double_reduc)
4208 if (vect_print_dump_info (REPORT_DETAILS))
4209 fprintf (vect_dump, "no reduc code for scalar code.");
4211 return false;
4215 if (double_reduc && ncopies > 1)
4217 if (vect_print_dump_info (REPORT_DETAILS))
4218 fprintf (vect_dump, "multiple types in double reduction");
4220 return false;
4223 if (!vec_stmt) /* transformation not required. */
4225 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4226 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4227 return false;
4228 return true;
4231 /** Transform. **/
4233 if (vect_print_dump_info (REPORT_DETAILS))
4234 fprintf (vect_dump, "transform reduction.");
4236 /* FORNOW: Multiple types are not supported for condition. */
4237 if (code == COND_EXPR)
4238 gcc_assert (ncopies == 1);
4240 /* Create the destination vector */
4241 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4243 /* In case the vectorization factor (VF) is bigger than the number
4244 of elements that we can fit in a vectype (nunits), we have to generate
4245 more than one vector stmt - i.e - we need to "unroll" the
4246 vector stmt by a factor VF/nunits. For more details see documentation
4247 in vectorizable_operation. */
4249 /* If the reduction is used in an outer loop we need to generate
4250 VF intermediate results, like so (e.g. for ncopies=2):
4251 r0 = phi (init, r0)
4252 r1 = phi (init, r1)
4253 r0 = x0 + r0;
4254 r1 = x1 + r1;
4255 (i.e. we generate VF results in 2 registers).
4256 In this case we have a separate def-use cycle for each copy, and therefore
4257 for each copy we get the vector def for the reduction variable from the
4258 respective phi node created for this copy.
4260 Otherwise (the reduction is unused in the loop nest), we can combine
4261 together intermediate results, like so (e.g. for ncopies=2):
4262 r = phi (init, r)
4263 r = x0 + r;
4264 r = x1 + r;
4265 (i.e. we generate VF/2 results in a single register).
4266 In this case for each copy we get the vector def for the reduction variable
4267 from the vectorized reduction operation generated in the previous iteration.
4270 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4272 single_defuse_cycle = true;
4273 epilog_copies = 1;
4275 else
4276 epilog_copies = ncopies;
4278 prev_stmt_info = NULL;
4279 prev_phi_info = NULL;
4280 if (slp_node)
4282 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4283 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4284 == TYPE_VECTOR_SUBPARTS (vectype_in));
4286 else
4288 vec_num = 1;
4289 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4290 if (op_type == ternary_op)
4291 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4294 phis = VEC_alloc (gimple, heap, vec_num);
4295 vect_defs = VEC_alloc (tree, heap, vec_num);
4296 if (!slp_node)
4297 VEC_quick_push (tree, vect_defs, NULL_TREE);
4299 for (j = 0; j < ncopies; j++)
4301 if (j == 0 || !single_defuse_cycle)
4303 for (i = 0; i < vec_num; i++)
4305 /* Create the reduction-phi that defines the reduction
4306 operand. */
4307 new_phi = create_phi_node (vec_dest, loop->header);
4308 set_vinfo_for_stmt (new_phi,
4309 new_stmt_vec_info (new_phi, loop_vinfo,
4310 NULL));
4311 if (j == 0 || slp_node)
4312 VEC_quick_push (gimple, phis, new_phi);
4316 if (code == COND_EXPR)
4318 gcc_assert (!slp_node);
4319 vectorizable_condition (stmt, gsi, vec_stmt,
4320 PHI_RESULT (VEC_index (gimple, phis, 0)),
4321 reduc_index);
4322 /* Multiple types are not supported for condition. */
4323 break;
4326 /* Handle uses. */
4327 if (j == 0)
4329 tree op0, op1 = NULL_TREE;
4331 op0 = ops[!reduc_index];
4332 if (op_type == ternary_op)
4334 if (reduc_index == 0)
4335 op1 = ops[2];
4336 else
4337 op1 = ops[1];
4340 if (slp_node)
4341 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4342 -1);
4343 else
4345 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4346 stmt, NULL);
4347 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4348 if (op_type == ternary_op)
4350 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4351 NULL);
4352 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4356 else
4358 if (!slp_node)
4360 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4361 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4362 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4363 if (op_type == ternary_op)
4365 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4366 loop_vec_def1);
4367 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4371 if (single_defuse_cycle)
4372 reduc_def = gimple_assign_lhs (new_stmt);
4374 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4377 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4379 if (slp_node)
4380 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4381 else
4383 if (!single_defuse_cycle || j == 0)
4384 reduc_def = PHI_RESULT (new_phi);
4387 def1 = ((op_type == ternary_op)
4388 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4389 if (op_type == binary_op)
4391 if (reduc_index == 0)
4392 expr = build2 (code, vectype_out, reduc_def, def0);
4393 else
4394 expr = build2 (code, vectype_out, def0, reduc_def);
4396 else
4398 if (reduc_index == 0)
4399 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4400 else
4402 if (reduc_index == 1)
4403 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4404 else
4405 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4409 new_stmt = gimple_build_assign (vec_dest, expr);
4410 new_temp = make_ssa_name (vec_dest, new_stmt);
4411 gimple_assign_set_lhs (new_stmt, new_temp);
4412 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4413 if (slp_node)
4415 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4416 VEC_quick_push (tree, vect_defs, new_temp);
4418 else
4419 VEC_replace (tree, vect_defs, 0, new_temp);
4422 if (slp_node)
4423 continue;
4425 if (j == 0)
4426 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4427 else
4428 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4430 prev_stmt_info = vinfo_for_stmt (new_stmt);
4431 prev_phi_info = vinfo_for_stmt (new_phi);
4434 /* Finalize the reduction-phi (set its arguments) and create the
4435 epilog reduction code. */
4436 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4438 new_temp = gimple_assign_lhs (*vec_stmt);
4439 VEC_replace (tree, vect_defs, 0, new_temp);
4442 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4443 epilog_reduc_code, phis, reduc_index,
4444 double_reduc, slp_node);
4446 VEC_free (gimple, heap, phis);
4447 VEC_free (tree, heap, vec_oprnds0);
4448 if (vec_oprnds1)
4449 VEC_free (tree, heap, vec_oprnds1);
4451 return true;
4454 /* Function vect_min_worthwhile_factor.
4456 For a loop where we could vectorize the operation indicated by CODE,
4457 return the minimum vectorization factor that makes it worthwhile
4458 to use generic vectors. */
4460 vect_min_worthwhile_factor (enum tree_code code)
4462 switch (code)
4464 case PLUS_EXPR:
4465 case MINUS_EXPR:
4466 case NEGATE_EXPR:
4467 return 4;
4469 case BIT_AND_EXPR:
4470 case BIT_IOR_EXPR:
4471 case BIT_XOR_EXPR:
4472 case BIT_NOT_EXPR:
4473 return 2;
4475 default:
4476 return INT_MAX;
4481 /* Function vectorizable_induction
4483 Check if PHI performs an induction computation that can be vectorized.
4484 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4485 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4486 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4488 bool
4489 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4490 gimple *vec_stmt)
4492 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4493 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4496 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4497 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4498 tree vec_def;
4500 gcc_assert (ncopies >= 1);
4501 /* FORNOW. This restriction should be relaxed. */
4502 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4504 if (vect_print_dump_info (REPORT_DETAILS))
4505 fprintf (vect_dump, "multiple types in nested loop.");
4506 return false;
4509 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4510 return false;
4512 /* FORNOW: SLP not supported. */
4513 if (STMT_SLP_TYPE (stmt_info))
4514 return false;
4516 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4518 if (gimple_code (phi) != GIMPLE_PHI)
4519 return false;
4521 if (!vec_stmt) /* transformation not required. */
4523 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4524 if (vect_print_dump_info (REPORT_DETAILS))
4525 fprintf (vect_dump, "=== vectorizable_induction ===");
4526 vect_model_induction_cost (stmt_info, ncopies);
4527 return true;
4530 /** Transform. **/
4532 if (vect_print_dump_info (REPORT_DETAILS))
4533 fprintf (vect_dump, "transform induction phi.");
4535 vec_def = get_initial_def_for_induction (phi);
4536 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4537 return true;
4540 /* Function vectorizable_live_operation.
4542 STMT computes a value that is used outside the loop. Check if
4543 it can be supported. */
4545 bool
4546 vectorizable_live_operation (gimple stmt,
4547 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4548 gimple *vec_stmt ATTRIBUTE_UNUSED)
4550 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4553 int i;
4554 int op_type;
4555 tree op;
4556 tree def;
4557 gimple def_stmt;
4558 enum vect_def_type dt;
4559 enum tree_code code;
4560 enum gimple_rhs_class rhs_class;
4562 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4564 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4565 return false;
4567 if (!is_gimple_assign (stmt))
4568 return false;
4570 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4571 return false;
4573 /* FORNOW. CHECKME. */
4574 if (nested_in_vect_loop_p (loop, stmt))
4575 return false;
4577 code = gimple_assign_rhs_code (stmt);
4578 op_type = TREE_CODE_LENGTH (code);
4579 rhs_class = get_gimple_rhs_class (code);
4580 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4581 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4583 /* FORNOW: support only if all uses are invariant. This means
4584 that the scalar operations can remain in place, unvectorized.
4585 The original last scalar value that they compute will be used. */
4587 for (i = 0; i < op_type; i++)
4589 if (rhs_class == GIMPLE_SINGLE_RHS)
4590 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4591 else
4592 op = gimple_op (stmt, i + 1);
4593 if (op
4594 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4596 if (vect_print_dump_info (REPORT_DETAILS))
4597 fprintf (vect_dump, "use not simple.");
4598 return false;
4601 if (dt != vect_external_def && dt != vect_constant_def)
4602 return false;
4605 /* No transformation is required for the cases we currently support. */
4606 return true;
4609 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4611 static void
4612 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4614 ssa_op_iter op_iter;
4615 imm_use_iterator imm_iter;
4616 def_operand_p def_p;
4617 gimple ustmt;
4619 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4621 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4623 basic_block bb;
4625 if (!is_gimple_debug (ustmt))
4626 continue;
4628 bb = gimple_bb (ustmt);
4630 if (!flow_bb_inside_loop_p (loop, bb))
4632 if (gimple_debug_bind_p (ustmt))
4634 if (vect_print_dump_info (REPORT_DETAILS))
4635 fprintf (vect_dump, "killing debug use");
4637 gimple_debug_bind_reset_value (ustmt);
4638 update_stmt (ustmt);
4640 else
4641 gcc_unreachable ();
4647 /* Function vect_transform_loop.
4649 The analysis phase has determined that the loop is vectorizable.
4650 Vectorize the loop - created vectorized stmts to replace the scalar
4651 stmts in the loop, and update the loop exit condition. */
4653 void
4654 vect_transform_loop (loop_vec_info loop_vinfo)
4656 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4657 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4658 int nbbs = loop->num_nodes;
4659 gimple_stmt_iterator si;
4660 int i;
4661 tree ratio = NULL;
4662 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4663 bool strided_store;
4664 bool slp_scheduled = false;
4665 unsigned int nunits;
4666 tree cond_expr = NULL_TREE;
4667 gimple_seq cond_expr_stmt_list = NULL;
4668 bool do_peeling_for_loop_bound;
4670 if (vect_print_dump_info (REPORT_DETAILS))
4671 fprintf (vect_dump, "=== vec_transform_loop ===");
4673 /* Peel the loop if there are data refs with unknown alignment.
4674 Only one data ref with unknown store is allowed. */
4676 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4677 vect_do_peeling_for_alignment (loop_vinfo);
4679 do_peeling_for_loop_bound
4680 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4681 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4682 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4684 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4685 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4686 vect_loop_versioning (loop_vinfo,
4687 !do_peeling_for_loop_bound,
4688 &cond_expr, &cond_expr_stmt_list);
4690 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4691 compile time constant), or it is a constant that doesn't divide by the
4692 vectorization factor, then an epilog loop needs to be created.
4693 We therefore duplicate the loop: the original loop will be vectorized,
4694 and will compute the first (n/VF) iterations. The second copy of the loop
4695 will remain scalar and will compute the remaining (n%VF) iterations.
4696 (VF is the vectorization factor). */
4698 if (do_peeling_for_loop_bound)
4699 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4700 cond_expr, cond_expr_stmt_list);
4701 else
4702 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4703 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4705 /* 1) Make sure the loop header has exactly two entries
4706 2) Make sure we have a preheader basic block. */
4708 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4710 split_edge (loop_preheader_edge (loop));
4712 /* FORNOW: the vectorizer supports only loops which body consist
4713 of one basic block (header + empty latch). When the vectorizer will
4714 support more involved loop forms, the order by which the BBs are
4715 traversed need to be reconsidered. */
4717 for (i = 0; i < nbbs; i++)
4719 basic_block bb = bbs[i];
4720 stmt_vec_info stmt_info;
4721 gimple phi;
4723 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4725 phi = gsi_stmt (si);
4726 if (vect_print_dump_info (REPORT_DETAILS))
4728 fprintf (vect_dump, "------>vectorizing phi: ");
4729 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4731 stmt_info = vinfo_for_stmt (phi);
4732 if (!stmt_info)
4733 continue;
4735 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4736 vect_loop_kill_debug_uses (loop, phi);
4738 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4739 && !STMT_VINFO_LIVE_P (stmt_info))
4740 continue;
4742 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4743 != (unsigned HOST_WIDE_INT) vectorization_factor)
4744 && vect_print_dump_info (REPORT_DETAILS))
4745 fprintf (vect_dump, "multiple-types.");
4747 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4749 if (vect_print_dump_info (REPORT_DETAILS))
4750 fprintf (vect_dump, "transform phi.");
4751 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4755 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4757 gimple stmt = gsi_stmt (si);
4758 bool is_store;
4760 if (vect_print_dump_info (REPORT_DETAILS))
4762 fprintf (vect_dump, "------>vectorizing statement: ");
4763 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4766 stmt_info = vinfo_for_stmt (stmt);
4768 /* vector stmts created in the outer-loop during vectorization of
4769 stmts in an inner-loop may not have a stmt_info, and do not
4770 need to be vectorized. */
4771 if (!stmt_info)
4773 gsi_next (&si);
4774 continue;
4777 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4778 vect_loop_kill_debug_uses (loop, stmt);
4780 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4781 && !STMT_VINFO_LIVE_P (stmt_info))
4783 gsi_next (&si);
4784 continue;
4787 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4788 nunits =
4789 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4790 if (!STMT_SLP_TYPE (stmt_info)
4791 && nunits != (unsigned int) vectorization_factor
4792 && vect_print_dump_info (REPORT_DETAILS))
4793 /* For SLP VF is set according to unrolling factor, and not to
4794 vector size, hence for SLP this print is not valid. */
4795 fprintf (vect_dump, "multiple-types.");
4797 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4798 reached. */
4799 if (STMT_SLP_TYPE (stmt_info))
4801 if (!slp_scheduled)
4803 slp_scheduled = true;
4805 if (vect_print_dump_info (REPORT_DETAILS))
4806 fprintf (vect_dump, "=== scheduling SLP instances ===");
4808 vect_schedule_slp (loop_vinfo, NULL);
4811 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4812 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4814 gsi_next (&si);
4815 continue;
4819 /* -------- vectorize statement ------------ */
4820 if (vect_print_dump_info (REPORT_DETAILS))
4821 fprintf (vect_dump, "transform statement.");
4823 strided_store = false;
4824 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4825 if (is_store)
4827 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4829 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4830 interleaving chain was completed - free all the stores in
4831 the chain. */
4832 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4833 gsi_remove (&si, true);
4834 continue;
4836 else
4838 /* Free the attached stmt_vec_info and remove the stmt. */
4839 free_stmt_vec_info (stmt);
4840 gsi_remove (&si, true);
4841 continue;
4844 gsi_next (&si);
4845 } /* stmts in BB */
4846 } /* BBs in loop */
4848 slpeel_make_loop_iterate_ntimes (loop, ratio);
4850 /* The memory tags and pointers in vectorized statements need to
4851 have their SSA forms updated. FIXME, why can't this be delayed
4852 until all the loops have been transformed? */
4853 update_ssa (TODO_update_ssa);
4855 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4856 fprintf (vect_dump, "LOOP VECTORIZED.");
4857 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4858 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");