Retry rdrand if the carry flag isn't valid.
[official-gcc.git] / gcc / tree-vect-loop.c
blob22b23a6e5239956905caadab36f3127097af9f42
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "toplev.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
129 support different sizes of vectors, for now will need to specify one value
130 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
132 Since we only vectorize operations which vector form can be
133 expressed using existing tree codes, to verify that an operation is
134 supported, the vectorizer checks the relevant optab at the relevant
135 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
136 the value found is CODE_FOR_nothing, then there's no target support, and
137 we can't vectorize the stmt.
139 For additional information on this project see:
140 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
143 /* Function vect_determine_vectorization_factor
145 Determine the vectorization factor (VF). VF is the number of data elements
146 that are operated upon in parallel in a single iteration of the vectorized
147 loop. For example, when vectorizing a loop that operates on 4byte elements,
148 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
149 elements can fit in a single vector register.
151 We currently support vectorization of loops in which all types operated upon
152 are of the same size. Therefore this function currently sets VF according to
153 the size of the types operated upon, and fails if there are multiple sizes
154 in the loop.
156 VF is also the factor by which the loop iterations are strip-mined, e.g.:
157 original loop:
158 for (i=0; i<N; i++){
159 a[i] = b[i] + c[i];
162 vectorized loop:
163 for (i=0; i<N; i+=VF){
164 a[i:VF] = b[i:VF] + c[i:VF];
168 static bool
169 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
172 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
173 int nbbs = loop->num_nodes;
174 gimple_stmt_iterator si;
175 unsigned int vectorization_factor = 0;
176 tree scalar_type;
177 gimple phi;
178 tree vectype;
179 unsigned int nunits;
180 stmt_vec_info stmt_info;
181 int i;
182 HOST_WIDE_INT dummy;
184 if (vect_print_dump_info (REPORT_DETAILS))
185 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
187 for (i = 0; i < nbbs; i++)
189 basic_block bb = bbs[i];
191 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
193 phi = gsi_stmt (si);
194 stmt_info = vinfo_for_stmt (phi);
195 if (vect_print_dump_info (REPORT_DETAILS))
197 fprintf (vect_dump, "==> examining phi: ");
198 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
201 gcc_assert (stmt_info);
203 if (STMT_VINFO_RELEVANT_P (stmt_info))
205 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
206 scalar_type = TREE_TYPE (PHI_RESULT (phi));
208 if (vect_print_dump_info (REPORT_DETAILS))
210 fprintf (vect_dump, "get vectype for scalar type: ");
211 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
214 vectype = get_vectype_for_scalar_type (scalar_type);
215 if (!vectype)
217 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
219 fprintf (vect_dump,
220 "not vectorized: unsupported data-type ");
221 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
223 return false;
225 STMT_VINFO_VECTYPE (stmt_info) = vectype;
227 if (vect_print_dump_info (REPORT_DETAILS))
229 fprintf (vect_dump, "vectype: ");
230 print_generic_expr (vect_dump, vectype, TDF_SLIM);
233 nunits = TYPE_VECTOR_SUBPARTS (vectype);
234 if (vect_print_dump_info (REPORT_DETAILS))
235 fprintf (vect_dump, "nunits = %d", nunits);
237 if (!vectorization_factor
238 || (nunits > vectorization_factor))
239 vectorization_factor = nunits;
243 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
245 tree vf_vectype;
246 gimple stmt = gsi_stmt (si);
247 stmt_info = vinfo_for_stmt (stmt);
249 if (vect_print_dump_info (REPORT_DETAILS))
251 fprintf (vect_dump, "==> examining statement: ");
252 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
255 gcc_assert (stmt_info);
257 /* skip stmts which do not need to be vectorized. */
258 if (!STMT_VINFO_RELEVANT_P (stmt_info)
259 && !STMT_VINFO_LIVE_P (stmt_info))
261 if (vect_print_dump_info (REPORT_DETAILS))
262 fprintf (vect_dump, "skip.");
263 continue;
266 if (gimple_get_lhs (stmt) == NULL_TREE)
268 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
270 fprintf (vect_dump, "not vectorized: irregular stmt.");
271 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
273 return false;
276 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
278 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
280 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
281 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
283 return false;
286 if (STMT_VINFO_VECTYPE (stmt_info))
288 /* The only case when a vectype had been already set is for stmts
289 that contain a dataref, or for "pattern-stmts" (stmts generated
290 by the vectorizer to represent/replace a certain idiom). */
291 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
292 || is_pattern_stmt_p (stmt_info));
293 vectype = STMT_VINFO_VECTYPE (stmt_info);
295 else
297 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
298 && !is_pattern_stmt_p (stmt_info));
300 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
301 if (vect_print_dump_info (REPORT_DETAILS))
303 fprintf (vect_dump, "get vectype for scalar type: ");
304 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
306 vectype = get_vectype_for_scalar_type (scalar_type);
307 if (!vectype)
309 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
311 fprintf (vect_dump,
312 "not vectorized: unsupported data-type ");
313 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
315 return false;
318 STMT_VINFO_VECTYPE (stmt_info) = vectype;
321 /* The vectorization factor is according to the smallest
322 scalar type (or the largest vector size, but we only
323 support one vector size per loop). */
324 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
325 &dummy);
326 if (vect_print_dump_info (REPORT_DETAILS))
328 fprintf (vect_dump, "get vectype for scalar type: ");
329 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
331 vf_vectype = get_vectype_for_scalar_type (scalar_type);
332 if (!vf_vectype)
334 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
336 fprintf (vect_dump,
337 "not vectorized: unsupported data-type ");
338 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
340 return false;
343 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
344 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
346 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
348 fprintf (vect_dump,
349 "not vectorized: different sized vector "
350 "types in statement, ");
351 print_generic_expr (vect_dump, vectype, TDF_SLIM);
352 fprintf (vect_dump, " and ");
353 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
355 return false;
358 if (vect_print_dump_info (REPORT_DETAILS))
360 fprintf (vect_dump, "vectype: ");
361 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
364 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
365 if (vect_print_dump_info (REPORT_DETAILS))
366 fprintf (vect_dump, "nunits = %d", nunits);
368 if (!vectorization_factor
369 || (nunits > vectorization_factor))
370 vectorization_factor = nunits;
374 /* TODO: Analyze cost. Decide if worth while to vectorize. */
375 if (vect_print_dump_info (REPORT_DETAILS))
376 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
377 if (vectorization_factor <= 1)
379 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
380 fprintf (vect_dump, "not vectorized: unsupported data-type");
381 return false;
383 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
385 return true;
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution with constant step. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
402 /* When there is no evolution in this loop, the evolution function
403 is not "simple". */
404 if (evolution_part == NULL_TREE)
405 return false;
407 /* When the evolution is a polynomial of degree >= 2
408 the evolution function is not "simple". */
409 if (tree_is_chrec (evolution_part))
410 return false;
412 step_expr = evolution_part;
413 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415 if (vect_print_dump_info (REPORT_DETAILS))
417 fprintf (vect_dump, "step: ");
418 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
419 fprintf (vect_dump, ", init: ");
420 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
423 *init = init_expr;
424 *step = step_expr;
426 if (TREE_CODE (step_expr) != INTEGER_CST)
428 if (vect_print_dump_info (REPORT_DETAILS))
429 fprintf (vect_dump, "step unknown.");
430 return false;
433 return true;
436 /* Function vect_analyze_scalar_cycles_1.
438 Examine the cross iteration def-use cycles of scalar variables
439 in LOOP. LOOP_VINFO represents the loop that is now being
440 considered for vectorization (can be LOOP, or an outer-loop
441 enclosing LOOP). */
443 static void
444 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
446 basic_block bb = loop->header;
447 tree dumy;
448 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
449 gimple_stmt_iterator gsi;
450 bool double_reduc;
452 if (vect_print_dump_info (REPORT_DETAILS))
453 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
455 /* First - identify all inductions. Reduction detection assumes that all the
456 inductions have been identified, therefore, this order must not be
457 changed. */
458 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
460 gimple phi = gsi_stmt (gsi);
461 tree access_fn = NULL;
462 tree def = PHI_RESULT (phi);
463 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
465 if (vect_print_dump_info (REPORT_DETAILS))
467 fprintf (vect_dump, "Analyze phi: ");
468 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
471 /* Skip virtual phi's. The data dependences that are associated with
472 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
473 if (!is_gimple_reg (SSA_NAME_VAR (def)))
474 continue;
476 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
478 /* Analyze the evolution function. */
479 access_fn = analyze_scalar_evolution (loop, def);
480 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
482 fprintf (vect_dump, "Access function of PHI: ");
483 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
486 if (!access_fn
487 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
489 VEC_safe_push (gimple, heap, worklist, phi);
490 continue;
493 if (vect_print_dump_info (REPORT_DETAILS))
494 fprintf (vect_dump, "Detected induction.");
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
499 /* Second - identify all reductions and nested cycles. */
500 while (VEC_length (gimple, worklist) > 0)
502 gimple phi = VEC_pop (gimple, worklist);
503 tree def = PHI_RESULT (phi);
504 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
505 gimple reduc_stmt;
506 bool nested_cycle;
508 if (vect_print_dump_info (REPORT_DETAILS))
510 fprintf (vect_dump, "Analyze phi: ");
511 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
514 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
515 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
517 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
518 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
519 &double_reduc);
520 if (reduc_stmt)
522 if (double_reduc)
524 if (vect_print_dump_info (REPORT_DETAILS))
525 fprintf (vect_dump, "Detected double reduction.");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
528 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
529 vect_double_reduction_def;
531 else
533 if (nested_cycle)
535 if (vect_print_dump_info (REPORT_DETAILS))
536 fprintf (vect_dump, "Detected vectorizable nested cycle.");
538 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
539 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
540 vect_nested_cycle;
542 else
544 if (vect_print_dump_info (REPORT_DETAILS))
545 fprintf (vect_dump, "Detected reduction.");
547 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
548 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
549 vect_reduction_def;
550 /* Store the reduction cycles for possible vectorization in
551 loop-aware SLP. */
552 VEC_safe_push (gimple, heap,
553 LOOP_VINFO_REDUCTIONS (loop_vinfo),
554 reduc_stmt);
558 else
559 if (vect_print_dump_info (REPORT_DETAILS))
560 fprintf (vect_dump, "Unknown def-use cycle pattern.");
563 VEC_free (gimple, heap, worklist);
567 /* Function vect_analyze_scalar_cycles.
569 Examine the cross iteration def-use cycles of scalar variables, by
570 analyzing the loop-header PHIs of scalar variables; Classify each
571 cycle as one of the following: invariant, induction, reduction, unknown.
572 We do that for the loop represented by LOOP_VINFO, and also to its
573 inner-loop, if exists.
574 Examples for scalar cycles:
576 Example1: reduction:
578 loop1:
579 for (i=0; i<N; i++)
580 sum += a[i];
582 Example2: induction:
584 loop2:
585 for (i=0; i<N; i++)
586 a[i] = i; */
588 static void
589 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
593 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
595 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
596 Reductions in such inner-loop therefore have different properties than
597 the reductions in the nest that gets vectorized:
598 1. When vectorized, they are executed in the same order as in the original
599 scalar loop, so we can't change the order of computation when
600 vectorizing them.
601 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
602 current checks are too strict. */
604 if (loop->inner)
605 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
608 /* Function vect_get_loop_niters.
610 Determine how many iterations the loop is executed.
611 If an expression that represents the number of iterations
612 can be constructed, place it in NUMBER_OF_ITERATIONS.
613 Return the loop exit condition. */
615 static gimple
616 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
618 tree niters;
620 if (vect_print_dump_info (REPORT_DETAILS))
621 fprintf (vect_dump, "=== get_loop_niters ===");
623 niters = number_of_exit_cond_executions (loop);
625 if (niters != NULL_TREE
626 && niters != chrec_dont_know)
628 *number_of_iterations = niters;
630 if (vect_print_dump_info (REPORT_DETAILS))
632 fprintf (vect_dump, "==> get_loop_niters:" );
633 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
637 return get_loop_exit_condition (loop);
641 /* Function bb_in_loop_p
643 Used as predicate for dfs order traversal of the loop bbs. */
645 static bool
646 bb_in_loop_p (const_basic_block bb, const void *data)
648 const struct loop *const loop = (const struct loop *)data;
649 if (flow_bb_inside_loop_p (loop, bb))
650 return true;
651 return false;
655 /* Function new_loop_vec_info.
657 Create and initialize a new loop_vec_info struct for LOOP, as well as
658 stmt_vec_info structs for all the stmts in LOOP. */
660 static loop_vec_info
661 new_loop_vec_info (struct loop *loop)
663 loop_vec_info res;
664 basic_block *bbs;
665 gimple_stmt_iterator si;
666 unsigned int i, nbbs;
668 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
669 LOOP_VINFO_LOOP (res) = loop;
671 bbs = get_loop_body (loop);
673 /* Create/Update stmt_info for all stmts in the loop. */
674 for (i = 0; i < loop->num_nodes; i++)
676 basic_block bb = bbs[i];
678 /* BBs in a nested inner-loop will have been already processed (because
679 we will have called vect_analyze_loop_form for any nested inner-loop).
680 Therefore, for stmts in an inner-loop we just want to update the
681 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
682 loop_info of the outer-loop we are currently considering to vectorize
683 (instead of the loop_info of the inner-loop).
684 For stmts in other BBs we need to create a stmt_info from scratch. */
685 if (bb->loop_father != loop)
687 /* Inner-loop bb. */
688 gcc_assert (loop->inner && bb->loop_father == loop->inner);
689 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
691 gimple phi = gsi_stmt (si);
692 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
693 loop_vec_info inner_loop_vinfo =
694 STMT_VINFO_LOOP_VINFO (stmt_info);
695 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
696 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
698 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
700 gimple stmt = gsi_stmt (si);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info inner_loop_vinfo =
703 STMT_VINFO_LOOP_VINFO (stmt_info);
704 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
705 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
708 else
710 /* bb in current nest. */
711 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
713 gimple phi = gsi_stmt (si);
714 gimple_set_uid (phi, 0);
715 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
718 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
720 gimple stmt = gsi_stmt (si);
721 gimple_set_uid (stmt, 0);
722 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
727 /* CHECKME: We want to visit all BBs before their successors (except for
728 latch blocks, for which this assertion wouldn't hold). In the simple
729 case of the loop forms we allow, a dfs order of the BBs would the same
730 as reversed postorder traversal, so we are safe. */
732 free (bbs);
733 bbs = XCNEWVEC (basic_block, loop->num_nodes);
734 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
735 bbs, loop->num_nodes, loop);
736 gcc_assert (nbbs == loop->num_nodes);
738 LOOP_VINFO_BBS (res) = bbs;
739 LOOP_VINFO_NITERS (res) = NULL;
740 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
741 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
742 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
743 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
744 LOOP_VINFO_VECT_FACTOR (res) = 0;
745 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
746 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
747 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
748 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
749 VEC_alloc (gimple, heap,
750 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
751 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
752 VEC_alloc (ddr_p, heap,
753 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
754 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
755 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
756 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
757 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
758 LOOP_VINFO_PEELING_HTAB (res) = NULL;
760 return res;
764 /* Function destroy_loop_vec_info.
766 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
767 stmts in the loop. */
769 void
770 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
772 struct loop *loop;
773 basic_block *bbs;
774 int nbbs;
775 gimple_stmt_iterator si;
776 int j;
777 VEC (slp_instance, heap) *slp_instances;
778 slp_instance instance;
780 if (!loop_vinfo)
781 return;
783 loop = LOOP_VINFO_LOOP (loop_vinfo);
785 bbs = LOOP_VINFO_BBS (loop_vinfo);
786 nbbs = loop->num_nodes;
788 if (!clean_stmts)
790 free (LOOP_VINFO_BBS (loop_vinfo));
791 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
792 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
793 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
795 free (loop_vinfo);
796 loop->aux = NULL;
797 return;
800 for (j = 0; j < nbbs; j++)
802 basic_block bb = bbs[j];
803 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
804 free_stmt_vec_info (gsi_stmt (si));
806 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
808 gimple stmt = gsi_stmt (si);
809 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
811 if (stmt_info)
813 /* Check if this is a "pattern stmt" (introduced by the
814 vectorizer during the pattern recognition pass). */
815 bool remove_stmt_p = false;
816 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
817 if (orig_stmt)
819 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
820 if (orig_stmt_info
821 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
822 remove_stmt_p = true;
825 /* Free stmt_vec_info. */
826 free_stmt_vec_info (stmt);
828 /* Remove dead "pattern stmts". */
829 if (remove_stmt_p)
830 gsi_remove (&si, true);
832 gsi_next (&si);
836 free (LOOP_VINFO_BBS (loop_vinfo));
837 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
838 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
839 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
840 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
841 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
842 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
843 vect_free_slp_instance (instance);
845 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
846 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
847 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
849 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
850 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
852 free (loop_vinfo);
853 loop->aux = NULL;
857 /* Function vect_analyze_loop_1.
859 Apply a set of analyses on LOOP, and create a loop_vec_info struct
860 for it. The different analyses will record information in the
861 loop_vec_info struct. This is a subset of the analyses applied in
862 vect_analyze_loop, to be applied on an inner-loop nested in the loop
863 that is now considered for (outer-loop) vectorization. */
865 static loop_vec_info
866 vect_analyze_loop_1 (struct loop *loop)
868 loop_vec_info loop_vinfo;
870 if (vect_print_dump_info (REPORT_DETAILS))
871 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
873 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
875 loop_vinfo = vect_analyze_loop_form (loop);
876 if (!loop_vinfo)
878 if (vect_print_dump_info (REPORT_DETAILS))
879 fprintf (vect_dump, "bad inner-loop form.");
880 return NULL;
883 return loop_vinfo;
887 /* Function vect_analyze_loop_form.
889 Verify that certain CFG restrictions hold, including:
890 - the loop has a pre-header
891 - the loop has a single entry and exit
892 - the loop exit condition is simple enough, and the number of iterations
893 can be analyzed (a countable loop). */
895 loop_vec_info
896 vect_analyze_loop_form (struct loop *loop)
898 loop_vec_info loop_vinfo;
899 gimple loop_cond;
900 tree number_of_iterations = NULL;
901 loop_vec_info inner_loop_vinfo = NULL;
903 if (vect_print_dump_info (REPORT_DETAILS))
904 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
906 /* Different restrictions apply when we are considering an inner-most loop,
907 vs. an outer (nested) loop.
908 (FORNOW. May want to relax some of these restrictions in the future). */
910 if (!loop->inner)
912 /* Inner-most loop. We currently require that the number of BBs is
913 exactly 2 (the header and latch). Vectorizable inner-most loops
914 look like this:
916 (pre-header)
918 header <--------+
919 | | |
920 | +--> latch --+
922 (exit-bb) */
924 if (loop->num_nodes != 2)
926 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
927 fprintf (vect_dump, "not vectorized: control flow in loop.");
928 return NULL;
931 if (empty_block_p (loop->header))
933 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
934 fprintf (vect_dump, "not vectorized: empty loop.");
935 return NULL;
938 else
940 struct loop *innerloop = loop->inner;
941 edge entryedge;
943 /* Nested loop. We currently require that the loop is doubly-nested,
944 contains a single inner loop, and the number of BBs is exactly 5.
945 Vectorizable outer-loops look like this:
947 (pre-header)
949 header <---+
951 inner-loop |
953 tail ------+
955 (exit-bb)
957 The inner-loop has the properties expected of inner-most loops
958 as described above. */
960 if ((loop->inner)->inner || (loop->inner)->next)
962 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
963 fprintf (vect_dump, "not vectorized: multiple nested loops.");
964 return NULL;
967 /* Analyze the inner-loop. */
968 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
969 if (!inner_loop_vinfo)
971 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
972 fprintf (vect_dump, "not vectorized: Bad inner loop.");
973 return NULL;
976 if (!expr_invariant_in_loop_p (loop,
977 LOOP_VINFO_NITERS (inner_loop_vinfo)))
979 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
980 fprintf (vect_dump,
981 "not vectorized: inner-loop count not invariant.");
982 destroy_loop_vec_info (inner_loop_vinfo, true);
983 return NULL;
986 if (loop->num_nodes != 5)
988 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
989 fprintf (vect_dump, "not vectorized: control flow in loop.");
990 destroy_loop_vec_info (inner_loop_vinfo, true);
991 return NULL;
994 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
995 entryedge = EDGE_PRED (innerloop->header, 0);
996 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
997 entryedge = EDGE_PRED (innerloop->header, 1);
999 if (entryedge->src != loop->header
1000 || !single_exit (innerloop)
1001 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1003 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1004 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1005 destroy_loop_vec_info (inner_loop_vinfo, true);
1006 return NULL;
1009 if (vect_print_dump_info (REPORT_DETAILS))
1010 fprintf (vect_dump, "Considering outer-loop vectorization.");
1013 if (!single_exit (loop)
1014 || EDGE_COUNT (loop->header->preds) != 2)
1016 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1018 if (!single_exit (loop))
1019 fprintf (vect_dump, "not vectorized: multiple exits.");
1020 else if (EDGE_COUNT (loop->header->preds) != 2)
1021 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1023 if (inner_loop_vinfo)
1024 destroy_loop_vec_info (inner_loop_vinfo, true);
1025 return NULL;
1028 /* We assume that the loop exit condition is at the end of the loop. i.e,
1029 that the loop is represented as a do-while (with a proper if-guard
1030 before the loop if needed), where the loop header contains all the
1031 executable statements, and the latch is empty. */
1032 if (!empty_block_p (loop->latch)
1033 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1035 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1037 if (inner_loop_vinfo)
1038 destroy_loop_vec_info (inner_loop_vinfo, true);
1039 return NULL;
1042 /* Make sure there exists a single-predecessor exit bb: */
1043 if (!single_pred_p (single_exit (loop)->dest))
1045 edge e = single_exit (loop);
1046 if (!(e->flags & EDGE_ABNORMAL))
1048 split_loop_exit_edge (e);
1049 if (vect_print_dump_info (REPORT_DETAILS))
1050 fprintf (vect_dump, "split exit edge.");
1052 else
1054 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1055 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1056 if (inner_loop_vinfo)
1057 destroy_loop_vec_info (inner_loop_vinfo, true);
1058 return NULL;
1062 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1063 if (!loop_cond)
1065 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1066 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1067 if (inner_loop_vinfo)
1068 destroy_loop_vec_info (inner_loop_vinfo, true);
1069 return NULL;
1072 if (!number_of_iterations)
1074 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1075 fprintf (vect_dump,
1076 "not vectorized: number of iterations cannot be computed.");
1077 if (inner_loop_vinfo)
1078 destroy_loop_vec_info (inner_loop_vinfo, true);
1079 return NULL;
1082 if (chrec_contains_undetermined (number_of_iterations))
1084 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085 fprintf (vect_dump, "Infinite number of iterations.");
1086 if (inner_loop_vinfo)
1087 destroy_loop_vec_info (inner_loop_vinfo, true);
1088 return NULL;
1091 if (!NITERS_KNOWN_P (number_of_iterations))
1093 if (vect_print_dump_info (REPORT_DETAILS))
1095 fprintf (vect_dump, "Symbolic number of iterations is ");
1096 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1099 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1101 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1102 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1103 if (inner_loop_vinfo)
1104 destroy_loop_vec_info (inner_loop_vinfo, false);
1105 return NULL;
1108 loop_vinfo = new_loop_vec_info (loop);
1109 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1110 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1112 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1114 /* CHECKME: May want to keep it around it in the future. */
1115 if (inner_loop_vinfo)
1116 destroy_loop_vec_info (inner_loop_vinfo, false);
1118 gcc_assert (!loop->aux);
1119 loop->aux = loop_vinfo;
1120 return loop_vinfo;
1124 /* Get cost by calling cost target builtin. */
1126 static inline
1127 int vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1129 tree dummy_type = NULL;
1130 int dummy = 0;
1132 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1133 dummy_type, dummy);
1137 /* Function vect_analyze_loop_operations.
1139 Scan the loop stmts and make sure they are all vectorizable. */
1141 static bool
1142 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1144 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1145 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1146 int nbbs = loop->num_nodes;
1147 gimple_stmt_iterator si;
1148 unsigned int vectorization_factor = 0;
1149 int i;
1150 gimple phi;
1151 stmt_vec_info stmt_info;
1152 bool need_to_vectorize = false;
1153 int min_profitable_iters;
1154 int min_scalar_loop_bound;
1155 unsigned int th;
1156 bool only_slp_in_loop = true, ok;
1158 if (vect_print_dump_info (REPORT_DETAILS))
1159 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1161 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1162 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1164 for (i = 0; i < nbbs; i++)
1166 basic_block bb = bbs[i];
1168 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1170 phi = gsi_stmt (si);
1171 ok = true;
1173 stmt_info = vinfo_for_stmt (phi);
1174 if (vect_print_dump_info (REPORT_DETAILS))
1176 fprintf (vect_dump, "examining phi: ");
1177 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1180 if (! is_loop_header_bb_p (bb))
1182 /* inner-loop loop-closed exit phi in outer-loop vectorization
1183 (i.e. a phi in the tail of the outer-loop).
1184 FORNOW: we currently don't support the case that these phis
1185 are not used in the outerloop (unless it is double reduction,
1186 i.e., this phi is vect_reduction_def), cause this case
1187 requires to actually do something here. */
1188 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1189 || STMT_VINFO_LIVE_P (stmt_info))
1190 && STMT_VINFO_DEF_TYPE (stmt_info)
1191 != vect_double_reduction_def)
1193 if (vect_print_dump_info (REPORT_DETAILS))
1194 fprintf (vect_dump,
1195 "Unsupported loop-closed phi in outer-loop.");
1196 return false;
1198 continue;
1201 gcc_assert (stmt_info);
1203 if (STMT_VINFO_LIVE_P (stmt_info))
1205 /* FORNOW: not yet supported. */
1206 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207 fprintf (vect_dump, "not vectorized: value used after loop.");
1208 return false;
1211 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1212 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1214 /* A scalar-dependence cycle that we don't support. */
1215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1216 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1217 return false;
1220 if (STMT_VINFO_RELEVANT_P (stmt_info))
1222 need_to_vectorize = true;
1223 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1224 ok = vectorizable_induction (phi, NULL, NULL);
1227 if (!ok)
1229 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1231 fprintf (vect_dump,
1232 "not vectorized: relevant phi not supported: ");
1233 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1235 return false;
1239 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1241 gimple stmt = gsi_stmt (si);
1242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1244 gcc_assert (stmt_info);
1246 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1247 return false;
1249 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1250 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1251 && !PURE_SLP_STMT (stmt_info))
1252 /* STMT needs both SLP and loop-based vectorization. */
1253 only_slp_in_loop = false;
1255 } /* bbs */
1257 /* All operations in the loop are either irrelevant (deal with loop
1258 control, or dead), or only used outside the loop and can be moved
1259 out of the loop (e.g. invariants, inductions). The loop can be
1260 optimized away by scalar optimizations. We're better off not
1261 touching this loop. */
1262 if (!need_to_vectorize)
1264 if (vect_print_dump_info (REPORT_DETAILS))
1265 fprintf (vect_dump,
1266 "All the computation can be taken out of the loop.");
1267 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1268 fprintf (vect_dump,
1269 "not vectorized: redundant loop. no profit to vectorize.");
1270 return false;
1273 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1274 vectorization factor of the loop is the unrolling factor required by the
1275 SLP instances. If that unrolling factor is 1, we say, that we perform
1276 pure SLP on loop - cross iteration parallelism is not exploited. */
1277 if (only_slp_in_loop)
1278 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1279 else
1280 vectorization_factor = least_common_multiple (vectorization_factor,
1281 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1283 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1285 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286 && vect_print_dump_info (REPORT_DETAILS))
1287 fprintf (vect_dump,
1288 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1289 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1291 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1292 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1294 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295 fprintf (vect_dump, "not vectorized: iteration count too small.");
1296 if (vect_print_dump_info (REPORT_DETAILS))
1297 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1298 "vectorization factor.");
1299 return false;
1302 /* Analyze cost. Decide if worth while to vectorize. */
1304 /* Once VF is set, SLP costs should be updated since the number of created
1305 vector stmts depends on VF. */
1306 vect_update_slp_costs_according_to_vf (loop_vinfo);
1308 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1309 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1311 if (min_profitable_iters < 0)
1313 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1315 if (vect_print_dump_info (REPORT_DETAILS))
1316 fprintf (vect_dump, "not vectorized: vector version will never be "
1317 "profitable.");
1318 return false;
1321 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1322 * vectorization_factor) - 1);
1324 /* Use the cost model only if it is more conservative than user specified
1325 threshold. */
1327 th = (unsigned) min_scalar_loop_bound;
1328 if (min_profitable_iters
1329 && (!min_scalar_loop_bound
1330 || min_profitable_iters > min_scalar_loop_bound))
1331 th = (unsigned) min_profitable_iters;
1333 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1334 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337 fprintf (vect_dump, "not vectorized: vectorization not "
1338 "profitable.");
1339 if (vect_print_dump_info (REPORT_DETAILS))
1340 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1341 "user specified loop bound parameter or minimum "
1342 "profitable iterations (whichever is more conservative).");
1343 return false;
1346 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1347 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1348 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1350 if (vect_print_dump_info (REPORT_DETAILS))
1351 fprintf (vect_dump, "epilog loop required.");
1352 if (!vect_can_advance_ivs_p (loop_vinfo))
1354 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1355 fprintf (vect_dump,
1356 "not vectorized: can't create epilog loop 1.");
1357 return false;
1359 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1361 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1362 fprintf (vect_dump,
1363 "not vectorized: can't create epilog loop 2.");
1364 return false;
1368 return true;
1372 /* Function vect_analyze_loop.
1374 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1375 for it. The different analyses will record information in the
1376 loop_vec_info struct. */
1377 loop_vec_info
1378 vect_analyze_loop (struct loop *loop)
1380 bool ok;
1381 loop_vec_info loop_vinfo;
1382 int max_vf = MAX_VECTORIZATION_FACTOR;
1383 int min_vf = 2;
1385 if (vect_print_dump_info (REPORT_DETAILS))
1386 fprintf (vect_dump, "===== analyze_loop_nest =====");
1388 if (loop_outer (loop)
1389 && loop_vec_info_for_loop (loop_outer (loop))
1390 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1392 if (vect_print_dump_info (REPORT_DETAILS))
1393 fprintf (vect_dump, "outer-loop already vectorized.");
1394 return NULL;
1397 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1399 loop_vinfo = vect_analyze_loop_form (loop);
1400 if (!loop_vinfo)
1402 if (vect_print_dump_info (REPORT_DETAILS))
1403 fprintf (vect_dump, "bad loop form.");
1404 return NULL;
1407 /* Find all data references in the loop (which correspond to vdefs/vuses)
1408 and analyze their evolution in the loop. Also adjust the minimal
1409 vectorization factor according to the loads and stores.
1411 FORNOW: Handle only simple, array references, which
1412 alignment can be forced, and aligned pointer-references. */
1414 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1415 if (!ok)
1417 if (vect_print_dump_info (REPORT_DETAILS))
1418 fprintf (vect_dump, "bad data references.");
1419 destroy_loop_vec_info (loop_vinfo, true);
1420 return NULL;
1423 /* Classify all cross-iteration scalar data-flow cycles.
1424 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1426 vect_analyze_scalar_cycles (loop_vinfo);
1428 vect_pattern_recog (loop_vinfo);
1430 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1432 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1433 if (!ok)
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "unexpected pattern.");
1437 destroy_loop_vec_info (loop_vinfo, true);
1438 return NULL;
1441 /* Analyze data dependences between the data-refs in the loop
1442 and adjust the maximum vectorization factor according to
1443 the dependences.
1444 FORNOW: fail at the first data dependence that we encounter. */
1446 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1447 if (!ok
1448 || max_vf < min_vf)
1450 if (vect_print_dump_info (REPORT_DETAILS))
1451 fprintf (vect_dump, "bad data dependence.");
1452 destroy_loop_vec_info (loop_vinfo, true);
1453 return NULL;
1456 ok = vect_determine_vectorization_factor (loop_vinfo);
1457 if (!ok)
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "can't determine vectorization factor.");
1461 destroy_loop_vec_info (loop_vinfo, true);
1462 return NULL;
1464 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1466 if (vect_print_dump_info (REPORT_DETAILS))
1467 fprintf (vect_dump, "bad data dependence.");
1468 destroy_loop_vec_info (loop_vinfo, true);
1469 return NULL;
1472 /* Analyze the alignment of the data-refs in the loop.
1473 Fail if a data reference is found that cannot be vectorized. */
1475 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1476 if (!ok)
1478 if (vect_print_dump_info (REPORT_DETAILS))
1479 fprintf (vect_dump, "bad data alignment.");
1480 destroy_loop_vec_info (loop_vinfo, true);
1481 return NULL;
1484 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1485 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1487 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1488 if (!ok)
1490 if (vect_print_dump_info (REPORT_DETAILS))
1491 fprintf (vect_dump, "bad data access.");
1492 destroy_loop_vec_info (loop_vinfo, true);
1493 return NULL;
1496 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1497 It is important to call pruning after vect_analyze_data_ref_accesses,
1498 since we use grouping information gathered by interleaving analysis. */
1499 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1500 if (!ok)
1502 if (vect_print_dump_info (REPORT_DETAILS))
1503 fprintf (vect_dump, "too long list of versioning for alias "
1504 "run-time tests.");
1505 destroy_loop_vec_info (loop_vinfo, true);
1506 return NULL;
1509 /* This pass will decide on using loop versioning and/or loop peeling in
1510 order to enhance the alignment of data references in the loop. */
1512 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1513 if (!ok)
1515 if (vect_print_dump_info (REPORT_DETAILS))
1516 fprintf (vect_dump, "bad data alignment.");
1517 destroy_loop_vec_info (loop_vinfo, true);
1518 return NULL;
1521 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1522 ok = vect_analyze_slp (loop_vinfo, NULL);
1523 if (ok)
1525 /* Decide which possible SLP instances to SLP. */
1526 vect_make_slp_decision (loop_vinfo);
1528 /* Find stmts that need to be both vectorized and SLPed. */
1529 vect_detect_hybrid_slp (loop_vinfo);
1532 /* Scan all the operations in the loop and make sure they are
1533 vectorizable. */
1535 ok = vect_analyze_loop_operations (loop_vinfo);
1536 if (!ok)
1538 if (vect_print_dump_info (REPORT_DETAILS))
1539 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1540 destroy_loop_vec_info (loop_vinfo, true);
1541 return NULL;
1544 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1546 return loop_vinfo;
1550 /* Function reduction_code_for_scalar_code
1552 Input:
1553 CODE - tree_code of a reduction operations.
1555 Output:
1556 REDUC_CODE - the corresponding tree-code to be used to reduce the
1557 vector of partial results into a single scalar result (which
1558 will also reside in a vector) or ERROR_MARK if the operation is
1559 a supported reduction operation, but does not have such tree-code.
1561 Return FALSE if CODE currently cannot be vectorized as reduction. */
1563 static bool
1564 reduction_code_for_scalar_code (enum tree_code code,
1565 enum tree_code *reduc_code)
1567 switch (code)
1569 case MAX_EXPR:
1570 *reduc_code = REDUC_MAX_EXPR;
1571 return true;
1573 case MIN_EXPR:
1574 *reduc_code = REDUC_MIN_EXPR;
1575 return true;
1577 case PLUS_EXPR:
1578 *reduc_code = REDUC_PLUS_EXPR;
1579 return true;
1581 case MULT_EXPR:
1582 case MINUS_EXPR:
1583 case BIT_IOR_EXPR:
1584 case BIT_XOR_EXPR:
1585 case BIT_AND_EXPR:
1586 *reduc_code = ERROR_MARK;
1587 return true;
1589 default:
1590 return false;
1595 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1596 STMT is printed with a message MSG. */
1598 static void
1599 report_vect_op (gimple stmt, const char *msg)
1601 fprintf (vect_dump, "%s", msg);
1602 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1606 /* Function vect_is_simple_reduction_1
1608 (1) Detect a cross-iteration def-use cycle that represents a simple
1609 reduction computation. We look for the following pattern:
1611 loop_header:
1612 a1 = phi < a0, a2 >
1613 a3 = ...
1614 a2 = operation (a3, a1)
1616 such that:
1617 1. operation is commutative and associative and it is safe to
1618 change the order of the computation (if CHECK_REDUCTION is true)
1619 2. no uses for a2 in the loop (a2 is used out of the loop)
1620 3. no uses of a1 in the loop besides the reduction operation.
1622 Condition 1 is tested here.
1623 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1625 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1626 nested cycles, if CHECK_REDUCTION is false.
1628 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1629 reductions:
1631 a1 = phi < a0, a2 >
1632 inner loop (def of a3)
1633 a2 = phi < a3 >
1635 If MODIFY is true it tries also to rework the code in-place to enable
1636 detection of more reduction patterns. For the time being we rewrite
1637 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1640 static gimple
1641 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1642 bool check_reduction, bool *double_reduc,
1643 bool modify)
1645 struct loop *loop = (gimple_bb (phi))->loop_father;
1646 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1647 edge latch_e = loop_latch_edge (loop);
1648 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1649 gimple def_stmt, def1 = NULL, def2 = NULL;
1650 enum tree_code orig_code, code;
1651 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1652 tree type;
1653 int nloop_uses;
1654 tree name;
1655 imm_use_iterator imm_iter;
1656 use_operand_p use_p;
1657 bool phi_def;
1659 *double_reduc = false;
1661 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1662 otherwise, we assume outer loop vectorization. */
1663 gcc_assert ((check_reduction && loop == vect_loop)
1664 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1666 name = PHI_RESULT (phi);
1667 nloop_uses = 0;
1668 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1670 gimple use_stmt = USE_STMT (use_p);
1671 if (is_gimple_debug (use_stmt))
1672 continue;
1673 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1674 && vinfo_for_stmt (use_stmt)
1675 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1676 nloop_uses++;
1677 if (nloop_uses > 1)
1679 if (vect_print_dump_info (REPORT_DETAILS))
1680 fprintf (vect_dump, "reduction used in loop.");
1681 return NULL;
1685 if (TREE_CODE (loop_arg) != SSA_NAME)
1687 if (vect_print_dump_info (REPORT_DETAILS))
1689 fprintf (vect_dump, "reduction: not ssa_name: ");
1690 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1692 return NULL;
1695 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1696 if (!def_stmt)
1698 if (vect_print_dump_info (REPORT_DETAILS))
1699 fprintf (vect_dump, "reduction: no def_stmt.");
1700 return NULL;
1703 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1705 if (vect_print_dump_info (REPORT_DETAILS))
1706 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1707 return NULL;
1710 if (is_gimple_assign (def_stmt))
1712 name = gimple_assign_lhs (def_stmt);
1713 phi_def = false;
1715 else
1717 name = PHI_RESULT (def_stmt);
1718 phi_def = true;
1721 nloop_uses = 0;
1722 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1724 gimple use_stmt = USE_STMT (use_p);
1725 if (is_gimple_debug (use_stmt))
1726 continue;
1727 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1728 && vinfo_for_stmt (use_stmt)
1729 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1730 nloop_uses++;
1731 if (nloop_uses > 1)
1733 if (vect_print_dump_info (REPORT_DETAILS))
1734 fprintf (vect_dump, "reduction used in loop.");
1735 return NULL;
1739 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1740 defined in the inner loop. */
1741 if (phi_def)
1743 op1 = PHI_ARG_DEF (def_stmt, 0);
1745 if (gimple_phi_num_args (def_stmt) != 1
1746 || TREE_CODE (op1) != SSA_NAME)
1748 if (vect_print_dump_info (REPORT_DETAILS))
1749 fprintf (vect_dump, "unsupported phi node definition.");
1751 return NULL;
1754 def1 = SSA_NAME_DEF_STMT (op1);
1755 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1756 && loop->inner
1757 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1758 && is_gimple_assign (def1))
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 report_vect_op (def_stmt, "detected double reduction: ");
1763 *double_reduc = true;
1764 return def_stmt;
1767 return NULL;
1770 code = orig_code = gimple_assign_rhs_code (def_stmt);
1772 /* We can handle "res -= x[i]", which is non-associative by
1773 simply rewriting this into "res += -x[i]". Avoid changing
1774 gimple instruction for the first simple tests and only do this
1775 if we're allowed to change code at all. */
1776 if (code == MINUS_EXPR && modify)
1777 code = PLUS_EXPR;
1779 if (check_reduction
1780 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1782 if (vect_print_dump_info (REPORT_DETAILS))
1783 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1784 return NULL;
1787 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1789 if (code != COND_EXPR)
1791 if (vect_print_dump_info (REPORT_DETAILS))
1792 report_vect_op (def_stmt, "reduction: not binary operation: ");
1794 return NULL;
1797 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1798 if (COMPARISON_CLASS_P (op3))
1800 op4 = TREE_OPERAND (op3, 1);
1801 op3 = TREE_OPERAND (op3, 0);
1804 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1805 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1807 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1809 if (vect_print_dump_info (REPORT_DETAILS))
1810 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1812 return NULL;
1815 else
1817 op1 = gimple_assign_rhs1 (def_stmt);
1818 op2 = gimple_assign_rhs2 (def_stmt);
1820 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1822 if (vect_print_dump_info (REPORT_DETAILS))
1823 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1825 return NULL;
1829 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1830 if ((TREE_CODE (op1) == SSA_NAME
1831 && !types_compatible_p (type,TREE_TYPE (op1)))
1832 || (TREE_CODE (op2) == SSA_NAME
1833 && !types_compatible_p (type, TREE_TYPE (op2)))
1834 || (op3 && TREE_CODE (op3) == SSA_NAME
1835 && !types_compatible_p (type, TREE_TYPE (op3)))
1836 || (op4 && TREE_CODE (op4) == SSA_NAME
1837 && !types_compatible_p (type, TREE_TYPE (op4))))
1839 if (vect_print_dump_info (REPORT_DETAILS))
1841 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1842 print_generic_expr (vect_dump, type, TDF_SLIM);
1843 fprintf (vect_dump, ", operands types: ");
1844 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1845 fprintf (vect_dump, ",");
1846 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1847 if (op3)
1849 fprintf (vect_dump, ",");
1850 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1853 if (op4)
1855 fprintf (vect_dump, ",");
1856 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1860 return NULL;
1863 /* Check that it's ok to change the order of the computation.
1864 Generally, when vectorizing a reduction we change the order of the
1865 computation. This may change the behavior of the program in some
1866 cases, so we need to check that this is ok. One exception is when
1867 vectorizing an outer-loop: the inner-loop is executed sequentially,
1868 and therefore vectorizing reductions in the inner-loop during
1869 outer-loop vectorization is safe. */
1871 /* CHECKME: check for !flag_finite_math_only too? */
1872 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1873 && check_reduction)
1875 /* Changing the order of operations changes the semantics. */
1876 if (vect_print_dump_info (REPORT_DETAILS))
1877 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1878 return NULL;
1880 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1881 && check_reduction)
1883 /* Changing the order of operations changes the semantics. */
1884 if (vect_print_dump_info (REPORT_DETAILS))
1885 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1886 return NULL;
1888 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1890 /* Changing the order of operations changes the semantics. */
1891 if (vect_print_dump_info (REPORT_DETAILS))
1892 report_vect_op (def_stmt,
1893 "reduction: unsafe fixed-point math optimization: ");
1894 return NULL;
1897 /* If we detected "res -= x[i]" earlier, rewrite it into
1898 "res += -x[i]" now. If this turns out to be useless reassoc
1899 will clean it up again. */
1900 if (orig_code == MINUS_EXPR)
1902 tree rhs = gimple_assign_rhs2 (def_stmt);
1903 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1904 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1905 rhs, NULL);
1906 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1907 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1908 loop_info, NULL));
1909 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1910 gimple_assign_set_rhs2 (def_stmt, negrhs);
1911 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1912 update_stmt (def_stmt);
1915 /* Reduction is safe. We're dealing with one of the following:
1916 1) integer arithmetic and no trapv
1917 2) floating point arithmetic, and special flags permit this optimization
1918 3) nested cycle (i.e., outer loop vectorization). */
1919 if (TREE_CODE (op1) == SSA_NAME)
1920 def1 = SSA_NAME_DEF_STMT (op1);
1922 if (TREE_CODE (op2) == SSA_NAME)
1923 def2 = SSA_NAME_DEF_STMT (op2);
1925 if (code != COND_EXPR
1926 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1928 if (vect_print_dump_info (REPORT_DETAILS))
1929 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1930 return NULL;
1933 /* Check that one def is the reduction def, defined by PHI,
1934 the other def is either defined in the loop ("vect_internal_def"),
1935 or it's an induction (defined by a loop-header phi-node). */
1937 if (def2 && def2 == phi
1938 && (code == COND_EXPR
1939 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1940 && (is_gimple_assign (def1)
1941 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1942 == vect_induction_def
1943 || (gimple_code (def1) == GIMPLE_PHI
1944 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1945 == vect_internal_def
1946 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1948 if (vect_print_dump_info (REPORT_DETAILS))
1949 report_vect_op (def_stmt, "detected reduction: ");
1950 return def_stmt;
1952 else if (def1 && def1 == phi
1953 && (code == COND_EXPR
1954 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1955 && (is_gimple_assign (def2)
1956 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1957 == vect_induction_def
1958 || (gimple_code (def2) == GIMPLE_PHI
1959 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1960 == vect_internal_def
1961 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1963 if (check_reduction)
1965 /* Swap operands (just for simplicity - so that the rest of the code
1966 can assume that the reduction variable is always the last (second)
1967 argument). */
1968 if (vect_print_dump_info (REPORT_DETAILS))
1969 report_vect_op (def_stmt,
1970 "detected reduction: need to swap operands: ");
1972 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1973 gimple_assign_rhs2_ptr (def_stmt));
1975 else
1977 if (vect_print_dump_info (REPORT_DETAILS))
1978 report_vect_op (def_stmt, "detected reduction: ");
1981 return def_stmt;
1983 else
1985 if (vect_print_dump_info (REPORT_DETAILS))
1986 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1988 return NULL;
1992 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
1993 in-place. Arguments as there. */
1995 static gimple
1996 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1997 bool check_reduction, bool *double_reduc)
1999 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2000 double_reduc, false);
2003 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2004 in-place if it enables detection of more reductions. Arguments
2005 as there. */
2007 gimple
2008 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2009 bool check_reduction, bool *double_reduc)
2011 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2012 double_reduc, true);
2015 /* Calculate the cost of one scalar iteration of the loop. */
2017 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2020 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2021 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2022 int innerloop_iters, i, stmt_cost;
2024 /* Count statements in scalar loop. Using this as scalar cost for a single
2025 iteration for now.
2027 TODO: Add outer loop support.
2029 TODO: Consider assigning different costs to different scalar
2030 statements. */
2032 /* FORNOW. */
2033 if (loop->inner)
2034 innerloop_iters = 50; /* FIXME */
2036 for (i = 0; i < nbbs; i++)
2038 gimple_stmt_iterator si;
2039 basic_block bb = bbs[i];
2041 if (bb->loop_father == loop->inner)
2042 factor = innerloop_iters;
2043 else
2044 factor = 1;
2046 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2048 gimple stmt = gsi_stmt (si);
2049 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2051 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2052 continue;
2054 /* Skip stmts that are not vectorized inside the loop. */
2055 if (stmt_info
2056 && !STMT_VINFO_RELEVANT_P (stmt_info)
2057 && (!STMT_VINFO_LIVE_P (stmt_info)
2058 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2059 continue;
2061 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2063 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2064 stmt_cost = vect_get_cost (scalar_load);
2065 else
2066 stmt_cost = vect_get_cost (scalar_store);
2068 else
2069 stmt_cost = vect_get_cost (scalar_stmt);
2071 scalar_single_iter_cost += stmt_cost * factor;
2074 return scalar_single_iter_cost;
2077 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2079 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2080 int *peel_iters_epilogue,
2081 int scalar_single_iter_cost)
2083 int peel_guard_costs = 0;
2084 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2086 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2088 *peel_iters_epilogue = vf/2;
2089 if (vect_print_dump_info (REPORT_COST))
2090 fprintf (vect_dump, "cost model: "
2091 "epilogue peel iters set to vf/2 because "
2092 "loop iterations are unknown .");
2094 /* If peeled iterations are known but number of scalar loop
2095 iterations are unknown, count a taken branch per peeled loop. */
2096 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2098 else
2100 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2101 peel_iters_prologue = niters < peel_iters_prologue ?
2102 niters : peel_iters_prologue;
2103 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2106 return (peel_iters_prologue * scalar_single_iter_cost)
2107 + (*peel_iters_epilogue * scalar_single_iter_cost)
2108 + peel_guard_costs;
2111 /* Function vect_estimate_min_profitable_iters
2113 Return the number of iterations required for the vector version of the
2114 loop to be profitable relative to the cost of the scalar version of the
2115 loop.
2117 TODO: Take profile info into account before making vectorization
2118 decisions, if available. */
2121 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2123 int i;
2124 int min_profitable_iters;
2125 int peel_iters_prologue;
2126 int peel_iters_epilogue;
2127 int vec_inside_cost = 0;
2128 int vec_outside_cost = 0;
2129 int scalar_single_iter_cost = 0;
2130 int scalar_outside_cost = 0;
2131 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2132 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2133 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2134 int nbbs = loop->num_nodes;
2135 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2136 int peel_guard_costs = 0;
2137 int innerloop_iters = 0, factor;
2138 VEC (slp_instance, heap) *slp_instances;
2139 slp_instance instance;
2141 /* Cost model disabled. */
2142 if (!flag_vect_cost_model)
2144 if (vect_print_dump_info (REPORT_COST))
2145 fprintf (vect_dump, "cost model disabled.");
2146 return 0;
2149 /* Requires loop versioning tests to handle misalignment. */
2150 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2152 /* FIXME: Make cost depend on complexity of individual check. */
2153 vec_outside_cost +=
2154 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2155 if (vect_print_dump_info (REPORT_COST))
2156 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2157 "versioning to treat misalignment.\n");
2160 /* Requires loop versioning with alias checks. */
2161 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2163 /* FIXME: Make cost depend on complexity of individual check. */
2164 vec_outside_cost +=
2165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2166 if (vect_print_dump_info (REPORT_COST))
2167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2168 "versioning aliasing.\n");
2171 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2172 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2173 vec_outside_cost += vect_get_cost (cond_branch_taken);
2175 /* Count statements in scalar loop. Using this as scalar cost for a single
2176 iteration for now.
2178 TODO: Add outer loop support.
2180 TODO: Consider assigning different costs to different scalar
2181 statements. */
2183 /* FORNOW. */
2184 if (loop->inner)
2185 innerloop_iters = 50; /* FIXME */
2187 for (i = 0; i < nbbs; i++)
2189 gimple_stmt_iterator si;
2190 basic_block bb = bbs[i];
2192 if (bb->loop_father == loop->inner)
2193 factor = innerloop_iters;
2194 else
2195 factor = 1;
2197 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2199 gimple stmt = gsi_stmt (si);
2200 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2201 /* Skip stmts that are not vectorized inside the loop. */
2202 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2203 && (!STMT_VINFO_LIVE_P (stmt_info)
2204 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2205 continue;
2206 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2207 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2208 some of the "outside" costs are generated inside the outer-loop. */
2209 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2213 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2215 /* Add additional cost for the peeled instructions in prologue and epilogue
2216 loop.
2218 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2219 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2221 TODO: Build an expression that represents peel_iters for prologue and
2222 epilogue to be used in a run-time test. */
2224 if (npeel < 0)
2226 peel_iters_prologue = vf/2;
2227 if (vect_print_dump_info (REPORT_COST))
2228 fprintf (vect_dump, "cost model: "
2229 "prologue peel iters set to vf/2.");
2231 /* If peeling for alignment is unknown, loop bound of main loop becomes
2232 unknown. */
2233 peel_iters_epilogue = vf/2;
2234 if (vect_print_dump_info (REPORT_COST))
2235 fprintf (vect_dump, "cost model: "
2236 "epilogue peel iters set to vf/2 because "
2237 "peeling for alignment is unknown .");
2239 /* If peeled iterations are unknown, count a taken branch and a not taken
2240 branch per peeled loop. Even if scalar loop iterations are known,
2241 vector iterations are not known since peeled prologue iterations are
2242 not known. Hence guards remain the same. */
2243 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2244 + vect_get_cost (cond_branch_not_taken));
2245 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2246 + (peel_iters_epilogue * scalar_single_iter_cost)
2247 + peel_guard_costs;
2249 else
2251 peel_iters_prologue = npeel;
2252 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2253 peel_iters_prologue, &peel_iters_epilogue,
2254 scalar_single_iter_cost);
2257 /* FORNOW: The scalar outside cost is incremented in one of the
2258 following ways:
2260 1. The vectorizer checks for alignment and aliasing and generates
2261 a condition that allows dynamic vectorization. A cost model
2262 check is ANDED with the versioning condition. Hence scalar code
2263 path now has the added cost of the versioning check.
2265 if (cost > th & versioning_check)
2266 jmp to vector code
2268 Hence run-time scalar is incremented by not-taken branch cost.
2270 2. The vectorizer then checks if a prologue is required. If the
2271 cost model check was not done before during versioning, it has to
2272 be done before the prologue check.
2274 if (cost <= th)
2275 prologue = scalar_iters
2276 if (prologue == 0)
2277 jmp to vector code
2278 else
2279 execute prologue
2280 if (prologue == num_iters)
2281 go to exit
2283 Hence the run-time scalar cost is incremented by a taken branch,
2284 plus a not-taken branch, plus a taken branch cost.
2286 3. The vectorizer then checks if an epilogue is required. If the
2287 cost model check was not done before during prologue check, it
2288 has to be done with the epilogue check.
2290 if (prologue == 0)
2291 jmp to vector code
2292 else
2293 execute prologue
2294 if (prologue == num_iters)
2295 go to exit
2296 vector code:
2297 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2298 jmp to epilogue
2300 Hence the run-time scalar cost should be incremented by 2 taken
2301 branches.
2303 TODO: The back end may reorder the BBS's differently and reverse
2304 conditions/branch directions. Change the estimates below to
2305 something more reasonable. */
2307 /* If the number of iterations is known and we do not do versioning, we can
2308 decide whether to vectorize at compile time. Hence the scalar version
2309 do not carry cost model guard costs. */
2310 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2311 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2312 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2314 /* Cost model check occurs at versioning. */
2315 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2316 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2317 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2318 else
2320 /* Cost model check occurs at prologue generation. */
2321 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2322 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2323 + vect_get_cost (cond_branch_not_taken);
2324 /* Cost model check occurs at epilogue generation. */
2325 else
2326 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2330 /* Add SLP costs. */
2331 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2332 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2334 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2335 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2338 /* Calculate number of iterations required to make the vector version
2339 profitable, relative to the loop bodies only. The following condition
2340 must hold true:
2341 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2342 where
2343 SIC = scalar iteration cost, VIC = vector iteration cost,
2344 VOC = vector outside cost, VF = vectorization factor,
2345 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2346 SOC = scalar outside cost for run time cost model check. */
2348 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2350 if (vec_outside_cost <= 0)
2351 min_profitable_iters = 1;
2352 else
2354 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2355 - vec_inside_cost * peel_iters_prologue
2356 - vec_inside_cost * peel_iters_epilogue)
2357 / ((scalar_single_iter_cost * vf)
2358 - vec_inside_cost);
2360 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2361 <= ((vec_inside_cost * min_profitable_iters)
2362 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2363 min_profitable_iters++;
2366 /* vector version will never be profitable. */
2367 else
2369 if (vect_print_dump_info (REPORT_COST))
2370 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2371 "divided by the scalar iteration cost = %d "
2372 "is greater or equal to the vectorization factor = %d.",
2373 vec_inside_cost, scalar_single_iter_cost, vf);
2374 return -1;
2377 if (vect_print_dump_info (REPORT_COST))
2379 fprintf (vect_dump, "Cost model analysis: \n");
2380 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2381 vec_inside_cost);
2382 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2383 vec_outside_cost);
2384 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2385 scalar_single_iter_cost);
2386 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2387 fprintf (vect_dump, " prologue iterations: %d\n",
2388 peel_iters_prologue);
2389 fprintf (vect_dump, " epilogue iterations: %d\n",
2390 peel_iters_epilogue);
2391 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2392 min_profitable_iters);
2395 min_profitable_iters =
2396 min_profitable_iters < vf ? vf : min_profitable_iters;
2398 /* Because the condition we create is:
2399 if (niters <= min_profitable_iters)
2400 then skip the vectorized loop. */
2401 min_profitable_iters--;
2403 if (vect_print_dump_info (REPORT_COST))
2404 fprintf (vect_dump, " Profitability threshold = %d\n",
2405 min_profitable_iters);
2407 return min_profitable_iters;
2411 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2412 functions. Design better to avoid maintenance issues. */
2414 /* Function vect_model_reduction_cost.
2416 Models cost for a reduction operation, including the vector ops
2417 generated within the strip-mine loop, the initial definition before
2418 the loop, and the epilogue code that must be generated. */
2420 static bool
2421 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2422 int ncopies)
2424 int outer_cost = 0;
2425 enum tree_code code;
2426 optab optab;
2427 tree vectype;
2428 gimple stmt, orig_stmt;
2429 tree reduction_op;
2430 enum machine_mode mode;
2431 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2432 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2435 /* Cost of reduction op inside loop. */
2436 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2437 += ncopies * vect_get_cost (vector_stmt);
2439 stmt = STMT_VINFO_STMT (stmt_info);
2441 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2443 case GIMPLE_SINGLE_RHS:
2444 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2445 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2446 break;
2447 case GIMPLE_UNARY_RHS:
2448 reduction_op = gimple_assign_rhs1 (stmt);
2449 break;
2450 case GIMPLE_BINARY_RHS:
2451 reduction_op = gimple_assign_rhs2 (stmt);
2452 break;
2453 default:
2454 gcc_unreachable ();
2457 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2458 if (!vectype)
2460 if (vect_print_dump_info (REPORT_COST))
2462 fprintf (vect_dump, "unsupported data-type ");
2463 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2465 return false;
2468 mode = TYPE_MODE (vectype);
2469 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2471 if (!orig_stmt)
2472 orig_stmt = STMT_VINFO_STMT (stmt_info);
2474 code = gimple_assign_rhs_code (orig_stmt);
2476 /* Add in cost for initial definition. */
2477 outer_cost += vect_get_cost (scalar_to_vec);
2479 /* Determine cost of epilogue code.
2481 We have a reduction operator that will reduce the vector in one statement.
2482 Also requires scalar extract. */
2484 if (!nested_in_vect_loop_p (loop, orig_stmt))
2486 if (reduc_code != ERROR_MARK)
2487 outer_cost += vect_get_cost (vector_stmt)
2488 + vect_get_cost (vec_to_scalar);
2489 else
2491 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2492 tree bitsize =
2493 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2494 int element_bitsize = tree_low_cst (bitsize, 1);
2495 int nelements = vec_size_in_bits / element_bitsize;
2497 optab = optab_for_tree_code (code, vectype, optab_default);
2499 /* We have a whole vector shift available. */
2500 if (VECTOR_MODE_P (mode)
2501 && optab_handler (optab, mode) != CODE_FOR_nothing
2502 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2503 /* Final reduction via vector shifts and the reduction operator. Also
2504 requires scalar extract. */
2505 outer_cost += ((exact_log2(nelements) * 2)
2506 * vect_get_cost (vector_stmt)
2507 + vect_get_cost (vec_to_scalar));
2508 else
2509 /* Use extracts and reduction op for final reduction. For N elements,
2510 we have N extracts and N-1 reduction ops. */
2511 outer_cost += ((nelements + nelements - 1)
2512 * vect_get_cost (vector_stmt));
2516 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2518 if (vect_print_dump_info (REPORT_COST))
2519 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2520 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2521 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2523 return true;
2527 /* Function vect_model_induction_cost.
2529 Models cost for induction operations. */
2531 static void
2532 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2534 /* loop cost for vec_loop. */
2535 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2536 = ncopies * vect_get_cost (vector_stmt);
2537 /* prologue cost for vec_init and vec_step. */
2538 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2539 = 2 * vect_get_cost (scalar_to_vec);
2541 if (vect_print_dump_info (REPORT_COST))
2542 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2543 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2544 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2548 /* Function get_initial_def_for_induction
2550 Input:
2551 STMT - a stmt that performs an induction operation in the loop.
2552 IV_PHI - the initial value of the induction variable
2554 Output:
2555 Return a vector variable, initialized with the first VF values of
2556 the induction variable. E.g., for an iv with IV_PHI='X' and
2557 evolution S, for a vector of 4 units, we want to return:
2558 [X, X + S, X + 2*S, X + 3*S]. */
2560 static tree
2561 get_initial_def_for_induction (gimple iv_phi)
2563 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2564 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2565 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2566 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2567 tree vectype;
2568 int nunits;
2569 edge pe = loop_preheader_edge (loop);
2570 struct loop *iv_loop;
2571 basic_block new_bb;
2572 tree vec, vec_init, vec_step, t;
2573 tree access_fn;
2574 tree new_var;
2575 tree new_name;
2576 gimple init_stmt, induction_phi, new_stmt;
2577 tree induc_def, vec_def, vec_dest;
2578 tree init_expr, step_expr;
2579 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2580 int i;
2581 bool ok;
2582 int ncopies;
2583 tree expr;
2584 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2585 bool nested_in_vect_loop = false;
2586 gimple_seq stmts = NULL;
2587 imm_use_iterator imm_iter;
2588 use_operand_p use_p;
2589 gimple exit_phi;
2590 edge latch_e;
2591 tree loop_arg;
2592 gimple_stmt_iterator si;
2593 basic_block bb = gimple_bb (iv_phi);
2594 tree stepvectype;
2596 vectype = get_vectype_for_scalar_type (scalar_type);
2597 gcc_assert (vectype);
2598 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2599 ncopies = vf / nunits;
2601 gcc_assert (phi_info);
2602 gcc_assert (ncopies >= 1);
2604 /* Find the first insertion point in the BB. */
2605 si = gsi_after_labels (bb);
2607 if (INTEGRAL_TYPE_P (scalar_type))
2608 step_expr = build_int_cst (scalar_type, 0);
2609 else if (POINTER_TYPE_P (scalar_type))
2610 step_expr = build_int_cst (sizetype, 0);
2611 else
2612 step_expr = build_real (scalar_type, dconst0);
2614 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2615 if (nested_in_vect_loop_p (loop, iv_phi))
2617 nested_in_vect_loop = true;
2618 iv_loop = loop->inner;
2620 else
2621 iv_loop = loop;
2622 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2624 latch_e = loop_latch_edge (iv_loop);
2625 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2627 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2628 gcc_assert (access_fn);
2629 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2630 &init_expr, &step_expr);
2631 gcc_assert (ok);
2632 pe = loop_preheader_edge (iv_loop);
2634 /* Create the vector that holds the initial_value of the induction. */
2635 if (nested_in_vect_loop)
2637 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2638 been created during vectorization of previous stmts; We obtain it from
2639 the STMT_VINFO_VEC_STMT of the defining stmt. */
2640 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2641 loop_preheader_edge (iv_loop));
2642 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2644 else
2646 /* iv_loop is the loop to be vectorized. Create:
2647 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2648 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2649 add_referenced_var (new_var);
2651 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2652 if (stmts)
2654 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2655 gcc_assert (!new_bb);
2658 t = NULL_TREE;
2659 t = tree_cons (NULL_TREE, init_expr, t);
2660 for (i = 1; i < nunits; i++)
2662 /* Create: new_name_i = new_name + step_expr */
2663 enum tree_code code = POINTER_TYPE_P (scalar_type)
2664 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2665 init_stmt = gimple_build_assign_with_ops (code, new_var,
2666 new_name, step_expr);
2667 new_name = make_ssa_name (new_var, init_stmt);
2668 gimple_assign_set_lhs (init_stmt, new_name);
2670 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2671 gcc_assert (!new_bb);
2673 if (vect_print_dump_info (REPORT_DETAILS))
2675 fprintf (vect_dump, "created new init_stmt: ");
2676 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2678 t = tree_cons (NULL_TREE, new_name, t);
2680 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2681 vec = build_constructor_from_list (vectype, nreverse (t));
2682 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2686 /* Create the vector that holds the step of the induction. */
2687 if (nested_in_vect_loop)
2688 /* iv_loop is nested in the loop to be vectorized. Generate:
2689 vec_step = [S, S, S, S] */
2690 new_name = step_expr;
2691 else
2693 /* iv_loop is the loop to be vectorized. Generate:
2694 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2695 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2696 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2697 expr, step_expr);
2700 t = NULL_TREE;
2701 for (i = 0; i < nunits; i++)
2702 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2703 gcc_assert (CONSTANT_CLASS_P (new_name));
2704 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2705 gcc_assert (stepvectype);
2706 vec = build_vector (stepvectype, t);
2707 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2710 /* Create the following def-use cycle:
2711 loop prolog:
2712 vec_init = ...
2713 vec_step = ...
2714 loop:
2715 vec_iv = PHI <vec_init, vec_loop>
2717 STMT
2719 vec_loop = vec_iv + vec_step; */
2721 /* Create the induction-phi that defines the induction-operand. */
2722 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2723 add_referenced_var (vec_dest);
2724 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2725 set_vinfo_for_stmt (induction_phi,
2726 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2727 induc_def = PHI_RESULT (induction_phi);
2729 /* Create the iv update inside the loop */
2730 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2731 induc_def, vec_step);
2732 vec_def = make_ssa_name (vec_dest, new_stmt);
2733 gimple_assign_set_lhs (new_stmt, vec_def);
2734 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2735 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2736 NULL));
2738 /* Set the arguments of the phi node: */
2739 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2740 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2741 UNKNOWN_LOCATION);
2744 /* In case that vectorization factor (VF) is bigger than the number
2745 of elements that we can fit in a vectype (nunits), we have to generate
2746 more than one vector stmt - i.e - we need to "unroll" the
2747 vector stmt by a factor VF/nunits. For more details see documentation
2748 in vectorizable_operation. */
2750 if (ncopies > 1)
2752 stmt_vec_info prev_stmt_vinfo;
2753 /* FORNOW. This restriction should be relaxed. */
2754 gcc_assert (!nested_in_vect_loop);
2756 /* Create the vector that holds the step of the induction. */
2757 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2758 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2759 expr, step_expr);
2760 t = NULL_TREE;
2761 for (i = 0; i < nunits; i++)
2762 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2763 gcc_assert (CONSTANT_CLASS_P (new_name));
2764 vec = build_vector (stepvectype, t);
2765 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2767 vec_def = induc_def;
2768 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2769 for (i = 1; i < ncopies; i++)
2771 /* vec_i = vec_prev + vec_step */
2772 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2773 vec_def, vec_step);
2774 vec_def = make_ssa_name (vec_dest, new_stmt);
2775 gimple_assign_set_lhs (new_stmt, vec_def);
2777 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2778 set_vinfo_for_stmt (new_stmt,
2779 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2780 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2781 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2785 if (nested_in_vect_loop)
2787 /* Find the loop-closed exit-phi of the induction, and record
2788 the final vector of induction results: */
2789 exit_phi = NULL;
2790 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2792 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2794 exit_phi = USE_STMT (use_p);
2795 break;
2798 if (exit_phi)
2800 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2801 /* FORNOW. Currently not supporting the case that an inner-loop induction
2802 is not used in the outer-loop (i.e. only outside the outer-loop). */
2803 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2804 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2806 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2807 if (vect_print_dump_info (REPORT_DETAILS))
2809 fprintf (vect_dump, "vector of inductions after inner-loop:");
2810 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2816 if (vect_print_dump_info (REPORT_DETAILS))
2818 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2819 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2820 fprintf (vect_dump, "\n");
2821 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2824 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2825 return induc_def;
2829 /* Function get_initial_def_for_reduction
2831 Input:
2832 STMT - a stmt that performs a reduction operation in the loop.
2833 INIT_VAL - the initial value of the reduction variable
2835 Output:
2836 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2837 of the reduction (used for adjusting the epilog - see below).
2838 Return a vector variable, initialized according to the operation that STMT
2839 performs. This vector will be used as the initial value of the
2840 vector of partial results.
2842 Option1 (adjust in epilog): Initialize the vector as follows:
2843 add/bit or/xor: [0,0,...,0,0]
2844 mult/bit and: [1,1,...,1,1]
2845 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2846 and when necessary (e.g. add/mult case) let the caller know
2847 that it needs to adjust the result by init_val.
2849 Option2: Initialize the vector as follows:
2850 add/bit or/xor: [init_val,0,0,...,0]
2851 mult/bit and: [init_val,1,1,...,1]
2852 min/max/cond_expr: [init_val,init_val,...,init_val]
2853 and no adjustments are needed.
2855 For example, for the following code:
2857 s = init_val;
2858 for (i=0;i<n;i++)
2859 s = s + a[i];
2861 STMT is 's = s + a[i]', and the reduction variable is 's'.
2862 For a vector of 4 units, we want to return either [0,0,0,init_val],
2863 or [0,0,0,0] and let the caller know that it needs to adjust
2864 the result at the end by 'init_val'.
2866 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2867 initialization vector is simpler (same element in all entries), if
2868 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2870 A cost model should help decide between these two schemes. */
2872 tree
2873 get_initial_def_for_reduction (gimple stmt, tree init_val,
2874 tree *adjustment_def)
2876 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2877 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2878 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2879 tree scalar_type = TREE_TYPE (init_val);
2880 tree vectype = get_vectype_for_scalar_type (scalar_type);
2881 int nunits;
2882 enum tree_code code = gimple_assign_rhs_code (stmt);
2883 tree def_for_init;
2884 tree init_def;
2885 tree t = NULL_TREE;
2886 int i;
2887 bool nested_in_vect_loop = false;
2888 tree init_value;
2889 REAL_VALUE_TYPE real_init_val = dconst0;
2890 int int_init_val = 0;
2891 gimple def_stmt = NULL;
2893 gcc_assert (vectype);
2894 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2896 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2897 || SCALAR_FLOAT_TYPE_P (scalar_type));
2899 if (nested_in_vect_loop_p (loop, stmt))
2900 nested_in_vect_loop = true;
2901 else
2902 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2904 /* In case of double reduction we only create a vector variable to be put
2905 in the reduction phi node. The actual statement creation is done in
2906 vect_create_epilog_for_reduction. */
2907 if (adjustment_def && nested_in_vect_loop
2908 && TREE_CODE (init_val) == SSA_NAME
2909 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2910 && gimple_code (def_stmt) == GIMPLE_PHI
2911 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2912 && vinfo_for_stmt (def_stmt)
2913 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914 == vect_double_reduction_def)
2916 *adjustment_def = NULL;
2917 return vect_create_destination_var (init_val, vectype);
2920 if (TREE_CONSTANT (init_val))
2922 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2923 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2924 else
2925 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2927 else
2928 init_value = init_val;
2930 switch (code)
2932 case WIDEN_SUM_EXPR:
2933 case DOT_PROD_EXPR:
2934 case PLUS_EXPR:
2935 case MINUS_EXPR:
2936 case BIT_IOR_EXPR:
2937 case BIT_XOR_EXPR:
2938 case MULT_EXPR:
2939 case BIT_AND_EXPR:
2940 /* ADJUSMENT_DEF is NULL when called from
2941 vect_create_epilog_for_reduction to vectorize double reduction. */
2942 if (adjustment_def)
2944 if (nested_in_vect_loop)
2945 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2946 NULL);
2947 else
2948 *adjustment_def = init_val;
2951 if (code == MULT_EXPR)
2953 real_init_val = dconst1;
2954 int_init_val = 1;
2957 if (code == BIT_AND_EXPR)
2958 int_init_val = -1;
2960 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2961 def_for_init = build_real (scalar_type, real_init_val);
2962 else
2963 def_for_init = build_int_cst (scalar_type, int_init_val);
2965 /* Create a vector of '0' or '1' except the first element. */
2966 for (i = nunits - 2; i >= 0; --i)
2967 t = tree_cons (NULL_TREE, def_for_init, t);
2969 /* Option1: the first element is '0' or '1' as well. */
2970 if (adjustment_def)
2972 t = tree_cons (NULL_TREE, def_for_init, t);
2973 init_def = build_vector (vectype, t);
2974 break;
2977 /* Option2: the first element is INIT_VAL. */
2978 t = tree_cons (NULL_TREE, init_value, t);
2979 if (TREE_CONSTANT (init_val))
2980 init_def = build_vector (vectype, t);
2981 else
2982 init_def = build_constructor_from_list (vectype, t);
2984 break;
2986 case MIN_EXPR:
2987 case MAX_EXPR:
2988 case COND_EXPR:
2989 if (adjustment_def)
2991 *adjustment_def = NULL_TREE;
2992 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2993 break;
2996 for (i = nunits - 1; i >= 0; --i)
2997 t = tree_cons (NULL_TREE, init_value, t);
2999 if (TREE_CONSTANT (init_val))
3000 init_def = build_vector (vectype, t);
3001 else
3002 init_def = build_constructor_from_list (vectype, t);
3004 break;
3006 default:
3007 gcc_unreachable ();
3010 return init_def;
3014 /* Function vect_create_epilog_for_reduction
3016 Create code at the loop-epilog to finalize the result of a reduction
3017 computation.
3019 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3020 reduction statements.
3021 STMT is the scalar reduction stmt that is being vectorized.
3022 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3023 number of elements that we can fit in a vectype (nunits). In this case
3024 we have to generate more than one vector stmt - i.e - we need to "unroll"
3025 the vector stmt by a factor VF/nunits. For more details see documentation
3026 in vectorizable_operation.
3027 REDUC_CODE is the tree-code for the epilog reduction.
3028 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3029 computation.
3030 REDUC_INDEX is the index of the operand in the right hand side of the
3031 statement that is defined by REDUCTION_PHI.
3032 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3033 SLP_NODE is an SLP node containing a group of reduction statements. The
3034 first one in this group is STMT.
3036 This function:
3037 1. Creates the reduction def-use cycles: sets the arguments for
3038 REDUCTION_PHIS:
3039 The loop-entry argument is the vectorized initial-value of the reduction.
3040 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3041 sums.
3042 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3043 by applying the operation specified by REDUC_CODE if available, or by
3044 other means (whole-vector shifts or a scalar loop).
3045 The function also creates a new phi node at the loop exit to preserve
3046 loop-closed form, as illustrated below.
3048 The flow at the entry to this function:
3050 loop:
3051 vec_def = phi <null, null> # REDUCTION_PHI
3052 VECT_DEF = vector_stmt # vectorized form of STMT
3053 s_loop = scalar_stmt # (scalar) STMT
3054 loop_exit:
3055 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3056 use <s_out0>
3057 use <s_out0>
3059 The above is transformed by this function into:
3061 loop:
3062 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3063 VECT_DEF = vector_stmt # vectorized form of STMT
3064 s_loop = scalar_stmt # (scalar) STMT
3065 loop_exit:
3066 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3067 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3068 v_out2 = reduce <v_out1>
3069 s_out3 = extract_field <v_out2, 0>
3070 s_out4 = adjust_result <s_out3>
3071 use <s_out4>
3072 use <s_out4>
3075 static void
3076 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3077 int ncopies, enum tree_code reduc_code,
3078 VEC (gimple, heap) *reduction_phis,
3079 int reduc_index, bool double_reduc,
3080 slp_tree slp_node)
3082 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3083 stmt_vec_info prev_phi_info;
3084 tree vectype;
3085 enum machine_mode mode;
3086 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3087 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3088 basic_block exit_bb;
3089 tree scalar_dest;
3090 tree scalar_type;
3091 gimple new_phi = NULL, phi;
3092 gimple_stmt_iterator exit_gsi;
3093 tree vec_dest;
3094 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3095 gimple epilog_stmt = NULL;
3096 enum tree_code code = gimple_assign_rhs_code (stmt);
3097 gimple exit_phi;
3098 tree bitsize, bitpos;
3099 tree adjustment_def = NULL;
3100 tree vec_initial_def = NULL;
3101 tree reduction_op, expr, def;
3102 tree orig_name, scalar_result;
3103 imm_use_iterator imm_iter;
3104 use_operand_p use_p;
3105 bool extract_scalar_result = false;
3106 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3107 bool nested_in_vect_loop = false;
3108 VEC (gimple, heap) *new_phis = NULL;
3109 enum vect_def_type dt = vect_unknown_def_type;
3110 int j, i;
3111 VEC (tree, heap) *scalar_results = NULL;
3112 unsigned int group_size = 1, k, ratio;
3113 VEC (tree, heap) *vec_initial_defs = NULL;
3114 VEC (gimple, heap) *phis;
3116 if (slp_node)
3117 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3119 if (nested_in_vect_loop_p (loop, stmt))
3121 outer_loop = loop;
3122 loop = loop->inner;
3123 nested_in_vect_loop = true;
3124 gcc_assert (!slp_node);
3127 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3129 case GIMPLE_SINGLE_RHS:
3130 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3131 == ternary_op);
3132 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3133 break;
3134 case GIMPLE_UNARY_RHS:
3135 reduction_op = gimple_assign_rhs1 (stmt);
3136 break;
3137 case GIMPLE_BINARY_RHS:
3138 reduction_op = reduc_index ?
3139 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3140 break;
3141 default:
3142 gcc_unreachable ();
3145 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3146 gcc_assert (vectype);
3147 mode = TYPE_MODE (vectype);
3149 /* 1. Create the reduction def-use cycle:
3150 Set the arguments of REDUCTION_PHIS, i.e., transform
3152 loop:
3153 vec_def = phi <null, null> # REDUCTION_PHI
3154 VECT_DEF = vector_stmt # vectorized form of STMT
3157 into:
3159 loop:
3160 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3161 VECT_DEF = vector_stmt # vectorized form of STMT
3164 (in case of SLP, do it for all the phis). */
3166 /* Get the loop-entry arguments. */
3167 if (slp_node)
3168 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3169 else
3171 vec_initial_defs = VEC_alloc (tree, heap, 1);
3172 /* For the case of reduction, vect_get_vec_def_for_operand returns
3173 the scalar def before the loop, that defines the initial value
3174 of the reduction variable. */
3175 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3176 &adjustment_def);
3177 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3180 /* Set phi nodes arguments. */
3181 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3183 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3184 tree def = VEC_index (tree, vect_defs, i);
3185 for (j = 0; j < ncopies; j++)
3187 /* Set the loop-entry arg of the reduction-phi. */
3188 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3189 UNKNOWN_LOCATION);
3191 /* Set the loop-latch arg for the reduction-phi. */
3192 if (j > 0)
3193 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3195 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3197 if (vect_print_dump_info (REPORT_DETAILS))
3199 fprintf (vect_dump, "transform reduction: created def-use"
3200 " cycle: ");
3201 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3202 fprintf (vect_dump, "\n");
3203 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3204 TDF_SLIM);
3207 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3211 VEC_free (tree, heap, vec_initial_defs);
3213 /* 2. Create epilog code.
3214 The reduction epilog code operates across the elements of the vector
3215 of partial results computed by the vectorized loop.
3216 The reduction epilog code consists of:
3218 step 1: compute the scalar result in a vector (v_out2)
3219 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3220 step 3: adjust the scalar result (s_out3) if needed.
3222 Step 1 can be accomplished using one the following three schemes:
3223 (scheme 1) using reduc_code, if available.
3224 (scheme 2) using whole-vector shifts, if available.
3225 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3226 combined.
3228 The overall epilog code looks like this:
3230 s_out0 = phi <s_loop> # original EXIT_PHI
3231 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3232 v_out2 = reduce <v_out1> # step 1
3233 s_out3 = extract_field <v_out2, 0> # step 2
3234 s_out4 = adjust_result <s_out3> # step 3
3236 (step 3 is optional, and steps 1 and 2 may be combined).
3237 Lastly, the uses of s_out0 are replaced by s_out4. */
3240 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3241 v_out1 = phi <VECT_DEF>
3242 Store them in NEW_PHIS. */
3244 exit_bb = single_exit (loop)->dest;
3245 prev_phi_info = NULL;
3246 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3247 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3249 for (j = 0; j < ncopies; j++)
3251 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3252 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3253 if (j == 0)
3254 VEC_quick_push (gimple, new_phis, phi);
3255 else
3257 def = vect_get_vec_def_for_stmt_copy (dt, def);
3258 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3261 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3262 prev_phi_info = vinfo_for_stmt (phi);
3266 exit_gsi = gsi_after_labels (exit_bb);
3268 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3269 (i.e. when reduc_code is not available) and in the final adjustment
3270 code (if needed). Also get the original scalar reduction variable as
3271 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3272 represents a reduction pattern), the tree-code and scalar-def are
3273 taken from the original stmt that the pattern-stmt (STMT) replaces.
3274 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3275 are taken from STMT. */
3277 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3278 if (!orig_stmt)
3280 /* Regular reduction */
3281 orig_stmt = stmt;
3283 else
3285 /* Reduction pattern */
3286 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3287 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3288 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3291 code = gimple_assign_rhs_code (orig_stmt);
3292 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3293 partial results are added and not subtracted. */
3294 if (code == MINUS_EXPR)
3295 code = PLUS_EXPR;
3297 scalar_dest = gimple_assign_lhs (orig_stmt);
3298 scalar_type = TREE_TYPE (scalar_dest);
3299 scalar_results = VEC_alloc (tree, heap, group_size);
3300 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3301 bitsize = TYPE_SIZE (scalar_type);
3303 /* In case this is a reduction in an inner-loop while vectorizing an outer
3304 loop - we don't need to extract a single scalar result at the end of the
3305 inner-loop (unless it is double reduction, i.e., the use of reduction is
3306 outside the outer-loop). The final vector of partial results will be used
3307 in the vectorized outer-loop, or reduced to a scalar result at the end of
3308 the outer-loop. */
3309 if (nested_in_vect_loop && !double_reduc)
3310 goto vect_finalize_reduction;
3312 /* 2.3 Create the reduction code, using one of the three schemes described
3313 above. In SLP we simply need to extract all the elements from the
3314 vector (without reducing them), so we use scalar shifts. */
3315 if (reduc_code != ERROR_MARK && !slp_node)
3317 tree tmp;
3319 /*** Case 1: Create:
3320 v_out2 = reduc_expr <v_out1> */
3322 if (vect_print_dump_info (REPORT_DETAILS))
3323 fprintf (vect_dump, "Reduce using direct vector reduction.");
3325 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3326 new_phi = VEC_index (gimple, new_phis, 0);
3327 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3328 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3329 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3330 gimple_assign_set_lhs (epilog_stmt, new_temp);
3331 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3333 extract_scalar_result = true;
3335 else
3337 enum tree_code shift_code = ERROR_MARK;
3338 bool have_whole_vector_shift = true;
3339 int bit_offset;
3340 int element_bitsize = tree_low_cst (bitsize, 1);
3341 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3342 tree vec_temp;
3344 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3345 shift_code = VEC_RSHIFT_EXPR;
3346 else
3347 have_whole_vector_shift = false;
3349 /* Regardless of whether we have a whole vector shift, if we're
3350 emulating the operation via tree-vect-generic, we don't want
3351 to use it. Only the first round of the reduction is likely
3352 to still be profitable via emulation. */
3353 /* ??? It might be better to emit a reduction tree code here, so that
3354 tree-vect-generic can expand the first round via bit tricks. */
3355 if (!VECTOR_MODE_P (mode))
3356 have_whole_vector_shift = false;
3357 else
3359 optab optab = optab_for_tree_code (code, vectype, optab_default);
3360 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3361 have_whole_vector_shift = false;
3364 if (have_whole_vector_shift && !slp_node)
3366 /*** Case 2: Create:
3367 for (offset = VS/2; offset >= element_size; offset/=2)
3369 Create: va' = vec_shift <va, offset>
3370 Create: va = vop <va, va'>
3371 } */
3373 if (vect_print_dump_info (REPORT_DETAILS))
3374 fprintf (vect_dump, "Reduce using vector shifts");
3376 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3377 new_phi = VEC_index (gimple, new_phis, 0);
3378 new_temp = PHI_RESULT (new_phi);
3379 for (bit_offset = vec_size_in_bits/2;
3380 bit_offset >= element_bitsize;
3381 bit_offset /= 2)
3383 tree bitpos = size_int (bit_offset);
3385 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3386 vec_dest, new_temp, bitpos);
3387 new_name = make_ssa_name (vec_dest, epilog_stmt);
3388 gimple_assign_set_lhs (epilog_stmt, new_name);
3389 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3391 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3392 new_name, new_temp);
3393 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3394 gimple_assign_set_lhs (epilog_stmt, new_temp);
3395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3398 extract_scalar_result = true;
3400 else
3402 tree rhs;
3404 /*** Case 3: Create:
3405 s = extract_field <v_out2, 0>
3406 for (offset = element_size;
3407 offset < vector_size;
3408 offset += element_size;)
3410 Create: s' = extract_field <v_out2, offset>
3411 Create: s = op <s, s'> // For non SLP cases
3412 } */
3414 if (vect_print_dump_info (REPORT_DETAILS))
3415 fprintf (vect_dump, "Reduce using scalar code. ");
3417 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3418 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3420 vec_temp = PHI_RESULT (new_phi);
3421 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3422 bitsize_zero_node);
3423 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3424 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3425 gimple_assign_set_lhs (epilog_stmt, new_temp);
3426 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3428 /* In SLP we don't need to apply reduction operation, so we just
3429 collect s' values in SCALAR_RESULTS. */
3430 if (slp_node)
3431 VEC_safe_push (tree, heap, scalar_results, new_temp);
3433 for (bit_offset = element_bitsize;
3434 bit_offset < vec_size_in_bits;
3435 bit_offset += element_bitsize)
3437 tree bitpos = bitsize_int (bit_offset);
3438 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3439 bitsize, bitpos);
3441 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3442 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3443 gimple_assign_set_lhs (epilog_stmt, new_name);
3444 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3446 if (slp_node)
3448 /* In SLP we don't need to apply reduction operation, so
3449 we just collect s' values in SCALAR_RESULTS. */
3450 new_temp = new_name;
3451 VEC_safe_push (tree, heap, scalar_results, new_name);
3453 else
3455 epilog_stmt = gimple_build_assign_with_ops (code,
3456 new_scalar_dest, new_name, new_temp);
3457 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3458 gimple_assign_set_lhs (epilog_stmt, new_temp);
3459 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3464 /* The only case where we need to reduce scalar results in SLP, is
3465 unrolling. If the size of SCALAR_RESULTS is greater than
3466 GROUP_SIZE, we reduce them combining elements modulo
3467 GROUP_SIZE. */
3468 if (slp_node)
3470 tree res, first_res, new_res;
3471 gimple new_stmt;
3473 /* Reduce multiple scalar results in case of SLP unrolling. */
3474 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3475 j++)
3477 first_res = VEC_index (tree, scalar_results, j % group_size);
3478 new_stmt = gimple_build_assign_with_ops (code,
3479 new_scalar_dest, first_res, res);
3480 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3481 gimple_assign_set_lhs (new_stmt, new_res);
3482 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3483 VEC_replace (tree, scalar_results, j % group_size, new_res);
3486 else
3487 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3488 VEC_safe_push (tree, heap, scalar_results, new_temp);
3490 extract_scalar_result = false;
3494 /* 2.4 Extract the final scalar result. Create:
3495 s_out3 = extract_field <v_out2, bitpos> */
3497 if (extract_scalar_result)
3499 tree rhs;
3501 if (vect_print_dump_info (REPORT_DETAILS))
3502 fprintf (vect_dump, "extract scalar result");
3504 if (BYTES_BIG_ENDIAN)
3505 bitpos = size_binop (MULT_EXPR,
3506 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3507 TYPE_SIZE (scalar_type));
3508 else
3509 bitpos = bitsize_zero_node;
3511 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3512 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3513 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3514 gimple_assign_set_lhs (epilog_stmt, new_temp);
3515 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3516 VEC_safe_push (tree, heap, scalar_results, new_temp);
3519 vect_finalize_reduction:
3521 /* 2.5 Adjust the final result by the initial value of the reduction
3522 variable. (When such adjustment is not needed, then
3523 'adjustment_def' is zero). For example, if code is PLUS we create:
3524 new_temp = loop_exit_def + adjustment_def */
3526 if (adjustment_def)
3528 gcc_assert (!slp_node);
3529 if (nested_in_vect_loop)
3531 new_phi = VEC_index (gimple, new_phis, 0);
3532 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3533 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3534 new_dest = vect_create_destination_var (scalar_dest, vectype);
3536 else
3538 new_temp = VEC_index (tree, scalar_results, 0);
3539 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3540 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3541 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3544 epilog_stmt = gimple_build_assign (new_dest, expr);
3545 new_temp = make_ssa_name (new_dest, epilog_stmt);
3546 gimple_assign_set_lhs (epilog_stmt, new_temp);
3547 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3548 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3549 if (nested_in_vect_loop)
3551 set_vinfo_for_stmt (epilog_stmt,
3552 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3553 NULL));
3554 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3555 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3557 if (!double_reduc)
3558 VEC_quick_push (tree, scalar_results, new_temp);
3559 else
3560 VEC_replace (tree, scalar_results, 0, new_temp);
3562 else
3563 VEC_replace (tree, scalar_results, 0, new_temp);
3565 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3568 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3569 phis with new adjusted scalar results, i.e., replace use <s_out0>
3570 with use <s_out4>.
3572 Transform:
3573 loop_exit:
3574 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3575 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3576 v_out2 = reduce <v_out1>
3577 s_out3 = extract_field <v_out2, 0>
3578 s_out4 = adjust_result <s_out3>
3579 use <s_out0>
3580 use <s_out0>
3582 into:
3584 loop_exit:
3585 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3586 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3587 v_out2 = reduce <v_out1>
3588 s_out3 = extract_field <v_out2, 0>
3589 s_out4 = adjust_result <s_out3>
3590 use <s_out4>
3591 use <s_out4> */
3593 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3594 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3595 need to match SCALAR_RESULTS with corresponding statements. The first
3596 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3597 the first vector stmt, etc.
3598 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3599 if (group_size > VEC_length (gimple, new_phis))
3601 ratio = group_size / VEC_length (gimple, new_phis);
3602 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3604 else
3605 ratio = 1;
3607 for (k = 0; k < group_size; k++)
3609 if (k % ratio == 0)
3611 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3612 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3615 if (slp_node)
3617 gimple current_stmt = VEC_index (gimple,
3618 SLP_TREE_SCALAR_STMTS (slp_node), k);
3620 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3621 /* SLP statements can't participate in patterns. */
3622 gcc_assert (!orig_stmt);
3623 scalar_dest = gimple_assign_lhs (current_stmt);
3626 phis = VEC_alloc (gimple, heap, 3);
3627 /* Find the loop-closed-use at the loop exit of the original scalar
3628 result. (The reduction result is expected to have two immediate uses -
3629 one at the latch block, and one at the loop exit). */
3630 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3631 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3632 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3634 /* We expect to have found an exit_phi because of loop-closed-ssa
3635 form. */
3636 gcc_assert (!VEC_empty (gimple, phis));
3638 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3640 if (outer_loop)
3642 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3643 gimple vect_phi;
3645 /* FORNOW. Currently not supporting the case that an inner-loop
3646 reduction is not used in the outer-loop (but only outside the
3647 outer-loop), unless it is double reduction. */
3648 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3649 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3650 || double_reduc);
3652 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3653 if (!double_reduc
3654 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3655 != vect_double_reduction_def)
3656 continue;
3658 /* Handle double reduction:
3660 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3661 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3662 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3663 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3665 At that point the regular reduction (stmt2 and stmt3) is
3666 already vectorized, as well as the exit phi node, stmt4.
3667 Here we vectorize the phi node of double reduction, stmt1, and
3668 update all relevant statements. */
3670 /* Go through all the uses of s2 to find double reduction phi
3671 node, i.e., stmt1 above. */
3672 orig_name = PHI_RESULT (exit_phi);
3673 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3675 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3676 stmt_vec_info new_phi_vinfo;
3677 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3678 basic_block bb = gimple_bb (use_stmt);
3679 gimple use;
3681 /* Check that USE_STMT is really double reduction phi
3682 node. */
3683 if (gimple_code (use_stmt) != GIMPLE_PHI
3684 || gimple_phi_num_args (use_stmt) != 2
3685 || !use_stmt_vinfo
3686 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3687 != vect_double_reduction_def
3688 || bb->loop_father != outer_loop)
3689 continue;
3691 /* Create vector phi node for double reduction:
3692 vs1 = phi <vs0, vs2>
3693 vs1 was created previously in this function by a call to
3694 vect_get_vec_def_for_operand and is stored in
3695 vec_initial_def;
3696 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3697 vs0 is created here. */
3699 /* Create vector phi node. */
3700 vect_phi = create_phi_node (vec_initial_def, bb);
3701 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3702 loop_vec_info_for_loop (outer_loop), NULL);
3703 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3705 /* Create vs0 - initial def of the double reduction phi. */
3706 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3707 loop_preheader_edge (outer_loop));
3708 init_def = get_initial_def_for_reduction (stmt,
3709 preheader_arg, NULL);
3710 vect_phi_init = vect_init_vector (use_stmt, init_def,
3711 vectype, NULL);
3713 /* Update phi node arguments with vs0 and vs2. */
3714 add_phi_arg (vect_phi, vect_phi_init,
3715 loop_preheader_edge (outer_loop),
3716 UNKNOWN_LOCATION);
3717 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3718 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3719 if (vect_print_dump_info (REPORT_DETAILS))
3721 fprintf (vect_dump, "created double reduction phi "
3722 "node: ");
3723 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3726 vect_phi_res = PHI_RESULT (vect_phi);
3728 /* Replace the use, i.e., set the correct vs1 in the regular
3729 reduction phi node. FORNOW, NCOPIES is always 1, so the
3730 loop is redundant. */
3731 use = reduction_phi;
3732 for (j = 0; j < ncopies; j++)
3734 edge pr_edge = loop_preheader_edge (loop);
3735 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3736 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3741 /* Replace the uses: */
3742 orig_name = PHI_RESULT (exit_phi);
3743 scalar_result = VEC_index (tree, scalar_results, k);
3744 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3745 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3746 SET_USE (use_p, scalar_result);
3749 VEC_free (gimple, heap, phis);
3752 VEC_free (tree, heap, scalar_results);
3753 VEC_free (gimple, heap, new_phis);
3757 /* Function vectorizable_reduction.
3759 Check if STMT performs a reduction operation that can be vectorized.
3760 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3761 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3762 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3764 This function also handles reduction idioms (patterns) that have been
3765 recognized in advance during vect_pattern_recog. In this case, STMT may be
3766 of this form:
3767 X = pattern_expr (arg0, arg1, ..., X)
3768 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3769 sequence that had been detected and replaced by the pattern-stmt (STMT).
3771 In some cases of reduction patterns, the type of the reduction variable X is
3772 different than the type of the other arguments of STMT.
3773 In such cases, the vectype that is used when transforming STMT into a vector
3774 stmt is different than the vectype that is used to determine the
3775 vectorization factor, because it consists of a different number of elements
3776 than the actual number of elements that are being operated upon in parallel.
3778 For example, consider an accumulation of shorts into an int accumulator.
3779 On some targets it's possible to vectorize this pattern operating on 8
3780 shorts at a time (hence, the vectype for purposes of determining the
3781 vectorization factor should be V8HI); on the other hand, the vectype that
3782 is used to create the vector form is actually V4SI (the type of the result).
3784 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3785 indicates what is the actual level of parallelism (V8HI in the example), so
3786 that the right vectorization factor would be derived. This vectype
3787 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3788 be used to create the vectorized stmt. The right vectype for the vectorized
3789 stmt is obtained from the type of the result X:
3790 get_vectype_for_scalar_type (TREE_TYPE (X))
3792 This means that, contrary to "regular" reductions (or "regular" stmts in
3793 general), the following equation:
3794 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3795 does *NOT* necessarily hold for reduction patterns. */
3797 bool
3798 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3799 gimple *vec_stmt, slp_tree slp_node)
3801 tree vec_dest;
3802 tree scalar_dest;
3803 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3804 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3805 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3806 tree vectype_in = NULL_TREE;
3807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3808 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3809 enum tree_code code, orig_code, epilog_reduc_code;
3810 enum machine_mode vec_mode;
3811 int op_type;
3812 optab optab, reduc_optab;
3813 tree new_temp = NULL_TREE;
3814 tree def;
3815 gimple def_stmt;
3816 enum vect_def_type dt;
3817 gimple new_phi = NULL;
3818 tree scalar_type;
3819 bool is_simple_use;
3820 gimple orig_stmt;
3821 stmt_vec_info orig_stmt_info;
3822 tree expr = NULL_TREE;
3823 int i;
3824 int ncopies;
3825 int epilog_copies;
3826 stmt_vec_info prev_stmt_info, prev_phi_info;
3827 bool single_defuse_cycle = false;
3828 tree reduc_def = NULL_TREE;
3829 gimple new_stmt = NULL;
3830 int j;
3831 tree ops[3];
3832 bool nested_cycle = false, found_nested_cycle_def = false;
3833 gimple reduc_def_stmt = NULL;
3834 /* The default is that the reduction variable is the last in statement. */
3835 int reduc_index = 2;
3836 bool double_reduc = false, dummy;
3837 basic_block def_bb;
3838 struct loop * def_stmt_loop, *outer_loop = NULL;
3839 tree def_arg;
3840 gimple def_arg_stmt;
3841 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3842 VEC (gimple, heap) *phis = NULL;
3843 int vec_num;
3844 tree def0, def1;
3846 if (nested_in_vect_loop_p (loop, stmt))
3848 outer_loop = loop;
3849 loop = loop->inner;
3850 nested_cycle = true;
3853 /* 1. Is vectorizable reduction? */
3854 /* Not supportable if the reduction variable is used in the loop. */
3855 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3856 return false;
3858 /* Reductions that are not used even in an enclosing outer-loop,
3859 are expected to be "live" (used out of the loop). */
3860 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3861 && !STMT_VINFO_LIVE_P (stmt_info))
3862 return false;
3864 /* Make sure it was already recognized as a reduction computation. */
3865 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3866 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3867 return false;
3869 /* 2. Has this been recognized as a reduction pattern?
3871 Check if STMT represents a pattern that has been recognized
3872 in earlier analysis stages. For stmts that represent a pattern,
3873 the STMT_VINFO_RELATED_STMT field records the last stmt in
3874 the original sequence that constitutes the pattern. */
3876 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3877 if (orig_stmt)
3879 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3880 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3881 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3882 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3885 /* 3. Check the operands of the operation. The first operands are defined
3886 inside the loop body. The last operand is the reduction variable,
3887 which is defined by the loop-header-phi. */
3889 gcc_assert (is_gimple_assign (stmt));
3891 /* Flatten RHS */
3892 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3894 case GIMPLE_SINGLE_RHS:
3895 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3896 if (op_type == ternary_op)
3898 tree rhs = gimple_assign_rhs1 (stmt);
3899 ops[0] = TREE_OPERAND (rhs, 0);
3900 ops[1] = TREE_OPERAND (rhs, 1);
3901 ops[2] = TREE_OPERAND (rhs, 2);
3902 code = TREE_CODE (rhs);
3904 else
3905 return false;
3906 break;
3908 case GIMPLE_BINARY_RHS:
3909 code = gimple_assign_rhs_code (stmt);
3910 op_type = TREE_CODE_LENGTH (code);
3911 gcc_assert (op_type == binary_op);
3912 ops[0] = gimple_assign_rhs1 (stmt);
3913 ops[1] = gimple_assign_rhs2 (stmt);
3914 break;
3916 case GIMPLE_UNARY_RHS:
3917 return false;
3919 default:
3920 gcc_unreachable ();
3923 scalar_dest = gimple_assign_lhs (stmt);
3924 scalar_type = TREE_TYPE (scalar_dest);
3925 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3926 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3927 return false;
3929 /* All uses but the last are expected to be defined in the loop.
3930 The last use is the reduction variable. In case of nested cycle this
3931 assumption is not true: we use reduc_index to record the index of the
3932 reduction variable. */
3933 for (i = 0; i < op_type-1; i++)
3935 tree tem;
3937 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
3938 if (i == 0 && code == COND_EXPR)
3939 continue;
3941 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3942 &def_stmt, &def, &dt, &tem);
3943 if (!vectype_in)
3944 vectype_in = tem;
3945 gcc_assert (is_simple_use);
3946 if (dt != vect_internal_def
3947 && dt != vect_external_def
3948 && dt != vect_constant_def
3949 && dt != vect_induction_def
3950 && !(dt == vect_nested_cycle && nested_cycle))
3951 return false;
3953 if (dt == vect_nested_cycle)
3955 found_nested_cycle_def = true;
3956 reduc_def_stmt = def_stmt;
3957 reduc_index = i;
3961 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3962 &def, &dt);
3963 gcc_assert (is_simple_use);
3964 gcc_assert (dt == vect_reduction_def
3965 || dt == vect_nested_cycle
3966 || ((dt == vect_internal_def || dt == vect_external_def
3967 || dt == vect_constant_def || dt == vect_induction_def)
3968 && nested_cycle && found_nested_cycle_def));
3969 if (!found_nested_cycle_def)
3970 reduc_def_stmt = def_stmt;
3972 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3973 if (orig_stmt)
3974 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3975 reduc_def_stmt,
3976 !nested_cycle,
3977 &dummy));
3978 else
3979 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3980 !nested_cycle, &dummy));
3982 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3983 return false;
3985 if (slp_node)
3986 ncopies = 1;
3987 else
3988 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3989 / TYPE_VECTOR_SUBPARTS (vectype_in));
3991 gcc_assert (ncopies >= 1);
3993 vec_mode = TYPE_MODE (vectype_in);
3995 if (code == COND_EXPR)
3997 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3999 if (vect_print_dump_info (REPORT_DETAILS))
4000 fprintf (vect_dump, "unsupported condition in reduction");
4002 return false;
4005 else
4007 /* 4. Supportable by target? */
4009 /* 4.1. check support for the operation in the loop */
4010 optab = optab_for_tree_code (code, vectype_in, optab_default);
4011 if (!optab)
4013 if (vect_print_dump_info (REPORT_DETAILS))
4014 fprintf (vect_dump, "no optab.");
4016 return false;
4019 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4021 if (vect_print_dump_info (REPORT_DETAILS))
4022 fprintf (vect_dump, "op not supported by target.");
4024 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4025 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4026 < vect_min_worthwhile_factor (code))
4027 return false;
4029 if (vect_print_dump_info (REPORT_DETAILS))
4030 fprintf (vect_dump, "proceeding using word mode.");
4033 /* Worthwhile without SIMD support? */
4034 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4035 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4036 < vect_min_worthwhile_factor (code))
4038 if (vect_print_dump_info (REPORT_DETAILS))
4039 fprintf (vect_dump, "not worthwhile without SIMD support.");
4041 return false;
4045 /* 4.2. Check support for the epilog operation.
4047 If STMT represents a reduction pattern, then the type of the
4048 reduction variable may be different than the type of the rest
4049 of the arguments. For example, consider the case of accumulation
4050 of shorts into an int accumulator; The original code:
4051 S1: int_a = (int) short_a;
4052 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4054 was replaced with:
4055 STMT: int_acc = widen_sum <short_a, int_acc>
4057 This means that:
4058 1. The tree-code that is used to create the vector operation in the
4059 epilog code (that reduces the partial results) is not the
4060 tree-code of STMT, but is rather the tree-code of the original
4061 stmt from the pattern that STMT is replacing. I.e, in the example
4062 above we want to use 'widen_sum' in the loop, but 'plus' in the
4063 epilog.
4064 2. The type (mode) we use to check available target support
4065 for the vector operation to be created in the *epilog*, is
4066 determined by the type of the reduction variable (in the example
4067 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4068 However the type (mode) we use to check available target support
4069 for the vector operation to be created *inside the loop*, is
4070 determined by the type of the other arguments to STMT (in the
4071 example we'd check this: optab_handler (widen_sum_optab,
4072 vect_short_mode)).
4074 This is contrary to "regular" reductions, in which the types of all
4075 the arguments are the same as the type of the reduction variable.
4076 For "regular" reductions we can therefore use the same vector type
4077 (and also the same tree-code) when generating the epilog code and
4078 when generating the code inside the loop. */
4080 if (orig_stmt)
4082 /* This is a reduction pattern: get the vectype from the type of the
4083 reduction variable, and get the tree-code from orig_stmt. */
4084 orig_code = gimple_assign_rhs_code (orig_stmt);
4085 gcc_assert (vectype_out);
4086 vec_mode = TYPE_MODE (vectype_out);
4088 else
4090 /* Regular reduction: use the same vectype and tree-code as used for
4091 the vector code inside the loop can be used for the epilog code. */
4092 orig_code = code;
4095 if (nested_cycle)
4097 def_bb = gimple_bb (reduc_def_stmt);
4098 def_stmt_loop = def_bb->loop_father;
4099 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4100 loop_preheader_edge (def_stmt_loop));
4101 if (TREE_CODE (def_arg) == SSA_NAME
4102 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4103 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4104 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4105 && vinfo_for_stmt (def_arg_stmt)
4106 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4107 == vect_double_reduction_def)
4108 double_reduc = true;
4111 epilog_reduc_code = ERROR_MARK;
4112 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4114 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4115 optab_default);
4116 if (!reduc_optab)
4118 if (vect_print_dump_info (REPORT_DETAILS))
4119 fprintf (vect_dump, "no optab for reduction.");
4121 epilog_reduc_code = ERROR_MARK;
4124 if (reduc_optab
4125 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4127 if (vect_print_dump_info (REPORT_DETAILS))
4128 fprintf (vect_dump, "reduc op not supported by target.");
4130 epilog_reduc_code = ERROR_MARK;
4133 else
4135 if (!nested_cycle || double_reduc)
4137 if (vect_print_dump_info (REPORT_DETAILS))
4138 fprintf (vect_dump, "no reduc code for scalar code.");
4140 return false;
4144 if (double_reduc && ncopies > 1)
4146 if (vect_print_dump_info (REPORT_DETAILS))
4147 fprintf (vect_dump, "multiple types in double reduction");
4149 return false;
4152 if (!vec_stmt) /* transformation not required. */
4154 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4155 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4156 return false;
4157 return true;
4160 /** Transform. **/
4162 if (vect_print_dump_info (REPORT_DETAILS))
4163 fprintf (vect_dump, "transform reduction.");
4165 /* FORNOW: Multiple types are not supported for condition. */
4166 if (code == COND_EXPR)
4167 gcc_assert (ncopies == 1);
4169 /* Create the destination vector */
4170 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4172 /* In case the vectorization factor (VF) is bigger than the number
4173 of elements that we can fit in a vectype (nunits), we have to generate
4174 more than one vector stmt - i.e - we need to "unroll" the
4175 vector stmt by a factor VF/nunits. For more details see documentation
4176 in vectorizable_operation. */
4178 /* If the reduction is used in an outer loop we need to generate
4179 VF intermediate results, like so (e.g. for ncopies=2):
4180 r0 = phi (init, r0)
4181 r1 = phi (init, r1)
4182 r0 = x0 + r0;
4183 r1 = x1 + r1;
4184 (i.e. we generate VF results in 2 registers).
4185 In this case we have a separate def-use cycle for each copy, and therefore
4186 for each copy we get the vector def for the reduction variable from the
4187 respective phi node created for this copy.
4189 Otherwise (the reduction is unused in the loop nest), we can combine
4190 together intermediate results, like so (e.g. for ncopies=2):
4191 r = phi (init, r)
4192 r = x0 + r;
4193 r = x1 + r;
4194 (i.e. we generate VF/2 results in a single register).
4195 In this case for each copy we get the vector def for the reduction variable
4196 from the vectorized reduction operation generated in the previous iteration.
4199 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4201 single_defuse_cycle = true;
4202 epilog_copies = 1;
4204 else
4205 epilog_copies = ncopies;
4207 prev_stmt_info = NULL;
4208 prev_phi_info = NULL;
4209 if (slp_node)
4211 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4212 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4213 == TYPE_VECTOR_SUBPARTS (vectype_in));
4215 else
4217 vec_num = 1;
4218 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4219 if (op_type == ternary_op)
4220 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4223 phis = VEC_alloc (gimple, heap, vec_num);
4224 vect_defs = VEC_alloc (tree, heap, vec_num);
4225 if (!slp_node)
4226 VEC_quick_push (tree, vect_defs, NULL_TREE);
4228 for (j = 0; j < ncopies; j++)
4230 if (j == 0 || !single_defuse_cycle)
4232 for (i = 0; i < vec_num; i++)
4234 /* Create the reduction-phi that defines the reduction
4235 operand. */
4236 new_phi = create_phi_node (vec_dest, loop->header);
4237 set_vinfo_for_stmt (new_phi,
4238 new_stmt_vec_info (new_phi, loop_vinfo,
4239 NULL));
4240 if (j == 0 || slp_node)
4241 VEC_quick_push (gimple, phis, new_phi);
4245 if (code == COND_EXPR)
4247 gcc_assert (!slp_node);
4248 vectorizable_condition (stmt, gsi, vec_stmt,
4249 PHI_RESULT (VEC_index (gimple, phis, 0)),
4250 reduc_index);
4251 /* Multiple types are not supported for condition. */
4252 break;
4255 /* Handle uses. */
4256 if (j == 0)
4258 if (slp_node)
4259 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4260 else
4262 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4263 stmt, NULL);
4264 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4265 if (op_type == ternary_op)
4267 if (reduc_index == 0)
4268 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4269 NULL);
4270 else
4271 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4272 NULL);
4274 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4278 else
4280 if (!slp_node)
4282 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4283 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4284 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4285 if (op_type == ternary_op)
4287 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4288 loop_vec_def1);
4289 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4293 if (single_defuse_cycle)
4294 reduc_def = gimple_assign_lhs (new_stmt);
4296 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4299 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
4301 if (slp_node)
4302 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4303 else
4305 if (!single_defuse_cycle || j == 0)
4306 reduc_def = PHI_RESULT (new_phi);
4309 def1 = ((op_type == ternary_op)
4310 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4311 if (op_type == binary_op)
4313 if (reduc_index == 0)
4314 expr = build2 (code, vectype_out, reduc_def, def0);
4315 else
4316 expr = build2 (code, vectype_out, def0, reduc_def);
4318 else
4320 if (reduc_index == 0)
4321 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4322 else
4324 if (reduc_index == 1)
4325 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4326 else
4327 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4331 new_stmt = gimple_build_assign (vec_dest, expr);
4332 new_temp = make_ssa_name (vec_dest, new_stmt);
4333 gimple_assign_set_lhs (new_stmt, new_temp);
4334 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4335 if (slp_node)
4337 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4338 VEC_quick_push (tree, vect_defs, new_temp);
4340 else
4341 VEC_replace (tree, vect_defs, 0, new_temp);
4344 if (slp_node)
4345 continue;
4347 if (j == 0)
4348 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4349 else
4350 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4352 prev_stmt_info = vinfo_for_stmt (new_stmt);
4353 prev_phi_info = vinfo_for_stmt (new_phi);
4356 /* Finalize the reduction-phi (set its arguments) and create the
4357 epilog reduction code. */
4358 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4360 new_temp = gimple_assign_lhs (*vec_stmt);
4361 VEC_replace (tree, vect_defs, 0, new_temp);
4364 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4365 epilog_reduc_code, phis, reduc_index,
4366 double_reduc, slp_node);
4368 VEC_free (gimple, heap, phis);
4369 VEC_free (tree, heap, vec_oprnds0);
4370 if (vec_oprnds1)
4371 VEC_free (tree, heap, vec_oprnds1);
4373 return true;
4376 /* Function vect_min_worthwhile_factor.
4378 For a loop where we could vectorize the operation indicated by CODE,
4379 return the minimum vectorization factor that makes it worthwhile
4380 to use generic vectors. */
4382 vect_min_worthwhile_factor (enum tree_code code)
4384 switch (code)
4386 case PLUS_EXPR:
4387 case MINUS_EXPR:
4388 case NEGATE_EXPR:
4389 return 4;
4391 case BIT_AND_EXPR:
4392 case BIT_IOR_EXPR:
4393 case BIT_XOR_EXPR:
4394 case BIT_NOT_EXPR:
4395 return 2;
4397 default:
4398 return INT_MAX;
4403 /* Function vectorizable_induction
4405 Check if PHI performs an induction computation that can be vectorized.
4406 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4407 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4408 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4410 bool
4411 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4412 gimple *vec_stmt)
4414 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4415 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4416 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4417 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4418 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4419 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4420 tree vec_def;
4422 gcc_assert (ncopies >= 1);
4423 /* FORNOW. This restriction should be relaxed. */
4424 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4426 if (vect_print_dump_info (REPORT_DETAILS))
4427 fprintf (vect_dump, "multiple types in nested loop.");
4428 return false;
4431 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4432 return false;
4434 /* FORNOW: SLP not supported. */
4435 if (STMT_SLP_TYPE (stmt_info))
4436 return false;
4438 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4440 if (gimple_code (phi) != GIMPLE_PHI)
4441 return false;
4443 if (!vec_stmt) /* transformation not required. */
4445 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4446 if (vect_print_dump_info (REPORT_DETAILS))
4447 fprintf (vect_dump, "=== vectorizable_induction ===");
4448 vect_model_induction_cost (stmt_info, ncopies);
4449 return true;
4452 /** Transform. **/
4454 if (vect_print_dump_info (REPORT_DETAILS))
4455 fprintf (vect_dump, "transform induction phi.");
4457 vec_def = get_initial_def_for_induction (phi);
4458 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4459 return true;
4462 /* Function vectorizable_live_operation.
4464 STMT computes a value that is used outside the loop. Check if
4465 it can be supported. */
4467 bool
4468 vectorizable_live_operation (gimple stmt,
4469 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4470 gimple *vec_stmt ATTRIBUTE_UNUSED)
4472 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4475 int i;
4476 int op_type;
4477 tree op;
4478 tree def;
4479 gimple def_stmt;
4480 enum vect_def_type dt;
4481 enum tree_code code;
4482 enum gimple_rhs_class rhs_class;
4484 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4486 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4487 return false;
4489 if (!is_gimple_assign (stmt))
4490 return false;
4492 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4493 return false;
4495 /* FORNOW. CHECKME. */
4496 if (nested_in_vect_loop_p (loop, stmt))
4497 return false;
4499 code = gimple_assign_rhs_code (stmt);
4500 op_type = TREE_CODE_LENGTH (code);
4501 rhs_class = get_gimple_rhs_class (code);
4502 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4503 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4505 /* FORNOW: support only if all uses are invariant. This means
4506 that the scalar operations can remain in place, unvectorized.
4507 The original last scalar value that they compute will be used. */
4509 for (i = 0; i < op_type; i++)
4511 if (rhs_class == GIMPLE_SINGLE_RHS)
4512 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4513 else
4514 op = gimple_op (stmt, i + 1);
4515 if (op
4516 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4518 if (vect_print_dump_info (REPORT_DETAILS))
4519 fprintf (vect_dump, "use not simple.");
4520 return false;
4523 if (dt != vect_external_def && dt != vect_constant_def)
4524 return false;
4527 /* No transformation is required for the cases we currently support. */
4528 return true;
4531 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4533 static void
4534 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4536 ssa_op_iter op_iter;
4537 imm_use_iterator imm_iter;
4538 def_operand_p def_p;
4539 gimple ustmt;
4541 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4543 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4545 basic_block bb;
4547 if (!is_gimple_debug (ustmt))
4548 continue;
4550 bb = gimple_bb (ustmt);
4552 if (!flow_bb_inside_loop_p (loop, bb))
4554 if (gimple_debug_bind_p (ustmt))
4556 if (vect_print_dump_info (REPORT_DETAILS))
4557 fprintf (vect_dump, "killing debug use");
4559 gimple_debug_bind_reset_value (ustmt);
4560 update_stmt (ustmt);
4562 else
4563 gcc_unreachable ();
4569 /* Function vect_transform_loop.
4571 The analysis phase has determined that the loop is vectorizable.
4572 Vectorize the loop - created vectorized stmts to replace the scalar
4573 stmts in the loop, and update the loop exit condition. */
4575 void
4576 vect_transform_loop (loop_vec_info loop_vinfo)
4578 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4579 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4580 int nbbs = loop->num_nodes;
4581 gimple_stmt_iterator si;
4582 int i;
4583 tree ratio = NULL;
4584 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4585 bool strided_store;
4586 bool slp_scheduled = false;
4587 unsigned int nunits;
4588 tree cond_expr = NULL_TREE;
4589 gimple_seq cond_expr_stmt_list = NULL;
4590 bool do_peeling_for_loop_bound;
4592 if (vect_print_dump_info (REPORT_DETAILS))
4593 fprintf (vect_dump, "=== vec_transform_loop ===");
4595 /* Peel the loop if there are data refs with unknown alignment.
4596 Only one data ref with unknown store is allowed. */
4598 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4599 vect_do_peeling_for_alignment (loop_vinfo);
4601 do_peeling_for_loop_bound
4602 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4603 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4604 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4606 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4607 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4608 vect_loop_versioning (loop_vinfo,
4609 !do_peeling_for_loop_bound,
4610 &cond_expr, &cond_expr_stmt_list);
4612 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4613 compile time constant), or it is a constant that doesn't divide by the
4614 vectorization factor, then an epilog loop needs to be created.
4615 We therefore duplicate the loop: the original loop will be vectorized,
4616 and will compute the first (n/VF) iterations. The second copy of the loop
4617 will remain scalar and will compute the remaining (n%VF) iterations.
4618 (VF is the vectorization factor). */
4620 if (do_peeling_for_loop_bound)
4621 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4622 cond_expr, cond_expr_stmt_list);
4623 else
4624 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4625 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4627 /* 1) Make sure the loop header has exactly two entries
4628 2) Make sure we have a preheader basic block. */
4630 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4632 split_edge (loop_preheader_edge (loop));
4634 /* FORNOW: the vectorizer supports only loops which body consist
4635 of one basic block (header + empty latch). When the vectorizer will
4636 support more involved loop forms, the order by which the BBs are
4637 traversed need to be reconsidered. */
4639 for (i = 0; i < nbbs; i++)
4641 basic_block bb = bbs[i];
4642 stmt_vec_info stmt_info;
4643 gimple phi;
4645 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4647 phi = gsi_stmt (si);
4648 if (vect_print_dump_info (REPORT_DETAILS))
4650 fprintf (vect_dump, "------>vectorizing phi: ");
4651 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4653 stmt_info = vinfo_for_stmt (phi);
4654 if (!stmt_info)
4655 continue;
4657 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4658 vect_loop_kill_debug_uses (loop, phi);
4660 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4661 && !STMT_VINFO_LIVE_P (stmt_info))
4662 continue;
4664 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4665 != (unsigned HOST_WIDE_INT) vectorization_factor)
4666 && vect_print_dump_info (REPORT_DETAILS))
4667 fprintf (vect_dump, "multiple-types.");
4669 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4671 if (vect_print_dump_info (REPORT_DETAILS))
4672 fprintf (vect_dump, "transform phi.");
4673 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4677 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4679 gimple stmt = gsi_stmt (si);
4680 bool is_store;
4682 if (vect_print_dump_info (REPORT_DETAILS))
4684 fprintf (vect_dump, "------>vectorizing statement: ");
4685 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4688 stmt_info = vinfo_for_stmt (stmt);
4690 /* vector stmts created in the outer-loop during vectorization of
4691 stmts in an inner-loop may not have a stmt_info, and do not
4692 need to be vectorized. */
4693 if (!stmt_info)
4695 gsi_next (&si);
4696 continue;
4699 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4700 vect_loop_kill_debug_uses (loop, stmt);
4702 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4703 && !STMT_VINFO_LIVE_P (stmt_info))
4705 gsi_next (&si);
4706 continue;
4709 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4710 nunits =
4711 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4712 if (!STMT_SLP_TYPE (stmt_info)
4713 && nunits != (unsigned int) vectorization_factor
4714 && vect_print_dump_info (REPORT_DETAILS))
4715 /* For SLP VF is set according to unrolling factor, and not to
4716 vector size, hence for SLP this print is not valid. */
4717 fprintf (vect_dump, "multiple-types.");
4719 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4720 reached. */
4721 if (STMT_SLP_TYPE (stmt_info))
4723 if (!slp_scheduled)
4725 slp_scheduled = true;
4727 if (vect_print_dump_info (REPORT_DETAILS))
4728 fprintf (vect_dump, "=== scheduling SLP instances ===");
4730 vect_schedule_slp (loop_vinfo, NULL);
4733 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4734 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4736 gsi_next (&si);
4737 continue;
4741 /* -------- vectorize statement ------------ */
4742 if (vect_print_dump_info (REPORT_DETAILS))
4743 fprintf (vect_dump, "transform statement.");
4745 strided_store = false;
4746 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4747 if (is_store)
4749 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4751 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4752 interleaving chain was completed - free all the stores in
4753 the chain. */
4754 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4755 gsi_remove (&si, true);
4756 continue;
4758 else
4760 /* Free the attached stmt_vec_info and remove the stmt. */
4761 free_stmt_vec_info (stmt);
4762 gsi_remove (&si, true);
4763 continue;
4766 gsi_next (&si);
4767 } /* stmts in BB */
4768 } /* BBs in loop */
4770 slpeel_make_loop_iterate_ntimes (loop, ratio);
4772 /* The memory tags and pointers in vectorized statements need to
4773 have their SSA forms updated. FIXME, why can't this be delayed
4774 until all the loops have been transformed? */
4775 update_ssa (TODO_update_ssa);
4777 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4778 fprintf (vect_dump, "LOOP VECTORIZED.");
4779 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4780 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");