2015-06-25 Zhouyi Zhou <yizhouzhou@ict.ac.cn>
[official-gcc.git] / gcc / tree-vect-loop.c
blob1256fe2acf3f5e2d41e33870e0adf24876b17faa
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "alias.h"
28 #include "symtab.h"
29 #include "tree.h"
30 #include "fold-const.h"
31 #include "stor-layout.h"
32 #include "predict.h"
33 #include "hard-reg-set.h"
34 #include "function.h"
35 #include "dominance.h"
36 #include "cfg.h"
37 #include "cfganal.h"
38 #include "basic-block.h"
39 #include "gimple-pretty-print.h"
40 #include "tree-ssa-alias.h"
41 #include "internal-fn.h"
42 #include "gimple-expr.h"
43 #include "gimple.h"
44 #include "gimplify.h"
45 #include "gimple-iterator.h"
46 #include "gimplify-me.h"
47 #include "gimple-ssa.h"
48 #include "tree-phinodes.h"
49 #include "ssa-iterators.h"
50 #include "stringpool.h"
51 #include "tree-ssanames.h"
52 #include "tree-ssa-loop-ivopts.h"
53 #include "tree-ssa-loop-manip.h"
54 #include "tree-ssa-loop-niter.h"
55 #include "tree-pass.h"
56 #include "cfgloop.h"
57 #include "rtl.h"
58 #include "flags.h"
59 #include "insn-config.h"
60 #include "expmed.h"
61 #include "dojump.h"
62 #include "explow.h"
63 #include "calls.h"
64 #include "emit-rtl.h"
65 #include "varasm.h"
66 #include "stmt.h"
67 #include "expr.h"
68 #include "recog.h"
69 #include "insn-codes.h"
70 #include "optabs.h"
71 #include "params.h"
72 #include "diagnostic-core.h"
73 #include "tree-chrec.h"
74 #include "tree-scalar-evolution.h"
75 #include "tree-vectorizer.h"
76 #include "target.h"
78 /* Loop Vectorization Pass.
80 This pass tries to vectorize loops.
82 For example, the vectorizer transforms the following simple loop:
84 short a[N]; short b[N]; short c[N]; int i;
86 for (i=0; i<N; i++){
87 a[i] = b[i] + c[i];
90 as if it was manually vectorized by rewriting the source code into:
92 typedef int __attribute__((mode(V8HI))) v8hi;
93 short a[N]; short b[N]; short c[N]; int i;
94 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
95 v8hi va, vb, vc;
97 for (i=0; i<N/8; i++){
98 vb = pb[i];
99 vc = pc[i];
100 va = vb + vc;
101 pa[i] = va;
104 The main entry to this pass is vectorize_loops(), in which
105 the vectorizer applies a set of analyses on a given set of loops,
106 followed by the actual vectorization transformation for the loops that
107 had successfully passed the analysis phase.
108 Throughout this pass we make a distinction between two types of
109 data: scalars (which are represented by SSA_NAMES), and memory references
110 ("data-refs"). These two types of data require different handling both
111 during analysis and transformation. The types of data-refs that the
112 vectorizer currently supports are ARRAY_REFS which base is an array DECL
113 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
114 accesses are required to have a simple (consecutive) access pattern.
116 Analysis phase:
117 ===============
118 The driver for the analysis phase is vect_analyze_loop().
119 It applies a set of analyses, some of which rely on the scalar evolution
120 analyzer (scev) developed by Sebastian Pop.
122 During the analysis phase the vectorizer records some information
123 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
124 loop, as well as general information about the loop as a whole, which is
125 recorded in a "loop_vec_info" struct attached to each loop.
127 Transformation phase:
128 =====================
129 The loop transformation phase scans all the stmts in the loop, and
130 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
131 the loop that needs to be vectorized. It inserts the vector code sequence
132 just before the scalar stmt S, and records a pointer to the vector code
133 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
134 attached to S). This pointer will be used for the vectorization of following
135 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
136 otherwise, we rely on dead code elimination for removing it.
138 For example, say stmt S1 was vectorized into stmt VS1:
140 VS1: vb = px[i];
141 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
142 S2: a = b;
144 To vectorize stmt S2, the vectorizer first finds the stmt that defines
145 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
146 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
147 resulting sequence would be:
149 VS1: vb = px[i];
150 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
151 VS2: va = vb;
152 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
154 Operands that are not SSA_NAMEs, are data-refs that appear in
155 load/store operations (like 'x[i]' in S1), and are handled differently.
157 Target modeling:
158 =================
159 Currently the only target specific information that is used is the
160 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
161 Targets that can support different sizes of vectors, for now will need
162 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
163 flexibility will be added in the future.
165 Since we only vectorize operations which vector form can be
166 expressed using existing tree codes, to verify that an operation is
167 supported, the vectorizer checks the relevant optab at the relevant
168 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
169 the value found is CODE_FOR_nothing, then there's no target support, and
170 we can't vectorize the stmt.
172 For additional information on this project see:
173 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
176 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
178 /* Function vect_determine_vectorization_factor
180 Determine the vectorization factor (VF). VF is the number of data elements
181 that are operated upon in parallel in a single iteration of the vectorized
182 loop. For example, when vectorizing a loop that operates on 4byte elements,
183 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
184 elements can fit in a single vector register.
186 We currently support vectorization of loops in which all types operated upon
187 are of the same size. Therefore this function currently sets VF according to
188 the size of the types operated upon, and fails if there are multiple sizes
189 in the loop.
191 VF is also the factor by which the loop iterations are strip-mined, e.g.:
192 original loop:
193 for (i=0; i<N; i++){
194 a[i] = b[i] + c[i];
197 vectorized loop:
198 for (i=0; i<N; i+=VF){
199 a[i:VF] = b[i:VF] + c[i:VF];
203 static bool
204 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
206 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
207 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
208 int nbbs = loop->num_nodes;
209 unsigned int vectorization_factor = 0;
210 tree scalar_type;
211 gphi *phi;
212 tree vectype;
213 unsigned int nunits;
214 stmt_vec_info stmt_info;
215 int i;
216 HOST_WIDE_INT dummy;
217 gimple stmt, pattern_stmt = NULL;
218 gimple_seq pattern_def_seq = NULL;
219 gimple_stmt_iterator pattern_def_si = gsi_none ();
220 bool analyze_pattern_stmt = false;
222 if (dump_enabled_p ())
223 dump_printf_loc (MSG_NOTE, vect_location,
224 "=== vect_determine_vectorization_factor ===\n");
226 for (i = 0; i < nbbs; i++)
228 basic_block bb = bbs[i];
230 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
231 gsi_next (&si))
233 phi = si.phi ();
234 stmt_info = vinfo_for_stmt (phi);
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
238 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
239 dump_printf (MSG_NOTE, "\n");
242 gcc_assert (stmt_info);
244 if (STMT_VINFO_RELEVANT_P (stmt_info))
246 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
247 scalar_type = TREE_TYPE (PHI_RESULT (phi));
249 if (dump_enabled_p ())
251 dump_printf_loc (MSG_NOTE, vect_location,
252 "get vectype for scalar type: ");
253 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
254 dump_printf (MSG_NOTE, "\n");
257 vectype = get_vectype_for_scalar_type (scalar_type);
258 if (!vectype)
260 if (dump_enabled_p ())
262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
263 "not vectorized: unsupported "
264 "data-type ");
265 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
266 scalar_type);
267 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
269 return false;
271 STMT_VINFO_VECTYPE (stmt_info) = vectype;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
276 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
277 dump_printf (MSG_NOTE, "\n");
280 nunits = TYPE_VECTOR_SUBPARTS (vectype);
281 if (dump_enabled_p ())
282 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
283 nunits);
285 if (!vectorization_factor
286 || (nunits > vectorization_factor))
287 vectorization_factor = nunits;
291 for (gimple_stmt_iterator si = gsi_start_bb (bb);
292 !gsi_end_p (si) || analyze_pattern_stmt;)
294 tree vf_vectype;
296 if (analyze_pattern_stmt)
297 stmt = pattern_stmt;
298 else
299 stmt = gsi_stmt (si);
301 stmt_info = vinfo_for_stmt (stmt);
303 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location,
306 "==> examining statement: ");
307 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
308 dump_printf (MSG_NOTE, "\n");
311 gcc_assert (stmt_info);
313 /* Skip stmts which do not need to be vectorized. */
314 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
315 && !STMT_VINFO_LIVE_P (stmt_info))
316 || gimple_clobber_p (stmt))
318 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
319 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
320 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
321 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
323 stmt = pattern_stmt;
324 stmt_info = vinfo_for_stmt (pattern_stmt);
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location,
328 "==> examining pattern statement: ");
329 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
330 dump_printf (MSG_NOTE, "\n");
333 else
335 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
337 gsi_next (&si);
338 continue;
341 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
342 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
343 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
344 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
345 analyze_pattern_stmt = true;
347 /* If a pattern statement has def stmts, analyze them too. */
348 if (is_pattern_stmt_p (stmt_info))
350 if (pattern_def_seq == NULL)
352 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
353 pattern_def_si = gsi_start (pattern_def_seq);
355 else if (!gsi_end_p (pattern_def_si))
356 gsi_next (&pattern_def_si);
357 if (pattern_def_seq != NULL)
359 gimple pattern_def_stmt = NULL;
360 stmt_vec_info pattern_def_stmt_info = NULL;
362 while (!gsi_end_p (pattern_def_si))
364 pattern_def_stmt = gsi_stmt (pattern_def_si);
365 pattern_def_stmt_info
366 = vinfo_for_stmt (pattern_def_stmt);
367 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
368 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
369 break;
370 gsi_next (&pattern_def_si);
373 if (!gsi_end_p (pattern_def_si))
375 if (dump_enabled_p ())
377 dump_printf_loc (MSG_NOTE, vect_location,
378 "==> examining pattern def stmt: ");
379 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
380 pattern_def_stmt, 0);
381 dump_printf (MSG_NOTE, "\n");
384 stmt = pattern_def_stmt;
385 stmt_info = pattern_def_stmt_info;
387 else
389 pattern_def_si = gsi_none ();
390 analyze_pattern_stmt = false;
393 else
394 analyze_pattern_stmt = false;
397 if (gimple_get_lhs (stmt) == NULL_TREE
398 /* MASK_STORE has no lhs, but is ok. */
399 && (!is_gimple_call (stmt)
400 || !gimple_call_internal_p (stmt)
401 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
403 if (is_gimple_call (stmt))
405 /* Ignore calls with no lhs. These must be calls to
406 #pragma omp simd functions, and what vectorization factor
407 it really needs can't be determined until
408 vectorizable_simd_clone_call. */
409 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
411 pattern_def_seq = NULL;
412 gsi_next (&si);
414 continue;
416 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "not vectorized: irregular stmt.");
420 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
422 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
424 return false;
427 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
429 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
432 "not vectorized: vector stmt in loop:");
433 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
434 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
436 return false;
439 if (STMT_VINFO_VECTYPE (stmt_info))
441 /* The only case when a vectype had been already set is for stmts
442 that contain a dataref, or for "pattern-stmts" (stmts
443 generated by the vectorizer to represent/replace a certain
444 idiom). */
445 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
446 || is_pattern_stmt_p (stmt_info)
447 || !gsi_end_p (pattern_def_si));
448 vectype = STMT_VINFO_VECTYPE (stmt_info);
450 else
452 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
453 if (is_gimple_call (stmt)
454 && gimple_call_internal_p (stmt)
455 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
456 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
457 else
458 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
459 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE, vect_location,
462 "get vectype for scalar type: ");
463 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
464 dump_printf (MSG_NOTE, "\n");
466 vectype = get_vectype_for_scalar_type (scalar_type);
467 if (!vectype)
469 if (dump_enabled_p ())
471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
472 "not vectorized: unsupported "
473 "data-type ");
474 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
475 scalar_type);
476 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
478 return false;
481 STMT_VINFO_VECTYPE (stmt_info) = vectype;
483 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
486 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
487 dump_printf (MSG_NOTE, "\n");
491 /* The vectorization factor is according to the smallest
492 scalar type (or the largest vector size, but we only
493 support one vector size per loop). */
494 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
495 &dummy);
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_NOTE, vect_location,
499 "get vectype for scalar type: ");
500 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
501 dump_printf (MSG_NOTE, "\n");
503 vf_vectype = get_vectype_for_scalar_type (scalar_type);
504 if (!vf_vectype)
506 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "not vectorized: unsupported data-type ");
510 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
511 scalar_type);
512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
514 return false;
517 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
518 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: different sized vector "
524 "types in statement, ");
525 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
526 vectype);
527 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
528 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
529 vf_vectype);
530 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
532 return false;
535 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
538 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
539 dump_printf (MSG_NOTE, "\n");
542 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
545 if (!vectorization_factor
546 || (nunits > vectorization_factor))
547 vectorization_factor = nunits;
549 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
551 pattern_def_seq = NULL;
552 gsi_next (&si);
557 /* TODO: Analyze cost. Decide if worth while to vectorize. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
560 vectorization_factor);
561 if (vectorization_factor <= 1)
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
565 "not vectorized: unsupported data-type\n");
566 return false;
568 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
570 return true;
574 /* Function vect_is_simple_iv_evolution.
576 FORNOW: A simple evolution of an induction variables in the loop is
577 considered a polynomial evolution. */
579 static bool
580 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
581 tree * step)
583 tree init_expr;
584 tree step_expr;
585 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
586 basic_block bb;
588 /* When there is no evolution in this loop, the evolution function
589 is not "simple". */
590 if (evolution_part == NULL_TREE)
591 return false;
593 /* When the evolution is a polynomial of degree >= 2
594 the evolution function is not "simple". */
595 if (tree_is_chrec (evolution_part))
596 return false;
598 step_expr = evolution_part;
599 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
601 if (dump_enabled_p ())
603 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
604 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
605 dump_printf (MSG_NOTE, ", init: ");
606 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
607 dump_printf (MSG_NOTE, "\n");
610 *init = init_expr;
611 *step = step_expr;
613 if (TREE_CODE (step_expr) != INTEGER_CST
614 && (TREE_CODE (step_expr) != SSA_NAME
615 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
616 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
617 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
618 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
619 || !flag_associative_math)))
620 && (TREE_CODE (step_expr) != REAL_CST
621 || !flag_associative_math))
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
625 "step unknown.\n");
626 return false;
629 return true;
632 /* Function vect_analyze_scalar_cycles_1.
634 Examine the cross iteration def-use cycles of scalar variables
635 in LOOP. LOOP_VINFO represents the loop that is now being
636 considered for vectorization (can be LOOP, or an outer-loop
637 enclosing LOOP). */
639 static void
640 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
642 basic_block bb = loop->header;
643 tree init, step;
644 auto_vec<gimple, 64> worklist;
645 gphi_iterator gsi;
646 bool double_reduc;
648 if (dump_enabled_p ())
649 dump_printf_loc (MSG_NOTE, vect_location,
650 "=== vect_analyze_scalar_cycles ===\n");
652 /* First - identify all inductions. Reduction detection assumes that all the
653 inductions have been identified, therefore, this order must not be
654 changed. */
655 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
657 gphi *phi = gsi.phi ();
658 tree access_fn = NULL;
659 tree def = PHI_RESULT (phi);
660 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
665 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
666 dump_printf (MSG_NOTE, "\n");
669 /* Skip virtual phi's. The data dependences that are associated with
670 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
671 if (virtual_operand_p (def))
672 continue;
674 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
676 /* Analyze the evolution function. */
677 access_fn = analyze_scalar_evolution (loop, def);
678 if (access_fn)
680 STRIP_NOPS (access_fn);
681 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Access function of PHI: ");
685 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
686 dump_printf (MSG_NOTE, "\n");
688 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
689 = evolution_part_in_loop_num (access_fn, loop->num);
692 if (!access_fn
693 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
694 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
695 && TREE_CODE (step) != INTEGER_CST))
697 worklist.safe_push (phi);
698 continue;
701 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
703 if (dump_enabled_p ())
704 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
709 /* Second - identify all reductions and nested cycles. */
710 while (worklist.length () > 0)
712 gimple phi = worklist.pop ();
713 tree def = PHI_RESULT (phi);
714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
715 gimple reduc_stmt;
716 bool nested_cycle;
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
721 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
722 dump_printf (MSG_NOTE, "\n");
725 gcc_assert (!virtual_operand_p (def)
726 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
728 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
729 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
730 &double_reduc);
731 if (reduc_stmt)
733 if (double_reduc)
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE, vect_location,
737 "Detected double reduction.\n");
739 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
740 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
741 vect_double_reduction_def;
743 else
745 if (nested_cycle)
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE, vect_location,
749 "Detected vectorizable nested cycle.\n");
751 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753 vect_nested_cycle;
755 else
757 if (dump_enabled_p ())
758 dump_printf_loc (MSG_NOTE, vect_location,
759 "Detected reduction.\n");
761 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
762 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
763 vect_reduction_def;
764 /* Store the reduction cycles for possible vectorization in
765 loop-aware SLP. */
766 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
770 else
771 if (dump_enabled_p ())
772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
773 "Unknown def-use cycle pattern.\n");
778 /* Function vect_analyze_scalar_cycles.
780 Examine the cross iteration def-use cycles of scalar variables, by
781 analyzing the loop-header PHIs of scalar variables. Classify each
782 cycle as one of the following: invariant, induction, reduction, unknown.
783 We do that for the loop represented by LOOP_VINFO, and also to its
784 inner-loop, if exists.
785 Examples for scalar cycles:
787 Example1: reduction:
789 loop1:
790 for (i=0; i<N; i++)
791 sum += a[i];
793 Example2: induction:
795 loop2:
796 for (i=0; i<N; i++)
797 a[i] = i; */
799 static void
800 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
802 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
804 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
806 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
807 Reductions in such inner-loop therefore have different properties than
808 the reductions in the nest that gets vectorized:
809 1. When vectorized, they are executed in the same order as in the original
810 scalar loop, so we can't change the order of computation when
811 vectorizing them.
812 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
813 current checks are too strict. */
815 if (loop->inner)
816 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
819 /* Transfer group and reduction information from STMT to its pattern stmt. */
821 static void
822 vect_fixup_reduc_chain (gimple stmt)
824 gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
825 gimple stmtp;
826 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
827 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
828 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
831 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
832 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
833 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
834 if (stmt)
835 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
836 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
838 while (stmt);
839 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
842 /* Fixup scalar cycles that now have their stmts detected as patterns. */
844 static void
845 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
847 gimple first;
848 unsigned i;
850 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
851 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
853 vect_fixup_reduc_chain (first);
854 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
855 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
859 /* Function vect_get_loop_niters.
861 Determine how many iterations the loop is executed and place it
862 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
863 in NUMBER_OF_ITERATIONSM1.
865 Return the loop exit condition. */
868 static gcond *
869 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
870 tree *number_of_iterationsm1)
872 tree niters;
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE, vect_location,
876 "=== get_loop_niters ===\n");
878 niters = number_of_latch_executions (loop);
879 *number_of_iterationsm1 = niters;
881 /* We want the number of loop header executions which is the number
882 of latch executions plus one.
883 ??? For UINT_MAX latch executions this number overflows to zero
884 for loops like do { n++; } while (n != 0); */
885 if (niters && !chrec_contains_undetermined (niters))
886 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
887 build_int_cst (TREE_TYPE (niters), 1));
888 *number_of_iterations = niters;
890 return get_loop_exit_condition (loop);
894 /* Function bb_in_loop_p
896 Used as predicate for dfs order traversal of the loop bbs. */
898 static bool
899 bb_in_loop_p (const_basic_block bb, const void *data)
901 const struct loop *const loop = (const struct loop *)data;
902 if (flow_bb_inside_loop_p (loop, bb))
903 return true;
904 return false;
908 /* Function new_loop_vec_info.
910 Create and initialize a new loop_vec_info struct for LOOP, as well as
911 stmt_vec_info structs for all the stmts in LOOP. */
913 static loop_vec_info
914 new_loop_vec_info (struct loop *loop)
916 loop_vec_info res;
917 basic_block *bbs;
918 gimple_stmt_iterator si;
919 unsigned int i, nbbs;
921 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
922 LOOP_VINFO_LOOP (res) = loop;
924 bbs = get_loop_body (loop);
926 /* Create/Update stmt_info for all stmts in the loop. */
927 for (i = 0; i < loop->num_nodes; i++)
929 basic_block bb = bbs[i];
931 /* BBs in a nested inner-loop will have been already processed (because
932 we will have called vect_analyze_loop_form for any nested inner-loop).
933 Therefore, for stmts in an inner-loop we just want to update the
934 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
935 loop_info of the outer-loop we are currently considering to vectorize
936 (instead of the loop_info of the inner-loop).
937 For stmts in other BBs we need to create a stmt_info from scratch. */
938 if (bb->loop_father != loop)
940 /* Inner-loop bb. */
941 gcc_assert (loop->inner && bb->loop_father == loop->inner);
942 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
944 gimple phi = gsi_stmt (si);
945 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
946 loop_vec_info inner_loop_vinfo =
947 STMT_VINFO_LOOP_VINFO (stmt_info);
948 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
949 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
951 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
953 gimple stmt = gsi_stmt (si);
954 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
955 loop_vec_info inner_loop_vinfo =
956 STMT_VINFO_LOOP_VINFO (stmt_info);
957 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
958 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
961 else
963 /* bb in current nest. */
964 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
966 gimple phi = gsi_stmt (si);
967 gimple_set_uid (phi, 0);
968 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
971 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
973 gimple stmt = gsi_stmt (si);
974 gimple_set_uid (stmt, 0);
975 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
980 /* CHECKME: We want to visit all BBs before their successors (except for
981 latch blocks, for which this assertion wouldn't hold). In the simple
982 case of the loop forms we allow, a dfs order of the BBs would the same
983 as reversed postorder traversal, so we are safe. */
985 free (bbs);
986 bbs = XCNEWVEC (basic_block, loop->num_nodes);
987 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
988 bbs, loop->num_nodes, loop);
989 gcc_assert (nbbs == loop->num_nodes);
991 LOOP_VINFO_BBS (res) = bbs;
992 LOOP_VINFO_NITERSM1 (res) = NULL;
993 LOOP_VINFO_NITERS (res) = NULL;
994 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
995 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
996 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
997 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
998 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
999 LOOP_VINFO_VECT_FACTOR (res) = 0;
1000 LOOP_VINFO_LOOP_NEST (res).create (3);
1001 LOOP_VINFO_DATAREFS (res).create (10);
1002 LOOP_VINFO_DDRS (res).create (10 * 10);
1003 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1004 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
1005 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1006 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
1007 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1008 LOOP_VINFO_GROUPED_STORES (res).create (10);
1009 LOOP_VINFO_REDUCTIONS (res).create (10);
1010 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
1011 LOOP_VINFO_SLP_INSTANCES (res).create (10);
1012 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1013 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1014 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1015 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1016 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1018 return res;
1022 /* Function destroy_loop_vec_info.
1024 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1025 stmts in the loop. */
1027 void
1028 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1030 struct loop *loop;
1031 basic_block *bbs;
1032 int nbbs;
1033 gimple_stmt_iterator si;
1034 int j;
1035 vec<slp_instance> slp_instances;
1036 slp_instance instance;
1037 bool swapped;
1039 if (!loop_vinfo)
1040 return;
1042 loop = LOOP_VINFO_LOOP (loop_vinfo);
1044 bbs = LOOP_VINFO_BBS (loop_vinfo);
1045 nbbs = clean_stmts ? loop->num_nodes : 0;
1046 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1048 for (j = 0; j < nbbs; j++)
1050 basic_block bb = bbs[j];
1051 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1052 free_stmt_vec_info (gsi_stmt (si));
1054 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1056 gimple stmt = gsi_stmt (si);
1058 /* We may have broken canonical form by moving a constant
1059 into RHS1 of a commutative op. Fix such occurrences. */
1060 if (swapped && is_gimple_assign (stmt))
1062 enum tree_code code = gimple_assign_rhs_code (stmt);
1064 if ((code == PLUS_EXPR
1065 || code == POINTER_PLUS_EXPR
1066 || code == MULT_EXPR)
1067 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1068 swap_ssa_operands (stmt,
1069 gimple_assign_rhs1_ptr (stmt),
1070 gimple_assign_rhs2_ptr (stmt));
1073 /* Free stmt_vec_info. */
1074 free_stmt_vec_info (stmt);
1075 gsi_next (&si);
1079 free (LOOP_VINFO_BBS (loop_vinfo));
1080 vect_destroy_datarefs (loop_vinfo, NULL);
1081 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1082 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1083 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1084 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1085 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1086 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1087 vect_free_slp_instance (instance);
1089 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1090 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1091 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1092 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1094 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1095 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1097 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1098 loop_vinfo->scalar_cost_vec.release ();
1100 free (loop_vinfo);
1101 loop->aux = NULL;
1105 /* Calculate the cost of one scalar iteration of the loop. */
1106 static void
1107 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1109 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1110 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1111 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1112 int innerloop_iters, i;
1114 /* Count statements in scalar loop. Using this as scalar cost for a single
1115 iteration for now.
1117 TODO: Add outer loop support.
1119 TODO: Consider assigning different costs to different scalar
1120 statements. */
1122 /* FORNOW. */
1123 innerloop_iters = 1;
1124 if (loop->inner)
1125 innerloop_iters = 50; /* FIXME */
1127 for (i = 0; i < nbbs; i++)
1129 gimple_stmt_iterator si;
1130 basic_block bb = bbs[i];
1132 if (bb->loop_father == loop->inner)
1133 factor = innerloop_iters;
1134 else
1135 factor = 1;
1137 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1139 gimple stmt = gsi_stmt (si);
1140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1142 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1143 continue;
1145 /* Skip stmts that are not vectorized inside the loop. */
1146 if (stmt_info
1147 && !STMT_VINFO_RELEVANT_P (stmt_info)
1148 && (!STMT_VINFO_LIVE_P (stmt_info)
1149 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1150 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1151 continue;
1153 vect_cost_for_stmt kind;
1154 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1156 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1157 kind = scalar_load;
1158 else
1159 kind = scalar_store;
1161 else
1162 kind = scalar_stmt;
1164 scalar_single_iter_cost
1165 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1166 factor, kind, NULL, 0, vect_prologue);
1169 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1170 = scalar_single_iter_cost;
1174 /* Function vect_analyze_loop_1.
1176 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1177 for it. The different analyses will record information in the
1178 loop_vec_info struct. This is a subset of the analyses applied in
1179 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1180 that is now considered for (outer-loop) vectorization. */
1182 static loop_vec_info
1183 vect_analyze_loop_1 (struct loop *loop)
1185 loop_vec_info loop_vinfo;
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_NOTE, vect_location,
1189 "===== analyze_loop_nest_1 =====\n");
1191 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1193 loop_vinfo = vect_analyze_loop_form (loop);
1194 if (!loop_vinfo)
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198 "bad inner-loop form.\n");
1199 return NULL;
1202 return loop_vinfo;
1206 /* Function vect_analyze_loop_form.
1208 Verify that certain CFG restrictions hold, including:
1209 - the loop has a pre-header
1210 - the loop has a single entry and exit
1211 - the loop exit condition is simple enough, and the number of iterations
1212 can be analyzed (a countable loop). */
1214 loop_vec_info
1215 vect_analyze_loop_form (struct loop *loop)
1217 loop_vec_info loop_vinfo;
1218 gcond *loop_cond;
1219 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1220 loop_vec_info inner_loop_vinfo = NULL;
1222 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_NOTE, vect_location,
1224 "=== vect_analyze_loop_form ===\n");
1226 /* Different restrictions apply when we are considering an inner-most loop,
1227 vs. an outer (nested) loop.
1228 (FORNOW. May want to relax some of these restrictions in the future). */
1230 if (!loop->inner)
1232 /* Inner-most loop. We currently require that the number of BBs is
1233 exactly 2 (the header and latch). Vectorizable inner-most loops
1234 look like this:
1236 (pre-header)
1238 header <--------+
1239 | | |
1240 | +--> latch --+
1242 (exit-bb) */
1244 if (loop->num_nodes != 2)
1246 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248 "not vectorized: control flow in loop.\n");
1249 return NULL;
1252 if (empty_block_p (loop->header))
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "not vectorized: empty loop.\n");
1257 return NULL;
1260 else
1262 struct loop *innerloop = loop->inner;
1263 edge entryedge;
1265 /* Nested loop. We currently require that the loop is doubly-nested,
1266 contains a single inner loop, and the number of BBs is exactly 5.
1267 Vectorizable outer-loops look like this:
1269 (pre-header)
1271 header <---+
1273 inner-loop |
1275 tail ------+
1277 (exit-bb)
1279 The inner-loop has the properties expected of inner-most loops
1280 as described above. */
1282 if ((loop->inner)->inner || (loop->inner)->next)
1284 if (dump_enabled_p ())
1285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1286 "not vectorized: multiple nested loops.\n");
1287 return NULL;
1290 /* Analyze the inner-loop. */
1291 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1292 if (!inner_loop_vinfo)
1294 if (dump_enabled_p ())
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296 "not vectorized: Bad inner loop.\n");
1297 return NULL;
1300 if (!expr_invariant_in_loop_p (loop,
1301 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1303 if (dump_enabled_p ())
1304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305 "not vectorized: inner-loop count not"
1306 " invariant.\n");
1307 destroy_loop_vec_info (inner_loop_vinfo, true);
1308 return NULL;
1311 if (loop->num_nodes != 5)
1313 if (dump_enabled_p ())
1314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315 "not vectorized: control flow in loop.\n");
1316 destroy_loop_vec_info (inner_loop_vinfo, true);
1317 return NULL;
1320 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1321 entryedge = EDGE_PRED (innerloop->header, 0);
1322 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1323 entryedge = EDGE_PRED (innerloop->header, 1);
1325 if (entryedge->src != loop->header
1326 || !single_exit (innerloop)
1327 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 "not vectorized: unsupported outerloop form.\n");
1332 destroy_loop_vec_info (inner_loop_vinfo, true);
1333 return NULL;
1336 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_NOTE, vect_location,
1338 "Considering outer-loop vectorization.\n");
1341 if (!single_exit (loop)
1342 || EDGE_COUNT (loop->header->preds) != 2)
1344 if (dump_enabled_p ())
1346 if (!single_exit (loop))
1347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348 "not vectorized: multiple exits.\n");
1349 else if (EDGE_COUNT (loop->header->preds) != 2)
1350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351 "not vectorized: too many incoming edges.\n");
1353 if (inner_loop_vinfo)
1354 destroy_loop_vec_info (inner_loop_vinfo, true);
1355 return NULL;
1358 /* We assume that the loop exit condition is at the end of the loop. i.e,
1359 that the loop is represented as a do-while (with a proper if-guard
1360 before the loop if needed), where the loop header contains all the
1361 executable statements, and the latch is empty. */
1362 if (!empty_block_p (loop->latch)
1363 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367 "not vectorized: latch block not empty.\n");
1368 if (inner_loop_vinfo)
1369 destroy_loop_vec_info (inner_loop_vinfo, true);
1370 return NULL;
1373 /* Make sure there exists a single-predecessor exit bb: */
1374 if (!single_pred_p (single_exit (loop)->dest))
1376 edge e = single_exit (loop);
1377 if (!(e->flags & EDGE_ABNORMAL))
1379 split_loop_exit_edge (e);
1380 if (dump_enabled_p ())
1381 dump_printf (MSG_NOTE, "split exit edge.\n");
1383 else
1385 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 "not vectorized: abnormal loop exit edge.\n");
1388 if (inner_loop_vinfo)
1389 destroy_loop_vec_info (inner_loop_vinfo, true);
1390 return NULL;
1394 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1395 &number_of_iterationsm1);
1396 if (!loop_cond)
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "not vectorized: complicated exit condition.\n");
1401 if (inner_loop_vinfo)
1402 destroy_loop_vec_info (inner_loop_vinfo, true);
1403 return NULL;
1406 if (!number_of_iterations
1407 || chrec_contains_undetermined (number_of_iterations))
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411 "not vectorized: number of iterations cannot be "
1412 "computed.\n");
1413 if (inner_loop_vinfo)
1414 destroy_loop_vec_info (inner_loop_vinfo, true);
1415 return NULL;
1418 if (integer_zerop (number_of_iterations))
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422 "not vectorized: number of iterations = 0.\n");
1423 if (inner_loop_vinfo)
1424 destroy_loop_vec_info (inner_loop_vinfo, true);
1425 return NULL;
1428 loop_vinfo = new_loop_vec_info (loop);
1429 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1430 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1431 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1433 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1435 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location,
1438 "Symbolic number of iterations is ");
1439 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1440 dump_printf (MSG_NOTE, "\n");
1444 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1446 /* CHECKME: May want to keep it around it in the future. */
1447 if (inner_loop_vinfo)
1448 destroy_loop_vec_info (inner_loop_vinfo, false);
1450 gcc_assert (!loop->aux);
1451 loop->aux = loop_vinfo;
1452 return loop_vinfo;
1455 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1456 statements update the vectorization factor. */
1458 static void
1459 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1462 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1463 int nbbs = loop->num_nodes;
1464 unsigned int vectorization_factor;
1465 int i;
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_NOTE, vect_location,
1469 "=== vect_update_vf_for_slp ===\n");
1471 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1472 gcc_assert (vectorization_factor != 0);
1474 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1475 vectorization factor of the loop is the unrolling factor required by
1476 the SLP instances. If that unrolling factor is 1, we say, that we
1477 perform pure SLP on loop - cross iteration parallelism is not
1478 exploited. */
1479 bool only_slp_in_loop = true;
1480 for (i = 0; i < nbbs; i++)
1482 basic_block bb = bbs[i];
1483 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1484 gsi_next (&si))
1486 gimple stmt = gsi_stmt (si);
1487 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1488 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1489 && STMT_VINFO_RELATED_STMT (stmt_info))
1491 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1492 stmt_info = vinfo_for_stmt (stmt);
1494 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496 && !PURE_SLP_STMT (stmt_info))
1497 /* STMT needs both SLP and loop-based vectorization. */
1498 only_slp_in_loop = false;
1502 if (only_slp_in_loop)
1503 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1504 else
1505 vectorization_factor
1506 = least_common_multiple (vectorization_factor,
1507 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1509 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1510 if (dump_enabled_p ())
1511 dump_printf_loc (MSG_NOTE, vect_location,
1512 "Updating vectorization factor to %d\n",
1513 vectorization_factor);
1516 /* Function vect_analyze_loop_operations.
1518 Scan the loop stmts and make sure they are all vectorizable. */
1520 static bool
1521 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1523 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 unsigned int vectorization_factor;
1527 int i;
1528 stmt_vec_info stmt_info;
1529 bool need_to_vectorize = false;
1530 int min_profitable_iters;
1531 int min_scalar_loop_bound;
1532 unsigned int th;
1533 bool ok;
1534 HOST_WIDE_INT max_niter;
1535 HOST_WIDE_INT estimated_niter;
1536 int min_profitable_estimate;
1538 if (dump_enabled_p ())
1539 dump_printf_loc (MSG_NOTE, vect_location,
1540 "=== vect_analyze_loop_operations ===\n");
1542 for (i = 0; i < nbbs; i++)
1544 basic_block bb = bbs[i];
1546 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1547 gsi_next (&si))
1549 gphi *phi = si.phi ();
1550 ok = true;
1552 stmt_info = vinfo_for_stmt (phi);
1553 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1556 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1557 dump_printf (MSG_NOTE, "\n");
1560 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1561 (i.e., a phi in the tail of the outer-loop). */
1562 if (! is_loop_header_bb_p (bb))
1564 /* FORNOW: we currently don't support the case that these phis
1565 are not used in the outerloop (unless it is double reduction,
1566 i.e., this phi is vect_reduction_def), cause this case
1567 requires to actually do something here. */
1568 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1569 || STMT_VINFO_LIVE_P (stmt_info))
1570 && STMT_VINFO_DEF_TYPE (stmt_info)
1571 != vect_double_reduction_def)
1573 if (dump_enabled_p ())
1574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575 "Unsupported loop-closed phi in "
1576 "outer-loop.\n");
1577 return false;
1580 /* If PHI is used in the outer loop, we check that its operand
1581 is defined in the inner loop. */
1582 if (STMT_VINFO_RELEVANT_P (stmt_info))
1584 tree phi_op;
1585 gimple op_def_stmt;
1587 if (gimple_phi_num_args (phi) != 1)
1588 return false;
1590 phi_op = PHI_ARG_DEF (phi, 0);
1591 if (TREE_CODE (phi_op) != SSA_NAME)
1592 return false;
1594 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1595 if (gimple_nop_p (op_def_stmt)
1596 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1597 || !vinfo_for_stmt (op_def_stmt))
1598 return false;
1600 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1601 != vect_used_in_outer
1602 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1603 != vect_used_in_outer_by_reduction)
1604 return false;
1607 continue;
1610 gcc_assert (stmt_info);
1612 if (STMT_VINFO_LIVE_P (stmt_info))
1614 /* FORNOW: not yet supported. */
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1617 "not vectorized: value used after loop.\n");
1618 return false;
1621 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1622 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1624 /* A scalar-dependence cycle that we don't support. */
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627 "not vectorized: scalar dependence cycle.\n");
1628 return false;
1631 if (STMT_VINFO_RELEVANT_P (stmt_info))
1633 need_to_vectorize = true;
1634 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1635 ok = vectorizable_induction (phi, NULL, NULL);
1638 if (!ok)
1640 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "not vectorized: relevant phi not "
1644 "supported: ");
1645 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1646 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1648 return false;
1652 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1653 gsi_next (&si))
1655 gimple stmt = gsi_stmt (si);
1656 if (!gimple_clobber_p (stmt)
1657 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1658 return false;
1660 } /* bbs */
1662 /* All operations in the loop are either irrelevant (deal with loop
1663 control, or dead), or only used outside the loop and can be moved
1664 out of the loop (e.g. invariants, inductions). The loop can be
1665 optimized away by scalar optimizations. We're better off not
1666 touching this loop. */
1667 if (!need_to_vectorize)
1669 if (dump_enabled_p ())
1670 dump_printf_loc (MSG_NOTE, vect_location,
1671 "All the computation can be taken out of the loop.\n");
1672 if (dump_enabled_p ())
1673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674 "not vectorized: redundant loop. no profit to "
1675 "vectorize.\n");
1676 return false;
1679 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1680 gcc_assert (vectorization_factor != 0);
1682 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "vectorization_factor = %d, niters = "
1685 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1686 LOOP_VINFO_INT_NITERS (loop_vinfo));
1688 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1689 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1690 || ((max_niter = max_stmt_executions_int (loop)) != -1
1691 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1693 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695 "not vectorized: iteration count too small.\n");
1696 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698 "not vectorized: iteration count smaller than "
1699 "vectorization factor.\n");
1700 return false;
1703 /* Analyze cost. Decide if worth while to vectorize. */
1705 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706 &min_profitable_estimate);
1707 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1709 if (min_profitable_iters < 0)
1711 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713 "not vectorized: vectorization not profitable.\n");
1714 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716 "not vectorized: vector version will never be "
1717 "profitable.\n");
1718 return false;
1721 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1722 * vectorization_factor) - 1);
1725 /* Use the cost model only if it is more conservative than user specified
1726 threshold. */
1728 th = (unsigned) min_scalar_loop_bound;
1729 if (min_profitable_iters
1730 && (!min_scalar_loop_bound
1731 || min_profitable_iters > min_scalar_loop_bound))
1732 th = (unsigned) min_profitable_iters;
1734 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1736 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1737 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741 "not vectorized: vectorization not profitable.\n");
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_NOTE, vect_location,
1744 "not vectorized: iteration count smaller than user "
1745 "specified loop bound parameter or minimum profitable "
1746 "iterations (whichever is more conservative).\n");
1747 return false;
1750 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1751 && ((unsigned HOST_WIDE_INT) estimated_niter
1752 <= MAX (th, (unsigned)min_profitable_estimate)))
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756 "not vectorized: estimated iteration count too "
1757 "small.\n");
1758 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_NOTE, vect_location,
1760 "not vectorized: estimated iteration count smaller "
1761 "than specified loop bound parameter or minimum "
1762 "profitable iterations (whichever is more "
1763 "conservative).\n");
1764 return false;
1767 return true;
1771 /* Function vect_analyze_loop_2.
1773 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1774 for it. The different analyses will record information in the
1775 loop_vec_info struct. */
1776 static bool
1777 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1779 bool ok;
1780 int max_vf = MAX_VECTORIZATION_FACTOR;
1781 int min_vf = 2;
1782 unsigned int th;
1783 unsigned int n_stmts = 0;
1785 /* Find all data references in the loop (which correspond to vdefs/vuses)
1786 and analyze their evolution in the loop. Also adjust the minimal
1787 vectorization factor according to the loads and stores.
1789 FORNOW: Handle only simple, array references, which
1790 alignment can be forced, and aligned pointer-references. */
1792 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1793 if (!ok)
1795 if (dump_enabled_p ())
1796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1797 "bad data references.\n");
1798 return false;
1801 /* Classify all cross-iteration scalar data-flow cycles.
1802 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1804 vect_analyze_scalar_cycles (loop_vinfo);
1806 vect_pattern_recog (loop_vinfo, NULL);
1808 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1810 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1811 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1813 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1814 if (!ok)
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818 "bad data access.\n");
1819 return false;
1822 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1824 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1825 if (!ok)
1827 if (dump_enabled_p ())
1828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829 "unexpected pattern.\n");
1830 return false;
1833 /* Analyze data dependences between the data-refs in the loop
1834 and adjust the maximum vectorization factor according to
1835 the dependences.
1836 FORNOW: fail at the first data dependence that we encounter. */
1838 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1839 if (!ok
1840 || max_vf < min_vf)
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "bad data dependence.\n");
1845 return false;
1848 ok = vect_determine_vectorization_factor (loop_vinfo);
1849 if (!ok)
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 "can't determine vectorization factor.\n");
1854 return false;
1856 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "bad data dependence.\n");
1861 return false;
1864 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1865 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1866 if (!ok)
1867 return false;
1869 /* If there are any SLP instances mark them as pure_slp. */
1870 bool slp = vect_make_slp_decision (loop_vinfo);
1871 if (slp)
1873 /* Find stmts that need to be both vectorized and SLPed. */
1874 vect_detect_hybrid_slp (loop_vinfo);
1876 /* Update the vectorization factor based on the SLP decision. */
1877 vect_update_vf_for_slp (loop_vinfo);
1880 /* Analyze the alignment of the data-refs in the loop.
1881 Fail if a data reference is found that cannot be vectorized. */
1883 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1884 if (!ok)
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 "bad data alignment.\n");
1889 return false;
1892 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1893 It is important to call pruning after vect_analyze_data_ref_accesses,
1894 since we use grouping information gathered by interleaving analysis. */
1895 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1896 if (!ok)
1898 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1900 "number of versioning for alias "
1901 "run-time tests exceeds %d "
1902 "(--param vect-max-version-for-alias-checks)\n",
1903 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1904 return false;
1907 /* Compute the scalar iteration cost. */
1908 vect_get_single_scalar_iteration_cost (loop_vinfo);
1910 /* This pass will decide on using loop versioning and/or loop peeling in
1911 order to enhance the alignment of data references in the loop. */
1913 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1914 if (!ok)
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918 "bad data alignment.\n");
1919 return false;
1922 if (slp)
1924 /* Analyze operations in the SLP instances. Note this may
1925 remove unsupported SLP instances which makes the above
1926 SLP kind detection invalid. */
1927 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1928 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1929 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1930 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1931 return false;
1934 /* Scan all the remaining operations in the loop that are not subject
1935 to SLP and make sure they are vectorizable. */
1936 ok = vect_analyze_loop_operations (loop_vinfo);
1937 if (!ok)
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "bad operation or unsupported loop bound.\n");
1942 return false;
1945 /* Decide whether we need to create an epilogue loop to handle
1946 remaining scalar iterations. */
1947 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1948 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1949 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1952 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1954 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1955 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1956 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1957 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1959 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1960 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1961 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1962 /* In case of versioning, check if the maximum number of
1963 iterations is greater than th. If they are identical,
1964 the epilogue is unnecessary. */
1965 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1966 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1967 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1968 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1969 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1971 /* If an epilogue loop is required make sure we can create one. */
1972 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1973 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1977 if (!vect_can_advance_ivs_p (loop_vinfo)
1978 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1979 single_exit (LOOP_VINFO_LOOP
1980 (loop_vinfo))))
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1984 "not vectorized: can't create required "
1985 "epilog loop\n");
1986 return false;
1990 return true;
1993 /* Function vect_analyze_loop.
1995 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1996 for it. The different analyses will record information in the
1997 loop_vec_info struct. */
1998 loop_vec_info
1999 vect_analyze_loop (struct loop *loop)
2001 loop_vec_info loop_vinfo;
2002 unsigned int vector_sizes;
2004 /* Autodetect first vector size we try. */
2005 current_vector_size = 0;
2006 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "===== analyze_loop_nest =====\n");
2012 if (loop_outer (loop)
2013 && loop_vec_info_for_loop (loop_outer (loop))
2014 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_NOTE, vect_location,
2018 "outer-loop already vectorized.\n");
2019 return NULL;
2022 while (1)
2024 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2025 loop_vinfo = vect_analyze_loop_form (loop);
2026 if (!loop_vinfo)
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2030 "bad loop form.\n");
2031 return NULL;
2034 if (vect_analyze_loop_2 (loop_vinfo))
2036 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2038 return loop_vinfo;
2041 destroy_loop_vec_info (loop_vinfo, true);
2043 vector_sizes &= ~current_vector_size;
2044 if (vector_sizes == 0
2045 || current_vector_size == 0)
2046 return NULL;
2048 /* Try the next biggest vector size. */
2049 current_vector_size = 1 << floor_log2 (vector_sizes);
2050 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "***** Re-trying analysis with "
2053 "vector size %d\n", current_vector_size);
2058 /* Function reduction_code_for_scalar_code
2060 Input:
2061 CODE - tree_code of a reduction operations.
2063 Output:
2064 REDUC_CODE - the corresponding tree-code to be used to reduce the
2065 vector of partial results into a single scalar result, or ERROR_MARK
2066 if the operation is a supported reduction operation, but does not have
2067 such a tree-code.
2069 Return FALSE if CODE currently cannot be vectorized as reduction. */
2071 static bool
2072 reduction_code_for_scalar_code (enum tree_code code,
2073 enum tree_code *reduc_code)
2075 switch (code)
2077 case MAX_EXPR:
2078 *reduc_code = REDUC_MAX_EXPR;
2079 return true;
2081 case MIN_EXPR:
2082 *reduc_code = REDUC_MIN_EXPR;
2083 return true;
2085 case PLUS_EXPR:
2086 *reduc_code = REDUC_PLUS_EXPR;
2087 return true;
2089 case MULT_EXPR:
2090 case MINUS_EXPR:
2091 case BIT_IOR_EXPR:
2092 case BIT_XOR_EXPR:
2093 case BIT_AND_EXPR:
2094 *reduc_code = ERROR_MARK;
2095 return true;
2097 default:
2098 return false;
2103 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2104 STMT is printed with a message MSG. */
2106 static void
2107 report_vect_op (int msg_type, gimple stmt, const char *msg)
2109 dump_printf_loc (msg_type, vect_location, "%s", msg);
2110 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2111 dump_printf (msg_type, "\n");
2115 /* Detect SLP reduction of the form:
2117 #a1 = phi <a5, a0>
2118 a2 = operation (a1)
2119 a3 = operation (a2)
2120 a4 = operation (a3)
2121 a5 = operation (a4)
2123 #a = phi <a5>
2125 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2126 FIRST_STMT is the first reduction stmt in the chain
2127 (a2 = operation (a1)).
2129 Return TRUE if a reduction chain was detected. */
2131 static bool
2132 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2134 struct loop *loop = (gimple_bb (phi))->loop_father;
2135 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2136 enum tree_code code;
2137 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2138 stmt_vec_info use_stmt_info, current_stmt_info;
2139 tree lhs;
2140 imm_use_iterator imm_iter;
2141 use_operand_p use_p;
2142 int nloop_uses, size = 0, n_out_of_loop_uses;
2143 bool found = false;
2145 if (loop != vect_loop)
2146 return false;
2148 lhs = PHI_RESULT (phi);
2149 code = gimple_assign_rhs_code (first_stmt);
2150 while (1)
2152 nloop_uses = 0;
2153 n_out_of_loop_uses = 0;
2154 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2156 gimple use_stmt = USE_STMT (use_p);
2157 if (is_gimple_debug (use_stmt))
2158 continue;
2160 /* Check if we got back to the reduction phi. */
2161 if (use_stmt == phi)
2163 loop_use_stmt = use_stmt;
2164 found = true;
2165 break;
2168 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2170 loop_use_stmt = use_stmt;
2171 nloop_uses++;
2173 else
2174 n_out_of_loop_uses++;
2176 /* There are can be either a single use in the loop or two uses in
2177 phi nodes. */
2178 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2179 return false;
2182 if (found)
2183 break;
2185 /* We reached a statement with no loop uses. */
2186 if (nloop_uses == 0)
2187 return false;
2189 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2190 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2191 return false;
2193 if (!is_gimple_assign (loop_use_stmt)
2194 || code != gimple_assign_rhs_code (loop_use_stmt)
2195 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2196 return false;
2198 /* Insert USE_STMT into reduction chain. */
2199 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2200 if (current_stmt)
2202 current_stmt_info = vinfo_for_stmt (current_stmt);
2203 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2204 GROUP_FIRST_ELEMENT (use_stmt_info)
2205 = GROUP_FIRST_ELEMENT (current_stmt_info);
2207 else
2208 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2210 lhs = gimple_assign_lhs (loop_use_stmt);
2211 current_stmt = loop_use_stmt;
2212 size++;
2215 if (!found || loop_use_stmt != phi || size < 2)
2216 return false;
2218 /* Swap the operands, if needed, to make the reduction operand be the second
2219 operand. */
2220 lhs = PHI_RESULT (phi);
2221 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2222 while (next_stmt)
2224 if (gimple_assign_rhs2 (next_stmt) == lhs)
2226 tree op = gimple_assign_rhs1 (next_stmt);
2227 gimple def_stmt = NULL;
2229 if (TREE_CODE (op) == SSA_NAME)
2230 def_stmt = SSA_NAME_DEF_STMT (op);
2232 /* Check that the other def is either defined in the loop
2233 ("vect_internal_def"), or it's an induction (defined by a
2234 loop-header phi-node). */
2235 if (def_stmt
2236 && gimple_bb (def_stmt)
2237 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2238 && (is_gimple_assign (def_stmt)
2239 || is_gimple_call (def_stmt)
2240 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2241 == vect_induction_def
2242 || (gimple_code (def_stmt) == GIMPLE_PHI
2243 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2244 == vect_internal_def
2245 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2247 lhs = gimple_assign_lhs (next_stmt);
2248 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2249 continue;
2252 return false;
2254 else
2256 tree op = gimple_assign_rhs2 (next_stmt);
2257 gimple def_stmt = NULL;
2259 if (TREE_CODE (op) == SSA_NAME)
2260 def_stmt = SSA_NAME_DEF_STMT (op);
2262 /* Check that the other def is either defined in the loop
2263 ("vect_internal_def"), or it's an induction (defined by a
2264 loop-header phi-node). */
2265 if (def_stmt
2266 && gimple_bb (def_stmt)
2267 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2268 && (is_gimple_assign (def_stmt)
2269 || is_gimple_call (def_stmt)
2270 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2271 == vect_induction_def
2272 || (gimple_code (def_stmt) == GIMPLE_PHI
2273 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2274 == vect_internal_def
2275 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2277 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2280 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2281 dump_printf (MSG_NOTE, "\n");
2284 swap_ssa_operands (next_stmt,
2285 gimple_assign_rhs1_ptr (next_stmt),
2286 gimple_assign_rhs2_ptr (next_stmt));
2287 update_stmt (next_stmt);
2289 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2290 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2292 else
2293 return false;
2296 lhs = gimple_assign_lhs (next_stmt);
2297 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2300 /* Save the chain for further analysis in SLP detection. */
2301 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2302 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2303 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2305 return true;
2309 /* Function vect_is_simple_reduction_1
2311 (1) Detect a cross-iteration def-use cycle that represents a simple
2312 reduction computation. We look for the following pattern:
2314 loop_header:
2315 a1 = phi < a0, a2 >
2316 a3 = ...
2317 a2 = operation (a3, a1)
2321 a3 = ...
2322 loop_header:
2323 a1 = phi < a0, a2 >
2324 a2 = operation (a3, a1)
2326 such that:
2327 1. operation is commutative and associative and it is safe to
2328 change the order of the computation (if CHECK_REDUCTION is true)
2329 2. no uses for a2 in the loop (a2 is used out of the loop)
2330 3. no uses of a1 in the loop besides the reduction operation
2331 4. no uses of a1 outside the loop.
2333 Conditions 1,4 are tested here.
2334 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2336 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2337 nested cycles, if CHECK_REDUCTION is false.
2339 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2340 reductions:
2342 a1 = phi < a0, a2 >
2343 inner loop (def of a3)
2344 a2 = phi < a3 >
2346 If MODIFY is true it tries also to rework the code in-place to enable
2347 detection of more reduction patterns. For the time being we rewrite
2348 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2351 static gimple
2352 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2353 bool check_reduction, bool *double_reduc,
2354 bool modify)
2356 struct loop *loop = (gimple_bb (phi))->loop_father;
2357 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2358 edge latch_e = loop_latch_edge (loop);
2359 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2360 gimple def_stmt, def1 = NULL, def2 = NULL;
2361 enum tree_code orig_code, code;
2362 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2363 tree type;
2364 int nloop_uses;
2365 tree name;
2366 imm_use_iterator imm_iter;
2367 use_operand_p use_p;
2368 bool phi_def;
2370 *double_reduc = false;
2372 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2373 otherwise, we assume outer loop vectorization. */
2374 gcc_assert ((check_reduction && loop == vect_loop)
2375 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2377 name = PHI_RESULT (phi);
2378 /* ??? If there are no uses of the PHI result the inner loop reduction
2379 won't be detected as possibly double-reduction by vectorizable_reduction
2380 because that tries to walk the PHI arg from the preheader edge which
2381 can be constant. See PR60382. */
2382 if (has_zero_uses (name))
2383 return NULL;
2384 nloop_uses = 0;
2385 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2387 gimple use_stmt = USE_STMT (use_p);
2388 if (is_gimple_debug (use_stmt))
2389 continue;
2391 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "intermediate value used outside loop.\n");
2397 return NULL;
2400 nloop_uses++;
2401 if (nloop_uses > 1)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "reduction used in loop.\n");
2406 return NULL;
2410 if (TREE_CODE (loop_arg) != SSA_NAME)
2412 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415 "reduction: not ssa_name: ");
2416 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2417 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2419 return NULL;
2422 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2423 if (!def_stmt)
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "reduction: no def_stmt.\n");
2428 return NULL;
2431 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2433 if (dump_enabled_p ())
2435 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2436 dump_printf (MSG_NOTE, "\n");
2438 return NULL;
2441 if (is_gimple_assign (def_stmt))
2443 name = gimple_assign_lhs (def_stmt);
2444 phi_def = false;
2446 else
2448 name = PHI_RESULT (def_stmt);
2449 phi_def = true;
2452 nloop_uses = 0;
2453 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2455 gimple use_stmt = USE_STMT (use_p);
2456 if (is_gimple_debug (use_stmt))
2457 continue;
2458 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2459 nloop_uses++;
2460 if (nloop_uses > 1)
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "reduction used in loop.\n");
2465 return NULL;
2469 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2470 defined in the inner loop. */
2471 if (phi_def)
2473 op1 = PHI_ARG_DEF (def_stmt, 0);
2475 if (gimple_phi_num_args (def_stmt) != 1
2476 || TREE_CODE (op1) != SSA_NAME)
2478 if (dump_enabled_p ())
2479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2480 "unsupported phi node definition.\n");
2482 return NULL;
2485 def1 = SSA_NAME_DEF_STMT (op1);
2486 if (gimple_bb (def1)
2487 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2488 && loop->inner
2489 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2490 && is_gimple_assign (def1))
2492 if (dump_enabled_p ())
2493 report_vect_op (MSG_NOTE, def_stmt,
2494 "detected double reduction: ");
2496 *double_reduc = true;
2497 return def_stmt;
2500 return NULL;
2503 code = orig_code = gimple_assign_rhs_code (def_stmt);
2505 /* We can handle "res -= x[i]", which is non-associative by
2506 simply rewriting this into "res += -x[i]". Avoid changing
2507 gimple instruction for the first simple tests and only do this
2508 if we're allowed to change code at all. */
2509 if (code == MINUS_EXPR
2510 && modify
2511 && (op1 = gimple_assign_rhs1 (def_stmt))
2512 && TREE_CODE (op1) == SSA_NAME
2513 && SSA_NAME_DEF_STMT (op1) == phi)
2514 code = PLUS_EXPR;
2516 if (check_reduction
2517 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2519 if (dump_enabled_p ())
2520 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2521 "reduction: not commutative/associative: ");
2522 return NULL;
2525 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2527 if (code != COND_EXPR)
2529 if (dump_enabled_p ())
2530 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2531 "reduction: not binary operation: ");
2533 return NULL;
2536 op3 = gimple_assign_rhs1 (def_stmt);
2537 if (COMPARISON_CLASS_P (op3))
2539 op4 = TREE_OPERAND (op3, 1);
2540 op3 = TREE_OPERAND (op3, 0);
2543 op1 = gimple_assign_rhs2 (def_stmt);
2544 op2 = gimple_assign_rhs3 (def_stmt);
2546 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2548 if (dump_enabled_p ())
2549 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2550 "reduction: uses not ssa_names: ");
2552 return NULL;
2555 else
2557 op1 = gimple_assign_rhs1 (def_stmt);
2558 op2 = gimple_assign_rhs2 (def_stmt);
2560 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2562 if (dump_enabled_p ())
2563 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2564 "reduction: uses not ssa_names: ");
2566 return NULL;
2570 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2571 if ((TREE_CODE (op1) == SSA_NAME
2572 && !types_compatible_p (type,TREE_TYPE (op1)))
2573 || (TREE_CODE (op2) == SSA_NAME
2574 && !types_compatible_p (type, TREE_TYPE (op2)))
2575 || (op3 && TREE_CODE (op3) == SSA_NAME
2576 && !types_compatible_p (type, TREE_TYPE (op3)))
2577 || (op4 && TREE_CODE (op4) == SSA_NAME
2578 && !types_compatible_p (type, TREE_TYPE (op4))))
2580 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_NOTE, vect_location,
2583 "reduction: multiple types: operation type: ");
2584 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2585 dump_printf (MSG_NOTE, ", operands types: ");
2586 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2587 TREE_TYPE (op1));
2588 dump_printf (MSG_NOTE, ",");
2589 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2590 TREE_TYPE (op2));
2591 if (op3)
2593 dump_printf (MSG_NOTE, ",");
2594 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2595 TREE_TYPE (op3));
2598 if (op4)
2600 dump_printf (MSG_NOTE, ",");
2601 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2602 TREE_TYPE (op4));
2604 dump_printf (MSG_NOTE, "\n");
2607 return NULL;
2610 /* Check that it's ok to change the order of the computation.
2611 Generally, when vectorizing a reduction we change the order of the
2612 computation. This may change the behavior of the program in some
2613 cases, so we need to check that this is ok. One exception is when
2614 vectorizing an outer-loop: the inner-loop is executed sequentially,
2615 and therefore vectorizing reductions in the inner-loop during
2616 outer-loop vectorization is safe. */
2618 /* CHECKME: check for !flag_finite_math_only too? */
2619 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2620 && check_reduction)
2622 /* Changing the order of operations changes the semantics. */
2623 if (dump_enabled_p ())
2624 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2625 "reduction: unsafe fp math optimization: ");
2626 return NULL;
2628 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2629 && check_reduction)
2631 /* Changing the order of operations changes the semantics. */
2632 if (dump_enabled_p ())
2633 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2634 "reduction: unsafe int math optimization: ");
2635 return NULL;
2637 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2639 /* Changing the order of operations changes the semantics. */
2640 if (dump_enabled_p ())
2641 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2642 "reduction: unsafe fixed-point math optimization: ");
2643 return NULL;
2646 /* If we detected "res -= x[i]" earlier, rewrite it into
2647 "res += -x[i]" now. If this turns out to be useless reassoc
2648 will clean it up again. */
2649 if (orig_code == MINUS_EXPR)
2651 tree rhs = gimple_assign_rhs2 (def_stmt);
2652 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2653 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2654 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2655 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2656 loop_info, NULL));
2657 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2658 gimple_assign_set_rhs2 (def_stmt, negrhs);
2659 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2660 update_stmt (def_stmt);
2663 /* Reduction is safe. We're dealing with one of the following:
2664 1) integer arithmetic and no trapv
2665 2) floating point arithmetic, and special flags permit this optimization
2666 3) nested cycle (i.e., outer loop vectorization). */
2667 if (TREE_CODE (op1) == SSA_NAME)
2668 def1 = SSA_NAME_DEF_STMT (op1);
2670 if (TREE_CODE (op2) == SSA_NAME)
2671 def2 = SSA_NAME_DEF_STMT (op2);
2673 if (code != COND_EXPR
2674 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2676 if (dump_enabled_p ())
2677 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2678 return NULL;
2681 /* Check that one def is the reduction def, defined by PHI,
2682 the other def is either defined in the loop ("vect_internal_def"),
2683 or it's an induction (defined by a loop-header phi-node). */
2685 if (def2 && def2 == phi
2686 && (code == COND_EXPR
2687 || !def1 || gimple_nop_p (def1)
2688 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2689 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2690 && (is_gimple_assign (def1)
2691 || is_gimple_call (def1)
2692 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2693 == vect_induction_def
2694 || (gimple_code (def1) == GIMPLE_PHI
2695 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2696 == vect_internal_def
2697 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2699 if (dump_enabled_p ())
2700 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2701 return def_stmt;
2704 if (def1 && def1 == phi
2705 && (code == COND_EXPR
2706 || !def2 || gimple_nop_p (def2)
2707 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2708 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2709 && (is_gimple_assign (def2)
2710 || is_gimple_call (def2)
2711 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2712 == vect_induction_def
2713 || (gimple_code (def2) == GIMPLE_PHI
2714 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2715 == vect_internal_def
2716 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2718 if (check_reduction)
2720 /* Swap operands (just for simplicity - so that the rest of the code
2721 can assume that the reduction variable is always the last (second)
2722 argument). */
2723 if (dump_enabled_p ())
2724 report_vect_op (MSG_NOTE, def_stmt,
2725 "detected reduction: need to swap operands: ");
2727 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2728 gimple_assign_rhs2_ptr (def_stmt));
2730 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2731 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2733 else
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2739 return def_stmt;
2742 /* Try to find SLP reduction chain. */
2743 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2745 if (dump_enabled_p ())
2746 report_vect_op (MSG_NOTE, def_stmt,
2747 "reduction: detected reduction chain: ");
2749 return def_stmt;
2752 if (dump_enabled_p ())
2753 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2754 "reduction: unknown pattern: ");
2756 return NULL;
2759 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2760 in-place. Arguments as there. */
2762 static gimple
2763 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2764 bool check_reduction, bool *double_reduc)
2766 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2767 double_reduc, false);
2770 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2771 in-place if it enables detection of more reductions. Arguments
2772 as there. */
2774 gimple
2775 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2776 bool check_reduction, bool *double_reduc)
2778 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2779 double_reduc, true);
2782 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2784 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2785 int *peel_iters_epilogue,
2786 stmt_vector_for_cost *scalar_cost_vec,
2787 stmt_vector_for_cost *prologue_cost_vec,
2788 stmt_vector_for_cost *epilogue_cost_vec)
2790 int retval = 0;
2791 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2793 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2795 *peel_iters_epilogue = vf/2;
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "cost model: epilogue peel iters set to vf/2 "
2799 "because loop iterations are unknown .\n");
2801 /* If peeled iterations are known but number of scalar loop
2802 iterations are unknown, count a taken branch per peeled loop. */
2803 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2804 NULL, 0, vect_prologue);
2805 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2806 NULL, 0, vect_epilogue);
2808 else
2810 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2811 peel_iters_prologue = niters < peel_iters_prologue ?
2812 niters : peel_iters_prologue;
2813 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2814 /* If we need to peel for gaps, but no peeling is required, we have to
2815 peel VF iterations. */
2816 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2817 *peel_iters_epilogue = vf;
2820 stmt_info_for_cost *si;
2821 int j;
2822 if (peel_iters_prologue)
2823 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2824 retval += record_stmt_cost (prologue_cost_vec,
2825 si->count * peel_iters_prologue,
2826 si->kind, NULL, si->misalign,
2827 vect_prologue);
2828 if (*peel_iters_epilogue)
2829 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2830 retval += record_stmt_cost (epilogue_cost_vec,
2831 si->count * *peel_iters_epilogue,
2832 si->kind, NULL, si->misalign,
2833 vect_epilogue);
2835 return retval;
2838 /* Function vect_estimate_min_profitable_iters
2840 Return the number of iterations required for the vector version of the
2841 loop to be profitable relative to the cost of the scalar version of the
2842 loop. */
2844 static void
2845 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2846 int *ret_min_profitable_niters,
2847 int *ret_min_profitable_estimate)
2849 int min_profitable_iters;
2850 int min_profitable_estimate;
2851 int peel_iters_prologue;
2852 int peel_iters_epilogue;
2853 unsigned vec_inside_cost = 0;
2854 int vec_outside_cost = 0;
2855 unsigned vec_prologue_cost = 0;
2856 unsigned vec_epilogue_cost = 0;
2857 int scalar_single_iter_cost = 0;
2858 int scalar_outside_cost = 0;
2859 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2860 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2861 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2863 /* Cost model disabled. */
2864 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2866 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2867 *ret_min_profitable_niters = 0;
2868 *ret_min_profitable_estimate = 0;
2869 return;
2872 /* Requires loop versioning tests to handle misalignment. */
2873 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2875 /* FIXME: Make cost depend on complexity of individual check. */
2876 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2877 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2878 vect_prologue);
2879 dump_printf (MSG_NOTE,
2880 "cost model: Adding cost of checks for loop "
2881 "versioning to treat misalignment.\n");
2884 /* Requires loop versioning with alias checks. */
2885 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2887 /* FIXME: Make cost depend on complexity of individual check. */
2888 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2889 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2890 vect_prologue);
2891 dump_printf (MSG_NOTE,
2892 "cost model: Adding cost of checks for loop "
2893 "versioning aliasing.\n");
2896 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2897 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2898 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2899 vect_prologue);
2901 /* Count statements in scalar loop. Using this as scalar cost for a single
2902 iteration for now.
2904 TODO: Add outer loop support.
2906 TODO: Consider assigning different costs to different scalar
2907 statements. */
2909 scalar_single_iter_cost
2910 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2912 /* Add additional cost for the peeled instructions in prologue and epilogue
2913 loop.
2915 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2916 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2918 TODO: Build an expression that represents peel_iters for prologue and
2919 epilogue to be used in a run-time test. */
2921 if (npeel < 0)
2923 peel_iters_prologue = vf/2;
2924 dump_printf (MSG_NOTE, "cost model: "
2925 "prologue peel iters set to vf/2.\n");
2927 /* If peeling for alignment is unknown, loop bound of main loop becomes
2928 unknown. */
2929 peel_iters_epilogue = vf/2;
2930 dump_printf (MSG_NOTE, "cost model: "
2931 "epilogue peel iters set to vf/2 because "
2932 "peeling for alignment is unknown.\n");
2934 /* If peeled iterations are unknown, count a taken branch and a not taken
2935 branch per peeled loop. Even if scalar loop iterations are known,
2936 vector iterations are not known since peeled prologue iterations are
2937 not known. Hence guards remain the same. */
2938 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2939 NULL, 0, vect_prologue);
2940 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2941 NULL, 0, vect_prologue);
2942 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2943 NULL, 0, vect_epilogue);
2944 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2945 NULL, 0, vect_epilogue);
2946 stmt_info_for_cost *si;
2947 int j;
2948 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2950 struct _stmt_vec_info *stmt_info
2951 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2952 (void) add_stmt_cost (target_cost_data,
2953 si->count * peel_iters_prologue,
2954 si->kind, stmt_info, si->misalign,
2955 vect_prologue);
2956 (void) add_stmt_cost (target_cost_data,
2957 si->count * peel_iters_epilogue,
2958 si->kind, stmt_info, si->misalign,
2959 vect_epilogue);
2962 else
2964 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2965 stmt_info_for_cost *si;
2966 int j;
2967 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2969 prologue_cost_vec.create (2);
2970 epilogue_cost_vec.create (2);
2971 peel_iters_prologue = npeel;
2973 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2974 &peel_iters_epilogue,
2975 &LOOP_VINFO_SCALAR_ITERATION_COST
2976 (loop_vinfo),
2977 &prologue_cost_vec,
2978 &epilogue_cost_vec);
2980 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2982 struct _stmt_vec_info *stmt_info
2983 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2984 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2985 si->misalign, vect_prologue);
2988 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2990 struct _stmt_vec_info *stmt_info
2991 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2992 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2993 si->misalign, vect_epilogue);
2996 prologue_cost_vec.release ();
2997 epilogue_cost_vec.release ();
3000 /* FORNOW: The scalar outside cost is incremented in one of the
3001 following ways:
3003 1. The vectorizer checks for alignment and aliasing and generates
3004 a condition that allows dynamic vectorization. A cost model
3005 check is ANDED with the versioning condition. Hence scalar code
3006 path now has the added cost of the versioning check.
3008 if (cost > th & versioning_check)
3009 jmp to vector code
3011 Hence run-time scalar is incremented by not-taken branch cost.
3013 2. The vectorizer then checks if a prologue is required. If the
3014 cost model check was not done before during versioning, it has to
3015 be done before the prologue check.
3017 if (cost <= th)
3018 prologue = scalar_iters
3019 if (prologue == 0)
3020 jmp to vector code
3021 else
3022 execute prologue
3023 if (prologue == num_iters)
3024 go to exit
3026 Hence the run-time scalar cost is incremented by a taken branch,
3027 plus a not-taken branch, plus a taken branch cost.
3029 3. The vectorizer then checks if an epilogue is required. If the
3030 cost model check was not done before during prologue check, it
3031 has to be done with the epilogue check.
3033 if (prologue == 0)
3034 jmp to vector code
3035 else
3036 execute prologue
3037 if (prologue == num_iters)
3038 go to exit
3039 vector code:
3040 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3041 jmp to epilogue
3043 Hence the run-time scalar cost should be incremented by 2 taken
3044 branches.
3046 TODO: The back end may reorder the BBS's differently and reverse
3047 conditions/branch directions. Change the estimates below to
3048 something more reasonable. */
3050 /* If the number of iterations is known and we do not do versioning, we can
3051 decide whether to vectorize at compile time. Hence the scalar version
3052 do not carry cost model guard costs. */
3053 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3054 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3055 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3057 /* Cost model check occurs at versioning. */
3058 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3059 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3060 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3061 else
3063 /* Cost model check occurs at prologue generation. */
3064 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3065 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3066 + vect_get_stmt_cost (cond_branch_not_taken);
3067 /* Cost model check occurs at epilogue generation. */
3068 else
3069 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3073 /* Complete the target-specific cost calculations. */
3074 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3075 &vec_inside_cost, &vec_epilogue_cost);
3077 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3079 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3082 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3083 vec_inside_cost);
3084 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3085 vec_prologue_cost);
3086 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3087 vec_epilogue_cost);
3088 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3089 scalar_single_iter_cost);
3090 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3091 scalar_outside_cost);
3092 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3093 vec_outside_cost);
3094 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3095 peel_iters_prologue);
3096 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3097 peel_iters_epilogue);
3100 /* Calculate number of iterations required to make the vector version
3101 profitable, relative to the loop bodies only. The following condition
3102 must hold true:
3103 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3104 where
3105 SIC = scalar iteration cost, VIC = vector iteration cost,
3106 VOC = vector outside cost, VF = vectorization factor,
3107 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3108 SOC = scalar outside cost for run time cost model check. */
3110 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3112 if (vec_outside_cost <= 0)
3113 min_profitable_iters = 1;
3114 else
3116 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3117 - vec_inside_cost * peel_iters_prologue
3118 - vec_inside_cost * peel_iters_epilogue)
3119 / ((scalar_single_iter_cost * vf)
3120 - vec_inside_cost);
3122 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3123 <= (((int) vec_inside_cost * min_profitable_iters)
3124 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3125 min_profitable_iters++;
3128 /* vector version will never be profitable. */
3129 else
3131 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3132 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3133 "did not happen for a simd loop");
3135 if (dump_enabled_p ())
3136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3137 "cost model: the vector iteration cost = %d "
3138 "divided by the scalar iteration cost = %d "
3139 "is greater or equal to the vectorization factor = %d"
3140 ".\n",
3141 vec_inside_cost, scalar_single_iter_cost, vf);
3142 *ret_min_profitable_niters = -1;
3143 *ret_min_profitable_estimate = -1;
3144 return;
3147 dump_printf (MSG_NOTE,
3148 " Calculated minimum iters for profitability: %d\n",
3149 min_profitable_iters);
3151 min_profitable_iters =
3152 min_profitable_iters < vf ? vf : min_profitable_iters;
3154 /* Because the condition we create is:
3155 if (niters <= min_profitable_iters)
3156 then skip the vectorized loop. */
3157 min_profitable_iters--;
3159 if (dump_enabled_p ())
3160 dump_printf_loc (MSG_NOTE, vect_location,
3161 " Runtime profitability threshold = %d\n",
3162 min_profitable_iters);
3164 *ret_min_profitable_niters = min_profitable_iters;
3166 /* Calculate number of iterations required to make the vector version
3167 profitable, relative to the loop bodies only.
3169 Non-vectorized variant is SIC * niters and it must win over vector
3170 variant on the expected loop trip count. The following condition must hold true:
3171 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3173 if (vec_outside_cost <= 0)
3174 min_profitable_estimate = 1;
3175 else
3177 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3178 - vec_inside_cost * peel_iters_prologue
3179 - vec_inside_cost * peel_iters_epilogue)
3180 / ((scalar_single_iter_cost * vf)
3181 - vec_inside_cost);
3183 min_profitable_estimate --;
3184 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3185 if (dump_enabled_p ())
3186 dump_printf_loc (MSG_NOTE, vect_location,
3187 " Static estimate profitability threshold = %d\n",
3188 min_profitable_iters);
3190 *ret_min_profitable_estimate = min_profitable_estimate;
3193 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3194 vector elements (not bits) for a vector of mode MODE. */
3195 static void
3196 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3197 unsigned char *sel)
3199 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3201 for (i = 0; i < nelt; i++)
3202 sel[i] = (i + offset) & (2*nelt - 1);
3205 /* Checks whether the target supports whole-vector shifts for vectors of mode
3206 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3207 it supports vec_perm_const with masks for all necessary shift amounts. */
3208 static bool
3209 have_whole_vector_shift (enum machine_mode mode)
3211 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3212 return true;
3214 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3215 return false;
3217 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3218 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3220 for (i = nelt/2; i >= 1; i/=2)
3222 calc_vec_perm_mask_for_shift (mode, i, sel);
3223 if (!can_vec_perm_p (mode, false, sel))
3224 return false;
3226 return true;
3229 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3231 static tree
3232 get_reduction_op (gimple stmt, int reduc_index)
3234 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3236 case GIMPLE_SINGLE_RHS:
3237 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3238 == ternary_op);
3239 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3240 case GIMPLE_UNARY_RHS:
3241 return gimple_assign_rhs1 (stmt);
3242 case GIMPLE_BINARY_RHS:
3243 return (reduc_index
3244 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3245 case GIMPLE_TERNARY_RHS:
3246 return gimple_op (stmt, reduc_index + 1);
3247 default:
3248 gcc_unreachable ();
3252 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3253 functions. Design better to avoid maintenance issues. */
3255 /* Function vect_model_reduction_cost.
3257 Models cost for a reduction operation, including the vector ops
3258 generated within the strip-mine loop, the initial definition before
3259 the loop, and the epilogue code that must be generated. */
3261 static bool
3262 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3263 int ncopies, int reduc_index)
3265 int prologue_cost = 0, epilogue_cost = 0;
3266 enum tree_code code;
3267 optab optab;
3268 tree vectype;
3269 gimple stmt, orig_stmt;
3270 tree reduction_op;
3271 machine_mode mode;
3272 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3273 struct loop *loop = NULL;
3274 void *target_cost_data;
3276 if (loop_vinfo)
3278 loop = LOOP_VINFO_LOOP (loop_vinfo);
3279 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3281 else
3282 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3284 /* Cost of reduction op inside loop. */
3285 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3286 stmt_info, 0, vect_body);
3287 stmt = STMT_VINFO_STMT (stmt_info);
3289 reduction_op = get_reduction_op (stmt, reduc_index);
3291 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3292 if (!vectype)
3294 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3297 "unsupported data-type ");
3298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3299 TREE_TYPE (reduction_op));
3300 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3302 return false;
3305 mode = TYPE_MODE (vectype);
3306 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3308 if (!orig_stmt)
3309 orig_stmt = STMT_VINFO_STMT (stmt_info);
3311 code = gimple_assign_rhs_code (orig_stmt);
3313 /* Add in cost for initial definition. */
3314 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3315 stmt_info, 0, vect_prologue);
3317 /* Determine cost of epilogue code.
3319 We have a reduction operator that will reduce the vector in one statement.
3320 Also requires scalar extract. */
3322 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3324 if (reduc_code != ERROR_MARK)
3326 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3327 stmt_info, 0, vect_epilogue);
3328 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3329 stmt_info, 0, vect_epilogue);
3331 else
3333 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3334 tree bitsize =
3335 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3336 int element_bitsize = tree_to_uhwi (bitsize);
3337 int nelements = vec_size_in_bits / element_bitsize;
3339 optab = optab_for_tree_code (code, vectype, optab_default);
3341 /* We have a whole vector shift available. */
3342 if (VECTOR_MODE_P (mode)
3343 && optab_handler (optab, mode) != CODE_FOR_nothing
3344 && have_whole_vector_shift (mode))
3346 /* Final reduction via vector shifts and the reduction operator.
3347 Also requires scalar extract. */
3348 epilogue_cost += add_stmt_cost (target_cost_data,
3349 exact_log2 (nelements) * 2,
3350 vector_stmt, stmt_info, 0,
3351 vect_epilogue);
3352 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3353 vec_to_scalar, stmt_info, 0,
3354 vect_epilogue);
3356 else
3357 /* Use extracts and reduction op for final reduction. For N
3358 elements, we have N extracts and N-1 reduction ops. */
3359 epilogue_cost += add_stmt_cost (target_cost_data,
3360 nelements + nelements - 1,
3361 vector_stmt, stmt_info, 0,
3362 vect_epilogue);
3366 if (dump_enabled_p ())
3367 dump_printf (MSG_NOTE,
3368 "vect_model_reduction_cost: inside_cost = %d, "
3369 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3370 prologue_cost, epilogue_cost);
3372 return true;
3376 /* Function vect_model_induction_cost.
3378 Models cost for induction operations. */
3380 static void
3381 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3383 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3384 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3385 unsigned inside_cost, prologue_cost;
3387 /* loop cost for vec_loop. */
3388 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3389 stmt_info, 0, vect_body);
3391 /* prologue cost for vec_init and vec_step. */
3392 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3393 stmt_info, 0, vect_prologue);
3395 if (dump_enabled_p ())
3396 dump_printf_loc (MSG_NOTE, vect_location,
3397 "vect_model_induction_cost: inside_cost = %d, "
3398 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3402 /* Function get_initial_def_for_induction
3404 Input:
3405 STMT - a stmt that performs an induction operation in the loop.
3406 IV_PHI - the initial value of the induction variable
3408 Output:
3409 Return a vector variable, initialized with the first VF values of
3410 the induction variable. E.g., for an iv with IV_PHI='X' and
3411 evolution S, for a vector of 4 units, we want to return:
3412 [X, X + S, X + 2*S, X + 3*S]. */
3414 static tree
3415 get_initial_def_for_induction (gimple iv_phi)
3417 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3418 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3419 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3420 tree vectype;
3421 int nunits;
3422 edge pe = loop_preheader_edge (loop);
3423 struct loop *iv_loop;
3424 basic_block new_bb;
3425 tree new_vec, vec_init, vec_step, t;
3426 tree new_var;
3427 tree new_name;
3428 gimple init_stmt, new_stmt;
3429 gphi *induction_phi;
3430 tree induc_def, vec_def, vec_dest;
3431 tree init_expr, step_expr;
3432 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433 int i;
3434 int ncopies;
3435 tree expr;
3436 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3437 bool nested_in_vect_loop = false;
3438 gimple_seq stmts = NULL;
3439 imm_use_iterator imm_iter;
3440 use_operand_p use_p;
3441 gimple exit_phi;
3442 edge latch_e;
3443 tree loop_arg;
3444 gimple_stmt_iterator si;
3445 basic_block bb = gimple_bb (iv_phi);
3446 tree stepvectype;
3447 tree resvectype;
3449 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3450 if (nested_in_vect_loop_p (loop, iv_phi))
3452 nested_in_vect_loop = true;
3453 iv_loop = loop->inner;
3455 else
3456 iv_loop = loop;
3457 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3459 latch_e = loop_latch_edge (iv_loop);
3460 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3462 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3463 gcc_assert (step_expr != NULL_TREE);
3465 pe = loop_preheader_edge (iv_loop);
3466 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3467 loop_preheader_edge (iv_loop));
3469 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3470 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3471 gcc_assert (vectype);
3472 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3473 ncopies = vf / nunits;
3475 gcc_assert (phi_info);
3476 gcc_assert (ncopies >= 1);
3478 /* Convert the step to the desired type. */
3479 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3480 step_expr),
3481 &stmts, true, NULL_TREE);
3482 if (stmts)
3484 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3485 gcc_assert (!new_bb);
3488 /* Find the first insertion point in the BB. */
3489 si = gsi_after_labels (bb);
3491 /* Create the vector that holds the initial_value of the induction. */
3492 if (nested_in_vect_loop)
3494 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3495 been created during vectorization of previous stmts. We obtain it
3496 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3497 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3498 /* If the initial value is not of proper type, convert it. */
3499 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3501 new_stmt
3502 = gimple_build_assign (vect_get_new_vect_var (vectype,
3503 vect_simple_var,
3504 "vec_iv_"),
3505 VIEW_CONVERT_EXPR,
3506 build1 (VIEW_CONVERT_EXPR, vectype,
3507 vec_init));
3508 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3509 gimple_assign_set_lhs (new_stmt, vec_init);
3510 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3511 new_stmt);
3512 gcc_assert (!new_bb);
3513 set_vinfo_for_stmt (new_stmt,
3514 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3517 else
3519 vec<constructor_elt, va_gc> *v;
3521 /* iv_loop is the loop to be vectorized. Create:
3522 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3523 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3524 vect_scalar_var, "var_");
3525 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3526 init_expr),
3527 &stmts, false, new_var);
3528 if (stmts)
3530 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3531 gcc_assert (!new_bb);
3534 vec_alloc (v, nunits);
3535 bool constant_p = is_gimple_min_invariant (new_name);
3536 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3537 for (i = 1; i < nunits; i++)
3539 /* Create: new_name_i = new_name + step_expr */
3540 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3541 new_name, step_expr);
3542 if (!is_gimple_min_invariant (new_name))
3544 init_stmt = gimple_build_assign (new_var, new_name);
3545 new_name = make_ssa_name (new_var, init_stmt);
3546 gimple_assign_set_lhs (init_stmt, new_name);
3547 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3548 gcc_assert (!new_bb);
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE, vect_location,
3552 "created new init_stmt: ");
3553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3554 dump_printf (MSG_NOTE, "\n");
3556 constant_p = false;
3558 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3560 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3561 if (constant_p)
3562 new_vec = build_vector_from_ctor (vectype, v);
3563 else
3564 new_vec = build_constructor (vectype, v);
3565 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3569 /* Create the vector that holds the step of the induction. */
3570 if (nested_in_vect_loop)
3571 /* iv_loop is nested in the loop to be vectorized. Generate:
3572 vec_step = [S, S, S, S] */
3573 new_name = step_expr;
3574 else
3576 /* iv_loop is the loop to be vectorized. Generate:
3577 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3578 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3580 expr = build_int_cst (integer_type_node, vf);
3581 expr = fold_convert (TREE_TYPE (step_expr), expr);
3583 else
3584 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3585 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3586 expr, step_expr);
3587 if (TREE_CODE (step_expr) == SSA_NAME)
3588 new_name = vect_init_vector (iv_phi, new_name,
3589 TREE_TYPE (step_expr), NULL);
3592 t = unshare_expr (new_name);
3593 gcc_assert (CONSTANT_CLASS_P (new_name)
3594 || TREE_CODE (new_name) == SSA_NAME);
3595 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3596 gcc_assert (stepvectype);
3597 new_vec = build_vector_from_val (stepvectype, t);
3598 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3601 /* Create the following def-use cycle:
3602 loop prolog:
3603 vec_init = ...
3604 vec_step = ...
3605 loop:
3606 vec_iv = PHI <vec_init, vec_loop>
3608 STMT
3610 vec_loop = vec_iv + vec_step; */
3612 /* Create the induction-phi that defines the induction-operand. */
3613 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3614 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3615 set_vinfo_for_stmt (induction_phi,
3616 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3617 induc_def = PHI_RESULT (induction_phi);
3619 /* Create the iv update inside the loop */
3620 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3621 vec_def = make_ssa_name (vec_dest, new_stmt);
3622 gimple_assign_set_lhs (new_stmt, vec_def);
3623 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3624 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3625 NULL));
3627 /* Set the arguments of the phi node: */
3628 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3629 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3630 UNKNOWN_LOCATION);
3633 /* In case that vectorization factor (VF) is bigger than the number
3634 of elements that we can fit in a vectype (nunits), we have to generate
3635 more than one vector stmt - i.e - we need to "unroll" the
3636 vector stmt by a factor VF/nunits. For more details see documentation
3637 in vectorizable_operation. */
3639 if (ncopies > 1)
3641 stmt_vec_info prev_stmt_vinfo;
3642 /* FORNOW. This restriction should be relaxed. */
3643 gcc_assert (!nested_in_vect_loop);
3645 /* Create the vector that holds the step of the induction. */
3646 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3648 expr = build_int_cst (integer_type_node, nunits);
3649 expr = fold_convert (TREE_TYPE (step_expr), expr);
3651 else
3652 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3653 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3654 expr, step_expr);
3655 if (TREE_CODE (step_expr) == SSA_NAME)
3656 new_name = vect_init_vector (iv_phi, new_name,
3657 TREE_TYPE (step_expr), NULL);
3658 t = unshare_expr (new_name);
3659 gcc_assert (CONSTANT_CLASS_P (new_name)
3660 || TREE_CODE (new_name) == SSA_NAME);
3661 new_vec = build_vector_from_val (stepvectype, t);
3662 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3664 vec_def = induc_def;
3665 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3666 for (i = 1; i < ncopies; i++)
3668 /* vec_i = vec_prev + vec_step */
3669 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3670 vec_def, vec_step);
3671 vec_def = make_ssa_name (vec_dest, new_stmt);
3672 gimple_assign_set_lhs (new_stmt, vec_def);
3674 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3675 if (!useless_type_conversion_p (resvectype, vectype))
3677 new_stmt
3678 = gimple_build_assign
3679 (vect_get_new_vect_var (resvectype, vect_simple_var,
3680 "vec_iv_"),
3681 VIEW_CONVERT_EXPR,
3682 build1 (VIEW_CONVERT_EXPR, resvectype,
3683 gimple_assign_lhs (new_stmt)));
3684 gimple_assign_set_lhs (new_stmt,
3685 make_ssa_name
3686 (gimple_assign_lhs (new_stmt), new_stmt));
3687 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3689 set_vinfo_for_stmt (new_stmt,
3690 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3691 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3692 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3696 if (nested_in_vect_loop)
3698 /* Find the loop-closed exit-phi of the induction, and record
3699 the final vector of induction results: */
3700 exit_phi = NULL;
3701 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3703 gimple use_stmt = USE_STMT (use_p);
3704 if (is_gimple_debug (use_stmt))
3705 continue;
3707 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3709 exit_phi = use_stmt;
3710 break;
3713 if (exit_phi)
3715 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3716 /* FORNOW. Currently not supporting the case that an inner-loop induction
3717 is not used in the outer-loop (i.e. only outside the outer-loop). */
3718 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3719 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3721 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3722 if (dump_enabled_p ())
3724 dump_printf_loc (MSG_NOTE, vect_location,
3725 "vector of inductions after inner-loop:");
3726 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3727 dump_printf (MSG_NOTE, "\n");
3733 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_NOTE, vect_location,
3736 "transform induction: created def-use cycle: ");
3737 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3738 dump_printf (MSG_NOTE, "\n");
3739 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3740 SSA_NAME_DEF_STMT (vec_def), 0);
3741 dump_printf (MSG_NOTE, "\n");
3744 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3745 if (!useless_type_conversion_p (resvectype, vectype))
3747 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3748 vect_simple_var,
3749 "vec_iv_"),
3750 VIEW_CONVERT_EXPR,
3751 build1 (VIEW_CONVERT_EXPR, resvectype,
3752 induc_def));
3753 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3754 gimple_assign_set_lhs (new_stmt, induc_def);
3755 si = gsi_after_labels (bb);
3756 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3757 set_vinfo_for_stmt (new_stmt,
3758 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3759 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3760 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3763 return induc_def;
3767 /* Function get_initial_def_for_reduction
3769 Input:
3770 STMT - a stmt that performs a reduction operation in the loop.
3771 INIT_VAL - the initial value of the reduction variable
3773 Output:
3774 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3775 of the reduction (used for adjusting the epilog - see below).
3776 Return a vector variable, initialized according to the operation that STMT
3777 performs. This vector will be used as the initial value of the
3778 vector of partial results.
3780 Option1 (adjust in epilog): Initialize the vector as follows:
3781 add/bit or/xor: [0,0,...,0,0]
3782 mult/bit and: [1,1,...,1,1]
3783 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3784 and when necessary (e.g. add/mult case) let the caller know
3785 that it needs to adjust the result by init_val.
3787 Option2: Initialize the vector as follows:
3788 add/bit or/xor: [init_val,0,0,...,0]
3789 mult/bit and: [init_val,1,1,...,1]
3790 min/max/cond_expr: [init_val,init_val,...,init_val]
3791 and no adjustments are needed.
3793 For example, for the following code:
3795 s = init_val;
3796 for (i=0;i<n;i++)
3797 s = s + a[i];
3799 STMT is 's = s + a[i]', and the reduction variable is 's'.
3800 For a vector of 4 units, we want to return either [0,0,0,init_val],
3801 or [0,0,0,0] and let the caller know that it needs to adjust
3802 the result at the end by 'init_val'.
3804 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3805 initialization vector is simpler (same element in all entries), if
3806 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3808 A cost model should help decide between these two schemes. */
3810 tree
3811 get_initial_def_for_reduction (gimple stmt, tree init_val,
3812 tree *adjustment_def)
3814 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3815 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3816 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3817 tree scalar_type = TREE_TYPE (init_val);
3818 tree vectype = get_vectype_for_scalar_type (scalar_type);
3819 int nunits;
3820 enum tree_code code = gimple_assign_rhs_code (stmt);
3821 tree def_for_init;
3822 tree init_def;
3823 tree *elts;
3824 int i;
3825 bool nested_in_vect_loop = false;
3826 tree init_value;
3827 REAL_VALUE_TYPE real_init_val = dconst0;
3828 int int_init_val = 0;
3829 gimple def_stmt = NULL;
3831 gcc_assert (vectype);
3832 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3834 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3835 || SCALAR_FLOAT_TYPE_P (scalar_type));
3837 if (nested_in_vect_loop_p (loop, stmt))
3838 nested_in_vect_loop = true;
3839 else
3840 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3842 /* In case of double reduction we only create a vector variable to be put
3843 in the reduction phi node. The actual statement creation is done in
3844 vect_create_epilog_for_reduction. */
3845 if (adjustment_def && nested_in_vect_loop
3846 && TREE_CODE (init_val) == SSA_NAME
3847 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3848 && gimple_code (def_stmt) == GIMPLE_PHI
3849 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3850 && vinfo_for_stmt (def_stmt)
3851 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3852 == vect_double_reduction_def)
3854 *adjustment_def = NULL;
3855 return vect_create_destination_var (init_val, vectype);
3858 if (TREE_CONSTANT (init_val))
3860 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3861 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3862 else
3863 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3865 else
3866 init_value = init_val;
3868 switch (code)
3870 case WIDEN_SUM_EXPR:
3871 case DOT_PROD_EXPR:
3872 case SAD_EXPR:
3873 case PLUS_EXPR:
3874 case MINUS_EXPR:
3875 case BIT_IOR_EXPR:
3876 case BIT_XOR_EXPR:
3877 case MULT_EXPR:
3878 case BIT_AND_EXPR:
3879 /* ADJUSMENT_DEF is NULL when called from
3880 vect_create_epilog_for_reduction to vectorize double reduction. */
3881 if (adjustment_def)
3883 if (nested_in_vect_loop)
3884 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3885 NULL);
3886 else
3887 *adjustment_def = init_val;
3890 if (code == MULT_EXPR)
3892 real_init_val = dconst1;
3893 int_init_val = 1;
3896 if (code == BIT_AND_EXPR)
3897 int_init_val = -1;
3899 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3900 def_for_init = build_real (scalar_type, real_init_val);
3901 else
3902 def_for_init = build_int_cst (scalar_type, int_init_val);
3904 /* Create a vector of '0' or '1' except the first element. */
3905 elts = XALLOCAVEC (tree, nunits);
3906 for (i = nunits - 2; i >= 0; --i)
3907 elts[i + 1] = def_for_init;
3909 /* Option1: the first element is '0' or '1' as well. */
3910 if (adjustment_def)
3912 elts[0] = def_for_init;
3913 init_def = build_vector (vectype, elts);
3914 break;
3917 /* Option2: the first element is INIT_VAL. */
3918 elts[0] = init_val;
3919 if (TREE_CONSTANT (init_val))
3920 init_def = build_vector (vectype, elts);
3921 else
3923 vec<constructor_elt, va_gc> *v;
3924 vec_alloc (v, nunits);
3925 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3926 for (i = 1; i < nunits; ++i)
3927 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3928 init_def = build_constructor (vectype, v);
3931 break;
3933 case MIN_EXPR:
3934 case MAX_EXPR:
3935 case COND_EXPR:
3936 if (adjustment_def)
3938 *adjustment_def = NULL_TREE;
3939 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3940 break;
3943 init_def = build_vector_from_val (vectype, init_value);
3944 break;
3946 default:
3947 gcc_unreachable ();
3950 return init_def;
3953 /* Function vect_create_epilog_for_reduction
3955 Create code at the loop-epilog to finalize the result of a reduction
3956 computation.
3958 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3959 reduction statements.
3960 STMT is the scalar reduction stmt that is being vectorized.
3961 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3962 number of elements that we can fit in a vectype (nunits). In this case
3963 we have to generate more than one vector stmt - i.e - we need to "unroll"
3964 the vector stmt by a factor VF/nunits. For more details see documentation
3965 in vectorizable_operation.
3966 REDUC_CODE is the tree-code for the epilog reduction.
3967 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3968 computation.
3969 REDUC_INDEX is the index of the operand in the right hand side of the
3970 statement that is defined by REDUCTION_PHI.
3971 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3972 SLP_NODE is an SLP node containing a group of reduction statements. The
3973 first one in this group is STMT.
3975 This function:
3976 1. Creates the reduction def-use cycles: sets the arguments for
3977 REDUCTION_PHIS:
3978 The loop-entry argument is the vectorized initial-value of the reduction.
3979 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3980 sums.
3981 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3982 by applying the operation specified by REDUC_CODE if available, or by
3983 other means (whole-vector shifts or a scalar loop).
3984 The function also creates a new phi node at the loop exit to preserve
3985 loop-closed form, as illustrated below.
3987 The flow at the entry to this function:
3989 loop:
3990 vec_def = phi <null, null> # REDUCTION_PHI
3991 VECT_DEF = vector_stmt # vectorized form of STMT
3992 s_loop = scalar_stmt # (scalar) STMT
3993 loop_exit:
3994 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3995 use <s_out0>
3996 use <s_out0>
3998 The above is transformed by this function into:
4000 loop:
4001 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4002 VECT_DEF = vector_stmt # vectorized form of STMT
4003 s_loop = scalar_stmt # (scalar) STMT
4004 loop_exit:
4005 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4006 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4007 v_out2 = reduce <v_out1>
4008 s_out3 = extract_field <v_out2, 0>
4009 s_out4 = adjust_result <s_out3>
4010 use <s_out4>
4011 use <s_out4>
4014 static void
4015 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4016 int ncopies, enum tree_code reduc_code,
4017 vec<gimple> reduction_phis,
4018 int reduc_index, bool double_reduc,
4019 slp_tree slp_node)
4021 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4022 stmt_vec_info prev_phi_info;
4023 tree vectype;
4024 machine_mode mode;
4025 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4026 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4027 basic_block exit_bb;
4028 tree scalar_dest;
4029 tree scalar_type;
4030 gimple new_phi = NULL, phi;
4031 gimple_stmt_iterator exit_gsi;
4032 tree vec_dest;
4033 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4034 gimple epilog_stmt = NULL;
4035 enum tree_code code = gimple_assign_rhs_code (stmt);
4036 gimple exit_phi;
4037 tree bitsize;
4038 tree adjustment_def = NULL;
4039 tree vec_initial_def = NULL;
4040 tree reduction_op, expr, def;
4041 tree orig_name, scalar_result;
4042 imm_use_iterator imm_iter, phi_imm_iter;
4043 use_operand_p use_p, phi_use_p;
4044 gimple use_stmt, orig_stmt, reduction_phi = NULL;
4045 bool nested_in_vect_loop = false;
4046 auto_vec<gimple> new_phis;
4047 auto_vec<gimple> inner_phis;
4048 enum vect_def_type dt = vect_unknown_def_type;
4049 int j, i;
4050 auto_vec<tree> scalar_results;
4051 unsigned int group_size = 1, k, ratio;
4052 auto_vec<tree> vec_initial_defs;
4053 auto_vec<gimple> phis;
4054 bool slp_reduc = false;
4055 tree new_phi_result;
4056 gimple inner_phi = NULL;
4058 if (slp_node)
4059 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4061 if (nested_in_vect_loop_p (loop, stmt))
4063 outer_loop = loop;
4064 loop = loop->inner;
4065 nested_in_vect_loop = true;
4066 gcc_assert (!slp_node);
4069 reduction_op = get_reduction_op (stmt, reduc_index);
4071 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4072 gcc_assert (vectype);
4073 mode = TYPE_MODE (vectype);
4075 /* 1. Create the reduction def-use cycle:
4076 Set the arguments of REDUCTION_PHIS, i.e., transform
4078 loop:
4079 vec_def = phi <null, null> # REDUCTION_PHI
4080 VECT_DEF = vector_stmt # vectorized form of STMT
4083 into:
4085 loop:
4086 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4087 VECT_DEF = vector_stmt # vectorized form of STMT
4090 (in case of SLP, do it for all the phis). */
4092 /* Get the loop-entry arguments. */
4093 if (slp_node)
4094 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4095 NULL, slp_node, reduc_index);
4096 else
4098 vec_initial_defs.create (1);
4099 /* For the case of reduction, vect_get_vec_def_for_operand returns
4100 the scalar def before the loop, that defines the initial value
4101 of the reduction variable. */
4102 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4103 &adjustment_def);
4104 vec_initial_defs.quick_push (vec_initial_def);
4107 /* Set phi nodes arguments. */
4108 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4110 tree vec_init_def, def;
4111 gimple_seq stmts;
4112 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4113 true, NULL_TREE);
4114 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4115 def = vect_defs[i];
4116 for (j = 0; j < ncopies; j++)
4118 /* Set the loop-entry arg of the reduction-phi. */
4119 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4120 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4122 /* Set the loop-latch arg for the reduction-phi. */
4123 if (j > 0)
4124 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4126 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4127 UNKNOWN_LOCATION);
4129 if (dump_enabled_p ())
4131 dump_printf_loc (MSG_NOTE, vect_location,
4132 "transform reduction: created def-use cycle: ");
4133 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4134 dump_printf (MSG_NOTE, "\n");
4135 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4136 dump_printf (MSG_NOTE, "\n");
4139 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4143 /* 2. Create epilog code.
4144 The reduction epilog code operates across the elements of the vector
4145 of partial results computed by the vectorized loop.
4146 The reduction epilog code consists of:
4148 step 1: compute the scalar result in a vector (v_out2)
4149 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4150 step 3: adjust the scalar result (s_out3) if needed.
4152 Step 1 can be accomplished using one the following three schemes:
4153 (scheme 1) using reduc_code, if available.
4154 (scheme 2) using whole-vector shifts, if available.
4155 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4156 combined.
4158 The overall epilog code looks like this:
4160 s_out0 = phi <s_loop> # original EXIT_PHI
4161 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4162 v_out2 = reduce <v_out1> # step 1
4163 s_out3 = extract_field <v_out2, 0> # step 2
4164 s_out4 = adjust_result <s_out3> # step 3
4166 (step 3 is optional, and steps 1 and 2 may be combined).
4167 Lastly, the uses of s_out0 are replaced by s_out4. */
4170 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4171 v_out1 = phi <VECT_DEF>
4172 Store them in NEW_PHIS. */
4174 exit_bb = single_exit (loop)->dest;
4175 prev_phi_info = NULL;
4176 new_phis.create (vect_defs.length ());
4177 FOR_EACH_VEC_ELT (vect_defs, i, def)
4179 for (j = 0; j < ncopies; j++)
4181 tree new_def = copy_ssa_name (def);
4182 phi = create_phi_node (new_def, exit_bb);
4183 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4184 if (j == 0)
4185 new_phis.quick_push (phi);
4186 else
4188 def = vect_get_vec_def_for_stmt_copy (dt, def);
4189 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4192 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4193 prev_phi_info = vinfo_for_stmt (phi);
4197 /* The epilogue is created for the outer-loop, i.e., for the loop being
4198 vectorized. Create exit phis for the outer loop. */
4199 if (double_reduc)
4201 loop = outer_loop;
4202 exit_bb = single_exit (loop)->dest;
4203 inner_phis.create (vect_defs.length ());
4204 FOR_EACH_VEC_ELT (new_phis, i, phi)
4206 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4207 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4208 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4209 PHI_RESULT (phi));
4210 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4211 loop_vinfo, NULL));
4212 inner_phis.quick_push (phi);
4213 new_phis[i] = outer_phi;
4214 prev_phi_info = vinfo_for_stmt (outer_phi);
4215 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4217 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4218 new_result = copy_ssa_name (PHI_RESULT (phi));
4219 outer_phi = create_phi_node (new_result, exit_bb);
4220 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4221 PHI_RESULT (phi));
4222 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4223 loop_vinfo, NULL));
4224 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4225 prev_phi_info = vinfo_for_stmt (outer_phi);
4230 exit_gsi = gsi_after_labels (exit_bb);
4232 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4233 (i.e. when reduc_code is not available) and in the final adjustment
4234 code (if needed). Also get the original scalar reduction variable as
4235 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4236 represents a reduction pattern), the tree-code and scalar-def are
4237 taken from the original stmt that the pattern-stmt (STMT) replaces.
4238 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4239 are taken from STMT. */
4241 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4242 if (!orig_stmt)
4244 /* Regular reduction */
4245 orig_stmt = stmt;
4247 else
4249 /* Reduction pattern */
4250 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4251 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4252 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4255 code = gimple_assign_rhs_code (orig_stmt);
4256 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4257 partial results are added and not subtracted. */
4258 if (code == MINUS_EXPR)
4259 code = PLUS_EXPR;
4261 scalar_dest = gimple_assign_lhs (orig_stmt);
4262 scalar_type = TREE_TYPE (scalar_dest);
4263 scalar_results.create (group_size);
4264 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4265 bitsize = TYPE_SIZE (scalar_type);
4267 /* In case this is a reduction in an inner-loop while vectorizing an outer
4268 loop - we don't need to extract a single scalar result at the end of the
4269 inner-loop (unless it is double reduction, i.e., the use of reduction is
4270 outside the outer-loop). The final vector of partial results will be used
4271 in the vectorized outer-loop, or reduced to a scalar result at the end of
4272 the outer-loop. */
4273 if (nested_in_vect_loop && !double_reduc)
4274 goto vect_finalize_reduction;
4276 /* SLP reduction without reduction chain, e.g.,
4277 # a1 = phi <a2, a0>
4278 # b1 = phi <b2, b0>
4279 a2 = operation (a1)
4280 b2 = operation (b1) */
4281 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4283 /* In case of reduction chain, e.g.,
4284 # a1 = phi <a3, a0>
4285 a2 = operation (a1)
4286 a3 = operation (a2),
4288 we may end up with more than one vector result. Here we reduce them to
4289 one vector. */
4290 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4292 tree first_vect = PHI_RESULT (new_phis[0]);
4293 tree tmp;
4294 gassign *new_vec_stmt = NULL;
4296 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4297 for (k = 1; k < new_phis.length (); k++)
4299 gimple next_phi = new_phis[k];
4300 tree second_vect = PHI_RESULT (next_phi);
4302 tmp = build2 (code, vectype, first_vect, second_vect);
4303 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4304 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4305 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4306 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4309 new_phi_result = first_vect;
4310 if (new_vec_stmt)
4312 new_phis.truncate (0);
4313 new_phis.safe_push (new_vec_stmt);
4316 else
4317 new_phi_result = PHI_RESULT (new_phis[0]);
4319 /* 2.3 Create the reduction code, using one of the three schemes described
4320 above. In SLP we simply need to extract all the elements from the
4321 vector (without reducing them), so we use scalar shifts. */
4322 if (reduc_code != ERROR_MARK && !slp_reduc)
4324 tree tmp;
4325 tree vec_elem_type;
4327 /*** Case 1: Create:
4328 v_out2 = reduc_expr <v_out1> */
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_NOTE, vect_location,
4332 "Reduce using direct vector reduction.\n");
4334 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4335 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4337 tree tmp_dest =
4338 vect_create_destination_var (scalar_dest, vec_elem_type);
4339 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4340 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4341 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4342 gimple_assign_set_lhs (epilog_stmt, new_temp);
4343 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4345 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4347 else
4348 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4349 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4350 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4351 gimple_assign_set_lhs (epilog_stmt, new_temp);
4352 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4353 scalar_results.safe_push (new_temp);
4355 else
4357 bool reduce_with_shift = have_whole_vector_shift (mode);
4358 int element_bitsize = tree_to_uhwi (bitsize);
4359 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4360 tree vec_temp;
4362 /* Regardless of whether we have a whole vector shift, if we're
4363 emulating the operation via tree-vect-generic, we don't want
4364 to use it. Only the first round of the reduction is likely
4365 to still be profitable via emulation. */
4366 /* ??? It might be better to emit a reduction tree code here, so that
4367 tree-vect-generic can expand the first round via bit tricks. */
4368 if (!VECTOR_MODE_P (mode))
4369 reduce_with_shift = false;
4370 else
4372 optab optab = optab_for_tree_code (code, vectype, optab_default);
4373 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4374 reduce_with_shift = false;
4377 if (reduce_with_shift && !slp_reduc)
4379 int nelements = vec_size_in_bits / element_bitsize;
4380 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4382 int elt_offset;
4384 tree zero_vec = build_zero_cst (vectype);
4385 /*** Case 2: Create:
4386 for (offset = nelements/2; offset >= 1; offset/=2)
4388 Create: va' = vec_shift <va, offset>
4389 Create: va = vop <va, va'>
4390 } */
4392 tree rhs;
4394 if (dump_enabled_p ())
4395 dump_printf_loc (MSG_NOTE, vect_location,
4396 "Reduce using vector shifts\n");
4398 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4399 new_temp = new_phi_result;
4400 for (elt_offset = nelements / 2;
4401 elt_offset >= 1;
4402 elt_offset /= 2)
4404 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4405 tree mask = vect_gen_perm_mask_any (vectype, sel);
4406 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4407 new_temp, zero_vec, mask);
4408 new_name = make_ssa_name (vec_dest, epilog_stmt);
4409 gimple_assign_set_lhs (epilog_stmt, new_name);
4410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4412 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4413 new_temp);
4414 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4415 gimple_assign_set_lhs (epilog_stmt, new_temp);
4416 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4419 /* 2.4 Extract the final scalar result. Create:
4420 s_out3 = extract_field <v_out2, bitpos> */
4422 if (dump_enabled_p ())
4423 dump_printf_loc (MSG_NOTE, vect_location,
4424 "extract scalar result\n");
4426 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4427 bitsize, bitsize_zero_node);
4428 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4429 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4430 gimple_assign_set_lhs (epilog_stmt, new_temp);
4431 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4432 scalar_results.safe_push (new_temp);
4434 else
4436 /*** Case 3: Create:
4437 s = extract_field <v_out2, 0>
4438 for (offset = element_size;
4439 offset < vector_size;
4440 offset += element_size;)
4442 Create: s' = extract_field <v_out2, offset>
4443 Create: s = op <s, s'> // For non SLP cases
4444 } */
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location,
4448 "Reduce using scalar code.\n");
4450 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4451 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4453 int bit_offset;
4454 if (gimple_code (new_phi) == GIMPLE_PHI)
4455 vec_temp = PHI_RESULT (new_phi);
4456 else
4457 vec_temp = gimple_assign_lhs (new_phi);
4458 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4459 bitsize_zero_node);
4460 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4461 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4462 gimple_assign_set_lhs (epilog_stmt, new_temp);
4463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4465 /* In SLP we don't need to apply reduction operation, so we just
4466 collect s' values in SCALAR_RESULTS. */
4467 if (slp_reduc)
4468 scalar_results.safe_push (new_temp);
4470 for (bit_offset = element_bitsize;
4471 bit_offset < vec_size_in_bits;
4472 bit_offset += element_bitsize)
4474 tree bitpos = bitsize_int (bit_offset);
4475 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4476 bitsize, bitpos);
4478 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4479 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4480 gimple_assign_set_lhs (epilog_stmt, new_name);
4481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4483 if (slp_reduc)
4485 /* In SLP we don't need to apply reduction operation, so
4486 we just collect s' values in SCALAR_RESULTS. */
4487 new_temp = new_name;
4488 scalar_results.safe_push (new_name);
4490 else
4492 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4493 new_name, new_temp);
4494 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4495 gimple_assign_set_lhs (epilog_stmt, new_temp);
4496 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4501 /* The only case where we need to reduce scalar results in SLP, is
4502 unrolling. If the size of SCALAR_RESULTS is greater than
4503 GROUP_SIZE, we reduce them combining elements modulo
4504 GROUP_SIZE. */
4505 if (slp_reduc)
4507 tree res, first_res, new_res;
4508 gimple new_stmt;
4510 /* Reduce multiple scalar results in case of SLP unrolling. */
4511 for (j = group_size; scalar_results.iterate (j, &res);
4512 j++)
4514 first_res = scalar_results[j % group_size];
4515 new_stmt = gimple_build_assign (new_scalar_dest, code,
4516 first_res, res);
4517 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4518 gimple_assign_set_lhs (new_stmt, new_res);
4519 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4520 scalar_results[j % group_size] = new_res;
4523 else
4524 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4525 scalar_results.safe_push (new_temp);
4529 vect_finalize_reduction:
4531 if (double_reduc)
4532 loop = loop->inner;
4534 /* 2.5 Adjust the final result by the initial value of the reduction
4535 variable. (When such adjustment is not needed, then
4536 'adjustment_def' is zero). For example, if code is PLUS we create:
4537 new_temp = loop_exit_def + adjustment_def */
4539 if (adjustment_def)
4541 gcc_assert (!slp_reduc);
4542 if (nested_in_vect_loop)
4544 new_phi = new_phis[0];
4545 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4546 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4547 new_dest = vect_create_destination_var (scalar_dest, vectype);
4549 else
4551 new_temp = scalar_results[0];
4552 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4553 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4554 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4557 epilog_stmt = gimple_build_assign (new_dest, expr);
4558 new_temp = make_ssa_name (new_dest, epilog_stmt);
4559 gimple_assign_set_lhs (epilog_stmt, new_temp);
4560 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4561 if (nested_in_vect_loop)
4563 set_vinfo_for_stmt (epilog_stmt,
4564 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4565 NULL));
4566 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4567 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4569 if (!double_reduc)
4570 scalar_results.quick_push (new_temp);
4571 else
4572 scalar_results[0] = new_temp;
4574 else
4575 scalar_results[0] = new_temp;
4577 new_phis[0] = epilog_stmt;
4580 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4581 phis with new adjusted scalar results, i.e., replace use <s_out0>
4582 with use <s_out4>.
4584 Transform:
4585 loop_exit:
4586 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4587 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4588 v_out2 = reduce <v_out1>
4589 s_out3 = extract_field <v_out2, 0>
4590 s_out4 = adjust_result <s_out3>
4591 use <s_out0>
4592 use <s_out0>
4594 into:
4596 loop_exit:
4597 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4598 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4599 v_out2 = reduce <v_out1>
4600 s_out3 = extract_field <v_out2, 0>
4601 s_out4 = adjust_result <s_out3>
4602 use <s_out4>
4603 use <s_out4> */
4606 /* In SLP reduction chain we reduce vector results into one vector if
4607 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4608 the last stmt in the reduction chain, since we are looking for the loop
4609 exit phi node. */
4610 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4612 gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4613 /* Handle reduction patterns. */
4614 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4615 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4617 scalar_dest = gimple_assign_lhs (dest_stmt);
4618 group_size = 1;
4621 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4622 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4623 need to match SCALAR_RESULTS with corresponding statements. The first
4624 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4625 the first vector stmt, etc.
4626 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4627 if (group_size > new_phis.length ())
4629 ratio = group_size / new_phis.length ();
4630 gcc_assert (!(group_size % new_phis.length ()));
4632 else
4633 ratio = 1;
4635 for (k = 0; k < group_size; k++)
4637 if (k % ratio == 0)
4639 epilog_stmt = new_phis[k / ratio];
4640 reduction_phi = reduction_phis[k / ratio];
4641 if (double_reduc)
4642 inner_phi = inner_phis[k / ratio];
4645 if (slp_reduc)
4647 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4649 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4650 /* SLP statements can't participate in patterns. */
4651 gcc_assert (!orig_stmt);
4652 scalar_dest = gimple_assign_lhs (current_stmt);
4655 phis.create (3);
4656 /* Find the loop-closed-use at the loop exit of the original scalar
4657 result. (The reduction result is expected to have two immediate uses -
4658 one at the latch block, and one at the loop exit). */
4659 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4660 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4661 && !is_gimple_debug (USE_STMT (use_p)))
4662 phis.safe_push (USE_STMT (use_p));
4664 /* While we expect to have found an exit_phi because of loop-closed-ssa
4665 form we can end up without one if the scalar cycle is dead. */
4667 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4669 if (outer_loop)
4671 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4672 gphi *vect_phi;
4674 /* FORNOW. Currently not supporting the case that an inner-loop
4675 reduction is not used in the outer-loop (but only outside the
4676 outer-loop), unless it is double reduction. */
4677 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4678 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4679 || double_reduc);
4681 if (double_reduc)
4682 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4683 else
4684 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4685 if (!double_reduc
4686 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4687 != vect_double_reduction_def)
4688 continue;
4690 /* Handle double reduction:
4692 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4693 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4694 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4695 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4697 At that point the regular reduction (stmt2 and stmt3) is
4698 already vectorized, as well as the exit phi node, stmt4.
4699 Here we vectorize the phi node of double reduction, stmt1, and
4700 update all relevant statements. */
4702 /* Go through all the uses of s2 to find double reduction phi
4703 node, i.e., stmt1 above. */
4704 orig_name = PHI_RESULT (exit_phi);
4705 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4707 stmt_vec_info use_stmt_vinfo;
4708 stmt_vec_info new_phi_vinfo;
4709 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4710 basic_block bb = gimple_bb (use_stmt);
4711 gimple use;
4713 /* Check that USE_STMT is really double reduction phi
4714 node. */
4715 if (gimple_code (use_stmt) != GIMPLE_PHI
4716 || gimple_phi_num_args (use_stmt) != 2
4717 || bb->loop_father != outer_loop)
4718 continue;
4719 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4720 if (!use_stmt_vinfo
4721 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4722 != vect_double_reduction_def)
4723 continue;
4725 /* Create vector phi node for double reduction:
4726 vs1 = phi <vs0, vs2>
4727 vs1 was created previously in this function by a call to
4728 vect_get_vec_def_for_operand and is stored in
4729 vec_initial_def;
4730 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4731 vs0 is created here. */
4733 /* Create vector phi node. */
4734 vect_phi = create_phi_node (vec_initial_def, bb);
4735 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4736 loop_vec_info_for_loop (outer_loop), NULL);
4737 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4739 /* Create vs0 - initial def of the double reduction phi. */
4740 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4741 loop_preheader_edge (outer_loop));
4742 init_def = get_initial_def_for_reduction (stmt,
4743 preheader_arg, NULL);
4744 vect_phi_init = vect_init_vector (use_stmt, init_def,
4745 vectype, NULL);
4747 /* Update phi node arguments with vs0 and vs2. */
4748 add_phi_arg (vect_phi, vect_phi_init,
4749 loop_preheader_edge (outer_loop),
4750 UNKNOWN_LOCATION);
4751 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4752 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4753 if (dump_enabled_p ())
4755 dump_printf_loc (MSG_NOTE, vect_location,
4756 "created double reduction phi node: ");
4757 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4758 dump_printf (MSG_NOTE, "\n");
4761 vect_phi_res = PHI_RESULT (vect_phi);
4763 /* Replace the use, i.e., set the correct vs1 in the regular
4764 reduction phi node. FORNOW, NCOPIES is always 1, so the
4765 loop is redundant. */
4766 use = reduction_phi;
4767 for (j = 0; j < ncopies; j++)
4769 edge pr_edge = loop_preheader_edge (loop);
4770 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4771 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4777 phis.release ();
4778 if (nested_in_vect_loop)
4780 if (double_reduc)
4781 loop = outer_loop;
4782 else
4783 continue;
4786 phis.create (3);
4787 /* Find the loop-closed-use at the loop exit of the original scalar
4788 result. (The reduction result is expected to have two immediate uses,
4789 one at the latch block, and one at the loop exit). For double
4790 reductions we are looking for exit phis of the outer loop. */
4791 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4793 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4795 if (!is_gimple_debug (USE_STMT (use_p)))
4796 phis.safe_push (USE_STMT (use_p));
4798 else
4800 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4802 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4804 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4806 if (!flow_bb_inside_loop_p (loop,
4807 gimple_bb (USE_STMT (phi_use_p)))
4808 && !is_gimple_debug (USE_STMT (phi_use_p)))
4809 phis.safe_push (USE_STMT (phi_use_p));
4815 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4817 /* Replace the uses: */
4818 orig_name = PHI_RESULT (exit_phi);
4819 scalar_result = scalar_results[k];
4820 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4821 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4822 SET_USE (use_p, scalar_result);
4825 phis.release ();
4830 /* Function vectorizable_reduction.
4832 Check if STMT performs a reduction operation that can be vectorized.
4833 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4834 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4835 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4837 This function also handles reduction idioms (patterns) that have been
4838 recognized in advance during vect_pattern_recog. In this case, STMT may be
4839 of this form:
4840 X = pattern_expr (arg0, arg1, ..., X)
4841 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4842 sequence that had been detected and replaced by the pattern-stmt (STMT).
4844 In some cases of reduction patterns, the type of the reduction variable X is
4845 different than the type of the other arguments of STMT.
4846 In such cases, the vectype that is used when transforming STMT into a vector
4847 stmt is different than the vectype that is used to determine the
4848 vectorization factor, because it consists of a different number of elements
4849 than the actual number of elements that are being operated upon in parallel.
4851 For example, consider an accumulation of shorts into an int accumulator.
4852 On some targets it's possible to vectorize this pattern operating on 8
4853 shorts at a time (hence, the vectype for purposes of determining the
4854 vectorization factor should be V8HI); on the other hand, the vectype that
4855 is used to create the vector form is actually V4SI (the type of the result).
4857 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4858 indicates what is the actual level of parallelism (V8HI in the example), so
4859 that the right vectorization factor would be derived. This vectype
4860 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4861 be used to create the vectorized stmt. The right vectype for the vectorized
4862 stmt is obtained from the type of the result X:
4863 get_vectype_for_scalar_type (TREE_TYPE (X))
4865 This means that, contrary to "regular" reductions (or "regular" stmts in
4866 general), the following equation:
4867 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4868 does *NOT* necessarily hold for reduction patterns. */
4870 bool
4871 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4872 gimple *vec_stmt, slp_tree slp_node)
4874 tree vec_dest;
4875 tree scalar_dest;
4876 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4877 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4878 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4879 tree vectype_in = NULL_TREE;
4880 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4881 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4882 enum tree_code code, orig_code, epilog_reduc_code;
4883 machine_mode vec_mode;
4884 int op_type;
4885 optab optab, reduc_optab;
4886 tree new_temp = NULL_TREE;
4887 tree def;
4888 gimple def_stmt;
4889 enum vect_def_type dt;
4890 gphi *new_phi = NULL;
4891 tree scalar_type;
4892 bool is_simple_use;
4893 gimple orig_stmt;
4894 stmt_vec_info orig_stmt_info;
4895 tree expr = NULL_TREE;
4896 int i;
4897 int ncopies;
4898 int epilog_copies;
4899 stmt_vec_info prev_stmt_info, prev_phi_info;
4900 bool single_defuse_cycle = false;
4901 tree reduc_def = NULL_TREE;
4902 gimple new_stmt = NULL;
4903 int j;
4904 tree ops[3];
4905 bool nested_cycle = false, found_nested_cycle_def = false;
4906 gimple reduc_def_stmt = NULL;
4907 bool double_reduc = false, dummy;
4908 basic_block def_bb;
4909 struct loop * def_stmt_loop, *outer_loop = NULL;
4910 tree def_arg;
4911 gimple def_arg_stmt;
4912 auto_vec<tree> vec_oprnds0;
4913 auto_vec<tree> vec_oprnds1;
4914 auto_vec<tree> vect_defs;
4915 auto_vec<gimple> phis;
4916 int vec_num;
4917 tree def0, def1, tem, op0, op1 = NULL_TREE;
4918 bool first_p = true;
4920 /* In case of reduction chain we switch to the first stmt in the chain, but
4921 we don't update STMT_INFO, since only the last stmt is marked as reduction
4922 and has reduction properties. */
4923 if (GROUP_FIRST_ELEMENT (stmt_info)
4924 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4926 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4927 first_p = false;
4930 if (nested_in_vect_loop_p (loop, stmt))
4932 outer_loop = loop;
4933 loop = loop->inner;
4934 nested_cycle = true;
4937 /* 1. Is vectorizable reduction? */
4938 /* Not supportable if the reduction variable is used in the loop, unless
4939 it's a reduction chain. */
4940 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4941 && !GROUP_FIRST_ELEMENT (stmt_info))
4942 return false;
4944 /* Reductions that are not used even in an enclosing outer-loop,
4945 are expected to be "live" (used out of the loop). */
4946 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4947 && !STMT_VINFO_LIVE_P (stmt_info))
4948 return false;
4950 /* Make sure it was already recognized as a reduction computation. */
4951 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4952 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4953 return false;
4955 /* 2. Has this been recognized as a reduction pattern?
4957 Check if STMT represents a pattern that has been recognized
4958 in earlier analysis stages. For stmts that represent a pattern,
4959 the STMT_VINFO_RELATED_STMT field records the last stmt in
4960 the original sequence that constitutes the pattern. */
4962 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4963 if (orig_stmt)
4965 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4966 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4967 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4970 /* 3. Check the operands of the operation. The first operands are defined
4971 inside the loop body. The last operand is the reduction variable,
4972 which is defined by the loop-header-phi. */
4974 gcc_assert (is_gimple_assign (stmt));
4976 /* Flatten RHS. */
4977 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4979 case GIMPLE_SINGLE_RHS:
4980 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4981 if (op_type == ternary_op)
4983 tree rhs = gimple_assign_rhs1 (stmt);
4984 ops[0] = TREE_OPERAND (rhs, 0);
4985 ops[1] = TREE_OPERAND (rhs, 1);
4986 ops[2] = TREE_OPERAND (rhs, 2);
4987 code = TREE_CODE (rhs);
4989 else
4990 return false;
4991 break;
4993 case GIMPLE_BINARY_RHS:
4994 code = gimple_assign_rhs_code (stmt);
4995 op_type = TREE_CODE_LENGTH (code);
4996 gcc_assert (op_type == binary_op);
4997 ops[0] = gimple_assign_rhs1 (stmt);
4998 ops[1] = gimple_assign_rhs2 (stmt);
4999 break;
5001 case GIMPLE_TERNARY_RHS:
5002 code = gimple_assign_rhs_code (stmt);
5003 op_type = TREE_CODE_LENGTH (code);
5004 gcc_assert (op_type == ternary_op);
5005 ops[0] = gimple_assign_rhs1 (stmt);
5006 ops[1] = gimple_assign_rhs2 (stmt);
5007 ops[2] = gimple_assign_rhs3 (stmt);
5008 break;
5010 case GIMPLE_UNARY_RHS:
5011 return false;
5013 default:
5014 gcc_unreachable ();
5016 /* The default is that the reduction variable is the last in statement. */
5017 int reduc_index = op_type - 1;
5019 if (code == COND_EXPR && slp_node)
5020 return false;
5022 scalar_dest = gimple_assign_lhs (stmt);
5023 scalar_type = TREE_TYPE (scalar_dest);
5024 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5025 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5026 return false;
5028 /* Do not try to vectorize bit-precision reductions. */
5029 if ((TYPE_PRECISION (scalar_type)
5030 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5031 return false;
5033 /* All uses but the last are expected to be defined in the loop.
5034 The last use is the reduction variable. In case of nested cycle this
5035 assumption is not true: we use reduc_index to record the index of the
5036 reduction variable. */
5037 for (i = 0; i < op_type - 1; i++)
5039 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5040 if (i == 0 && code == COND_EXPR)
5041 continue;
5043 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5044 &def_stmt, &def, &dt, &tem);
5045 if (!vectype_in)
5046 vectype_in = tem;
5047 gcc_assert (is_simple_use);
5049 if (dt != vect_internal_def
5050 && dt != vect_external_def
5051 && dt != vect_constant_def
5052 && dt != vect_induction_def
5053 && !(dt == vect_nested_cycle && nested_cycle))
5054 return false;
5056 if (dt == vect_nested_cycle)
5058 found_nested_cycle_def = true;
5059 reduc_def_stmt = def_stmt;
5060 reduc_index = i;
5064 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5065 &def_stmt, &def, &dt, &tem);
5066 if (!vectype_in)
5067 vectype_in = tem;
5068 gcc_assert (is_simple_use);
5069 if (!found_nested_cycle_def)
5070 reduc_def_stmt = def_stmt;
5072 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5073 return false;
5075 if (!(dt == vect_reduction_def
5076 || dt == vect_nested_cycle
5077 || ((dt == vect_internal_def || dt == vect_external_def
5078 || dt == vect_constant_def || dt == vect_induction_def)
5079 && nested_cycle && found_nested_cycle_def)))
5081 /* For pattern recognized stmts, orig_stmt might be a reduction,
5082 but some helper statements for the pattern might not, or
5083 might be COND_EXPRs with reduction uses in the condition. */
5084 gcc_assert (orig_stmt);
5085 return false;
5088 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5089 !nested_cycle, &dummy);
5090 if (orig_stmt)
5091 gcc_assert (tmp == orig_stmt
5092 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5093 else
5094 /* We changed STMT to be the first stmt in reduction chain, hence we
5095 check that in this case the first element in the chain is STMT. */
5096 gcc_assert (stmt == tmp
5097 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5099 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5100 return false;
5102 if (slp_node || PURE_SLP_STMT (stmt_info))
5103 ncopies = 1;
5104 else
5105 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5106 / TYPE_VECTOR_SUBPARTS (vectype_in));
5108 gcc_assert (ncopies >= 1);
5110 vec_mode = TYPE_MODE (vectype_in);
5112 if (code == COND_EXPR)
5114 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5116 if (dump_enabled_p ())
5117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5118 "unsupported condition in reduction\n");
5120 return false;
5123 else
5125 /* 4. Supportable by target? */
5127 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5128 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5130 /* Shifts and rotates are only supported by vectorizable_shifts,
5131 not vectorizable_reduction. */
5132 if (dump_enabled_p ())
5133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5134 "unsupported shift or rotation.\n");
5135 return false;
5138 /* 4.1. check support for the operation in the loop */
5139 optab = optab_for_tree_code (code, vectype_in, optab_default);
5140 if (!optab)
5142 if (dump_enabled_p ())
5143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5144 "no optab.\n");
5146 return false;
5149 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5151 if (dump_enabled_p ())
5152 dump_printf (MSG_NOTE, "op not supported by target.\n");
5154 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5155 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5156 < vect_min_worthwhile_factor (code))
5157 return false;
5159 if (dump_enabled_p ())
5160 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5163 /* Worthwhile without SIMD support? */
5164 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5165 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5166 < vect_min_worthwhile_factor (code))
5168 if (dump_enabled_p ())
5169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170 "not worthwhile without SIMD support.\n");
5172 return false;
5176 /* 4.2. Check support for the epilog operation.
5178 If STMT represents a reduction pattern, then the type of the
5179 reduction variable may be different than the type of the rest
5180 of the arguments. For example, consider the case of accumulation
5181 of shorts into an int accumulator; The original code:
5182 S1: int_a = (int) short_a;
5183 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5185 was replaced with:
5186 STMT: int_acc = widen_sum <short_a, int_acc>
5188 This means that:
5189 1. The tree-code that is used to create the vector operation in the
5190 epilog code (that reduces the partial results) is not the
5191 tree-code of STMT, but is rather the tree-code of the original
5192 stmt from the pattern that STMT is replacing. I.e, in the example
5193 above we want to use 'widen_sum' in the loop, but 'plus' in the
5194 epilog.
5195 2. The type (mode) we use to check available target support
5196 for the vector operation to be created in the *epilog*, is
5197 determined by the type of the reduction variable (in the example
5198 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5199 However the type (mode) we use to check available target support
5200 for the vector operation to be created *inside the loop*, is
5201 determined by the type of the other arguments to STMT (in the
5202 example we'd check this: optab_handler (widen_sum_optab,
5203 vect_short_mode)).
5205 This is contrary to "regular" reductions, in which the types of all
5206 the arguments are the same as the type of the reduction variable.
5207 For "regular" reductions we can therefore use the same vector type
5208 (and also the same tree-code) when generating the epilog code and
5209 when generating the code inside the loop. */
5211 if (orig_stmt)
5213 /* This is a reduction pattern: get the vectype from the type of the
5214 reduction variable, and get the tree-code from orig_stmt. */
5215 orig_code = gimple_assign_rhs_code (orig_stmt);
5216 gcc_assert (vectype_out);
5217 vec_mode = TYPE_MODE (vectype_out);
5219 else
5221 /* Regular reduction: use the same vectype and tree-code as used for
5222 the vector code inside the loop can be used for the epilog code. */
5223 orig_code = code;
5226 if (nested_cycle)
5228 def_bb = gimple_bb (reduc_def_stmt);
5229 def_stmt_loop = def_bb->loop_father;
5230 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5231 loop_preheader_edge (def_stmt_loop));
5232 if (TREE_CODE (def_arg) == SSA_NAME
5233 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5234 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5235 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5236 && vinfo_for_stmt (def_arg_stmt)
5237 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5238 == vect_double_reduction_def)
5239 double_reduc = true;
5242 epilog_reduc_code = ERROR_MARK;
5243 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5245 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5246 optab_default);
5247 if (!reduc_optab)
5249 if (dump_enabled_p ())
5250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5251 "no optab for reduction.\n");
5253 epilog_reduc_code = ERROR_MARK;
5255 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5257 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5258 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5260 if (dump_enabled_p ())
5261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5262 "reduc op not supported by target.\n");
5264 epilog_reduc_code = ERROR_MARK;
5268 else
5270 if (!nested_cycle || double_reduc)
5272 if (dump_enabled_p ())
5273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5274 "no reduc code for scalar code.\n");
5276 return false;
5280 if (double_reduc && ncopies > 1)
5282 if (dump_enabled_p ())
5283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5284 "multiple types in double reduction\n");
5286 return false;
5289 /* In case of widenning multiplication by a constant, we update the type
5290 of the constant to be the type of the other operand. We check that the
5291 constant fits the type in the pattern recognition pass. */
5292 if (code == DOT_PROD_EXPR
5293 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5295 if (TREE_CODE (ops[0]) == INTEGER_CST)
5296 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5297 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5298 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5299 else
5301 if (dump_enabled_p ())
5302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5303 "invalid types in dot-prod\n");
5305 return false;
5309 if (!vec_stmt) /* transformation not required. */
5311 if (first_p
5312 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5313 reduc_index))
5314 return false;
5315 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5316 return true;
5319 /** Transform. **/
5321 if (dump_enabled_p ())
5322 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5324 /* FORNOW: Multiple types are not supported for condition. */
5325 if (code == COND_EXPR)
5326 gcc_assert (ncopies == 1);
5328 /* Create the destination vector */
5329 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5331 /* In case the vectorization factor (VF) is bigger than the number
5332 of elements that we can fit in a vectype (nunits), we have to generate
5333 more than one vector stmt - i.e - we need to "unroll" the
5334 vector stmt by a factor VF/nunits. For more details see documentation
5335 in vectorizable_operation. */
5337 /* If the reduction is used in an outer loop we need to generate
5338 VF intermediate results, like so (e.g. for ncopies=2):
5339 r0 = phi (init, r0)
5340 r1 = phi (init, r1)
5341 r0 = x0 + r0;
5342 r1 = x1 + r1;
5343 (i.e. we generate VF results in 2 registers).
5344 In this case we have a separate def-use cycle for each copy, and therefore
5345 for each copy we get the vector def for the reduction variable from the
5346 respective phi node created for this copy.
5348 Otherwise (the reduction is unused in the loop nest), we can combine
5349 together intermediate results, like so (e.g. for ncopies=2):
5350 r = phi (init, r)
5351 r = x0 + r;
5352 r = x1 + r;
5353 (i.e. we generate VF/2 results in a single register).
5354 In this case for each copy we get the vector def for the reduction variable
5355 from the vectorized reduction operation generated in the previous iteration.
5358 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5360 single_defuse_cycle = true;
5361 epilog_copies = 1;
5363 else
5364 epilog_copies = ncopies;
5366 prev_stmt_info = NULL;
5367 prev_phi_info = NULL;
5368 if (slp_node)
5369 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5370 else
5372 vec_num = 1;
5373 vec_oprnds0.create (1);
5374 if (op_type == ternary_op)
5375 vec_oprnds1.create (1);
5378 phis.create (vec_num);
5379 vect_defs.create (vec_num);
5380 if (!slp_node)
5381 vect_defs.quick_push (NULL_TREE);
5383 for (j = 0; j < ncopies; j++)
5385 if (j == 0 || !single_defuse_cycle)
5387 for (i = 0; i < vec_num; i++)
5389 /* Create the reduction-phi that defines the reduction
5390 operand. */
5391 new_phi = create_phi_node (vec_dest, loop->header);
5392 set_vinfo_for_stmt (new_phi,
5393 new_stmt_vec_info (new_phi, loop_vinfo,
5394 NULL));
5395 if (j == 0 || slp_node)
5396 phis.quick_push (new_phi);
5400 if (code == COND_EXPR)
5402 gcc_assert (!slp_node);
5403 vectorizable_condition (stmt, gsi, vec_stmt,
5404 PHI_RESULT (phis[0]),
5405 reduc_index, NULL);
5406 /* Multiple types are not supported for condition. */
5407 break;
5410 /* Handle uses. */
5411 if (j == 0)
5413 op0 = ops[!reduc_index];
5414 if (op_type == ternary_op)
5416 if (reduc_index == 0)
5417 op1 = ops[2];
5418 else
5419 op1 = ops[1];
5422 if (slp_node)
5423 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5424 slp_node, -1);
5425 else
5427 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5428 stmt, NULL);
5429 vec_oprnds0.quick_push (loop_vec_def0);
5430 if (op_type == ternary_op)
5432 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5433 NULL);
5434 vec_oprnds1.quick_push (loop_vec_def1);
5438 else
5440 if (!slp_node)
5442 enum vect_def_type dt;
5443 gimple dummy_stmt;
5444 tree dummy;
5446 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5447 &dummy_stmt, &dummy, &dt);
5448 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5449 loop_vec_def0);
5450 vec_oprnds0[0] = loop_vec_def0;
5451 if (op_type == ternary_op)
5453 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5454 &dummy, &dt);
5455 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5456 loop_vec_def1);
5457 vec_oprnds1[0] = loop_vec_def1;
5461 if (single_defuse_cycle)
5462 reduc_def = gimple_assign_lhs (new_stmt);
5464 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5467 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5469 if (slp_node)
5470 reduc_def = PHI_RESULT (phis[i]);
5471 else
5473 if (!single_defuse_cycle || j == 0)
5474 reduc_def = PHI_RESULT (new_phi);
5477 def1 = ((op_type == ternary_op)
5478 ? vec_oprnds1[i] : NULL);
5479 if (op_type == binary_op)
5481 if (reduc_index == 0)
5482 expr = build2 (code, vectype_out, reduc_def, def0);
5483 else
5484 expr = build2 (code, vectype_out, def0, reduc_def);
5486 else
5488 if (reduc_index == 0)
5489 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5490 else
5492 if (reduc_index == 1)
5493 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5494 else
5495 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5499 new_stmt = gimple_build_assign (vec_dest, expr);
5500 new_temp = make_ssa_name (vec_dest, new_stmt);
5501 gimple_assign_set_lhs (new_stmt, new_temp);
5502 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5504 if (slp_node)
5506 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5507 vect_defs.quick_push (new_temp);
5509 else
5510 vect_defs[0] = new_temp;
5513 if (slp_node)
5514 continue;
5516 if (j == 0)
5517 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5518 else
5519 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5521 prev_stmt_info = vinfo_for_stmt (new_stmt);
5522 prev_phi_info = vinfo_for_stmt (new_phi);
5525 /* Finalize the reduction-phi (set its arguments) and create the
5526 epilog reduction code. */
5527 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5529 new_temp = gimple_assign_lhs (*vec_stmt);
5530 vect_defs[0] = new_temp;
5533 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5534 epilog_reduc_code, phis, reduc_index,
5535 double_reduc, slp_node);
5537 return true;
5540 /* Function vect_min_worthwhile_factor.
5542 For a loop where we could vectorize the operation indicated by CODE,
5543 return the minimum vectorization factor that makes it worthwhile
5544 to use generic vectors. */
5546 vect_min_worthwhile_factor (enum tree_code code)
5548 switch (code)
5550 case PLUS_EXPR:
5551 case MINUS_EXPR:
5552 case NEGATE_EXPR:
5553 return 4;
5555 case BIT_AND_EXPR:
5556 case BIT_IOR_EXPR:
5557 case BIT_XOR_EXPR:
5558 case BIT_NOT_EXPR:
5559 return 2;
5561 default:
5562 return INT_MAX;
5567 /* Function vectorizable_induction
5569 Check if PHI performs an induction computation that can be vectorized.
5570 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5571 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5572 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5574 bool
5575 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5576 gimple *vec_stmt)
5578 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5579 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5580 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5581 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5582 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5583 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5584 tree vec_def;
5586 gcc_assert (ncopies >= 1);
5587 /* FORNOW. These restrictions should be relaxed. */
5588 if (nested_in_vect_loop_p (loop, phi))
5590 imm_use_iterator imm_iter;
5591 use_operand_p use_p;
5592 gimple exit_phi;
5593 edge latch_e;
5594 tree loop_arg;
5596 if (ncopies > 1)
5598 if (dump_enabled_p ())
5599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5600 "multiple types in nested loop.\n");
5601 return false;
5604 exit_phi = NULL;
5605 latch_e = loop_latch_edge (loop->inner);
5606 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5607 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5609 gimple use_stmt = USE_STMT (use_p);
5610 if (is_gimple_debug (use_stmt))
5611 continue;
5613 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5615 exit_phi = use_stmt;
5616 break;
5619 if (exit_phi)
5621 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5622 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5623 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5625 if (dump_enabled_p ())
5626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5627 "inner-loop induction only used outside "
5628 "of the outer vectorized loop.\n");
5629 return false;
5634 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5635 return false;
5637 /* FORNOW: SLP not supported. */
5638 if (STMT_SLP_TYPE (stmt_info))
5639 return false;
5641 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5643 if (gimple_code (phi) != GIMPLE_PHI)
5644 return false;
5646 if (!vec_stmt) /* transformation not required. */
5648 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5649 if (dump_enabled_p ())
5650 dump_printf_loc (MSG_NOTE, vect_location,
5651 "=== vectorizable_induction ===\n");
5652 vect_model_induction_cost (stmt_info, ncopies);
5653 return true;
5656 /** Transform. **/
5658 if (dump_enabled_p ())
5659 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5661 vec_def = get_initial_def_for_induction (phi);
5662 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5663 return true;
5666 /* Function vectorizable_live_operation.
5668 STMT computes a value that is used outside the loop. Check if
5669 it can be supported. */
5671 bool
5672 vectorizable_live_operation (gimple stmt,
5673 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5674 gimple *vec_stmt)
5676 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5677 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5678 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5679 int i;
5680 int op_type;
5681 tree op;
5682 tree def;
5683 gimple def_stmt;
5684 enum vect_def_type dt;
5685 enum tree_code code;
5686 enum gimple_rhs_class rhs_class;
5688 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5690 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5691 return false;
5693 if (!is_gimple_assign (stmt))
5695 if (gimple_call_internal_p (stmt)
5696 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5697 && gimple_call_lhs (stmt)
5698 && loop->simduid
5699 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5700 && loop->simduid
5701 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5703 edge e = single_exit (loop);
5704 basic_block merge_bb = e->dest;
5705 imm_use_iterator imm_iter;
5706 use_operand_p use_p;
5707 tree lhs = gimple_call_lhs (stmt);
5709 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5711 gimple use_stmt = USE_STMT (use_p);
5712 if (gimple_code (use_stmt) == GIMPLE_PHI
5713 && gimple_bb (use_stmt) == merge_bb)
5715 if (vec_stmt)
5717 tree vfm1
5718 = build_int_cst (unsigned_type_node,
5719 loop_vinfo->vectorization_factor - 1);
5720 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5722 return true;
5727 return false;
5730 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5731 return false;
5733 /* FORNOW. CHECKME. */
5734 if (nested_in_vect_loop_p (loop, stmt))
5735 return false;
5737 code = gimple_assign_rhs_code (stmt);
5738 op_type = TREE_CODE_LENGTH (code);
5739 rhs_class = get_gimple_rhs_class (code);
5740 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5741 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5743 /* FORNOW: support only if all uses are invariant. This means
5744 that the scalar operations can remain in place, unvectorized.
5745 The original last scalar value that they compute will be used. */
5747 for (i = 0; i < op_type; i++)
5749 if (rhs_class == GIMPLE_SINGLE_RHS)
5750 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5751 else
5752 op = gimple_op (stmt, i + 1);
5753 if (op
5754 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5755 &dt))
5757 if (dump_enabled_p ())
5758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5759 "use not simple.\n");
5760 return false;
5763 if (dt != vect_external_def && dt != vect_constant_def)
5764 return false;
5767 /* No transformation is required for the cases we currently support. */
5768 return true;
5771 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5773 static void
5774 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5776 ssa_op_iter op_iter;
5777 imm_use_iterator imm_iter;
5778 def_operand_p def_p;
5779 gimple ustmt;
5781 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5783 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5785 basic_block bb;
5787 if (!is_gimple_debug (ustmt))
5788 continue;
5790 bb = gimple_bb (ustmt);
5792 if (!flow_bb_inside_loop_p (loop, bb))
5794 if (gimple_debug_bind_p (ustmt))
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_NOTE, vect_location,
5798 "killing debug use\n");
5800 gimple_debug_bind_reset_value (ustmt);
5801 update_stmt (ustmt);
5803 else
5804 gcc_unreachable ();
5811 /* This function builds ni_name = number of iterations. Statements
5812 are emitted on the loop preheader edge. */
5814 static tree
5815 vect_build_loop_niters (loop_vec_info loop_vinfo)
5817 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5818 if (TREE_CODE (ni) == INTEGER_CST)
5819 return ni;
5820 else
5822 tree ni_name, var;
5823 gimple_seq stmts = NULL;
5824 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5826 var = create_tmp_var (TREE_TYPE (ni), "niters");
5827 ni_name = force_gimple_operand (ni, &stmts, false, var);
5828 if (stmts)
5829 gsi_insert_seq_on_edge_immediate (pe, stmts);
5831 return ni_name;
5836 /* This function generates the following statements:
5838 ni_name = number of iterations loop executes
5839 ratio = ni_name / vf
5840 ratio_mult_vf_name = ratio * vf
5842 and places them on the loop preheader edge. */
5844 static void
5845 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5846 tree ni_name,
5847 tree *ratio_mult_vf_name_ptr,
5848 tree *ratio_name_ptr)
5850 tree ni_minus_gap_name;
5851 tree var;
5852 tree ratio_name;
5853 tree ratio_mult_vf_name;
5854 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5855 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5856 tree log_vf;
5858 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5860 /* If epilogue loop is required because of data accesses with gaps, we
5861 subtract one iteration from the total number of iterations here for
5862 correct calculation of RATIO. */
5863 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5865 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5866 ni_name,
5867 build_one_cst (TREE_TYPE (ni_name)));
5868 if (!is_gimple_val (ni_minus_gap_name))
5870 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5871 gimple stmts = NULL;
5872 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5873 true, var);
5874 gsi_insert_seq_on_edge_immediate (pe, stmts);
5877 else
5878 ni_minus_gap_name = ni_name;
5880 /* Create: ratio = ni >> log2(vf) */
5881 /* ??? As we have ni == number of latch executions + 1, ni could
5882 have overflown to zero. So avoid computing ratio based on ni
5883 but compute it using the fact that we know ratio will be at least
5884 one, thus via (ni - vf) >> log2(vf) + 1. */
5885 ratio_name
5886 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5887 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5888 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5889 ni_minus_gap_name,
5890 build_int_cst
5891 (TREE_TYPE (ni_name), vf)),
5892 log_vf),
5893 build_int_cst (TREE_TYPE (ni_name), 1));
5894 if (!is_gimple_val (ratio_name))
5896 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5897 gimple stmts = NULL;
5898 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5899 gsi_insert_seq_on_edge_immediate (pe, stmts);
5901 *ratio_name_ptr = ratio_name;
5903 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5905 if (ratio_mult_vf_name_ptr)
5907 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5908 ratio_name, log_vf);
5909 if (!is_gimple_val (ratio_mult_vf_name))
5911 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5912 gimple stmts = NULL;
5913 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5914 true, var);
5915 gsi_insert_seq_on_edge_immediate (pe, stmts);
5917 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5920 return;
5924 /* Function vect_transform_loop.
5926 The analysis phase has determined that the loop is vectorizable.
5927 Vectorize the loop - created vectorized stmts to replace the scalar
5928 stmts in the loop, and update the loop exit condition. */
5930 void
5931 vect_transform_loop (loop_vec_info loop_vinfo)
5933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5934 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5935 int nbbs = loop->num_nodes;
5936 int i;
5937 tree ratio = NULL;
5938 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5939 bool grouped_store;
5940 bool slp_scheduled = false;
5941 gimple stmt, pattern_stmt;
5942 gimple_seq pattern_def_seq = NULL;
5943 gimple_stmt_iterator pattern_def_si = gsi_none ();
5944 bool transform_pattern_stmt = false;
5945 bool check_profitability = false;
5946 int th;
5947 /* Record number of iterations before we started tampering with the profile. */
5948 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5950 if (dump_enabled_p ())
5951 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5953 /* If profile is inprecise, we have chance to fix it up. */
5954 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5955 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5957 /* Use the more conservative vectorization threshold. If the number
5958 of iterations is constant assume the cost check has been performed
5959 by our caller. If the threshold makes all loops profitable that
5960 run at least the vectorization factor number of times checking
5961 is pointless, too. */
5962 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5963 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5964 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5966 if (dump_enabled_p ())
5967 dump_printf_loc (MSG_NOTE, vect_location,
5968 "Profitability threshold is %d loop iterations.\n",
5969 th);
5970 check_profitability = true;
5973 /* Version the loop first, if required, so the profitability check
5974 comes first. */
5976 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5977 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5979 vect_loop_versioning (loop_vinfo, th, check_profitability);
5980 check_profitability = false;
5983 tree ni_name = vect_build_loop_niters (loop_vinfo);
5984 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5986 /* Peel the loop if there are data refs with unknown alignment.
5987 Only one data ref with unknown store is allowed. */
5989 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5991 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5992 th, check_profitability);
5993 check_profitability = false;
5994 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5995 be re-computed. */
5996 ni_name = NULL_TREE;
5999 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6000 compile time constant), or it is a constant that doesn't divide by the
6001 vectorization factor, then an epilog loop needs to be created.
6002 We therefore duplicate the loop: the original loop will be vectorized,
6003 and will compute the first (n/VF) iterations. The second copy of the loop
6004 will remain scalar and will compute the remaining (n%VF) iterations.
6005 (VF is the vectorization factor). */
6007 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6008 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6010 tree ratio_mult_vf;
6011 if (!ni_name)
6012 ni_name = vect_build_loop_niters (loop_vinfo);
6013 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6014 &ratio);
6015 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6016 th, check_profitability);
6018 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6019 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6020 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6021 else
6023 if (!ni_name)
6024 ni_name = vect_build_loop_niters (loop_vinfo);
6025 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6028 /* 1) Make sure the loop header has exactly two entries
6029 2) Make sure we have a preheader basic block. */
6031 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6033 split_edge (loop_preheader_edge (loop));
6035 /* FORNOW: the vectorizer supports only loops which body consist
6036 of one basic block (header + empty latch). When the vectorizer will
6037 support more involved loop forms, the order by which the BBs are
6038 traversed need to be reconsidered. */
6040 for (i = 0; i < nbbs; i++)
6042 basic_block bb = bbs[i];
6043 stmt_vec_info stmt_info;
6045 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6046 gsi_next (&si))
6048 gphi *phi = si.phi ();
6049 if (dump_enabled_p ())
6051 dump_printf_loc (MSG_NOTE, vect_location,
6052 "------>vectorizing phi: ");
6053 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6054 dump_printf (MSG_NOTE, "\n");
6056 stmt_info = vinfo_for_stmt (phi);
6057 if (!stmt_info)
6058 continue;
6060 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6061 vect_loop_kill_debug_uses (loop, phi);
6063 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6064 && !STMT_VINFO_LIVE_P (stmt_info))
6065 continue;
6067 if (STMT_VINFO_VECTYPE (stmt_info)
6068 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6069 != (unsigned HOST_WIDE_INT) vectorization_factor)
6070 && dump_enabled_p ())
6071 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6073 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6075 if (dump_enabled_p ())
6076 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6077 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6081 pattern_stmt = NULL;
6082 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6083 !gsi_end_p (si) || transform_pattern_stmt;)
6085 bool is_store;
6087 if (transform_pattern_stmt)
6088 stmt = pattern_stmt;
6089 else
6091 stmt = gsi_stmt (si);
6092 /* During vectorization remove existing clobber stmts. */
6093 if (gimple_clobber_p (stmt))
6095 unlink_stmt_vdef (stmt);
6096 gsi_remove (&si, true);
6097 release_defs (stmt);
6098 continue;
6102 if (dump_enabled_p ())
6104 dump_printf_loc (MSG_NOTE, vect_location,
6105 "------>vectorizing statement: ");
6106 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6107 dump_printf (MSG_NOTE, "\n");
6110 stmt_info = vinfo_for_stmt (stmt);
6112 /* vector stmts created in the outer-loop during vectorization of
6113 stmts in an inner-loop may not have a stmt_info, and do not
6114 need to be vectorized. */
6115 if (!stmt_info)
6117 gsi_next (&si);
6118 continue;
6121 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6122 vect_loop_kill_debug_uses (loop, stmt);
6124 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6125 && !STMT_VINFO_LIVE_P (stmt_info))
6127 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6128 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6129 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6130 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6132 stmt = pattern_stmt;
6133 stmt_info = vinfo_for_stmt (stmt);
6135 else
6137 gsi_next (&si);
6138 continue;
6141 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6142 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6143 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6144 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6145 transform_pattern_stmt = true;
6147 /* If pattern statement has def stmts, vectorize them too. */
6148 if (is_pattern_stmt_p (stmt_info))
6150 if (pattern_def_seq == NULL)
6152 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6153 pattern_def_si = gsi_start (pattern_def_seq);
6155 else if (!gsi_end_p (pattern_def_si))
6156 gsi_next (&pattern_def_si);
6157 if (pattern_def_seq != NULL)
6159 gimple pattern_def_stmt = NULL;
6160 stmt_vec_info pattern_def_stmt_info = NULL;
6162 while (!gsi_end_p (pattern_def_si))
6164 pattern_def_stmt = gsi_stmt (pattern_def_si);
6165 pattern_def_stmt_info
6166 = vinfo_for_stmt (pattern_def_stmt);
6167 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6168 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6169 break;
6170 gsi_next (&pattern_def_si);
6173 if (!gsi_end_p (pattern_def_si))
6175 if (dump_enabled_p ())
6177 dump_printf_loc (MSG_NOTE, vect_location,
6178 "==> vectorizing pattern def "
6179 "stmt: ");
6180 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6181 pattern_def_stmt, 0);
6182 dump_printf (MSG_NOTE, "\n");
6185 stmt = pattern_def_stmt;
6186 stmt_info = pattern_def_stmt_info;
6188 else
6190 pattern_def_si = gsi_none ();
6191 transform_pattern_stmt = false;
6194 else
6195 transform_pattern_stmt = false;
6198 if (STMT_VINFO_VECTYPE (stmt_info))
6200 unsigned int nunits
6201 = (unsigned int)
6202 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6203 if (!STMT_SLP_TYPE (stmt_info)
6204 && nunits != (unsigned int) vectorization_factor
6205 && dump_enabled_p ())
6206 /* For SLP VF is set according to unrolling factor, and not
6207 to vector size, hence for SLP this print is not valid. */
6208 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6211 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6212 reached. */
6213 if (STMT_SLP_TYPE (stmt_info))
6215 if (!slp_scheduled)
6217 slp_scheduled = true;
6219 if (dump_enabled_p ())
6220 dump_printf_loc (MSG_NOTE, vect_location,
6221 "=== scheduling SLP instances ===\n");
6223 vect_schedule_slp (loop_vinfo, NULL);
6226 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6227 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6229 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6231 pattern_def_seq = NULL;
6232 gsi_next (&si);
6234 continue;
6238 /* -------- vectorize statement ------------ */
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6242 grouped_store = false;
6243 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6244 if (is_store)
6246 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6248 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6249 interleaving chain was completed - free all the stores in
6250 the chain. */
6251 gsi_next (&si);
6252 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6254 else
6256 /* Free the attached stmt_vec_info and remove the stmt. */
6257 gimple store = gsi_stmt (si);
6258 free_stmt_vec_info (store);
6259 unlink_stmt_vdef (store);
6260 gsi_remove (&si, true);
6261 release_defs (store);
6264 /* Stores can only appear at the end of pattern statements. */
6265 gcc_assert (!transform_pattern_stmt);
6266 pattern_def_seq = NULL;
6268 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6270 pattern_def_seq = NULL;
6271 gsi_next (&si);
6273 } /* stmts in BB */
6274 } /* BBs in loop */
6276 slpeel_make_loop_iterate_ntimes (loop, ratio);
6278 /* Reduce loop iterations by the vectorization factor. */
6279 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6280 expected_iterations / vectorization_factor);
6281 loop->nb_iterations_upper_bound
6282 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6283 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6284 && loop->nb_iterations_upper_bound != 0)
6285 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6286 if (loop->any_estimate)
6288 loop->nb_iterations_estimate
6289 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6290 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6291 && loop->nb_iterations_estimate != 0)
6292 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6295 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_NOTE, vect_location,
6298 "LOOP VECTORIZED\n");
6299 if (loop->inner)
6300 dump_printf_loc (MSG_NOTE, vect_location,
6301 "OUTER LOOP VECTORIZED\n");
6302 dump_printf (MSG_NOTE, "\n");