2015-09-25 Vladimir Makarov <vmakarov@redhat.com>
[official-gcc.git] / gcc / tree-vect-loop.c
blob63e29aa7e1443c6733e665b4f8bc4dd0a203075a
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "cfghooks.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "cfganal.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-pass.h"
45 #include "cfgloop.h"
46 #include "flags.h"
47 #include "insn-codes.h"
48 #include "optabs-tree.h"
49 #include "params.h"
50 #include "diagnostic-core.h"
51 #include "tree-chrec.h"
52 #include "tree-scalar-evolution.h"
53 #include "tree-vectorizer.h"
54 #include "target.h"
56 /* Loop Vectorization Pass.
58 This pass tries to vectorize loops.
60 For example, the vectorizer transforms the following simple loop:
62 short a[N]; short b[N]; short c[N]; int i;
64 for (i=0; i<N; i++){
65 a[i] = b[i] + c[i];
68 as if it was manually vectorized by rewriting the source code into:
70 typedef int __attribute__((mode(V8HI))) v8hi;
71 short a[N]; short b[N]; short c[N]; int i;
72 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
73 v8hi va, vb, vc;
75 for (i=0; i<N/8; i++){
76 vb = pb[i];
77 vc = pc[i];
78 va = vb + vc;
79 pa[i] = va;
82 The main entry to this pass is vectorize_loops(), in which
83 the vectorizer applies a set of analyses on a given set of loops,
84 followed by the actual vectorization transformation for the loops that
85 had successfully passed the analysis phase.
86 Throughout this pass we make a distinction between two types of
87 data: scalars (which are represented by SSA_NAMES), and memory references
88 ("data-refs"). These two types of data require different handling both
89 during analysis and transformation. The types of data-refs that the
90 vectorizer currently supports are ARRAY_REFS which base is an array DECL
91 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
92 accesses are required to have a simple (consecutive) access pattern.
94 Analysis phase:
95 ===============
96 The driver for the analysis phase is vect_analyze_loop().
97 It applies a set of analyses, some of which rely on the scalar evolution
98 analyzer (scev) developed by Sebastian Pop.
100 During the analysis phase the vectorizer records some information
101 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
102 loop, as well as general information about the loop as a whole, which is
103 recorded in a "loop_vec_info" struct attached to each loop.
105 Transformation phase:
106 =====================
107 The loop transformation phase scans all the stmts in the loop, and
108 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
109 the loop that needs to be vectorized. It inserts the vector code sequence
110 just before the scalar stmt S, and records a pointer to the vector code
111 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
112 attached to S). This pointer will be used for the vectorization of following
113 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
114 otherwise, we rely on dead code elimination for removing it.
116 For example, say stmt S1 was vectorized into stmt VS1:
118 VS1: vb = px[i];
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 S2: a = b;
122 To vectorize stmt S2, the vectorizer first finds the stmt that defines
123 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
124 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
125 resulting sequence would be:
127 VS1: vb = px[i];
128 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
129 VS2: va = vb;
130 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132 Operands that are not SSA_NAMEs, are data-refs that appear in
133 load/store operations (like 'x[i]' in S1), and are handled differently.
135 Target modeling:
136 =================
137 Currently the only target specific information that is used is the
138 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
139 Targets that can support different sizes of vectors, for now will need
140 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
141 flexibility will be added in the future.
143 Since we only vectorize operations which vector form can be
144 expressed using existing tree codes, to verify that an operation is
145 supported, the vectorizer checks the relevant optab at the relevant
146 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
147 the value found is CODE_FOR_nothing, then there's no target support, and
148 we can't vectorize the stmt.
150 For additional information on this project see:
151 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 /* Function vect_determine_vectorization_factor
158 Determine the vectorization factor (VF). VF is the number of data elements
159 that are operated upon in parallel in a single iteration of the vectorized
160 loop. For example, when vectorizing a loop that operates on 4byte elements,
161 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
162 elements can fit in a single vector register.
164 We currently support vectorization of loops in which all types operated upon
165 are of the same size. Therefore this function currently sets VF according to
166 the size of the types operated upon, and fails if there are multiple sizes
167 in the loop.
169 VF is also the factor by which the loop iterations are strip-mined, e.g.:
170 original loop:
171 for (i=0; i<N; i++){
172 a[i] = b[i] + c[i];
175 vectorized loop:
176 for (i=0; i<N; i+=VF){
177 a[i:VF] = b[i:VF] + c[i:VF];
181 static bool
182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
185 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
186 int nbbs = loop->num_nodes;
187 unsigned int vectorization_factor = 0;
188 tree scalar_type;
189 gphi *phi;
190 tree vectype;
191 unsigned int nunits;
192 stmt_vec_info stmt_info;
193 int i;
194 HOST_WIDE_INT dummy;
195 gimple *stmt, *pattern_stmt = NULL;
196 gimple_seq pattern_def_seq = NULL;
197 gimple_stmt_iterator pattern_def_si = gsi_none ();
198 bool analyze_pattern_stmt = false;
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_NOTE, vect_location,
202 "=== vect_determine_vectorization_factor ===\n");
204 for (i = 0; i < nbbs; i++)
206 basic_block bb = bbs[i];
208 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
209 gsi_next (&si))
211 phi = si.phi ();
212 stmt_info = vinfo_for_stmt (phi);
213 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
216 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
217 dump_printf (MSG_NOTE, "\n");
220 gcc_assert (stmt_info);
222 if (STMT_VINFO_RELEVANT_P (stmt_info))
224 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
225 scalar_type = TREE_TYPE (PHI_RESULT (phi));
227 if (dump_enabled_p ())
229 dump_printf_loc (MSG_NOTE, vect_location,
230 "get vectype for scalar type: ");
231 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
232 dump_printf (MSG_NOTE, "\n");
235 vectype = get_vectype_for_scalar_type (scalar_type);
236 if (!vectype)
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "not vectorized: unsupported "
242 "data-type ");
243 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
244 scalar_type);
245 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
247 return false;
249 STMT_VINFO_VECTYPE (stmt_info) = vectype;
251 if (dump_enabled_p ())
253 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
254 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
255 dump_printf (MSG_NOTE, "\n");
258 nunits = TYPE_VECTOR_SUBPARTS (vectype);
259 if (dump_enabled_p ())
260 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
261 nunits);
263 if (!vectorization_factor
264 || (nunits > vectorization_factor))
265 vectorization_factor = nunits;
269 for (gimple_stmt_iterator si = gsi_start_bb (bb);
270 !gsi_end_p (si) || analyze_pattern_stmt;)
272 tree vf_vectype;
274 if (analyze_pattern_stmt)
275 stmt = pattern_stmt;
276 else
277 stmt = gsi_stmt (si);
279 stmt_info = vinfo_for_stmt (stmt);
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_NOTE, vect_location,
284 "==> examining statement: ");
285 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
286 dump_printf (MSG_NOTE, "\n");
289 gcc_assert (stmt_info);
291 /* Skip stmts which do not need to be vectorized. */
292 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
293 && !STMT_VINFO_LIVE_P (stmt_info))
294 || gimple_clobber_p (stmt))
296 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
297 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
298 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
299 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
301 stmt = pattern_stmt;
302 stmt_info = vinfo_for_stmt (pattern_stmt);
303 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location,
306 "==> examining pattern statement: ");
307 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
308 dump_printf (MSG_NOTE, "\n");
311 else
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
315 gsi_next (&si);
316 continue;
319 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
320 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
321 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
322 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
323 analyze_pattern_stmt = true;
325 /* If a pattern statement has def stmts, analyze them too. */
326 if (is_pattern_stmt_p (stmt_info))
328 if (pattern_def_seq == NULL)
330 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
331 pattern_def_si = gsi_start (pattern_def_seq);
333 else if (!gsi_end_p (pattern_def_si))
334 gsi_next (&pattern_def_si);
335 if (pattern_def_seq != NULL)
337 gimple *pattern_def_stmt = NULL;
338 stmt_vec_info pattern_def_stmt_info = NULL;
340 while (!gsi_end_p (pattern_def_si))
342 pattern_def_stmt = gsi_stmt (pattern_def_si);
343 pattern_def_stmt_info
344 = vinfo_for_stmt (pattern_def_stmt);
345 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
346 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
347 break;
348 gsi_next (&pattern_def_si);
351 if (!gsi_end_p (pattern_def_si))
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "==> examining pattern def stmt: ");
357 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
358 pattern_def_stmt, 0);
359 dump_printf (MSG_NOTE, "\n");
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
400 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
402 return false;
405 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "not vectorized: vector stmt in loop:");
411 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 return false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (is_gimple_call (stmt)
432 && gimple_call_internal_p (stmt)
433 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
434 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 else
436 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "get vectype for scalar type: ");
441 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
442 dump_printf (MSG_NOTE, "\n");
444 vectype = get_vectype_for_scalar_type (scalar_type);
445 if (!vectype)
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized: unsupported "
451 "data-type ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
453 scalar_type);
454 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 return false;
459 STMT_VINFO_VECTYPE (stmt_info) = vectype;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
465 dump_printf (MSG_NOTE, "\n");
469 /* The vectorization factor is according to the smallest
470 scalar type (or the largest vector size, but we only
471 support one vector size per loop). */
472 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
473 &dummy);
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_NOTE, vect_location,
477 "get vectype for scalar type: ");
478 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
479 dump_printf (MSG_NOTE, "\n");
481 vf_vectype = get_vectype_for_scalar_type (scalar_type);
482 if (!vf_vectype)
484 if (dump_enabled_p ())
486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
487 "not vectorized: unsupported data-type ");
488 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
489 scalar_type);
490 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
492 return false;
495 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
496 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
498 if (dump_enabled_p ())
500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
501 "not vectorized: different sized vector "
502 "types in statement, ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
504 vectype);
505 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
506 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
507 vf_vectype);
508 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
510 return false;
513 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
516 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
517 dump_printf (MSG_NOTE, "\n");
520 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
523 if (!vectorization_factor
524 || (nunits > vectorization_factor))
525 vectorization_factor = nunits;
527 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
529 pattern_def_seq = NULL;
530 gsi_next (&si);
535 /* TODO: Analyze cost. Decide if worth while to vectorize. */
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
538 vectorization_factor);
539 if (vectorization_factor <= 1)
541 if (dump_enabled_p ())
542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 "not vectorized: unsupported data-type\n");
544 return false;
546 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
548 return true;
552 /* Function vect_is_simple_iv_evolution.
554 FORNOW: A simple evolution of an induction variables in the loop is
555 considered a polynomial evolution. */
557 static bool
558 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
559 tree * step)
561 tree init_expr;
562 tree step_expr;
563 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
564 basic_block bb;
566 /* When there is no evolution in this loop, the evolution function
567 is not "simple". */
568 if (evolution_part == NULL_TREE)
569 return false;
571 /* When the evolution is a polynomial of degree >= 2
572 the evolution function is not "simple". */
573 if (tree_is_chrec (evolution_part))
574 return false;
576 step_expr = evolution_part;
577 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
579 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
582 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
583 dump_printf (MSG_NOTE, ", init: ");
584 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
585 dump_printf (MSG_NOTE, "\n");
588 *init = init_expr;
589 *step = step_expr;
591 if (TREE_CODE (step_expr) != INTEGER_CST
592 && (TREE_CODE (step_expr) != SSA_NAME
593 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
594 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
595 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
596 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
597 || !flag_associative_math)))
598 && (TREE_CODE (step_expr) != REAL_CST
599 || !flag_associative_math))
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
603 "step unknown.\n");
604 return false;
607 return true;
610 /* Function vect_analyze_scalar_cycles_1.
612 Examine the cross iteration def-use cycles of scalar variables
613 in LOOP. LOOP_VINFO represents the loop that is now being
614 considered for vectorization (can be LOOP, or an outer-loop
615 enclosing LOOP). */
617 static void
618 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
620 basic_block bb = loop->header;
621 tree init, step;
622 auto_vec<gimple *, 64> worklist;
623 gphi_iterator gsi;
624 bool double_reduc;
626 if (dump_enabled_p ())
627 dump_printf_loc (MSG_NOTE, vect_location,
628 "=== vect_analyze_scalar_cycles ===\n");
630 /* First - identify all inductions. Reduction detection assumes that all the
631 inductions have been identified, therefore, this order must not be
632 changed. */
633 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
635 gphi *phi = gsi.phi ();
636 tree access_fn = NULL;
637 tree def = PHI_RESULT (phi);
638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
640 if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
643 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
644 dump_printf (MSG_NOTE, "\n");
647 /* Skip virtual phi's. The data dependences that are associated with
648 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
649 if (virtual_operand_p (def))
650 continue;
652 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
654 /* Analyze the evolution function. */
655 access_fn = analyze_scalar_evolution (loop, def);
656 if (access_fn)
658 STRIP_NOPS (access_fn);
659 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "Access function of PHI: ");
663 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
664 dump_printf (MSG_NOTE, "\n");
666 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
667 = evolution_part_in_loop_num (access_fn, loop->num);
670 if (!access_fn
671 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
672 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
673 && TREE_CODE (step) != INTEGER_CST))
675 worklist.safe_push (phi);
676 continue;
679 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
681 if (dump_enabled_p ())
682 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
683 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
687 /* Second - identify all reductions and nested cycles. */
688 while (worklist.length () > 0)
690 gimple *phi = worklist.pop ();
691 tree def = PHI_RESULT (phi);
692 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
693 gimple *reduc_stmt;
694 bool nested_cycle;
696 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
699 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
700 dump_printf (MSG_NOTE, "\n");
703 gcc_assert (!virtual_operand_p (def)
704 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
706 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
707 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
708 &double_reduc, false);
709 if (reduc_stmt)
711 if (double_reduc)
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE, vect_location,
715 "Detected double reduction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
718 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
719 vect_double_reduction_def;
721 else
723 if (nested_cycle)
725 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE, vect_location,
727 "Detected vectorizable nested cycle.\n");
729 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
730 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
731 vect_nested_cycle;
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE, vect_location,
737 "Detected reduction.\n");
739 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
740 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
741 vect_reduction_def;
742 /* Store the reduction cycles for possible vectorization in
743 loop-aware SLP. */
744 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
748 else
749 if (dump_enabled_p ())
750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
751 "Unknown def-use cycle pattern.\n");
756 /* Function vect_analyze_scalar_cycles.
758 Examine the cross iteration def-use cycles of scalar variables, by
759 analyzing the loop-header PHIs of scalar variables. Classify each
760 cycle as one of the following: invariant, induction, reduction, unknown.
761 We do that for the loop represented by LOOP_VINFO, and also to its
762 inner-loop, if exists.
763 Examples for scalar cycles:
765 Example1: reduction:
767 loop1:
768 for (i=0; i<N; i++)
769 sum += a[i];
771 Example2: induction:
773 loop2:
774 for (i=0; i<N; i++)
775 a[i] = i; */
777 static void
778 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
780 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
782 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
784 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
785 Reductions in such inner-loop therefore have different properties than
786 the reductions in the nest that gets vectorized:
787 1. When vectorized, they are executed in the same order as in the original
788 scalar loop, so we can't change the order of computation when
789 vectorizing them.
790 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
791 current checks are too strict. */
793 if (loop->inner)
794 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
797 /* Transfer group and reduction information from STMT to its pattern stmt. */
799 static void
800 vect_fixup_reduc_chain (gimple *stmt)
802 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
803 gimple *stmtp;
804 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
805 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
806 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
809 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
810 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
811 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
812 if (stmt)
813 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
814 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
816 while (stmt);
817 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
820 /* Fixup scalar cycles that now have their stmts detected as patterns. */
822 static void
823 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
825 gimple *first;
826 unsigned i;
828 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
829 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
831 vect_fixup_reduc_chain (first);
832 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
833 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
837 /* Function vect_get_loop_niters.
839 Determine how many iterations the loop is executed and place it
840 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
841 in NUMBER_OF_ITERATIONSM1.
843 Return the loop exit condition. */
846 static gcond *
847 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
848 tree *number_of_iterationsm1)
850 tree niters;
852 if (dump_enabled_p ())
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "=== get_loop_niters ===\n");
856 niters = number_of_latch_executions (loop);
857 *number_of_iterationsm1 = niters;
859 /* We want the number of loop header executions which is the number
860 of latch executions plus one.
861 ??? For UINT_MAX latch executions this number overflows to zero
862 for loops like do { n++; } while (n != 0); */
863 if (niters && !chrec_contains_undetermined (niters))
864 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
865 build_int_cst (TREE_TYPE (niters), 1));
866 *number_of_iterations = niters;
868 return get_loop_exit_condition (loop);
872 /* Function bb_in_loop_p
874 Used as predicate for dfs order traversal of the loop bbs. */
876 static bool
877 bb_in_loop_p (const_basic_block bb, const void *data)
879 const struct loop *const loop = (const struct loop *)data;
880 if (flow_bb_inside_loop_p (loop, bb))
881 return true;
882 return false;
886 /* Function new_loop_vec_info.
888 Create and initialize a new loop_vec_info struct for LOOP, as well as
889 stmt_vec_info structs for all the stmts in LOOP. */
891 static loop_vec_info
892 new_loop_vec_info (struct loop *loop)
894 loop_vec_info res;
895 basic_block *bbs;
896 gimple_stmt_iterator si;
897 unsigned int i, nbbs;
899 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
900 LOOP_VINFO_LOOP (res) = loop;
902 bbs = get_loop_body (loop);
904 /* Create/Update stmt_info for all stmts in the loop. */
905 for (i = 0; i < loop->num_nodes; i++)
907 basic_block bb = bbs[i];
909 /* BBs in a nested inner-loop will have been already processed (because
910 we will have called vect_analyze_loop_form for any nested inner-loop).
911 Therefore, for stmts in an inner-loop we just want to update the
912 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
913 loop_info of the outer-loop we are currently considering to vectorize
914 (instead of the loop_info of the inner-loop).
915 For stmts in other BBs we need to create a stmt_info from scratch. */
916 if (bb->loop_father != loop)
918 /* Inner-loop bb. */
919 gcc_assert (loop->inner && bb->loop_father == loop->inner);
920 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
922 gimple *phi = gsi_stmt (si);
923 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
924 loop_vec_info inner_loop_vinfo =
925 STMT_VINFO_LOOP_VINFO (stmt_info);
926 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
927 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
929 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
931 gimple *stmt = gsi_stmt (si);
932 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
933 loop_vec_info inner_loop_vinfo =
934 STMT_VINFO_LOOP_VINFO (stmt_info);
935 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
936 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
939 else
941 /* bb in current nest. */
942 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
944 gimple *phi = gsi_stmt (si);
945 gimple_set_uid (phi, 0);
946 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
949 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
951 gimple *stmt = gsi_stmt (si);
952 gimple_set_uid (stmt, 0);
953 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
958 /* CHECKME: We want to visit all BBs before their successors (except for
959 latch blocks, for which this assertion wouldn't hold). In the simple
960 case of the loop forms we allow, a dfs order of the BBs would the same
961 as reversed postorder traversal, so we are safe. */
963 free (bbs);
964 bbs = XCNEWVEC (basic_block, loop->num_nodes);
965 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
966 bbs, loop->num_nodes, loop);
967 gcc_assert (nbbs == loop->num_nodes);
969 LOOP_VINFO_BBS (res) = bbs;
970 LOOP_VINFO_NITERSM1 (res) = NULL;
971 LOOP_VINFO_NITERS (res) = NULL;
972 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
973 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
974 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
975 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
976 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
977 LOOP_VINFO_VECT_FACTOR (res) = 0;
978 LOOP_VINFO_LOOP_NEST (res).create (3);
979 LOOP_VINFO_DATAREFS (res).create (10);
980 LOOP_VINFO_DDRS (res).create (10 * 10);
981 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
982 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
983 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
984 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
985 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
986 LOOP_VINFO_GROUPED_STORES (res).create (10);
987 LOOP_VINFO_REDUCTIONS (res).create (10);
988 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
989 LOOP_VINFO_SLP_INSTANCES (res).create (10);
990 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
991 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
992 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
993 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
994 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
996 return res;
1000 /* Function destroy_loop_vec_info.
1002 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1003 stmts in the loop. */
1005 void
1006 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1008 struct loop *loop;
1009 basic_block *bbs;
1010 int nbbs;
1011 gimple_stmt_iterator si;
1012 int j;
1013 vec<slp_instance> slp_instances;
1014 slp_instance instance;
1015 bool swapped;
1017 if (!loop_vinfo)
1018 return;
1020 loop = LOOP_VINFO_LOOP (loop_vinfo);
1022 bbs = LOOP_VINFO_BBS (loop_vinfo);
1023 nbbs = clean_stmts ? loop->num_nodes : 0;
1024 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1026 for (j = 0; j < nbbs; j++)
1028 basic_block bb = bbs[j];
1029 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1030 free_stmt_vec_info (gsi_stmt (si));
1032 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1034 gimple *stmt = gsi_stmt (si);
1036 /* We may have broken canonical form by moving a constant
1037 into RHS1 of a commutative op. Fix such occurrences. */
1038 if (swapped && is_gimple_assign (stmt))
1040 enum tree_code code = gimple_assign_rhs_code (stmt);
1042 if ((code == PLUS_EXPR
1043 || code == POINTER_PLUS_EXPR
1044 || code == MULT_EXPR)
1045 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1046 swap_ssa_operands (stmt,
1047 gimple_assign_rhs1_ptr (stmt),
1048 gimple_assign_rhs2_ptr (stmt));
1051 /* Free stmt_vec_info. */
1052 free_stmt_vec_info (stmt);
1053 gsi_next (&si);
1057 free (LOOP_VINFO_BBS (loop_vinfo));
1058 vect_destroy_datarefs (loop_vinfo, NULL);
1059 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1060 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1061 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1062 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1063 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1064 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1065 vect_free_slp_instance (instance);
1067 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1068 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1069 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1070 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1072 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1073 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1075 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1076 loop_vinfo->scalar_cost_vec.release ();
1078 free (loop_vinfo);
1079 loop->aux = NULL;
1083 /* Calculate the cost of one scalar iteration of the loop. */
1084 static void
1085 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1087 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1088 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1089 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1090 int innerloop_iters, i;
1092 /* Count statements in scalar loop. Using this as scalar cost for a single
1093 iteration for now.
1095 TODO: Add outer loop support.
1097 TODO: Consider assigning different costs to different scalar
1098 statements. */
1100 /* FORNOW. */
1101 innerloop_iters = 1;
1102 if (loop->inner)
1103 innerloop_iters = 50; /* FIXME */
1105 for (i = 0; i < nbbs; i++)
1107 gimple_stmt_iterator si;
1108 basic_block bb = bbs[i];
1110 if (bb->loop_father == loop->inner)
1111 factor = innerloop_iters;
1112 else
1113 factor = 1;
1115 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1117 gimple *stmt = gsi_stmt (si);
1118 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1120 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1121 continue;
1123 /* Skip stmts that are not vectorized inside the loop. */
1124 if (stmt_info
1125 && !STMT_VINFO_RELEVANT_P (stmt_info)
1126 && (!STMT_VINFO_LIVE_P (stmt_info)
1127 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1128 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1129 continue;
1131 vect_cost_for_stmt kind;
1132 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1134 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1135 kind = scalar_load;
1136 else
1137 kind = scalar_store;
1139 else
1140 kind = scalar_stmt;
1142 scalar_single_iter_cost
1143 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1144 factor, kind, NULL, 0, vect_prologue);
1147 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1148 = scalar_single_iter_cost;
1152 /* Function vect_analyze_loop_1.
1154 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1155 for it. The different analyses will record information in the
1156 loop_vec_info struct. This is a subset of the analyses applied in
1157 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1158 that is now considered for (outer-loop) vectorization. */
1160 static loop_vec_info
1161 vect_analyze_loop_1 (struct loop *loop)
1163 loop_vec_info loop_vinfo;
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_NOTE, vect_location,
1167 "===== analyze_loop_nest_1 =====\n");
1169 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1171 loop_vinfo = vect_analyze_loop_form (loop);
1172 if (!loop_vinfo)
1174 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176 "bad inner-loop form.\n");
1177 return NULL;
1180 return loop_vinfo;
1184 /* Function vect_analyze_loop_form.
1186 Verify that certain CFG restrictions hold, including:
1187 - the loop has a pre-header
1188 - the loop has a single entry and exit
1189 - the loop exit condition is simple enough, and the number of iterations
1190 can be analyzed (a countable loop). */
1192 loop_vec_info
1193 vect_analyze_loop_form (struct loop *loop)
1195 loop_vec_info loop_vinfo;
1196 gcond *loop_cond;
1197 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1198 loop_vec_info inner_loop_vinfo = NULL;
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_NOTE, vect_location,
1202 "=== vect_analyze_loop_form ===\n");
1204 /* Different restrictions apply when we are considering an inner-most loop,
1205 vs. an outer (nested) loop.
1206 (FORNOW. May want to relax some of these restrictions in the future). */
1208 if (!loop->inner)
1210 /* Inner-most loop. We currently require that the number of BBs is
1211 exactly 2 (the header and latch). Vectorizable inner-most loops
1212 look like this:
1214 (pre-header)
1216 header <--------+
1217 | | |
1218 | +--> latch --+
1220 (exit-bb) */
1222 if (loop->num_nodes != 2)
1224 if (dump_enabled_p ())
1225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226 "not vectorized: control flow in loop.\n");
1227 return NULL;
1230 if (empty_block_p (loop->header))
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: empty loop.\n");
1235 return NULL;
1238 else
1240 struct loop *innerloop = loop->inner;
1241 edge entryedge;
1243 /* Nested loop. We currently require that the loop is doubly-nested,
1244 contains a single inner loop, and the number of BBs is exactly 5.
1245 Vectorizable outer-loops look like this:
1247 (pre-header)
1249 header <---+
1251 inner-loop |
1253 tail ------+
1255 (exit-bb)
1257 The inner-loop has the properties expected of inner-most loops
1258 as described above. */
1260 if ((loop->inner)->inner || (loop->inner)->next)
1262 if (dump_enabled_p ())
1263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264 "not vectorized: multiple nested loops.\n");
1265 return NULL;
1268 /* Analyze the inner-loop. */
1269 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1270 if (!inner_loop_vinfo)
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274 "not vectorized: Bad inner loop.\n");
1275 return NULL;
1278 if (!expr_invariant_in_loop_p (loop,
1279 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1283 "not vectorized: inner-loop count not"
1284 " invariant.\n");
1285 destroy_loop_vec_info (inner_loop_vinfo, true);
1286 return NULL;
1289 if (loop->num_nodes != 5)
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: control flow in loop.\n");
1294 destroy_loop_vec_info (inner_loop_vinfo, true);
1295 return NULL;
1298 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1299 entryedge = EDGE_PRED (innerloop->header, 0);
1300 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1301 entryedge = EDGE_PRED (innerloop->header, 1);
1303 if (entryedge->src != loop->header
1304 || !single_exit (innerloop)
1305 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1307 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309 "not vectorized: unsupported outerloop form.\n");
1310 destroy_loop_vec_info (inner_loop_vinfo, true);
1311 return NULL;
1314 if (dump_enabled_p ())
1315 dump_printf_loc (MSG_NOTE, vect_location,
1316 "Considering outer-loop vectorization.\n");
1319 if (!single_exit (loop)
1320 || EDGE_COUNT (loop->header->preds) != 2)
1322 if (dump_enabled_p ())
1324 if (!single_exit (loop))
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326 "not vectorized: multiple exits.\n");
1327 else if (EDGE_COUNT (loop->header->preds) != 2)
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "not vectorized: too many incoming edges.\n");
1331 if (inner_loop_vinfo)
1332 destroy_loop_vec_info (inner_loop_vinfo, true);
1333 return NULL;
1336 /* We assume that the loop exit condition is at the end of the loop. i.e,
1337 that the loop is represented as a do-while (with a proper if-guard
1338 before the loop if needed), where the loop header contains all the
1339 executable statements, and the latch is empty. */
1340 if (!empty_block_p (loop->latch)
1341 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1343 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345 "not vectorized: latch block not empty.\n");
1346 if (inner_loop_vinfo)
1347 destroy_loop_vec_info (inner_loop_vinfo, true);
1348 return NULL;
1351 /* Make sure there exists a single-predecessor exit bb: */
1352 if (!single_pred_p (single_exit (loop)->dest))
1354 edge e = single_exit (loop);
1355 if (!(e->flags & EDGE_ABNORMAL))
1357 split_loop_exit_edge (e);
1358 if (dump_enabled_p ())
1359 dump_printf (MSG_NOTE, "split exit edge.\n");
1361 else
1363 if (dump_enabled_p ())
1364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365 "not vectorized: abnormal loop exit edge.\n");
1366 if (inner_loop_vinfo)
1367 destroy_loop_vec_info (inner_loop_vinfo, true);
1368 return NULL;
1372 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1373 &number_of_iterationsm1);
1374 if (!loop_cond)
1376 if (dump_enabled_p ())
1377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378 "not vectorized: complicated exit condition.\n");
1379 if (inner_loop_vinfo)
1380 destroy_loop_vec_info (inner_loop_vinfo, true);
1381 return NULL;
1384 if (!number_of_iterations
1385 || chrec_contains_undetermined (number_of_iterations))
1387 if (dump_enabled_p ())
1388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1389 "not vectorized: number of iterations cannot be "
1390 "computed.\n");
1391 if (inner_loop_vinfo)
1392 destroy_loop_vec_info (inner_loop_vinfo, true);
1393 return NULL;
1396 if (integer_zerop (number_of_iterations))
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "not vectorized: number of iterations = 0.\n");
1401 if (inner_loop_vinfo)
1402 destroy_loop_vec_info (inner_loop_vinfo, true);
1403 return NULL;
1406 loop_vinfo = new_loop_vec_info (loop);
1407 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1408 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1409 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1411 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1413 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "Symbolic number of iterations is ");
1417 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1418 dump_printf (MSG_NOTE, "\n");
1422 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1424 /* CHECKME: May want to keep it around it in the future. */
1425 if (inner_loop_vinfo)
1426 destroy_loop_vec_info (inner_loop_vinfo, false);
1428 gcc_assert (!loop->aux);
1429 loop->aux = loop_vinfo;
1430 return loop_vinfo;
1433 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1434 statements update the vectorization factor. */
1436 static void
1437 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1439 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1440 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1441 int nbbs = loop->num_nodes;
1442 unsigned int vectorization_factor;
1443 int i;
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "=== vect_update_vf_for_slp ===\n");
1449 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1450 gcc_assert (vectorization_factor != 0);
1452 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1453 vectorization factor of the loop is the unrolling factor required by
1454 the SLP instances. If that unrolling factor is 1, we say, that we
1455 perform pure SLP on loop - cross iteration parallelism is not
1456 exploited. */
1457 bool only_slp_in_loop = true;
1458 for (i = 0; i < nbbs; i++)
1460 basic_block bb = bbs[i];
1461 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1462 gsi_next (&si))
1464 gimple *stmt = gsi_stmt (si);
1465 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1466 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1467 && STMT_VINFO_RELATED_STMT (stmt_info))
1469 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1470 stmt_info = vinfo_for_stmt (stmt);
1472 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1473 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1474 && !PURE_SLP_STMT (stmt_info))
1475 /* STMT needs both SLP and loop-based vectorization. */
1476 only_slp_in_loop = false;
1480 if (only_slp_in_loop)
1481 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1482 else
1483 vectorization_factor
1484 = least_common_multiple (vectorization_factor,
1485 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1487 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1488 if (dump_enabled_p ())
1489 dump_printf_loc (MSG_NOTE, vect_location,
1490 "Updating vectorization factor to %d\n",
1491 vectorization_factor);
1494 /* Function vect_analyze_loop_operations.
1496 Scan the loop stmts and make sure they are all vectorizable. */
1498 static bool
1499 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1501 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1502 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1503 int nbbs = loop->num_nodes;
1504 unsigned int vectorization_factor;
1505 int i;
1506 stmt_vec_info stmt_info;
1507 bool need_to_vectorize = false;
1508 int min_profitable_iters;
1509 int min_scalar_loop_bound;
1510 unsigned int th;
1511 bool ok;
1512 HOST_WIDE_INT max_niter;
1513 HOST_WIDE_INT estimated_niter;
1514 int min_profitable_estimate;
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_NOTE, vect_location,
1518 "=== vect_analyze_loop_operations ===\n");
1520 for (i = 0; i < nbbs; i++)
1522 basic_block bb = bbs[i];
1524 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1525 gsi_next (&si))
1527 gphi *phi = si.phi ();
1528 ok = true;
1530 stmt_info = vinfo_for_stmt (phi);
1531 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1534 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1535 dump_printf (MSG_NOTE, "\n");
1538 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1539 (i.e., a phi in the tail of the outer-loop). */
1540 if (! is_loop_header_bb_p (bb))
1542 /* FORNOW: we currently don't support the case that these phis
1543 are not used in the outerloop (unless it is double reduction,
1544 i.e., this phi is vect_reduction_def), cause this case
1545 requires to actually do something here. */
1546 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1547 || STMT_VINFO_LIVE_P (stmt_info))
1548 && STMT_VINFO_DEF_TYPE (stmt_info)
1549 != vect_double_reduction_def)
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 "Unsupported loop-closed phi in "
1554 "outer-loop.\n");
1555 return false;
1558 /* If PHI is used in the outer loop, we check that its operand
1559 is defined in the inner loop. */
1560 if (STMT_VINFO_RELEVANT_P (stmt_info))
1562 tree phi_op;
1563 gimple *op_def_stmt;
1565 if (gimple_phi_num_args (phi) != 1)
1566 return false;
1568 phi_op = PHI_ARG_DEF (phi, 0);
1569 if (TREE_CODE (phi_op) != SSA_NAME)
1570 return false;
1572 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1573 if (gimple_nop_p (op_def_stmt)
1574 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1575 || !vinfo_for_stmt (op_def_stmt))
1576 return false;
1578 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1579 != vect_used_in_outer
1580 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1581 != vect_used_in_outer_by_reduction)
1582 return false;
1585 continue;
1588 gcc_assert (stmt_info);
1590 if (STMT_VINFO_LIVE_P (stmt_info))
1592 /* FORNOW: not yet supported. */
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595 "not vectorized: value used after loop.\n");
1596 return false;
1599 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1600 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1602 /* A scalar-dependence cycle that we don't support. */
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "not vectorized: scalar dependence cycle.\n");
1606 return false;
1609 if (STMT_VINFO_RELEVANT_P (stmt_info))
1611 need_to_vectorize = true;
1612 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1613 ok = vectorizable_induction (phi, NULL, NULL);
1616 if (!ok)
1618 if (dump_enabled_p ())
1620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621 "not vectorized: relevant phi not "
1622 "supported: ");
1623 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1624 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1626 return false;
1630 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1631 gsi_next (&si))
1633 gimple *stmt = gsi_stmt (si);
1634 if (!gimple_clobber_p (stmt)
1635 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1636 return false;
1638 } /* bbs */
1640 /* All operations in the loop are either irrelevant (deal with loop
1641 control, or dead), or only used outside the loop and can be moved
1642 out of the loop (e.g. invariants, inductions). The loop can be
1643 optimized away by scalar optimizations. We're better off not
1644 touching this loop. */
1645 if (!need_to_vectorize)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE, vect_location,
1649 "All the computation can be taken out of the loop.\n");
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: redundant loop. no profit to "
1653 "vectorize.\n");
1654 return false;
1657 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1658 gcc_assert (vectorization_factor != 0);
1660 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1661 dump_printf_loc (MSG_NOTE, vect_location,
1662 "vectorization_factor = %d, niters = "
1663 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1664 LOOP_VINFO_INT_NITERS (loop_vinfo));
1666 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1667 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1668 || ((max_niter = max_stmt_executions_int (loop)) != -1
1669 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 "not vectorized: iteration count too small.\n");
1674 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676 "not vectorized: iteration count smaller than "
1677 "vectorization factor.\n");
1678 return false;
1681 /* Analyze cost. Decide if worth while to vectorize. */
1683 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1684 &min_profitable_estimate);
1685 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1687 if (min_profitable_iters < 0)
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "not vectorized: vectorization not profitable.\n");
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694 "not vectorized: vector version will never be "
1695 "profitable.\n");
1696 return false;
1699 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1700 * vectorization_factor) - 1);
1703 /* Use the cost model only if it is more conservative than user specified
1704 threshold. */
1706 th = (unsigned) min_scalar_loop_bound;
1707 if (min_profitable_iters
1708 && (!min_scalar_loop_bound
1709 || min_profitable_iters > min_scalar_loop_bound))
1710 th = (unsigned) min_profitable_iters;
1712 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1714 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1715 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: vectorization not profitable.\n");
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "not vectorized: iteration count smaller than user "
1723 "specified loop bound parameter or minimum profitable "
1724 "iterations (whichever is more conservative).\n");
1725 return false;
1728 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1729 && ((unsigned HOST_WIDE_INT) estimated_niter
1730 <= MAX (th, (unsigned)min_profitable_estimate)))
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734 "not vectorized: estimated iteration count too "
1735 "small.\n");
1736 if (dump_enabled_p ())
1737 dump_printf_loc (MSG_NOTE, vect_location,
1738 "not vectorized: estimated iteration count smaller "
1739 "than specified loop bound parameter or minimum "
1740 "profitable iterations (whichever is more "
1741 "conservative).\n");
1742 return false;
1745 return true;
1749 /* Function vect_analyze_loop_2.
1751 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1752 for it. The different analyses will record information in the
1753 loop_vec_info struct. */
1754 static bool
1755 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1757 bool ok;
1758 int max_vf = MAX_VECTORIZATION_FACTOR;
1759 int min_vf = 2;
1760 unsigned int th;
1761 unsigned int n_stmts = 0;
1763 /* Find all data references in the loop (which correspond to vdefs/vuses)
1764 and analyze their evolution in the loop. Also adjust the minimal
1765 vectorization factor according to the loads and stores.
1767 FORNOW: Handle only simple, array references, which
1768 alignment can be forced, and aligned pointer-references. */
1770 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1771 if (!ok)
1773 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1775 "bad data references.\n");
1776 return false;
1779 /* Classify all cross-iteration scalar data-flow cycles.
1780 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1782 vect_analyze_scalar_cycles (loop_vinfo);
1784 vect_pattern_recog (loop_vinfo, NULL);
1786 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1788 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1789 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1791 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1792 if (!ok)
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796 "bad data access.\n");
1797 return false;
1800 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1802 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1803 if (!ok)
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807 "unexpected pattern.\n");
1808 return false;
1811 /* Analyze data dependences between the data-refs in the loop
1812 and adjust the maximum vectorization factor according to
1813 the dependences.
1814 FORNOW: fail at the first data dependence that we encounter. */
1816 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1817 if (!ok
1818 || max_vf < min_vf)
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1822 "bad data dependence.\n");
1823 return false;
1826 ok = vect_determine_vectorization_factor (loop_vinfo);
1827 if (!ok)
1829 if (dump_enabled_p ())
1830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831 "can't determine vectorization factor.\n");
1832 return false;
1834 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "bad data dependence.\n");
1839 return false;
1842 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1843 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1844 if (!ok)
1845 return false;
1847 /* If there are any SLP instances mark them as pure_slp. */
1848 bool slp = vect_make_slp_decision (loop_vinfo);
1849 if (slp)
1851 /* Find stmts that need to be both vectorized and SLPed. */
1852 vect_detect_hybrid_slp (loop_vinfo);
1854 /* Update the vectorization factor based on the SLP decision. */
1855 vect_update_vf_for_slp (loop_vinfo);
1858 /* Analyze the alignment of the data-refs in the loop.
1859 Fail if a data reference is found that cannot be vectorized. */
1861 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1862 if (!ok)
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "bad data alignment.\n");
1867 return false;
1870 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1871 It is important to call pruning after vect_analyze_data_ref_accesses,
1872 since we use grouping information gathered by interleaving analysis. */
1873 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1874 if (!ok)
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "number of versioning for alias "
1879 "run-time tests exceeds %d "
1880 "(--param vect-max-version-for-alias-checks)\n",
1881 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1882 return false;
1885 /* Compute the scalar iteration cost. */
1886 vect_get_single_scalar_iteration_cost (loop_vinfo);
1888 /* This pass will decide on using loop versioning and/or loop peeling in
1889 order to enhance the alignment of data references in the loop. */
1891 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1892 if (!ok)
1894 if (dump_enabled_p ())
1895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1896 "bad data alignment.\n");
1897 return false;
1900 if (slp)
1902 /* Analyze operations in the SLP instances. Note this may
1903 remove unsupported SLP instances which makes the above
1904 SLP kind detection invalid. */
1905 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1906 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1907 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1908 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1909 return false;
1912 /* Scan all the remaining operations in the loop that are not subject
1913 to SLP and make sure they are vectorizable. */
1914 ok = vect_analyze_loop_operations (loop_vinfo);
1915 if (!ok)
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919 "bad operation or unsupported loop bound.\n");
1920 return false;
1923 /* Decide whether we need to create an epilogue loop to handle
1924 remaining scalar iterations. */
1925 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1926 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1927 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1929 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1930 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1932 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1933 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1934 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1935 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1937 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1938 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1939 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1940 /* In case of versioning, check if the maximum number of
1941 iterations is greater than th. If they are identical,
1942 the epilogue is unnecessary. */
1943 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1944 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1945 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1946 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1947 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1949 /* If an epilogue loop is required make sure we can create one. */
1950 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1951 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1955 if (!vect_can_advance_ivs_p (loop_vinfo)
1956 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1957 single_exit (LOOP_VINFO_LOOP
1958 (loop_vinfo))))
1960 if (dump_enabled_p ())
1961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1962 "not vectorized: can't create required "
1963 "epilog loop\n");
1964 return false;
1968 return true;
1971 /* Function vect_analyze_loop.
1973 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1974 for it. The different analyses will record information in the
1975 loop_vec_info struct. */
1976 loop_vec_info
1977 vect_analyze_loop (struct loop *loop)
1979 loop_vec_info loop_vinfo;
1980 unsigned int vector_sizes;
1982 /* Autodetect first vector size we try. */
1983 current_vector_size = 0;
1984 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "===== analyze_loop_nest =====\n");
1990 if (loop_outer (loop)
1991 && loop_vec_info_for_loop (loop_outer (loop))
1992 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_NOTE, vect_location,
1996 "outer-loop already vectorized.\n");
1997 return NULL;
2000 while (1)
2002 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2003 loop_vinfo = vect_analyze_loop_form (loop);
2004 if (!loop_vinfo)
2006 if (dump_enabled_p ())
2007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008 "bad loop form.\n");
2009 return NULL;
2012 if (vect_analyze_loop_2 (loop_vinfo))
2014 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2016 return loop_vinfo;
2019 destroy_loop_vec_info (loop_vinfo, true);
2021 vector_sizes &= ~current_vector_size;
2022 if (vector_sizes == 0
2023 || current_vector_size == 0)
2024 return NULL;
2026 /* Try the next biggest vector size. */
2027 current_vector_size = 1 << floor_log2 (vector_sizes);
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_NOTE, vect_location,
2030 "***** Re-trying analysis with "
2031 "vector size %d\n", current_vector_size);
2036 /* Function reduction_code_for_scalar_code
2038 Input:
2039 CODE - tree_code of a reduction operations.
2041 Output:
2042 REDUC_CODE - the corresponding tree-code to be used to reduce the
2043 vector of partial results into a single scalar result, or ERROR_MARK
2044 if the operation is a supported reduction operation, but does not have
2045 such a tree-code.
2047 Return FALSE if CODE currently cannot be vectorized as reduction. */
2049 static bool
2050 reduction_code_for_scalar_code (enum tree_code code,
2051 enum tree_code *reduc_code)
2053 switch (code)
2055 case MAX_EXPR:
2056 *reduc_code = REDUC_MAX_EXPR;
2057 return true;
2059 case MIN_EXPR:
2060 *reduc_code = REDUC_MIN_EXPR;
2061 return true;
2063 case PLUS_EXPR:
2064 *reduc_code = REDUC_PLUS_EXPR;
2065 return true;
2067 case MULT_EXPR:
2068 case MINUS_EXPR:
2069 case BIT_IOR_EXPR:
2070 case BIT_XOR_EXPR:
2071 case BIT_AND_EXPR:
2072 *reduc_code = ERROR_MARK;
2073 return true;
2075 default:
2076 return false;
2081 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2082 STMT is printed with a message MSG. */
2084 static void
2085 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2087 dump_printf_loc (msg_type, vect_location, "%s", msg);
2088 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2089 dump_printf (msg_type, "\n");
2093 /* Detect SLP reduction of the form:
2095 #a1 = phi <a5, a0>
2096 a2 = operation (a1)
2097 a3 = operation (a2)
2098 a4 = operation (a3)
2099 a5 = operation (a4)
2101 #a = phi <a5>
2103 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2104 FIRST_STMT is the first reduction stmt in the chain
2105 (a2 = operation (a1)).
2107 Return TRUE if a reduction chain was detected. */
2109 static bool
2110 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2111 gimple *first_stmt)
2113 struct loop *loop = (gimple_bb (phi))->loop_father;
2114 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2115 enum tree_code code;
2116 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2117 stmt_vec_info use_stmt_info, current_stmt_info;
2118 tree lhs;
2119 imm_use_iterator imm_iter;
2120 use_operand_p use_p;
2121 int nloop_uses, size = 0, n_out_of_loop_uses;
2122 bool found = false;
2124 if (loop != vect_loop)
2125 return false;
2127 lhs = PHI_RESULT (phi);
2128 code = gimple_assign_rhs_code (first_stmt);
2129 while (1)
2131 nloop_uses = 0;
2132 n_out_of_loop_uses = 0;
2133 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2135 gimple *use_stmt = USE_STMT (use_p);
2136 if (is_gimple_debug (use_stmt))
2137 continue;
2139 /* Check if we got back to the reduction phi. */
2140 if (use_stmt == phi)
2142 loop_use_stmt = use_stmt;
2143 found = true;
2144 break;
2147 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2149 loop_use_stmt = use_stmt;
2150 nloop_uses++;
2152 else
2153 n_out_of_loop_uses++;
2155 /* There are can be either a single use in the loop or two uses in
2156 phi nodes. */
2157 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2158 return false;
2161 if (found)
2162 break;
2164 /* We reached a statement with no loop uses. */
2165 if (nloop_uses == 0)
2166 return false;
2168 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2169 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2170 return false;
2172 if (!is_gimple_assign (loop_use_stmt)
2173 || code != gimple_assign_rhs_code (loop_use_stmt)
2174 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2175 return false;
2177 /* Insert USE_STMT into reduction chain. */
2178 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2179 if (current_stmt)
2181 current_stmt_info = vinfo_for_stmt (current_stmt);
2182 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2183 GROUP_FIRST_ELEMENT (use_stmt_info)
2184 = GROUP_FIRST_ELEMENT (current_stmt_info);
2186 else
2187 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2189 lhs = gimple_assign_lhs (loop_use_stmt);
2190 current_stmt = loop_use_stmt;
2191 size++;
2194 if (!found || loop_use_stmt != phi || size < 2)
2195 return false;
2197 /* Swap the operands, if needed, to make the reduction operand be the second
2198 operand. */
2199 lhs = PHI_RESULT (phi);
2200 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2201 while (next_stmt)
2203 if (gimple_assign_rhs2 (next_stmt) == lhs)
2205 tree op = gimple_assign_rhs1 (next_stmt);
2206 gimple *def_stmt = NULL;
2208 if (TREE_CODE (op) == SSA_NAME)
2209 def_stmt = SSA_NAME_DEF_STMT (op);
2211 /* Check that the other def is either defined in the loop
2212 ("vect_internal_def"), or it's an induction (defined by a
2213 loop-header phi-node). */
2214 if (def_stmt
2215 && gimple_bb (def_stmt)
2216 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2217 && (is_gimple_assign (def_stmt)
2218 || is_gimple_call (def_stmt)
2219 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2220 == vect_induction_def
2221 || (gimple_code (def_stmt) == GIMPLE_PHI
2222 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2223 == vect_internal_def
2224 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2226 lhs = gimple_assign_lhs (next_stmt);
2227 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2228 continue;
2231 return false;
2233 else
2235 tree op = gimple_assign_rhs2 (next_stmt);
2236 gimple *def_stmt = NULL;
2238 if (TREE_CODE (op) == SSA_NAME)
2239 def_stmt = SSA_NAME_DEF_STMT (op);
2241 /* Check that the other def is either defined in the loop
2242 ("vect_internal_def"), or it's an induction (defined by a
2243 loop-header phi-node). */
2244 if (def_stmt
2245 && gimple_bb (def_stmt)
2246 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2247 && (is_gimple_assign (def_stmt)
2248 || is_gimple_call (def_stmt)
2249 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2250 == vect_induction_def
2251 || (gimple_code (def_stmt) == GIMPLE_PHI
2252 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2253 == vect_internal_def
2254 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2256 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2259 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2260 dump_printf (MSG_NOTE, "\n");
2263 swap_ssa_operands (next_stmt,
2264 gimple_assign_rhs1_ptr (next_stmt),
2265 gimple_assign_rhs2_ptr (next_stmt));
2266 update_stmt (next_stmt);
2268 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2269 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2271 else
2272 return false;
2275 lhs = gimple_assign_lhs (next_stmt);
2276 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2279 /* Save the chain for further analysis in SLP detection. */
2280 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2281 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2282 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2284 return true;
2288 /* Function vect_is_simple_reduction_1
2290 (1) Detect a cross-iteration def-use cycle that represents a simple
2291 reduction computation. We look for the following pattern:
2293 loop_header:
2294 a1 = phi < a0, a2 >
2295 a3 = ...
2296 a2 = operation (a3, a1)
2300 a3 = ...
2301 loop_header:
2302 a1 = phi < a0, a2 >
2303 a2 = operation (a3, a1)
2305 such that:
2306 1. operation is commutative and associative and it is safe to
2307 change the order of the computation (if CHECK_REDUCTION is true)
2308 2. no uses for a2 in the loop (a2 is used out of the loop)
2309 3. no uses of a1 in the loop besides the reduction operation
2310 4. no uses of a1 outside the loop.
2312 Conditions 1,4 are tested here.
2313 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2315 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2316 nested cycles, if CHECK_REDUCTION is false.
2318 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2319 reductions:
2321 a1 = phi < a0, a2 >
2322 inner loop (def of a3)
2323 a2 = phi < a3 >
2325 If MODIFY is true it tries also to rework the code in-place to enable
2326 detection of more reduction patterns. For the time being we rewrite
2327 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2330 static gimple *
2331 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple *phi,
2332 bool check_reduction, bool *double_reduc,
2333 bool modify, bool need_wrapping_integral_overflow)
2335 struct loop *loop = (gimple_bb (phi))->loop_father;
2336 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2337 edge latch_e = loop_latch_edge (loop);
2338 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2339 gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2340 enum tree_code orig_code, code;
2341 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2342 tree type;
2343 int nloop_uses;
2344 tree name;
2345 imm_use_iterator imm_iter;
2346 use_operand_p use_p;
2347 bool phi_def;
2349 *double_reduc = false;
2351 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2352 otherwise, we assume outer loop vectorization. */
2353 gcc_assert ((check_reduction && loop == vect_loop)
2354 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2356 name = PHI_RESULT (phi);
2357 /* ??? If there are no uses of the PHI result the inner loop reduction
2358 won't be detected as possibly double-reduction by vectorizable_reduction
2359 because that tries to walk the PHI arg from the preheader edge which
2360 can be constant. See PR60382. */
2361 if (has_zero_uses (name))
2362 return NULL;
2363 nloop_uses = 0;
2364 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2366 gimple *use_stmt = USE_STMT (use_p);
2367 if (is_gimple_debug (use_stmt))
2368 continue;
2370 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 "intermediate value used outside loop.\n");
2376 return NULL;
2379 nloop_uses++;
2380 if (nloop_uses > 1)
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2384 "reduction used in loop.\n");
2385 return NULL;
2389 if (TREE_CODE (loop_arg) != SSA_NAME)
2391 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2394 "reduction: not ssa_name: ");
2395 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2396 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2398 return NULL;
2401 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2402 if (!def_stmt)
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406 "reduction: no def_stmt.\n");
2407 return NULL;
2410 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2412 if (dump_enabled_p ())
2414 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2415 dump_printf (MSG_NOTE, "\n");
2417 return NULL;
2420 if (is_gimple_assign (def_stmt))
2422 name = gimple_assign_lhs (def_stmt);
2423 phi_def = false;
2425 else
2427 name = PHI_RESULT (def_stmt);
2428 phi_def = true;
2431 nloop_uses = 0;
2432 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2434 gimple *use_stmt = USE_STMT (use_p);
2435 if (is_gimple_debug (use_stmt))
2436 continue;
2437 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2438 nloop_uses++;
2439 if (nloop_uses > 1)
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2443 "reduction used in loop.\n");
2444 return NULL;
2448 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2449 defined in the inner loop. */
2450 if (phi_def)
2452 op1 = PHI_ARG_DEF (def_stmt, 0);
2454 if (gimple_phi_num_args (def_stmt) != 1
2455 || TREE_CODE (op1) != SSA_NAME)
2457 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2459 "unsupported phi node definition.\n");
2461 return NULL;
2464 def1 = SSA_NAME_DEF_STMT (op1);
2465 if (gimple_bb (def1)
2466 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2467 && loop->inner
2468 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2469 && is_gimple_assign (def1))
2471 if (dump_enabled_p ())
2472 report_vect_op (MSG_NOTE, def_stmt,
2473 "detected double reduction: ");
2475 *double_reduc = true;
2476 return def_stmt;
2479 return NULL;
2482 code = orig_code = gimple_assign_rhs_code (def_stmt);
2484 /* We can handle "res -= x[i]", which is non-associative by
2485 simply rewriting this into "res += -x[i]". Avoid changing
2486 gimple instruction for the first simple tests and only do this
2487 if we're allowed to change code at all. */
2488 if (code == MINUS_EXPR
2489 && modify
2490 && (op1 = gimple_assign_rhs1 (def_stmt))
2491 && TREE_CODE (op1) == SSA_NAME
2492 && SSA_NAME_DEF_STMT (op1) == phi)
2493 code = PLUS_EXPR;
2495 if (check_reduction
2496 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2498 if (dump_enabled_p ())
2499 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2500 "reduction: not commutative/associative: ");
2501 return NULL;
2504 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2506 if (code != COND_EXPR)
2508 if (dump_enabled_p ())
2509 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2510 "reduction: not binary operation: ");
2512 return NULL;
2515 op3 = gimple_assign_rhs1 (def_stmt);
2516 if (COMPARISON_CLASS_P (op3))
2518 op4 = TREE_OPERAND (op3, 1);
2519 op3 = TREE_OPERAND (op3, 0);
2522 op1 = gimple_assign_rhs2 (def_stmt);
2523 op2 = gimple_assign_rhs3 (def_stmt);
2525 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2527 if (dump_enabled_p ())
2528 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2529 "reduction: uses not ssa_names: ");
2531 return NULL;
2534 else
2536 op1 = gimple_assign_rhs1 (def_stmt);
2537 op2 = gimple_assign_rhs2 (def_stmt);
2539 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2541 if (dump_enabled_p ())
2542 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2543 "reduction: uses not ssa_names: ");
2545 return NULL;
2549 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2550 if ((TREE_CODE (op1) == SSA_NAME
2551 && !types_compatible_p (type,TREE_TYPE (op1)))
2552 || (TREE_CODE (op2) == SSA_NAME
2553 && !types_compatible_p (type, TREE_TYPE (op2)))
2554 || (op3 && TREE_CODE (op3) == SSA_NAME
2555 && !types_compatible_p (type, TREE_TYPE (op3)))
2556 || (op4 && TREE_CODE (op4) == SSA_NAME
2557 && !types_compatible_p (type, TREE_TYPE (op4))))
2559 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location,
2562 "reduction: multiple types: operation type: ");
2563 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2564 dump_printf (MSG_NOTE, ", operands types: ");
2565 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2566 TREE_TYPE (op1));
2567 dump_printf (MSG_NOTE, ",");
2568 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2569 TREE_TYPE (op2));
2570 if (op3)
2572 dump_printf (MSG_NOTE, ",");
2573 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2574 TREE_TYPE (op3));
2577 if (op4)
2579 dump_printf (MSG_NOTE, ",");
2580 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2581 TREE_TYPE (op4));
2583 dump_printf (MSG_NOTE, "\n");
2586 return NULL;
2589 /* Check that it's ok to change the order of the computation.
2590 Generally, when vectorizing a reduction we change the order of the
2591 computation. This may change the behavior of the program in some
2592 cases, so we need to check that this is ok. One exception is when
2593 vectorizing an outer-loop: the inner-loop is executed sequentially,
2594 and therefore vectorizing reductions in the inner-loop during
2595 outer-loop vectorization is safe. */
2597 /* CHECKME: check for !flag_finite_math_only too? */
2598 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2599 && check_reduction)
2601 /* Changing the order of operations changes the semantics. */
2602 if (dump_enabled_p ())
2603 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2604 "reduction: unsafe fp math optimization: ");
2605 return NULL;
2607 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2609 if (!operation_no_trapping_overflow (type, code))
2611 /* Changing the order of operations changes the semantics. */
2612 if (dump_enabled_p ())
2613 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2614 "reduction: unsafe int math optimization"
2615 " (overflow traps): ");
2616 return NULL;
2618 if (need_wrapping_integral_overflow
2619 && !TYPE_OVERFLOW_WRAPS (type)
2620 && operation_can_overflow (code))
2622 /* Changing the order of operations changes the semantics. */
2623 if (dump_enabled_p ())
2624 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2625 "reduction: unsafe int math optimization"
2626 " (overflow doesn't wrap): ");
2627 return NULL;
2630 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2632 /* Changing the order of operations changes the semantics. */
2633 if (dump_enabled_p ())
2634 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2635 "reduction: unsafe fixed-point math optimization: ");
2636 return NULL;
2639 /* If we detected "res -= x[i]" earlier, rewrite it into
2640 "res += -x[i]" now. If this turns out to be useless reassoc
2641 will clean it up again. */
2642 if (orig_code == MINUS_EXPR)
2644 tree rhs = gimple_assign_rhs2 (def_stmt);
2645 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2646 gimple *negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2647 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2648 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2649 loop_info, NULL));
2650 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2651 gimple_assign_set_rhs2 (def_stmt, negrhs);
2652 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2653 update_stmt (def_stmt);
2656 /* Reduction is safe. We're dealing with one of the following:
2657 1) integer arithmetic and no trapv
2658 2) floating point arithmetic, and special flags permit this optimization
2659 3) nested cycle (i.e., outer loop vectorization). */
2660 if (TREE_CODE (op1) == SSA_NAME)
2661 def1 = SSA_NAME_DEF_STMT (op1);
2663 if (TREE_CODE (op2) == SSA_NAME)
2664 def2 = SSA_NAME_DEF_STMT (op2);
2666 if (code != COND_EXPR
2667 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2669 if (dump_enabled_p ())
2670 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2671 return NULL;
2674 /* Check that one def is the reduction def, defined by PHI,
2675 the other def is either defined in the loop ("vect_internal_def"),
2676 or it's an induction (defined by a loop-header phi-node). */
2678 if (def2 && def2 == phi
2679 && (code == COND_EXPR
2680 || !def1 || gimple_nop_p (def1)
2681 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2682 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2683 && (is_gimple_assign (def1)
2684 || is_gimple_call (def1)
2685 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2686 == vect_induction_def
2687 || (gimple_code (def1) == GIMPLE_PHI
2688 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2689 == vect_internal_def
2690 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2692 if (dump_enabled_p ())
2693 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2694 return def_stmt;
2697 if (def1 && def1 == phi
2698 && (code == COND_EXPR
2699 || !def2 || gimple_nop_p (def2)
2700 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2701 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2702 && (is_gimple_assign (def2)
2703 || is_gimple_call (def2)
2704 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2705 == vect_induction_def
2706 || (gimple_code (def2) == GIMPLE_PHI
2707 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2708 == vect_internal_def
2709 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2711 if (check_reduction)
2713 /* Swap operands (just for simplicity - so that the rest of the code
2714 can assume that the reduction variable is always the last (second)
2715 argument). */
2716 if (dump_enabled_p ())
2717 report_vect_op (MSG_NOTE, def_stmt,
2718 "detected reduction: need to swap operands: ");
2720 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2721 gimple_assign_rhs2_ptr (def_stmt));
2723 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2724 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2726 else
2728 if (dump_enabled_p ())
2729 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2732 return def_stmt;
2735 /* Try to find SLP reduction chain. */
2736 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2738 if (dump_enabled_p ())
2739 report_vect_op (MSG_NOTE, def_stmt,
2740 "reduction: detected reduction chain: ");
2742 return def_stmt;
2745 if (dump_enabled_p ())
2746 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2747 "reduction: unknown pattern: ");
2749 return NULL;
2752 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2753 in-place. Arguments as there. */
2755 static gimple *
2756 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2757 bool check_reduction, bool *double_reduc,
2758 bool need_wrapping_integral_overflow)
2760 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2761 double_reduc, false,
2762 need_wrapping_integral_overflow);
2765 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2766 in-place if it enables detection of more reductions. Arguments
2767 as there. */
2769 gimple *
2770 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2771 bool check_reduction, bool *double_reduc,
2772 bool need_wrapping_integral_overflow)
2774 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2775 double_reduc, true,
2776 need_wrapping_integral_overflow);
2779 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2781 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2782 int *peel_iters_epilogue,
2783 stmt_vector_for_cost *scalar_cost_vec,
2784 stmt_vector_for_cost *prologue_cost_vec,
2785 stmt_vector_for_cost *epilogue_cost_vec)
2787 int retval = 0;
2788 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2790 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2792 *peel_iters_epilogue = vf/2;
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE, vect_location,
2795 "cost model: epilogue peel iters set to vf/2 "
2796 "because loop iterations are unknown .\n");
2798 /* If peeled iterations are known but number of scalar loop
2799 iterations are unknown, count a taken branch per peeled loop. */
2800 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2801 NULL, 0, vect_prologue);
2802 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2803 NULL, 0, vect_epilogue);
2805 else
2807 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2808 peel_iters_prologue = niters < peel_iters_prologue ?
2809 niters : peel_iters_prologue;
2810 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2811 /* If we need to peel for gaps, but no peeling is required, we have to
2812 peel VF iterations. */
2813 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2814 *peel_iters_epilogue = vf;
2817 stmt_info_for_cost *si;
2818 int j;
2819 if (peel_iters_prologue)
2820 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2821 retval += record_stmt_cost (prologue_cost_vec,
2822 si->count * peel_iters_prologue,
2823 si->kind, NULL, si->misalign,
2824 vect_prologue);
2825 if (*peel_iters_epilogue)
2826 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2827 retval += record_stmt_cost (epilogue_cost_vec,
2828 si->count * *peel_iters_epilogue,
2829 si->kind, NULL, si->misalign,
2830 vect_epilogue);
2832 return retval;
2835 /* Function vect_estimate_min_profitable_iters
2837 Return the number of iterations required for the vector version of the
2838 loop to be profitable relative to the cost of the scalar version of the
2839 loop. */
2841 static void
2842 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2843 int *ret_min_profitable_niters,
2844 int *ret_min_profitable_estimate)
2846 int min_profitable_iters;
2847 int min_profitable_estimate;
2848 int peel_iters_prologue;
2849 int peel_iters_epilogue;
2850 unsigned vec_inside_cost = 0;
2851 int vec_outside_cost = 0;
2852 unsigned vec_prologue_cost = 0;
2853 unsigned vec_epilogue_cost = 0;
2854 int scalar_single_iter_cost = 0;
2855 int scalar_outside_cost = 0;
2856 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2858 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2860 /* Cost model disabled. */
2861 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2863 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2864 *ret_min_profitable_niters = 0;
2865 *ret_min_profitable_estimate = 0;
2866 return;
2869 /* Requires loop versioning tests to handle misalignment. */
2870 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2872 /* FIXME: Make cost depend on complexity of individual check. */
2873 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2874 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2875 vect_prologue);
2876 dump_printf (MSG_NOTE,
2877 "cost model: Adding cost of checks for loop "
2878 "versioning to treat misalignment.\n");
2881 /* Requires loop versioning with alias checks. */
2882 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2884 /* FIXME: Make cost depend on complexity of individual check. */
2885 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2886 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2887 vect_prologue);
2888 dump_printf (MSG_NOTE,
2889 "cost model: Adding cost of checks for loop "
2890 "versioning aliasing.\n");
2893 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2894 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2895 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2896 vect_prologue);
2898 /* Count statements in scalar loop. Using this as scalar cost for a single
2899 iteration for now.
2901 TODO: Add outer loop support.
2903 TODO: Consider assigning different costs to different scalar
2904 statements. */
2906 scalar_single_iter_cost
2907 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2909 /* Add additional cost for the peeled instructions in prologue and epilogue
2910 loop.
2912 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2913 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2915 TODO: Build an expression that represents peel_iters for prologue and
2916 epilogue to be used in a run-time test. */
2918 if (npeel < 0)
2920 peel_iters_prologue = vf/2;
2921 dump_printf (MSG_NOTE, "cost model: "
2922 "prologue peel iters set to vf/2.\n");
2924 /* If peeling for alignment is unknown, loop bound of main loop becomes
2925 unknown. */
2926 peel_iters_epilogue = vf/2;
2927 dump_printf (MSG_NOTE, "cost model: "
2928 "epilogue peel iters set to vf/2 because "
2929 "peeling for alignment is unknown.\n");
2931 /* If peeled iterations are unknown, count a taken branch and a not taken
2932 branch per peeled loop. Even if scalar loop iterations are known,
2933 vector iterations are not known since peeled prologue iterations are
2934 not known. Hence guards remain the same. */
2935 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2936 NULL, 0, vect_prologue);
2937 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2938 NULL, 0, vect_prologue);
2939 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2940 NULL, 0, vect_epilogue);
2941 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2942 NULL, 0, vect_epilogue);
2943 stmt_info_for_cost *si;
2944 int j;
2945 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2947 struct _stmt_vec_info *stmt_info
2948 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2949 (void) add_stmt_cost (target_cost_data,
2950 si->count * peel_iters_prologue,
2951 si->kind, stmt_info, si->misalign,
2952 vect_prologue);
2953 (void) add_stmt_cost (target_cost_data,
2954 si->count * peel_iters_epilogue,
2955 si->kind, stmt_info, si->misalign,
2956 vect_epilogue);
2959 else
2961 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2962 stmt_info_for_cost *si;
2963 int j;
2964 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2966 prologue_cost_vec.create (2);
2967 epilogue_cost_vec.create (2);
2968 peel_iters_prologue = npeel;
2970 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2971 &peel_iters_epilogue,
2972 &LOOP_VINFO_SCALAR_ITERATION_COST
2973 (loop_vinfo),
2974 &prologue_cost_vec,
2975 &epilogue_cost_vec);
2977 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2979 struct _stmt_vec_info *stmt_info
2980 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2981 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2982 si->misalign, vect_prologue);
2985 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2987 struct _stmt_vec_info *stmt_info
2988 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2989 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2990 si->misalign, vect_epilogue);
2993 prologue_cost_vec.release ();
2994 epilogue_cost_vec.release ();
2997 /* FORNOW: The scalar outside cost is incremented in one of the
2998 following ways:
3000 1. The vectorizer checks for alignment and aliasing and generates
3001 a condition that allows dynamic vectorization. A cost model
3002 check is ANDED with the versioning condition. Hence scalar code
3003 path now has the added cost of the versioning check.
3005 if (cost > th & versioning_check)
3006 jmp to vector code
3008 Hence run-time scalar is incremented by not-taken branch cost.
3010 2. The vectorizer then checks if a prologue is required. If the
3011 cost model check was not done before during versioning, it has to
3012 be done before the prologue check.
3014 if (cost <= th)
3015 prologue = scalar_iters
3016 if (prologue == 0)
3017 jmp to vector code
3018 else
3019 execute prologue
3020 if (prologue == num_iters)
3021 go to exit
3023 Hence the run-time scalar cost is incremented by a taken branch,
3024 plus a not-taken branch, plus a taken branch cost.
3026 3. The vectorizer then checks if an epilogue is required. If the
3027 cost model check was not done before during prologue check, it
3028 has to be done with the epilogue check.
3030 if (prologue == 0)
3031 jmp to vector code
3032 else
3033 execute prologue
3034 if (prologue == num_iters)
3035 go to exit
3036 vector code:
3037 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3038 jmp to epilogue
3040 Hence the run-time scalar cost should be incremented by 2 taken
3041 branches.
3043 TODO: The back end may reorder the BBS's differently and reverse
3044 conditions/branch directions. Change the estimates below to
3045 something more reasonable. */
3047 /* If the number of iterations is known and we do not do versioning, we can
3048 decide whether to vectorize at compile time. Hence the scalar version
3049 do not carry cost model guard costs. */
3050 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3051 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3052 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3054 /* Cost model check occurs at versioning. */
3055 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3056 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3057 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3058 else
3060 /* Cost model check occurs at prologue generation. */
3061 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3062 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3063 + vect_get_stmt_cost (cond_branch_not_taken);
3064 /* Cost model check occurs at epilogue generation. */
3065 else
3066 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3070 /* Complete the target-specific cost calculations. */
3071 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3072 &vec_inside_cost, &vec_epilogue_cost);
3074 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3076 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3079 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3080 vec_inside_cost);
3081 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3082 vec_prologue_cost);
3083 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3084 vec_epilogue_cost);
3085 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3086 scalar_single_iter_cost);
3087 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3088 scalar_outside_cost);
3089 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3090 vec_outside_cost);
3091 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3092 peel_iters_prologue);
3093 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3094 peel_iters_epilogue);
3097 /* Calculate number of iterations required to make the vector version
3098 profitable, relative to the loop bodies only. The following condition
3099 must hold true:
3100 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3101 where
3102 SIC = scalar iteration cost, VIC = vector iteration cost,
3103 VOC = vector outside cost, VF = vectorization factor,
3104 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3105 SOC = scalar outside cost for run time cost model check. */
3107 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3109 if (vec_outside_cost <= 0)
3110 min_profitable_iters = 1;
3111 else
3113 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3114 - vec_inside_cost * peel_iters_prologue
3115 - vec_inside_cost * peel_iters_epilogue)
3116 / ((scalar_single_iter_cost * vf)
3117 - vec_inside_cost);
3119 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3120 <= (((int) vec_inside_cost * min_profitable_iters)
3121 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3122 min_profitable_iters++;
3125 /* vector version will never be profitable. */
3126 else
3128 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3129 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3130 "did not happen for a simd loop");
3132 if (dump_enabled_p ())
3133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134 "cost model: the vector iteration cost = %d "
3135 "divided by the scalar iteration cost = %d "
3136 "is greater or equal to the vectorization factor = %d"
3137 ".\n",
3138 vec_inside_cost, scalar_single_iter_cost, vf);
3139 *ret_min_profitable_niters = -1;
3140 *ret_min_profitable_estimate = -1;
3141 return;
3144 dump_printf (MSG_NOTE,
3145 " Calculated minimum iters for profitability: %d\n",
3146 min_profitable_iters);
3148 min_profitable_iters =
3149 min_profitable_iters < vf ? vf : min_profitable_iters;
3151 /* Because the condition we create is:
3152 if (niters <= min_profitable_iters)
3153 then skip the vectorized loop. */
3154 min_profitable_iters--;
3156 if (dump_enabled_p ())
3157 dump_printf_loc (MSG_NOTE, vect_location,
3158 " Runtime profitability threshold = %d\n",
3159 min_profitable_iters);
3161 *ret_min_profitable_niters = min_profitable_iters;
3163 /* Calculate number of iterations required to make the vector version
3164 profitable, relative to the loop bodies only.
3166 Non-vectorized variant is SIC * niters and it must win over vector
3167 variant on the expected loop trip count. The following condition must hold true:
3168 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3170 if (vec_outside_cost <= 0)
3171 min_profitable_estimate = 1;
3172 else
3174 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3175 - vec_inside_cost * peel_iters_prologue
3176 - vec_inside_cost * peel_iters_epilogue)
3177 / ((scalar_single_iter_cost * vf)
3178 - vec_inside_cost);
3180 min_profitable_estimate --;
3181 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 " Static estimate profitability threshold = %d\n",
3185 min_profitable_iters);
3187 *ret_min_profitable_estimate = min_profitable_estimate;
3190 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3191 vector elements (not bits) for a vector of mode MODE. */
3192 static void
3193 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3194 unsigned char *sel)
3196 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3198 for (i = 0; i < nelt; i++)
3199 sel[i] = (i + offset) & (2*nelt - 1);
3202 /* Checks whether the target supports whole-vector shifts for vectors of mode
3203 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3204 it supports vec_perm_const with masks for all necessary shift amounts. */
3205 static bool
3206 have_whole_vector_shift (enum machine_mode mode)
3208 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3209 return true;
3211 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3212 return false;
3214 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3215 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3217 for (i = nelt/2; i >= 1; i/=2)
3219 calc_vec_perm_mask_for_shift (mode, i, sel);
3220 if (!can_vec_perm_p (mode, false, sel))
3221 return false;
3223 return true;
3226 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3228 static tree
3229 get_reduction_op (gimple *stmt, int reduc_index)
3231 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3233 case GIMPLE_SINGLE_RHS:
3234 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3235 == ternary_op);
3236 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3237 case GIMPLE_UNARY_RHS:
3238 return gimple_assign_rhs1 (stmt);
3239 case GIMPLE_BINARY_RHS:
3240 return (reduc_index
3241 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3242 case GIMPLE_TERNARY_RHS:
3243 return gimple_op (stmt, reduc_index + 1);
3244 default:
3245 gcc_unreachable ();
3249 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3250 functions. Design better to avoid maintenance issues. */
3252 /* Function vect_model_reduction_cost.
3254 Models cost for a reduction operation, including the vector ops
3255 generated within the strip-mine loop, the initial definition before
3256 the loop, and the epilogue code that must be generated. */
3258 static bool
3259 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3260 int ncopies, int reduc_index)
3262 int prologue_cost = 0, epilogue_cost = 0;
3263 enum tree_code code;
3264 optab optab;
3265 tree vectype;
3266 gimple *stmt, *orig_stmt;
3267 tree reduction_op;
3268 machine_mode mode;
3269 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3270 struct loop *loop = NULL;
3271 void *target_cost_data;
3273 if (loop_vinfo)
3275 loop = LOOP_VINFO_LOOP (loop_vinfo);
3276 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3278 else
3279 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3281 /* Cost of reduction op inside loop. */
3282 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3283 stmt_info, 0, vect_body);
3284 stmt = STMT_VINFO_STMT (stmt_info);
3286 reduction_op = get_reduction_op (stmt, reduc_index);
3288 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3289 if (!vectype)
3291 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3294 "unsupported data-type ");
3295 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3296 TREE_TYPE (reduction_op));
3297 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3299 return false;
3302 mode = TYPE_MODE (vectype);
3303 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3305 if (!orig_stmt)
3306 orig_stmt = STMT_VINFO_STMT (stmt_info);
3308 code = gimple_assign_rhs_code (orig_stmt);
3310 /* Add in cost for initial definition. */
3311 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3312 stmt_info, 0, vect_prologue);
3314 /* Determine cost of epilogue code.
3316 We have a reduction operator that will reduce the vector in one statement.
3317 Also requires scalar extract. */
3319 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3321 if (reduc_code != ERROR_MARK)
3323 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3324 stmt_info, 0, vect_epilogue);
3325 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3326 stmt_info, 0, vect_epilogue);
3328 else
3330 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3331 tree bitsize =
3332 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3333 int element_bitsize = tree_to_uhwi (bitsize);
3334 int nelements = vec_size_in_bits / element_bitsize;
3336 optab = optab_for_tree_code (code, vectype, optab_default);
3338 /* We have a whole vector shift available. */
3339 if (VECTOR_MODE_P (mode)
3340 && optab_handler (optab, mode) != CODE_FOR_nothing
3341 && have_whole_vector_shift (mode))
3343 /* Final reduction via vector shifts and the reduction operator.
3344 Also requires scalar extract. */
3345 epilogue_cost += add_stmt_cost (target_cost_data,
3346 exact_log2 (nelements) * 2,
3347 vector_stmt, stmt_info, 0,
3348 vect_epilogue);
3349 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3350 vec_to_scalar, stmt_info, 0,
3351 vect_epilogue);
3353 else
3354 /* Use extracts and reduction op for final reduction. For N
3355 elements, we have N extracts and N-1 reduction ops. */
3356 epilogue_cost += add_stmt_cost (target_cost_data,
3357 nelements + nelements - 1,
3358 vector_stmt, stmt_info, 0,
3359 vect_epilogue);
3363 if (dump_enabled_p ())
3364 dump_printf (MSG_NOTE,
3365 "vect_model_reduction_cost: inside_cost = %d, "
3366 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3367 prologue_cost, epilogue_cost);
3369 return true;
3373 /* Function vect_model_induction_cost.
3375 Models cost for induction operations. */
3377 static void
3378 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3381 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3382 unsigned inside_cost, prologue_cost;
3384 /* loop cost for vec_loop. */
3385 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3386 stmt_info, 0, vect_body);
3388 /* prologue cost for vec_init and vec_step. */
3389 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3390 stmt_info, 0, vect_prologue);
3392 if (dump_enabled_p ())
3393 dump_printf_loc (MSG_NOTE, vect_location,
3394 "vect_model_induction_cost: inside_cost = %d, "
3395 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3399 /* Function get_initial_def_for_induction
3401 Input:
3402 STMT - a stmt that performs an induction operation in the loop.
3403 IV_PHI - the initial value of the induction variable
3405 Output:
3406 Return a vector variable, initialized with the first VF values of
3407 the induction variable. E.g., for an iv with IV_PHI='X' and
3408 evolution S, for a vector of 4 units, we want to return:
3409 [X, X + S, X + 2*S, X + 3*S]. */
3411 static tree
3412 get_initial_def_for_induction (gimple *iv_phi)
3414 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3415 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3416 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3417 tree vectype;
3418 int nunits;
3419 edge pe = loop_preheader_edge (loop);
3420 struct loop *iv_loop;
3421 basic_block new_bb;
3422 tree new_vec, vec_init, vec_step, t;
3423 tree new_var;
3424 tree new_name;
3425 gimple *init_stmt, *new_stmt;
3426 gphi *induction_phi;
3427 tree induc_def, vec_def, vec_dest;
3428 tree init_expr, step_expr;
3429 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3430 int i;
3431 int ncopies;
3432 tree expr;
3433 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3434 bool nested_in_vect_loop = false;
3435 gimple_seq stmts = NULL;
3436 imm_use_iterator imm_iter;
3437 use_operand_p use_p;
3438 gimple *exit_phi;
3439 edge latch_e;
3440 tree loop_arg;
3441 gimple_stmt_iterator si;
3442 basic_block bb = gimple_bb (iv_phi);
3443 tree stepvectype;
3444 tree resvectype;
3446 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3447 if (nested_in_vect_loop_p (loop, iv_phi))
3449 nested_in_vect_loop = true;
3450 iv_loop = loop->inner;
3452 else
3453 iv_loop = loop;
3454 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3456 latch_e = loop_latch_edge (iv_loop);
3457 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3459 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3460 gcc_assert (step_expr != NULL_TREE);
3462 pe = loop_preheader_edge (iv_loop);
3463 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3464 loop_preheader_edge (iv_loop));
3466 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3467 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3468 gcc_assert (vectype);
3469 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3470 ncopies = vf / nunits;
3472 gcc_assert (phi_info);
3473 gcc_assert (ncopies >= 1);
3475 /* Convert the step to the desired type. */
3476 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3477 step_expr),
3478 &stmts, true, NULL_TREE);
3479 if (stmts)
3481 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3482 gcc_assert (!new_bb);
3485 /* Find the first insertion point in the BB. */
3486 si = gsi_after_labels (bb);
3488 /* Create the vector that holds the initial_value of the induction. */
3489 if (nested_in_vect_loop)
3491 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3492 been created during vectorization of previous stmts. We obtain it
3493 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3494 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3495 /* If the initial value is not of proper type, convert it. */
3496 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3498 new_stmt
3499 = gimple_build_assign (vect_get_new_vect_var (vectype,
3500 vect_simple_var,
3501 "vec_iv_"),
3502 VIEW_CONVERT_EXPR,
3503 build1 (VIEW_CONVERT_EXPR, vectype,
3504 vec_init));
3505 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3506 gimple_assign_set_lhs (new_stmt, vec_init);
3507 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3508 new_stmt);
3509 gcc_assert (!new_bb);
3510 set_vinfo_for_stmt (new_stmt,
3511 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3514 else
3516 vec<constructor_elt, va_gc> *v;
3518 /* iv_loop is the loop to be vectorized. Create:
3519 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3520 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3521 vect_scalar_var, "var_");
3522 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3523 init_expr),
3524 &stmts, false, new_var);
3525 if (stmts)
3527 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3528 gcc_assert (!new_bb);
3531 vec_alloc (v, nunits);
3532 bool constant_p = is_gimple_min_invariant (new_name);
3533 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3534 for (i = 1; i < nunits; i++)
3536 /* Create: new_name_i = new_name + step_expr */
3537 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3538 new_name, step_expr);
3539 if (!is_gimple_min_invariant (new_name))
3541 init_stmt = gimple_build_assign (new_var, new_name);
3542 new_name = make_ssa_name (new_var, init_stmt);
3543 gimple_assign_set_lhs (init_stmt, new_name);
3544 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3545 gcc_assert (!new_bb);
3546 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "created new init_stmt: ");
3550 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3551 dump_printf (MSG_NOTE, "\n");
3553 constant_p = false;
3555 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3557 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3558 if (constant_p)
3559 new_vec = build_vector_from_ctor (vectype, v);
3560 else
3561 new_vec = build_constructor (vectype, v);
3562 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3566 /* Create the vector that holds the step of the induction. */
3567 if (nested_in_vect_loop)
3568 /* iv_loop is nested in the loop to be vectorized. Generate:
3569 vec_step = [S, S, S, S] */
3570 new_name = step_expr;
3571 else
3573 /* iv_loop is the loop to be vectorized. Generate:
3574 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3575 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3577 expr = build_int_cst (integer_type_node, vf);
3578 expr = fold_convert (TREE_TYPE (step_expr), expr);
3580 else
3581 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3582 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3583 expr, step_expr);
3584 if (TREE_CODE (step_expr) == SSA_NAME)
3585 new_name = vect_init_vector (iv_phi, new_name,
3586 TREE_TYPE (step_expr), NULL);
3589 t = unshare_expr (new_name);
3590 gcc_assert (CONSTANT_CLASS_P (new_name)
3591 || TREE_CODE (new_name) == SSA_NAME);
3592 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3593 gcc_assert (stepvectype);
3594 new_vec = build_vector_from_val (stepvectype, t);
3595 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3598 /* Create the following def-use cycle:
3599 loop prolog:
3600 vec_init = ...
3601 vec_step = ...
3602 loop:
3603 vec_iv = PHI <vec_init, vec_loop>
3605 STMT
3607 vec_loop = vec_iv + vec_step; */
3609 /* Create the induction-phi that defines the induction-operand. */
3610 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3611 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3612 set_vinfo_for_stmt (induction_phi,
3613 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3614 induc_def = PHI_RESULT (induction_phi);
3616 /* Create the iv update inside the loop */
3617 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3618 vec_def = make_ssa_name (vec_dest, new_stmt);
3619 gimple_assign_set_lhs (new_stmt, vec_def);
3620 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3621 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3622 NULL));
3624 /* Set the arguments of the phi node: */
3625 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3626 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3627 UNKNOWN_LOCATION);
3630 /* In case that vectorization factor (VF) is bigger than the number
3631 of elements that we can fit in a vectype (nunits), we have to generate
3632 more than one vector stmt - i.e - we need to "unroll" the
3633 vector stmt by a factor VF/nunits. For more details see documentation
3634 in vectorizable_operation. */
3636 if (ncopies > 1)
3638 stmt_vec_info prev_stmt_vinfo;
3639 /* FORNOW. This restriction should be relaxed. */
3640 gcc_assert (!nested_in_vect_loop);
3642 /* Create the vector that holds the step of the induction. */
3643 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3645 expr = build_int_cst (integer_type_node, nunits);
3646 expr = fold_convert (TREE_TYPE (step_expr), expr);
3648 else
3649 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3650 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3651 expr, step_expr);
3652 if (TREE_CODE (step_expr) == SSA_NAME)
3653 new_name = vect_init_vector (iv_phi, new_name,
3654 TREE_TYPE (step_expr), NULL);
3655 t = unshare_expr (new_name);
3656 gcc_assert (CONSTANT_CLASS_P (new_name)
3657 || TREE_CODE (new_name) == SSA_NAME);
3658 new_vec = build_vector_from_val (stepvectype, t);
3659 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3661 vec_def = induc_def;
3662 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3663 for (i = 1; i < ncopies; i++)
3665 /* vec_i = vec_prev + vec_step */
3666 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3667 vec_def, vec_step);
3668 vec_def = make_ssa_name (vec_dest, new_stmt);
3669 gimple_assign_set_lhs (new_stmt, vec_def);
3671 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3672 if (!useless_type_conversion_p (resvectype, vectype))
3674 new_stmt
3675 = gimple_build_assign
3676 (vect_get_new_vect_var (resvectype, vect_simple_var,
3677 "vec_iv_"),
3678 VIEW_CONVERT_EXPR,
3679 build1 (VIEW_CONVERT_EXPR, resvectype,
3680 gimple_assign_lhs (new_stmt)));
3681 gimple_assign_set_lhs (new_stmt,
3682 make_ssa_name
3683 (gimple_assign_lhs (new_stmt), new_stmt));
3684 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3686 set_vinfo_for_stmt (new_stmt,
3687 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3688 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3689 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3693 if (nested_in_vect_loop)
3695 /* Find the loop-closed exit-phi of the induction, and record
3696 the final vector of induction results: */
3697 exit_phi = NULL;
3698 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3700 gimple *use_stmt = USE_STMT (use_p);
3701 if (is_gimple_debug (use_stmt))
3702 continue;
3704 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3706 exit_phi = use_stmt;
3707 break;
3710 if (exit_phi)
3712 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3713 /* FORNOW. Currently not supporting the case that an inner-loop induction
3714 is not used in the outer-loop (i.e. only outside the outer-loop). */
3715 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3716 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3718 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3719 if (dump_enabled_p ())
3721 dump_printf_loc (MSG_NOTE, vect_location,
3722 "vector of inductions after inner-loop:");
3723 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3724 dump_printf (MSG_NOTE, "\n");
3730 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_NOTE, vect_location,
3733 "transform induction: created def-use cycle: ");
3734 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3735 dump_printf (MSG_NOTE, "\n");
3736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3737 SSA_NAME_DEF_STMT (vec_def), 0);
3738 dump_printf (MSG_NOTE, "\n");
3741 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3742 if (!useless_type_conversion_p (resvectype, vectype))
3744 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3745 vect_simple_var,
3746 "vec_iv_"),
3747 VIEW_CONVERT_EXPR,
3748 build1 (VIEW_CONVERT_EXPR, resvectype,
3749 induc_def));
3750 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3751 gimple_assign_set_lhs (new_stmt, induc_def);
3752 si = gsi_after_labels (bb);
3753 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3754 set_vinfo_for_stmt (new_stmt,
3755 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3756 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3757 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3760 return induc_def;
3764 /* Function get_initial_def_for_reduction
3766 Input:
3767 STMT - a stmt that performs a reduction operation in the loop.
3768 INIT_VAL - the initial value of the reduction variable
3770 Output:
3771 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3772 of the reduction (used for adjusting the epilog - see below).
3773 Return a vector variable, initialized according to the operation that STMT
3774 performs. This vector will be used as the initial value of the
3775 vector of partial results.
3777 Option1 (adjust in epilog): Initialize the vector as follows:
3778 add/bit or/xor: [0,0,...,0,0]
3779 mult/bit and: [1,1,...,1,1]
3780 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3781 and when necessary (e.g. add/mult case) let the caller know
3782 that it needs to adjust the result by init_val.
3784 Option2: Initialize the vector as follows:
3785 add/bit or/xor: [init_val,0,0,...,0]
3786 mult/bit and: [init_val,1,1,...,1]
3787 min/max/cond_expr: [init_val,init_val,...,init_val]
3788 and no adjustments are needed.
3790 For example, for the following code:
3792 s = init_val;
3793 for (i=0;i<n;i++)
3794 s = s + a[i];
3796 STMT is 's = s + a[i]', and the reduction variable is 's'.
3797 For a vector of 4 units, we want to return either [0,0,0,init_val],
3798 or [0,0,0,0] and let the caller know that it needs to adjust
3799 the result at the end by 'init_val'.
3801 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3802 initialization vector is simpler (same element in all entries), if
3803 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3805 A cost model should help decide between these two schemes. */
3807 tree
3808 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3809 tree *adjustment_def)
3811 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3812 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3813 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3814 tree scalar_type = TREE_TYPE (init_val);
3815 tree vectype = get_vectype_for_scalar_type (scalar_type);
3816 int nunits;
3817 enum tree_code code = gimple_assign_rhs_code (stmt);
3818 tree def_for_init;
3819 tree init_def;
3820 tree *elts;
3821 int i;
3822 bool nested_in_vect_loop = false;
3823 tree init_value;
3824 REAL_VALUE_TYPE real_init_val = dconst0;
3825 int int_init_val = 0;
3826 gimple *def_stmt = NULL;
3828 gcc_assert (vectype);
3829 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3831 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3832 || SCALAR_FLOAT_TYPE_P (scalar_type));
3834 if (nested_in_vect_loop_p (loop, stmt))
3835 nested_in_vect_loop = true;
3836 else
3837 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3839 /* In case of double reduction we only create a vector variable to be put
3840 in the reduction phi node. The actual statement creation is done in
3841 vect_create_epilog_for_reduction. */
3842 if (adjustment_def && nested_in_vect_loop
3843 && TREE_CODE (init_val) == SSA_NAME
3844 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3845 && gimple_code (def_stmt) == GIMPLE_PHI
3846 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3847 && vinfo_for_stmt (def_stmt)
3848 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3849 == vect_double_reduction_def)
3851 *adjustment_def = NULL;
3852 return vect_create_destination_var (init_val, vectype);
3855 if (TREE_CONSTANT (init_val))
3857 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3858 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3859 else
3860 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3862 else
3863 init_value = init_val;
3865 switch (code)
3867 case WIDEN_SUM_EXPR:
3868 case DOT_PROD_EXPR:
3869 case SAD_EXPR:
3870 case PLUS_EXPR:
3871 case MINUS_EXPR:
3872 case BIT_IOR_EXPR:
3873 case BIT_XOR_EXPR:
3874 case MULT_EXPR:
3875 case BIT_AND_EXPR:
3876 /* ADJUSMENT_DEF is NULL when called from
3877 vect_create_epilog_for_reduction to vectorize double reduction. */
3878 if (adjustment_def)
3880 if (nested_in_vect_loop)
3881 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3882 NULL);
3883 else
3884 *adjustment_def = init_val;
3887 if (code == MULT_EXPR)
3889 real_init_val = dconst1;
3890 int_init_val = 1;
3893 if (code == BIT_AND_EXPR)
3894 int_init_val = -1;
3896 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3897 def_for_init = build_real (scalar_type, real_init_val);
3898 else
3899 def_for_init = build_int_cst (scalar_type, int_init_val);
3901 /* Create a vector of '0' or '1' except the first element. */
3902 elts = XALLOCAVEC (tree, nunits);
3903 for (i = nunits - 2; i >= 0; --i)
3904 elts[i + 1] = def_for_init;
3906 /* Option1: the first element is '0' or '1' as well. */
3907 if (adjustment_def)
3909 elts[0] = def_for_init;
3910 init_def = build_vector (vectype, elts);
3911 break;
3914 /* Option2: the first element is INIT_VAL. */
3915 elts[0] = init_val;
3916 if (TREE_CONSTANT (init_val))
3917 init_def = build_vector (vectype, elts);
3918 else
3920 vec<constructor_elt, va_gc> *v;
3921 vec_alloc (v, nunits);
3922 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3923 for (i = 1; i < nunits; ++i)
3924 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3925 init_def = build_constructor (vectype, v);
3928 break;
3930 case MIN_EXPR:
3931 case MAX_EXPR:
3932 case COND_EXPR:
3933 if (adjustment_def)
3935 *adjustment_def = NULL_TREE;
3936 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3937 break;
3940 init_def = build_vector_from_val (vectype, init_value);
3941 break;
3943 default:
3944 gcc_unreachable ();
3947 return init_def;
3950 /* Function vect_create_epilog_for_reduction
3952 Create code at the loop-epilog to finalize the result of a reduction
3953 computation.
3955 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3956 reduction statements.
3957 STMT is the scalar reduction stmt that is being vectorized.
3958 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3959 number of elements that we can fit in a vectype (nunits). In this case
3960 we have to generate more than one vector stmt - i.e - we need to "unroll"
3961 the vector stmt by a factor VF/nunits. For more details see documentation
3962 in vectorizable_operation.
3963 REDUC_CODE is the tree-code for the epilog reduction.
3964 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3965 computation.
3966 REDUC_INDEX is the index of the operand in the right hand side of the
3967 statement that is defined by REDUCTION_PHI.
3968 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3969 SLP_NODE is an SLP node containing a group of reduction statements. The
3970 first one in this group is STMT.
3972 This function:
3973 1. Creates the reduction def-use cycles: sets the arguments for
3974 REDUCTION_PHIS:
3975 The loop-entry argument is the vectorized initial-value of the reduction.
3976 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3977 sums.
3978 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3979 by applying the operation specified by REDUC_CODE if available, or by
3980 other means (whole-vector shifts or a scalar loop).
3981 The function also creates a new phi node at the loop exit to preserve
3982 loop-closed form, as illustrated below.
3984 The flow at the entry to this function:
3986 loop:
3987 vec_def = phi <null, null> # REDUCTION_PHI
3988 VECT_DEF = vector_stmt # vectorized form of STMT
3989 s_loop = scalar_stmt # (scalar) STMT
3990 loop_exit:
3991 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3992 use <s_out0>
3993 use <s_out0>
3995 The above is transformed by this function into:
3997 loop:
3998 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3999 VECT_DEF = vector_stmt # vectorized form of STMT
4000 s_loop = scalar_stmt # (scalar) STMT
4001 loop_exit:
4002 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4003 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4004 v_out2 = reduce <v_out1>
4005 s_out3 = extract_field <v_out2, 0>
4006 s_out4 = adjust_result <s_out3>
4007 use <s_out4>
4008 use <s_out4>
4011 static void
4012 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4013 int ncopies, enum tree_code reduc_code,
4014 vec<gimple *> reduction_phis,
4015 int reduc_index, bool double_reduc,
4016 slp_tree slp_node)
4018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4019 stmt_vec_info prev_phi_info;
4020 tree vectype;
4021 machine_mode mode;
4022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4023 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4024 basic_block exit_bb;
4025 tree scalar_dest;
4026 tree scalar_type;
4027 gimple *new_phi = NULL, *phi;
4028 gimple_stmt_iterator exit_gsi;
4029 tree vec_dest;
4030 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4031 gimple *epilog_stmt = NULL;
4032 enum tree_code code = gimple_assign_rhs_code (stmt);
4033 gimple *exit_phi;
4034 tree bitsize;
4035 tree adjustment_def = NULL;
4036 tree vec_initial_def = NULL;
4037 tree reduction_op, expr, def;
4038 tree orig_name, scalar_result;
4039 imm_use_iterator imm_iter, phi_imm_iter;
4040 use_operand_p use_p, phi_use_p;
4041 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4042 bool nested_in_vect_loop = false;
4043 auto_vec<gimple *> new_phis;
4044 auto_vec<gimple *> inner_phis;
4045 enum vect_def_type dt = vect_unknown_def_type;
4046 int j, i;
4047 auto_vec<tree> scalar_results;
4048 unsigned int group_size = 1, k, ratio;
4049 auto_vec<tree> vec_initial_defs;
4050 auto_vec<gimple *> phis;
4051 bool slp_reduc = false;
4052 tree new_phi_result;
4053 gimple *inner_phi = NULL;
4055 if (slp_node)
4056 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4058 if (nested_in_vect_loop_p (loop, stmt))
4060 outer_loop = loop;
4061 loop = loop->inner;
4062 nested_in_vect_loop = true;
4063 gcc_assert (!slp_node);
4066 reduction_op = get_reduction_op (stmt, reduc_index);
4068 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4069 gcc_assert (vectype);
4070 mode = TYPE_MODE (vectype);
4072 /* 1. Create the reduction def-use cycle:
4073 Set the arguments of REDUCTION_PHIS, i.e., transform
4075 loop:
4076 vec_def = phi <null, null> # REDUCTION_PHI
4077 VECT_DEF = vector_stmt # vectorized form of STMT
4080 into:
4082 loop:
4083 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4084 VECT_DEF = vector_stmt # vectorized form of STMT
4087 (in case of SLP, do it for all the phis). */
4089 /* Get the loop-entry arguments. */
4090 if (slp_node)
4091 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4092 NULL, slp_node, reduc_index);
4093 else
4095 vec_initial_defs.create (1);
4096 /* For the case of reduction, vect_get_vec_def_for_operand returns
4097 the scalar def before the loop, that defines the initial value
4098 of the reduction variable. */
4099 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4100 &adjustment_def);
4101 vec_initial_defs.quick_push (vec_initial_def);
4104 /* Set phi nodes arguments. */
4105 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4107 tree vec_init_def, def;
4108 gimple_seq stmts;
4109 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4110 true, NULL_TREE);
4111 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4112 def = vect_defs[i];
4113 for (j = 0; j < ncopies; j++)
4115 /* Set the loop-entry arg of the reduction-phi. */
4116 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4117 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4119 /* Set the loop-latch arg for the reduction-phi. */
4120 if (j > 0)
4121 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4123 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4124 UNKNOWN_LOCATION);
4126 if (dump_enabled_p ())
4128 dump_printf_loc (MSG_NOTE, vect_location,
4129 "transform reduction: created def-use cycle: ");
4130 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4131 dump_printf (MSG_NOTE, "\n");
4132 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4133 dump_printf (MSG_NOTE, "\n");
4136 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4140 /* 2. Create epilog code.
4141 The reduction epilog code operates across the elements of the vector
4142 of partial results computed by the vectorized loop.
4143 The reduction epilog code consists of:
4145 step 1: compute the scalar result in a vector (v_out2)
4146 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4147 step 3: adjust the scalar result (s_out3) if needed.
4149 Step 1 can be accomplished using one the following three schemes:
4150 (scheme 1) using reduc_code, if available.
4151 (scheme 2) using whole-vector shifts, if available.
4152 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4153 combined.
4155 The overall epilog code looks like this:
4157 s_out0 = phi <s_loop> # original EXIT_PHI
4158 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4159 v_out2 = reduce <v_out1> # step 1
4160 s_out3 = extract_field <v_out2, 0> # step 2
4161 s_out4 = adjust_result <s_out3> # step 3
4163 (step 3 is optional, and steps 1 and 2 may be combined).
4164 Lastly, the uses of s_out0 are replaced by s_out4. */
4167 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4168 v_out1 = phi <VECT_DEF>
4169 Store them in NEW_PHIS. */
4171 exit_bb = single_exit (loop)->dest;
4172 prev_phi_info = NULL;
4173 new_phis.create (vect_defs.length ());
4174 FOR_EACH_VEC_ELT (vect_defs, i, def)
4176 for (j = 0; j < ncopies; j++)
4178 tree new_def = copy_ssa_name (def);
4179 phi = create_phi_node (new_def, exit_bb);
4180 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4181 if (j == 0)
4182 new_phis.quick_push (phi);
4183 else
4185 def = vect_get_vec_def_for_stmt_copy (dt, def);
4186 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4189 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4190 prev_phi_info = vinfo_for_stmt (phi);
4194 /* The epilogue is created for the outer-loop, i.e., for the loop being
4195 vectorized. Create exit phis for the outer loop. */
4196 if (double_reduc)
4198 loop = outer_loop;
4199 exit_bb = single_exit (loop)->dest;
4200 inner_phis.create (vect_defs.length ());
4201 FOR_EACH_VEC_ELT (new_phis, i, phi)
4203 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4204 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4205 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4206 PHI_RESULT (phi));
4207 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4208 loop_vinfo, NULL));
4209 inner_phis.quick_push (phi);
4210 new_phis[i] = outer_phi;
4211 prev_phi_info = vinfo_for_stmt (outer_phi);
4212 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4214 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4215 new_result = copy_ssa_name (PHI_RESULT (phi));
4216 outer_phi = create_phi_node (new_result, exit_bb);
4217 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4218 PHI_RESULT (phi));
4219 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4220 loop_vinfo, NULL));
4221 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4222 prev_phi_info = vinfo_for_stmt (outer_phi);
4227 exit_gsi = gsi_after_labels (exit_bb);
4229 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4230 (i.e. when reduc_code is not available) and in the final adjustment
4231 code (if needed). Also get the original scalar reduction variable as
4232 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4233 represents a reduction pattern), the tree-code and scalar-def are
4234 taken from the original stmt that the pattern-stmt (STMT) replaces.
4235 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4236 are taken from STMT. */
4238 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4239 if (!orig_stmt)
4241 /* Regular reduction */
4242 orig_stmt = stmt;
4244 else
4246 /* Reduction pattern */
4247 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4248 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4249 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4252 code = gimple_assign_rhs_code (orig_stmt);
4253 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4254 partial results are added and not subtracted. */
4255 if (code == MINUS_EXPR)
4256 code = PLUS_EXPR;
4258 scalar_dest = gimple_assign_lhs (orig_stmt);
4259 scalar_type = TREE_TYPE (scalar_dest);
4260 scalar_results.create (group_size);
4261 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4262 bitsize = TYPE_SIZE (scalar_type);
4264 /* In case this is a reduction in an inner-loop while vectorizing an outer
4265 loop - we don't need to extract a single scalar result at the end of the
4266 inner-loop (unless it is double reduction, i.e., the use of reduction is
4267 outside the outer-loop). The final vector of partial results will be used
4268 in the vectorized outer-loop, or reduced to a scalar result at the end of
4269 the outer-loop. */
4270 if (nested_in_vect_loop && !double_reduc)
4271 goto vect_finalize_reduction;
4273 /* SLP reduction without reduction chain, e.g.,
4274 # a1 = phi <a2, a0>
4275 # b1 = phi <b2, b0>
4276 a2 = operation (a1)
4277 b2 = operation (b1) */
4278 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4280 /* In case of reduction chain, e.g.,
4281 # a1 = phi <a3, a0>
4282 a2 = operation (a1)
4283 a3 = operation (a2),
4285 we may end up with more than one vector result. Here we reduce them to
4286 one vector. */
4287 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4289 tree first_vect = PHI_RESULT (new_phis[0]);
4290 tree tmp;
4291 gassign *new_vec_stmt = NULL;
4293 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4294 for (k = 1; k < new_phis.length (); k++)
4296 gimple *next_phi = new_phis[k];
4297 tree second_vect = PHI_RESULT (next_phi);
4299 tmp = build2 (code, vectype, first_vect, second_vect);
4300 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4301 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4302 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4303 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4306 new_phi_result = first_vect;
4307 if (new_vec_stmt)
4309 new_phis.truncate (0);
4310 new_phis.safe_push (new_vec_stmt);
4313 else
4314 new_phi_result = PHI_RESULT (new_phis[0]);
4316 /* 2.3 Create the reduction code, using one of the three schemes described
4317 above. In SLP we simply need to extract all the elements from the
4318 vector (without reducing them), so we use scalar shifts. */
4319 if (reduc_code != ERROR_MARK && !slp_reduc)
4321 tree tmp;
4322 tree vec_elem_type;
4324 /*** Case 1: Create:
4325 v_out2 = reduc_expr <v_out1> */
4327 if (dump_enabled_p ())
4328 dump_printf_loc (MSG_NOTE, vect_location,
4329 "Reduce using direct vector reduction.\n");
4331 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4332 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4334 tree tmp_dest =
4335 vect_create_destination_var (scalar_dest, vec_elem_type);
4336 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4337 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4338 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4339 gimple_assign_set_lhs (epilog_stmt, new_temp);
4340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4342 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4344 else
4345 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4346 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4347 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4348 gimple_assign_set_lhs (epilog_stmt, new_temp);
4349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4350 scalar_results.safe_push (new_temp);
4352 else
4354 bool reduce_with_shift = have_whole_vector_shift (mode);
4355 int element_bitsize = tree_to_uhwi (bitsize);
4356 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4357 tree vec_temp;
4359 /* Regardless of whether we have a whole vector shift, if we're
4360 emulating the operation via tree-vect-generic, we don't want
4361 to use it. Only the first round of the reduction is likely
4362 to still be profitable via emulation. */
4363 /* ??? It might be better to emit a reduction tree code here, so that
4364 tree-vect-generic can expand the first round via bit tricks. */
4365 if (!VECTOR_MODE_P (mode))
4366 reduce_with_shift = false;
4367 else
4369 optab optab = optab_for_tree_code (code, vectype, optab_default);
4370 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4371 reduce_with_shift = false;
4374 if (reduce_with_shift && !slp_reduc)
4376 int nelements = vec_size_in_bits / element_bitsize;
4377 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4379 int elt_offset;
4381 tree zero_vec = build_zero_cst (vectype);
4382 /*** Case 2: Create:
4383 for (offset = nelements/2; offset >= 1; offset/=2)
4385 Create: va' = vec_shift <va, offset>
4386 Create: va = vop <va, va'>
4387 } */
4389 tree rhs;
4391 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_NOTE, vect_location,
4393 "Reduce using vector shifts\n");
4395 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4396 new_temp = new_phi_result;
4397 for (elt_offset = nelements / 2;
4398 elt_offset >= 1;
4399 elt_offset /= 2)
4401 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4402 tree mask = vect_gen_perm_mask_any (vectype, sel);
4403 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4404 new_temp, zero_vec, mask);
4405 new_name = make_ssa_name (vec_dest, epilog_stmt);
4406 gimple_assign_set_lhs (epilog_stmt, new_name);
4407 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4409 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4410 new_temp);
4411 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4412 gimple_assign_set_lhs (epilog_stmt, new_temp);
4413 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4416 /* 2.4 Extract the final scalar result. Create:
4417 s_out3 = extract_field <v_out2, bitpos> */
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "extract scalar result\n");
4423 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4424 bitsize, bitsize_zero_node);
4425 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4426 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4427 gimple_assign_set_lhs (epilog_stmt, new_temp);
4428 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4429 scalar_results.safe_push (new_temp);
4431 else
4433 /*** Case 3: Create:
4434 s = extract_field <v_out2, 0>
4435 for (offset = element_size;
4436 offset < vector_size;
4437 offset += element_size;)
4439 Create: s' = extract_field <v_out2, offset>
4440 Create: s = op <s, s'> // For non SLP cases
4441 } */
4443 if (dump_enabled_p ())
4444 dump_printf_loc (MSG_NOTE, vect_location,
4445 "Reduce using scalar code.\n");
4447 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4448 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4450 int bit_offset;
4451 if (gimple_code (new_phi) == GIMPLE_PHI)
4452 vec_temp = PHI_RESULT (new_phi);
4453 else
4454 vec_temp = gimple_assign_lhs (new_phi);
4455 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4456 bitsize_zero_node);
4457 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4458 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4459 gimple_assign_set_lhs (epilog_stmt, new_temp);
4460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4462 /* In SLP we don't need to apply reduction operation, so we just
4463 collect s' values in SCALAR_RESULTS. */
4464 if (slp_reduc)
4465 scalar_results.safe_push (new_temp);
4467 for (bit_offset = element_bitsize;
4468 bit_offset < vec_size_in_bits;
4469 bit_offset += element_bitsize)
4471 tree bitpos = bitsize_int (bit_offset);
4472 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4473 bitsize, bitpos);
4475 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4476 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4477 gimple_assign_set_lhs (epilog_stmt, new_name);
4478 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4480 if (slp_reduc)
4482 /* In SLP we don't need to apply reduction operation, so
4483 we just collect s' values in SCALAR_RESULTS. */
4484 new_temp = new_name;
4485 scalar_results.safe_push (new_name);
4487 else
4489 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4490 new_name, new_temp);
4491 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4492 gimple_assign_set_lhs (epilog_stmt, new_temp);
4493 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4498 /* The only case where we need to reduce scalar results in SLP, is
4499 unrolling. If the size of SCALAR_RESULTS is greater than
4500 GROUP_SIZE, we reduce them combining elements modulo
4501 GROUP_SIZE. */
4502 if (slp_reduc)
4504 tree res, first_res, new_res;
4505 gimple *new_stmt;
4507 /* Reduce multiple scalar results in case of SLP unrolling. */
4508 for (j = group_size; scalar_results.iterate (j, &res);
4509 j++)
4511 first_res = scalar_results[j % group_size];
4512 new_stmt = gimple_build_assign (new_scalar_dest, code,
4513 first_res, res);
4514 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4515 gimple_assign_set_lhs (new_stmt, new_res);
4516 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4517 scalar_results[j % group_size] = new_res;
4520 else
4521 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4522 scalar_results.safe_push (new_temp);
4526 vect_finalize_reduction:
4528 if (double_reduc)
4529 loop = loop->inner;
4531 /* 2.5 Adjust the final result by the initial value of the reduction
4532 variable. (When such adjustment is not needed, then
4533 'adjustment_def' is zero). For example, if code is PLUS we create:
4534 new_temp = loop_exit_def + adjustment_def */
4536 if (adjustment_def)
4538 gcc_assert (!slp_reduc);
4539 if (nested_in_vect_loop)
4541 new_phi = new_phis[0];
4542 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4543 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4544 new_dest = vect_create_destination_var (scalar_dest, vectype);
4546 else
4548 new_temp = scalar_results[0];
4549 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4550 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4551 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4554 epilog_stmt = gimple_build_assign (new_dest, expr);
4555 new_temp = make_ssa_name (new_dest, epilog_stmt);
4556 gimple_assign_set_lhs (epilog_stmt, new_temp);
4557 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4558 if (nested_in_vect_loop)
4560 set_vinfo_for_stmt (epilog_stmt,
4561 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4562 NULL));
4563 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4564 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4566 if (!double_reduc)
4567 scalar_results.quick_push (new_temp);
4568 else
4569 scalar_results[0] = new_temp;
4571 else
4572 scalar_results[0] = new_temp;
4574 new_phis[0] = epilog_stmt;
4577 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4578 phis with new adjusted scalar results, i.e., replace use <s_out0>
4579 with use <s_out4>.
4581 Transform:
4582 loop_exit:
4583 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4584 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4585 v_out2 = reduce <v_out1>
4586 s_out3 = extract_field <v_out2, 0>
4587 s_out4 = adjust_result <s_out3>
4588 use <s_out0>
4589 use <s_out0>
4591 into:
4593 loop_exit:
4594 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4595 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4596 v_out2 = reduce <v_out1>
4597 s_out3 = extract_field <v_out2, 0>
4598 s_out4 = adjust_result <s_out3>
4599 use <s_out4>
4600 use <s_out4> */
4603 /* In SLP reduction chain we reduce vector results into one vector if
4604 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4605 the last stmt in the reduction chain, since we are looking for the loop
4606 exit phi node. */
4607 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4609 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4610 /* Handle reduction patterns. */
4611 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4612 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4614 scalar_dest = gimple_assign_lhs (dest_stmt);
4615 group_size = 1;
4618 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4619 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4620 need to match SCALAR_RESULTS with corresponding statements. The first
4621 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4622 the first vector stmt, etc.
4623 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4624 if (group_size > new_phis.length ())
4626 ratio = group_size / new_phis.length ();
4627 gcc_assert (!(group_size % new_phis.length ()));
4629 else
4630 ratio = 1;
4632 for (k = 0; k < group_size; k++)
4634 if (k % ratio == 0)
4636 epilog_stmt = new_phis[k / ratio];
4637 reduction_phi = reduction_phis[k / ratio];
4638 if (double_reduc)
4639 inner_phi = inner_phis[k / ratio];
4642 if (slp_reduc)
4644 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4646 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4647 /* SLP statements can't participate in patterns. */
4648 gcc_assert (!orig_stmt);
4649 scalar_dest = gimple_assign_lhs (current_stmt);
4652 phis.create (3);
4653 /* Find the loop-closed-use at the loop exit of the original scalar
4654 result. (The reduction result is expected to have two immediate uses -
4655 one at the latch block, and one at the loop exit). */
4656 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4657 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4658 && !is_gimple_debug (USE_STMT (use_p)))
4659 phis.safe_push (USE_STMT (use_p));
4661 /* While we expect to have found an exit_phi because of loop-closed-ssa
4662 form we can end up without one if the scalar cycle is dead. */
4664 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4666 if (outer_loop)
4668 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4669 gphi *vect_phi;
4671 /* FORNOW. Currently not supporting the case that an inner-loop
4672 reduction is not used in the outer-loop (but only outside the
4673 outer-loop), unless it is double reduction. */
4674 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4675 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4676 || double_reduc);
4678 if (double_reduc)
4679 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4680 else
4681 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4682 if (!double_reduc
4683 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4684 != vect_double_reduction_def)
4685 continue;
4687 /* Handle double reduction:
4689 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4690 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4691 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4692 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4694 At that point the regular reduction (stmt2 and stmt3) is
4695 already vectorized, as well as the exit phi node, stmt4.
4696 Here we vectorize the phi node of double reduction, stmt1, and
4697 update all relevant statements. */
4699 /* Go through all the uses of s2 to find double reduction phi
4700 node, i.e., stmt1 above. */
4701 orig_name = PHI_RESULT (exit_phi);
4702 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4704 stmt_vec_info use_stmt_vinfo;
4705 stmt_vec_info new_phi_vinfo;
4706 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4707 basic_block bb = gimple_bb (use_stmt);
4708 gimple *use;
4710 /* Check that USE_STMT is really double reduction phi
4711 node. */
4712 if (gimple_code (use_stmt) != GIMPLE_PHI
4713 || gimple_phi_num_args (use_stmt) != 2
4714 || bb->loop_father != outer_loop)
4715 continue;
4716 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4717 if (!use_stmt_vinfo
4718 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4719 != vect_double_reduction_def)
4720 continue;
4722 /* Create vector phi node for double reduction:
4723 vs1 = phi <vs0, vs2>
4724 vs1 was created previously in this function by a call to
4725 vect_get_vec_def_for_operand and is stored in
4726 vec_initial_def;
4727 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4728 vs0 is created here. */
4730 /* Create vector phi node. */
4731 vect_phi = create_phi_node (vec_initial_def, bb);
4732 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4733 loop_vec_info_for_loop (outer_loop), NULL);
4734 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4736 /* Create vs0 - initial def of the double reduction phi. */
4737 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4738 loop_preheader_edge (outer_loop));
4739 init_def = get_initial_def_for_reduction (stmt,
4740 preheader_arg, NULL);
4741 vect_phi_init = vect_init_vector (use_stmt, init_def,
4742 vectype, NULL);
4744 /* Update phi node arguments with vs0 and vs2. */
4745 add_phi_arg (vect_phi, vect_phi_init,
4746 loop_preheader_edge (outer_loop),
4747 UNKNOWN_LOCATION);
4748 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4749 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4750 if (dump_enabled_p ())
4752 dump_printf_loc (MSG_NOTE, vect_location,
4753 "created double reduction phi node: ");
4754 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4755 dump_printf (MSG_NOTE, "\n");
4758 vect_phi_res = PHI_RESULT (vect_phi);
4760 /* Replace the use, i.e., set the correct vs1 in the regular
4761 reduction phi node. FORNOW, NCOPIES is always 1, so the
4762 loop is redundant. */
4763 use = reduction_phi;
4764 for (j = 0; j < ncopies; j++)
4766 edge pr_edge = loop_preheader_edge (loop);
4767 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4768 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4774 phis.release ();
4775 if (nested_in_vect_loop)
4777 if (double_reduc)
4778 loop = outer_loop;
4779 else
4780 continue;
4783 phis.create (3);
4784 /* Find the loop-closed-use at the loop exit of the original scalar
4785 result. (The reduction result is expected to have two immediate uses,
4786 one at the latch block, and one at the loop exit). For double
4787 reductions we are looking for exit phis of the outer loop. */
4788 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4790 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4792 if (!is_gimple_debug (USE_STMT (use_p)))
4793 phis.safe_push (USE_STMT (use_p));
4795 else
4797 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4799 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4801 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4803 if (!flow_bb_inside_loop_p (loop,
4804 gimple_bb (USE_STMT (phi_use_p)))
4805 && !is_gimple_debug (USE_STMT (phi_use_p)))
4806 phis.safe_push (USE_STMT (phi_use_p));
4812 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4814 /* Replace the uses: */
4815 orig_name = PHI_RESULT (exit_phi);
4816 scalar_result = scalar_results[k];
4817 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4818 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4819 SET_USE (use_p, scalar_result);
4822 phis.release ();
4827 /* Function vectorizable_reduction.
4829 Check if STMT performs a reduction operation that can be vectorized.
4830 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4831 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4832 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4834 This function also handles reduction idioms (patterns) that have been
4835 recognized in advance during vect_pattern_recog. In this case, STMT may be
4836 of this form:
4837 X = pattern_expr (arg0, arg1, ..., X)
4838 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4839 sequence that had been detected and replaced by the pattern-stmt (STMT).
4841 In some cases of reduction patterns, the type of the reduction variable X is
4842 different than the type of the other arguments of STMT.
4843 In such cases, the vectype that is used when transforming STMT into a vector
4844 stmt is different than the vectype that is used to determine the
4845 vectorization factor, because it consists of a different number of elements
4846 than the actual number of elements that are being operated upon in parallel.
4848 For example, consider an accumulation of shorts into an int accumulator.
4849 On some targets it's possible to vectorize this pattern operating on 8
4850 shorts at a time (hence, the vectype for purposes of determining the
4851 vectorization factor should be V8HI); on the other hand, the vectype that
4852 is used to create the vector form is actually V4SI (the type of the result).
4854 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4855 indicates what is the actual level of parallelism (V8HI in the example), so
4856 that the right vectorization factor would be derived. This vectype
4857 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4858 be used to create the vectorized stmt. The right vectype for the vectorized
4859 stmt is obtained from the type of the result X:
4860 get_vectype_for_scalar_type (TREE_TYPE (X))
4862 This means that, contrary to "regular" reductions (or "regular" stmts in
4863 general), the following equation:
4864 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4865 does *NOT* necessarily hold for reduction patterns. */
4867 bool
4868 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
4869 gimple **vec_stmt, slp_tree slp_node)
4871 tree vec_dest;
4872 tree scalar_dest;
4873 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4874 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4875 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4876 tree vectype_in = NULL_TREE;
4877 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4878 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4879 enum tree_code code, orig_code, epilog_reduc_code;
4880 machine_mode vec_mode;
4881 int op_type;
4882 optab optab, reduc_optab;
4883 tree new_temp = NULL_TREE;
4884 tree def;
4885 gimple *def_stmt;
4886 enum vect_def_type dt;
4887 gphi *new_phi = NULL;
4888 tree scalar_type;
4889 bool is_simple_use;
4890 gimple *orig_stmt;
4891 stmt_vec_info orig_stmt_info;
4892 tree expr = NULL_TREE;
4893 int i;
4894 int ncopies;
4895 int epilog_copies;
4896 stmt_vec_info prev_stmt_info, prev_phi_info;
4897 bool single_defuse_cycle = false;
4898 tree reduc_def = NULL_TREE;
4899 gimple *new_stmt = NULL;
4900 int j;
4901 tree ops[3];
4902 bool nested_cycle = false, found_nested_cycle_def = false;
4903 gimple *reduc_def_stmt = NULL;
4904 bool double_reduc = false, dummy;
4905 basic_block def_bb;
4906 struct loop * def_stmt_loop, *outer_loop = NULL;
4907 tree def_arg;
4908 gimple *def_arg_stmt;
4909 auto_vec<tree> vec_oprnds0;
4910 auto_vec<tree> vec_oprnds1;
4911 auto_vec<tree> vect_defs;
4912 auto_vec<gimple *> phis;
4913 int vec_num;
4914 tree def0, def1, tem, op0, op1 = NULL_TREE;
4915 bool first_p = true;
4917 /* In case of reduction chain we switch to the first stmt in the chain, but
4918 we don't update STMT_INFO, since only the last stmt is marked as reduction
4919 and has reduction properties. */
4920 if (GROUP_FIRST_ELEMENT (stmt_info)
4921 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4923 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4924 first_p = false;
4927 if (nested_in_vect_loop_p (loop, stmt))
4929 outer_loop = loop;
4930 loop = loop->inner;
4931 nested_cycle = true;
4934 /* 1. Is vectorizable reduction? */
4935 /* Not supportable if the reduction variable is used in the loop, unless
4936 it's a reduction chain. */
4937 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4938 && !GROUP_FIRST_ELEMENT (stmt_info))
4939 return false;
4941 /* Reductions that are not used even in an enclosing outer-loop,
4942 are expected to be "live" (used out of the loop). */
4943 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4944 && !STMT_VINFO_LIVE_P (stmt_info))
4945 return false;
4947 /* Make sure it was already recognized as a reduction computation. */
4948 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4949 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4950 return false;
4952 /* 2. Has this been recognized as a reduction pattern?
4954 Check if STMT represents a pattern that has been recognized
4955 in earlier analysis stages. For stmts that represent a pattern,
4956 the STMT_VINFO_RELATED_STMT field records the last stmt in
4957 the original sequence that constitutes the pattern. */
4959 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4960 if (orig_stmt)
4962 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4963 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4964 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4967 /* 3. Check the operands of the operation. The first operands are defined
4968 inside the loop body. The last operand is the reduction variable,
4969 which is defined by the loop-header-phi. */
4971 gcc_assert (is_gimple_assign (stmt));
4973 /* Flatten RHS. */
4974 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4976 case GIMPLE_SINGLE_RHS:
4977 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4978 if (op_type == ternary_op)
4980 tree rhs = gimple_assign_rhs1 (stmt);
4981 ops[0] = TREE_OPERAND (rhs, 0);
4982 ops[1] = TREE_OPERAND (rhs, 1);
4983 ops[2] = TREE_OPERAND (rhs, 2);
4984 code = TREE_CODE (rhs);
4986 else
4987 return false;
4988 break;
4990 case GIMPLE_BINARY_RHS:
4991 code = gimple_assign_rhs_code (stmt);
4992 op_type = TREE_CODE_LENGTH (code);
4993 gcc_assert (op_type == binary_op);
4994 ops[0] = gimple_assign_rhs1 (stmt);
4995 ops[1] = gimple_assign_rhs2 (stmt);
4996 break;
4998 case GIMPLE_TERNARY_RHS:
4999 code = gimple_assign_rhs_code (stmt);
5000 op_type = TREE_CODE_LENGTH (code);
5001 gcc_assert (op_type == ternary_op);
5002 ops[0] = gimple_assign_rhs1 (stmt);
5003 ops[1] = gimple_assign_rhs2 (stmt);
5004 ops[2] = gimple_assign_rhs3 (stmt);
5005 break;
5007 case GIMPLE_UNARY_RHS:
5008 return false;
5010 default:
5011 gcc_unreachable ();
5013 /* The default is that the reduction variable is the last in statement. */
5014 int reduc_index = op_type - 1;
5016 if (code == COND_EXPR && slp_node)
5017 return false;
5019 scalar_dest = gimple_assign_lhs (stmt);
5020 scalar_type = TREE_TYPE (scalar_dest);
5021 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5022 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5023 return false;
5025 /* Do not try to vectorize bit-precision reductions. */
5026 if ((TYPE_PRECISION (scalar_type)
5027 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5028 return false;
5030 /* All uses but the last are expected to be defined in the loop.
5031 The last use is the reduction variable. In case of nested cycle this
5032 assumption is not true: we use reduc_index to record the index of the
5033 reduction variable. */
5034 for (i = 0; i < op_type - 1; i++)
5036 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5037 if (i == 0 && code == COND_EXPR)
5038 continue;
5040 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5041 &def_stmt, &def, &dt, &tem);
5042 if (!vectype_in)
5043 vectype_in = tem;
5044 gcc_assert (is_simple_use);
5046 if (dt != vect_internal_def
5047 && dt != vect_external_def
5048 && dt != vect_constant_def
5049 && dt != vect_induction_def
5050 && !(dt == vect_nested_cycle && nested_cycle))
5051 return false;
5053 if (dt == vect_nested_cycle)
5055 found_nested_cycle_def = true;
5056 reduc_def_stmt = def_stmt;
5057 reduc_index = i;
5061 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5062 &def_stmt, &def, &dt, &tem);
5063 if (!vectype_in)
5064 vectype_in = tem;
5065 gcc_assert (is_simple_use);
5066 if (!found_nested_cycle_def)
5067 reduc_def_stmt = def_stmt;
5069 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5070 return false;
5072 if (!(dt == vect_reduction_def
5073 || dt == vect_nested_cycle
5074 || ((dt == vect_internal_def || dt == vect_external_def
5075 || dt == vect_constant_def || dt == vect_induction_def)
5076 && nested_cycle && found_nested_cycle_def)))
5078 /* For pattern recognized stmts, orig_stmt might be a reduction,
5079 but some helper statements for the pattern might not, or
5080 might be COND_EXPRs with reduction uses in the condition. */
5081 gcc_assert (orig_stmt);
5082 return false;
5085 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5086 !nested_cycle, &dummy, false);
5087 if (orig_stmt)
5088 gcc_assert (tmp == orig_stmt
5089 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5090 else
5091 /* We changed STMT to be the first stmt in reduction chain, hence we
5092 check that in this case the first element in the chain is STMT. */
5093 gcc_assert (stmt == tmp
5094 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5096 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5097 return false;
5099 if (slp_node || PURE_SLP_STMT (stmt_info))
5100 ncopies = 1;
5101 else
5102 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5103 / TYPE_VECTOR_SUBPARTS (vectype_in));
5105 gcc_assert (ncopies >= 1);
5107 vec_mode = TYPE_MODE (vectype_in);
5109 if (code == COND_EXPR)
5111 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5115 "unsupported condition in reduction\n");
5117 return false;
5120 else
5122 /* 4. Supportable by target? */
5124 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5125 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5127 /* Shifts and rotates are only supported by vectorizable_shifts,
5128 not vectorizable_reduction. */
5129 if (dump_enabled_p ())
5130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5131 "unsupported shift or rotation.\n");
5132 return false;
5135 /* 4.1. check support for the operation in the loop */
5136 optab = optab_for_tree_code (code, vectype_in, optab_default);
5137 if (!optab)
5139 if (dump_enabled_p ())
5140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5141 "no optab.\n");
5143 return false;
5146 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5148 if (dump_enabled_p ())
5149 dump_printf (MSG_NOTE, "op not supported by target.\n");
5151 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5152 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5153 < vect_min_worthwhile_factor (code))
5154 return false;
5156 if (dump_enabled_p ())
5157 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5160 /* Worthwhile without SIMD support? */
5161 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5162 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5163 < vect_min_worthwhile_factor (code))
5165 if (dump_enabled_p ())
5166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5167 "not worthwhile without SIMD support.\n");
5169 return false;
5173 /* 4.2. Check support for the epilog operation.
5175 If STMT represents a reduction pattern, then the type of the
5176 reduction variable may be different than the type of the rest
5177 of the arguments. For example, consider the case of accumulation
5178 of shorts into an int accumulator; The original code:
5179 S1: int_a = (int) short_a;
5180 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5182 was replaced with:
5183 STMT: int_acc = widen_sum <short_a, int_acc>
5185 This means that:
5186 1. The tree-code that is used to create the vector operation in the
5187 epilog code (that reduces the partial results) is not the
5188 tree-code of STMT, but is rather the tree-code of the original
5189 stmt from the pattern that STMT is replacing. I.e, in the example
5190 above we want to use 'widen_sum' in the loop, but 'plus' in the
5191 epilog.
5192 2. The type (mode) we use to check available target support
5193 for the vector operation to be created in the *epilog*, is
5194 determined by the type of the reduction variable (in the example
5195 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5196 However the type (mode) we use to check available target support
5197 for the vector operation to be created *inside the loop*, is
5198 determined by the type of the other arguments to STMT (in the
5199 example we'd check this: optab_handler (widen_sum_optab,
5200 vect_short_mode)).
5202 This is contrary to "regular" reductions, in which the types of all
5203 the arguments are the same as the type of the reduction variable.
5204 For "regular" reductions we can therefore use the same vector type
5205 (and also the same tree-code) when generating the epilog code and
5206 when generating the code inside the loop. */
5208 if (orig_stmt)
5210 /* This is a reduction pattern: get the vectype from the type of the
5211 reduction variable, and get the tree-code from orig_stmt. */
5212 orig_code = gimple_assign_rhs_code (orig_stmt);
5213 gcc_assert (vectype_out);
5214 vec_mode = TYPE_MODE (vectype_out);
5216 else
5218 /* Regular reduction: use the same vectype and tree-code as used for
5219 the vector code inside the loop can be used for the epilog code. */
5220 orig_code = code;
5223 if (nested_cycle)
5225 def_bb = gimple_bb (reduc_def_stmt);
5226 def_stmt_loop = def_bb->loop_father;
5227 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5228 loop_preheader_edge (def_stmt_loop));
5229 if (TREE_CODE (def_arg) == SSA_NAME
5230 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5231 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5232 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5233 && vinfo_for_stmt (def_arg_stmt)
5234 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5235 == vect_double_reduction_def)
5236 double_reduc = true;
5239 epilog_reduc_code = ERROR_MARK;
5240 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5242 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5243 optab_default);
5244 if (!reduc_optab)
5246 if (dump_enabled_p ())
5247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5248 "no optab for reduction.\n");
5250 epilog_reduc_code = ERROR_MARK;
5252 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5254 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5255 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5257 if (dump_enabled_p ())
5258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5259 "reduc op not supported by target.\n");
5261 epilog_reduc_code = ERROR_MARK;
5265 else
5267 if (!nested_cycle || double_reduc)
5269 if (dump_enabled_p ())
5270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5271 "no reduc code for scalar code.\n");
5273 return false;
5277 if (double_reduc && ncopies > 1)
5279 if (dump_enabled_p ())
5280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5281 "multiple types in double reduction\n");
5283 return false;
5286 /* In case of widenning multiplication by a constant, we update the type
5287 of the constant to be the type of the other operand. We check that the
5288 constant fits the type in the pattern recognition pass. */
5289 if (code == DOT_PROD_EXPR
5290 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5292 if (TREE_CODE (ops[0]) == INTEGER_CST)
5293 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5294 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5295 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5296 else
5298 if (dump_enabled_p ())
5299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5300 "invalid types in dot-prod\n");
5302 return false;
5306 if (!vec_stmt) /* transformation not required. */
5308 if (first_p
5309 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5310 reduc_index))
5311 return false;
5312 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5313 return true;
5316 /** Transform. **/
5318 if (dump_enabled_p ())
5319 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5321 /* FORNOW: Multiple types are not supported for condition. */
5322 if (code == COND_EXPR)
5323 gcc_assert (ncopies == 1);
5325 /* Create the destination vector */
5326 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5328 /* In case the vectorization factor (VF) is bigger than the number
5329 of elements that we can fit in a vectype (nunits), we have to generate
5330 more than one vector stmt - i.e - we need to "unroll" the
5331 vector stmt by a factor VF/nunits. For more details see documentation
5332 in vectorizable_operation. */
5334 /* If the reduction is used in an outer loop we need to generate
5335 VF intermediate results, like so (e.g. for ncopies=2):
5336 r0 = phi (init, r0)
5337 r1 = phi (init, r1)
5338 r0 = x0 + r0;
5339 r1 = x1 + r1;
5340 (i.e. we generate VF results in 2 registers).
5341 In this case we have a separate def-use cycle for each copy, and therefore
5342 for each copy we get the vector def for the reduction variable from the
5343 respective phi node created for this copy.
5345 Otherwise (the reduction is unused in the loop nest), we can combine
5346 together intermediate results, like so (e.g. for ncopies=2):
5347 r = phi (init, r)
5348 r = x0 + r;
5349 r = x1 + r;
5350 (i.e. we generate VF/2 results in a single register).
5351 In this case for each copy we get the vector def for the reduction variable
5352 from the vectorized reduction operation generated in the previous iteration.
5355 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5357 single_defuse_cycle = true;
5358 epilog_copies = 1;
5360 else
5361 epilog_copies = ncopies;
5363 prev_stmt_info = NULL;
5364 prev_phi_info = NULL;
5365 if (slp_node)
5366 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5367 else
5369 vec_num = 1;
5370 vec_oprnds0.create (1);
5371 if (op_type == ternary_op)
5372 vec_oprnds1.create (1);
5375 phis.create (vec_num);
5376 vect_defs.create (vec_num);
5377 if (!slp_node)
5378 vect_defs.quick_push (NULL_TREE);
5380 for (j = 0; j < ncopies; j++)
5382 if (j == 0 || !single_defuse_cycle)
5384 for (i = 0; i < vec_num; i++)
5386 /* Create the reduction-phi that defines the reduction
5387 operand. */
5388 new_phi = create_phi_node (vec_dest, loop->header);
5389 set_vinfo_for_stmt (new_phi,
5390 new_stmt_vec_info (new_phi, loop_vinfo,
5391 NULL));
5392 if (j == 0 || slp_node)
5393 phis.quick_push (new_phi);
5397 if (code == COND_EXPR)
5399 gcc_assert (!slp_node);
5400 vectorizable_condition (stmt, gsi, vec_stmt,
5401 PHI_RESULT (phis[0]),
5402 reduc_index, NULL);
5403 /* Multiple types are not supported for condition. */
5404 break;
5407 /* Handle uses. */
5408 if (j == 0)
5410 op0 = ops[!reduc_index];
5411 if (op_type == ternary_op)
5413 if (reduc_index == 0)
5414 op1 = ops[2];
5415 else
5416 op1 = ops[1];
5419 if (slp_node)
5420 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5421 slp_node, -1);
5422 else
5424 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5425 stmt, NULL);
5426 vec_oprnds0.quick_push (loop_vec_def0);
5427 if (op_type == ternary_op)
5429 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5430 NULL);
5431 vec_oprnds1.quick_push (loop_vec_def1);
5435 else
5437 if (!slp_node)
5439 enum vect_def_type dt;
5440 gimple *dummy_stmt;
5441 tree dummy;
5443 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5444 &dummy_stmt, &dummy, &dt);
5445 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5446 loop_vec_def0);
5447 vec_oprnds0[0] = loop_vec_def0;
5448 if (op_type == ternary_op)
5450 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5451 &dummy, &dt);
5452 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5453 loop_vec_def1);
5454 vec_oprnds1[0] = loop_vec_def1;
5458 if (single_defuse_cycle)
5459 reduc_def = gimple_assign_lhs (new_stmt);
5461 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5464 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5466 if (slp_node)
5467 reduc_def = PHI_RESULT (phis[i]);
5468 else
5470 if (!single_defuse_cycle || j == 0)
5471 reduc_def = PHI_RESULT (new_phi);
5474 def1 = ((op_type == ternary_op)
5475 ? vec_oprnds1[i] : NULL);
5476 if (op_type == binary_op)
5478 if (reduc_index == 0)
5479 expr = build2 (code, vectype_out, reduc_def, def0);
5480 else
5481 expr = build2 (code, vectype_out, def0, reduc_def);
5483 else
5485 if (reduc_index == 0)
5486 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5487 else
5489 if (reduc_index == 1)
5490 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5491 else
5492 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5496 new_stmt = gimple_build_assign (vec_dest, expr);
5497 new_temp = make_ssa_name (vec_dest, new_stmt);
5498 gimple_assign_set_lhs (new_stmt, new_temp);
5499 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5501 if (slp_node)
5503 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5504 vect_defs.quick_push (new_temp);
5506 else
5507 vect_defs[0] = new_temp;
5510 if (slp_node)
5511 continue;
5513 if (j == 0)
5514 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5515 else
5516 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5518 prev_stmt_info = vinfo_for_stmt (new_stmt);
5519 prev_phi_info = vinfo_for_stmt (new_phi);
5522 /* Finalize the reduction-phi (set its arguments) and create the
5523 epilog reduction code. */
5524 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5526 new_temp = gimple_assign_lhs (*vec_stmt);
5527 vect_defs[0] = new_temp;
5530 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5531 epilog_reduc_code, phis, reduc_index,
5532 double_reduc, slp_node);
5534 return true;
5537 /* Function vect_min_worthwhile_factor.
5539 For a loop where we could vectorize the operation indicated by CODE,
5540 return the minimum vectorization factor that makes it worthwhile
5541 to use generic vectors. */
5543 vect_min_worthwhile_factor (enum tree_code code)
5545 switch (code)
5547 case PLUS_EXPR:
5548 case MINUS_EXPR:
5549 case NEGATE_EXPR:
5550 return 4;
5552 case BIT_AND_EXPR:
5553 case BIT_IOR_EXPR:
5554 case BIT_XOR_EXPR:
5555 case BIT_NOT_EXPR:
5556 return 2;
5558 default:
5559 return INT_MAX;
5564 /* Function vectorizable_induction
5566 Check if PHI performs an induction computation that can be vectorized.
5567 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5568 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5569 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5571 bool
5572 vectorizable_induction (gimple *phi,
5573 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5574 gimple **vec_stmt)
5576 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5577 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5578 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5579 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5580 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5581 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5582 tree vec_def;
5584 gcc_assert (ncopies >= 1);
5585 /* FORNOW. These restrictions should be relaxed. */
5586 if (nested_in_vect_loop_p (loop, phi))
5588 imm_use_iterator imm_iter;
5589 use_operand_p use_p;
5590 gimple *exit_phi;
5591 edge latch_e;
5592 tree loop_arg;
5594 if (ncopies > 1)
5596 if (dump_enabled_p ())
5597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5598 "multiple types in nested loop.\n");
5599 return false;
5602 exit_phi = NULL;
5603 latch_e = loop_latch_edge (loop->inner);
5604 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5605 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5607 gimple *use_stmt = USE_STMT (use_p);
5608 if (is_gimple_debug (use_stmt))
5609 continue;
5611 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5613 exit_phi = use_stmt;
5614 break;
5617 if (exit_phi)
5619 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5620 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5621 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5623 if (dump_enabled_p ())
5624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5625 "inner-loop induction only used outside "
5626 "of the outer vectorized loop.\n");
5627 return false;
5632 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5633 return false;
5635 /* FORNOW: SLP not supported. */
5636 if (STMT_SLP_TYPE (stmt_info))
5637 return false;
5639 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5641 if (gimple_code (phi) != GIMPLE_PHI)
5642 return false;
5644 if (!vec_stmt) /* transformation not required. */
5646 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_NOTE, vect_location,
5649 "=== vectorizable_induction ===\n");
5650 vect_model_induction_cost (stmt_info, ncopies);
5651 return true;
5654 /** Transform. **/
5656 if (dump_enabled_p ())
5657 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5659 vec_def = get_initial_def_for_induction (phi);
5660 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5661 return true;
5664 /* Function vectorizable_live_operation.
5666 STMT computes a value that is used outside the loop. Check if
5667 it can be supported. */
5669 bool
5670 vectorizable_live_operation (gimple *stmt,
5671 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5672 gimple **vec_stmt)
5674 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5675 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5676 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5677 int i;
5678 int op_type;
5679 tree op;
5680 tree def;
5681 gimple *def_stmt;
5682 enum vect_def_type dt;
5683 enum tree_code code;
5684 enum gimple_rhs_class rhs_class;
5686 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5688 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5689 return false;
5691 if (!is_gimple_assign (stmt))
5693 if (gimple_call_internal_p (stmt)
5694 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5695 && gimple_call_lhs (stmt)
5696 && loop->simduid
5697 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5698 && loop->simduid
5699 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5701 edge e = single_exit (loop);
5702 basic_block merge_bb = e->dest;
5703 imm_use_iterator imm_iter;
5704 use_operand_p use_p;
5705 tree lhs = gimple_call_lhs (stmt);
5707 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5709 gimple *use_stmt = USE_STMT (use_p);
5710 if (gimple_code (use_stmt) == GIMPLE_PHI
5711 && gimple_bb (use_stmt) == merge_bb)
5713 if (vec_stmt)
5715 tree vfm1
5716 = build_int_cst (unsigned_type_node,
5717 loop_vinfo->vectorization_factor - 1);
5718 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5720 return true;
5725 return false;
5728 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5729 return false;
5731 /* FORNOW. CHECKME. */
5732 if (nested_in_vect_loop_p (loop, stmt))
5733 return false;
5735 code = gimple_assign_rhs_code (stmt);
5736 op_type = TREE_CODE_LENGTH (code);
5737 rhs_class = get_gimple_rhs_class (code);
5738 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5739 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5741 /* FORNOW: support only if all uses are invariant. This means
5742 that the scalar operations can remain in place, unvectorized.
5743 The original last scalar value that they compute will be used. */
5745 for (i = 0; i < op_type; i++)
5747 if (rhs_class == GIMPLE_SINGLE_RHS)
5748 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5749 else
5750 op = gimple_op (stmt, i + 1);
5751 if (op
5752 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5753 &dt))
5755 if (dump_enabled_p ())
5756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5757 "use not simple.\n");
5758 return false;
5761 if (dt != vect_external_def && dt != vect_constant_def)
5762 return false;
5765 /* No transformation is required for the cases we currently support. */
5766 return true;
5769 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5771 static void
5772 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
5774 ssa_op_iter op_iter;
5775 imm_use_iterator imm_iter;
5776 def_operand_p def_p;
5777 gimple *ustmt;
5779 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5781 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5783 basic_block bb;
5785 if (!is_gimple_debug (ustmt))
5786 continue;
5788 bb = gimple_bb (ustmt);
5790 if (!flow_bb_inside_loop_p (loop, bb))
5792 if (gimple_debug_bind_p (ustmt))
5794 if (dump_enabled_p ())
5795 dump_printf_loc (MSG_NOTE, vect_location,
5796 "killing debug use\n");
5798 gimple_debug_bind_reset_value (ustmt);
5799 update_stmt (ustmt);
5801 else
5802 gcc_unreachable ();
5809 /* This function builds ni_name = number of iterations. Statements
5810 are emitted on the loop preheader edge. */
5812 static tree
5813 vect_build_loop_niters (loop_vec_info loop_vinfo)
5815 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5816 if (TREE_CODE (ni) == INTEGER_CST)
5817 return ni;
5818 else
5820 tree ni_name, var;
5821 gimple_seq stmts = NULL;
5822 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5824 var = create_tmp_var (TREE_TYPE (ni), "niters");
5825 ni_name = force_gimple_operand (ni, &stmts, false, var);
5826 if (stmts)
5827 gsi_insert_seq_on_edge_immediate (pe, stmts);
5829 return ni_name;
5834 /* This function generates the following statements:
5836 ni_name = number of iterations loop executes
5837 ratio = ni_name / vf
5838 ratio_mult_vf_name = ratio * vf
5840 and places them on the loop preheader edge. */
5842 static void
5843 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5844 tree ni_name,
5845 tree *ratio_mult_vf_name_ptr,
5846 tree *ratio_name_ptr)
5848 tree ni_minus_gap_name;
5849 tree var;
5850 tree ratio_name;
5851 tree ratio_mult_vf_name;
5852 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5853 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5854 tree log_vf;
5856 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5858 /* If epilogue loop is required because of data accesses with gaps, we
5859 subtract one iteration from the total number of iterations here for
5860 correct calculation of RATIO. */
5861 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5863 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5864 ni_name,
5865 build_one_cst (TREE_TYPE (ni_name)));
5866 if (!is_gimple_val (ni_minus_gap_name))
5868 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5869 gimple *stmts = NULL;
5870 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5871 true, var);
5872 gsi_insert_seq_on_edge_immediate (pe, stmts);
5875 else
5876 ni_minus_gap_name = ni_name;
5878 /* Create: ratio = ni >> log2(vf) */
5879 /* ??? As we have ni == number of latch executions + 1, ni could
5880 have overflown to zero. So avoid computing ratio based on ni
5881 but compute it using the fact that we know ratio will be at least
5882 one, thus via (ni - vf) >> log2(vf) + 1. */
5883 ratio_name
5884 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5885 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5886 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5887 ni_minus_gap_name,
5888 build_int_cst
5889 (TREE_TYPE (ni_name), vf)),
5890 log_vf),
5891 build_int_cst (TREE_TYPE (ni_name), 1));
5892 if (!is_gimple_val (ratio_name))
5894 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5895 gimple *stmts = NULL;
5896 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5897 gsi_insert_seq_on_edge_immediate (pe, stmts);
5899 *ratio_name_ptr = ratio_name;
5901 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5903 if (ratio_mult_vf_name_ptr)
5905 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5906 ratio_name, log_vf);
5907 if (!is_gimple_val (ratio_mult_vf_name))
5909 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5910 gimple *stmts = NULL;
5911 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5912 true, var);
5913 gsi_insert_seq_on_edge_immediate (pe, stmts);
5915 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5918 return;
5922 /* Function vect_transform_loop.
5924 The analysis phase has determined that the loop is vectorizable.
5925 Vectorize the loop - created vectorized stmts to replace the scalar
5926 stmts in the loop, and update the loop exit condition. */
5928 void
5929 vect_transform_loop (loop_vec_info loop_vinfo)
5931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5932 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5933 int nbbs = loop->num_nodes;
5934 int i;
5935 tree ratio = NULL;
5936 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5937 bool grouped_store;
5938 bool slp_scheduled = false;
5939 gimple *stmt, *pattern_stmt;
5940 gimple_seq pattern_def_seq = NULL;
5941 gimple_stmt_iterator pattern_def_si = gsi_none ();
5942 bool transform_pattern_stmt = false;
5943 bool check_profitability = false;
5944 int th;
5945 /* Record number of iterations before we started tampering with the profile. */
5946 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5948 if (dump_enabled_p ())
5949 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5951 /* If profile is inprecise, we have chance to fix it up. */
5952 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5953 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5955 /* Use the more conservative vectorization threshold. If the number
5956 of iterations is constant assume the cost check has been performed
5957 by our caller. If the threshold makes all loops profitable that
5958 run at least the vectorization factor number of times checking
5959 is pointless, too. */
5960 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5961 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5962 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5964 if (dump_enabled_p ())
5965 dump_printf_loc (MSG_NOTE, vect_location,
5966 "Profitability threshold is %d loop iterations.\n",
5967 th);
5968 check_profitability = true;
5971 /* Version the loop first, if required, so the profitability check
5972 comes first. */
5974 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5975 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5977 vect_loop_versioning (loop_vinfo, th, check_profitability);
5978 check_profitability = false;
5981 tree ni_name = vect_build_loop_niters (loop_vinfo);
5982 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5984 /* Peel the loop if there are data refs with unknown alignment.
5985 Only one data ref with unknown store is allowed. */
5987 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5989 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5990 th, check_profitability);
5991 check_profitability = false;
5992 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5993 be re-computed. */
5994 ni_name = NULL_TREE;
5997 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5998 compile time constant), or it is a constant that doesn't divide by the
5999 vectorization factor, then an epilog loop needs to be created.
6000 We therefore duplicate the loop: the original loop will be vectorized,
6001 and will compute the first (n/VF) iterations. The second copy of the loop
6002 will remain scalar and will compute the remaining (n%VF) iterations.
6003 (VF is the vectorization factor). */
6005 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6006 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6008 tree ratio_mult_vf;
6009 if (!ni_name)
6010 ni_name = vect_build_loop_niters (loop_vinfo);
6011 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6012 &ratio);
6013 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6014 th, check_profitability);
6016 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6017 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6018 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6019 else
6021 if (!ni_name)
6022 ni_name = vect_build_loop_niters (loop_vinfo);
6023 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6026 /* 1) Make sure the loop header has exactly two entries
6027 2) Make sure we have a preheader basic block. */
6029 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6031 split_edge (loop_preheader_edge (loop));
6033 /* FORNOW: the vectorizer supports only loops which body consist
6034 of one basic block (header + empty latch). When the vectorizer will
6035 support more involved loop forms, the order by which the BBs are
6036 traversed need to be reconsidered. */
6038 for (i = 0; i < nbbs; i++)
6040 basic_block bb = bbs[i];
6041 stmt_vec_info stmt_info;
6043 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6044 gsi_next (&si))
6046 gphi *phi = si.phi ();
6047 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_NOTE, vect_location,
6050 "------>vectorizing phi: ");
6051 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6052 dump_printf (MSG_NOTE, "\n");
6054 stmt_info = vinfo_for_stmt (phi);
6055 if (!stmt_info)
6056 continue;
6058 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6059 vect_loop_kill_debug_uses (loop, phi);
6061 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6062 && !STMT_VINFO_LIVE_P (stmt_info))
6063 continue;
6065 if (STMT_VINFO_VECTYPE (stmt_info)
6066 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6067 != (unsigned HOST_WIDE_INT) vectorization_factor)
6068 && dump_enabled_p ())
6069 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6071 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6073 if (dump_enabled_p ())
6074 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6075 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6079 pattern_stmt = NULL;
6080 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6081 !gsi_end_p (si) || transform_pattern_stmt;)
6083 bool is_store;
6085 if (transform_pattern_stmt)
6086 stmt = pattern_stmt;
6087 else
6089 stmt = gsi_stmt (si);
6090 /* During vectorization remove existing clobber stmts. */
6091 if (gimple_clobber_p (stmt))
6093 unlink_stmt_vdef (stmt);
6094 gsi_remove (&si, true);
6095 release_defs (stmt);
6096 continue;
6100 if (dump_enabled_p ())
6102 dump_printf_loc (MSG_NOTE, vect_location,
6103 "------>vectorizing statement: ");
6104 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6105 dump_printf (MSG_NOTE, "\n");
6108 stmt_info = vinfo_for_stmt (stmt);
6110 /* vector stmts created in the outer-loop during vectorization of
6111 stmts in an inner-loop may not have a stmt_info, and do not
6112 need to be vectorized. */
6113 if (!stmt_info)
6115 gsi_next (&si);
6116 continue;
6119 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6120 vect_loop_kill_debug_uses (loop, stmt);
6122 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6123 && !STMT_VINFO_LIVE_P (stmt_info))
6125 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6126 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6127 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6128 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6130 stmt = pattern_stmt;
6131 stmt_info = vinfo_for_stmt (stmt);
6133 else
6135 gsi_next (&si);
6136 continue;
6139 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6140 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6141 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6142 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6143 transform_pattern_stmt = true;
6145 /* If pattern statement has def stmts, vectorize them too. */
6146 if (is_pattern_stmt_p (stmt_info))
6148 if (pattern_def_seq == NULL)
6150 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6151 pattern_def_si = gsi_start (pattern_def_seq);
6153 else if (!gsi_end_p (pattern_def_si))
6154 gsi_next (&pattern_def_si);
6155 if (pattern_def_seq != NULL)
6157 gimple *pattern_def_stmt = NULL;
6158 stmt_vec_info pattern_def_stmt_info = NULL;
6160 while (!gsi_end_p (pattern_def_si))
6162 pattern_def_stmt = gsi_stmt (pattern_def_si);
6163 pattern_def_stmt_info
6164 = vinfo_for_stmt (pattern_def_stmt);
6165 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6166 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6167 break;
6168 gsi_next (&pattern_def_si);
6171 if (!gsi_end_p (pattern_def_si))
6173 if (dump_enabled_p ())
6175 dump_printf_loc (MSG_NOTE, vect_location,
6176 "==> vectorizing pattern def "
6177 "stmt: ");
6178 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6179 pattern_def_stmt, 0);
6180 dump_printf (MSG_NOTE, "\n");
6183 stmt = pattern_def_stmt;
6184 stmt_info = pattern_def_stmt_info;
6186 else
6188 pattern_def_si = gsi_none ();
6189 transform_pattern_stmt = false;
6192 else
6193 transform_pattern_stmt = false;
6196 if (STMT_VINFO_VECTYPE (stmt_info))
6198 unsigned int nunits
6199 = (unsigned int)
6200 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6201 if (!STMT_SLP_TYPE (stmt_info)
6202 && nunits != (unsigned int) vectorization_factor
6203 && dump_enabled_p ())
6204 /* For SLP VF is set according to unrolling factor, and not
6205 to vector size, hence for SLP this print is not valid. */
6206 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6209 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6210 reached. */
6211 if (STMT_SLP_TYPE (stmt_info))
6213 if (!slp_scheduled)
6215 slp_scheduled = true;
6217 if (dump_enabled_p ())
6218 dump_printf_loc (MSG_NOTE, vect_location,
6219 "=== scheduling SLP instances ===\n");
6221 vect_schedule_slp (loop_vinfo, NULL);
6224 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6225 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6227 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6229 pattern_def_seq = NULL;
6230 gsi_next (&si);
6232 continue;
6236 /* -------- vectorize statement ------------ */
6237 if (dump_enabled_p ())
6238 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6240 grouped_store = false;
6241 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6242 if (is_store)
6244 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6246 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6247 interleaving chain was completed - free all the stores in
6248 the chain. */
6249 gsi_next (&si);
6250 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6252 else
6254 /* Free the attached stmt_vec_info and remove the stmt. */
6255 gimple *store = gsi_stmt (si);
6256 free_stmt_vec_info (store);
6257 unlink_stmt_vdef (store);
6258 gsi_remove (&si, true);
6259 release_defs (store);
6262 /* Stores can only appear at the end of pattern statements. */
6263 gcc_assert (!transform_pattern_stmt);
6264 pattern_def_seq = NULL;
6266 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6268 pattern_def_seq = NULL;
6269 gsi_next (&si);
6271 } /* stmts in BB */
6272 } /* BBs in loop */
6274 slpeel_make_loop_iterate_ntimes (loop, ratio);
6276 /* Reduce loop iterations by the vectorization factor. */
6277 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6278 expected_iterations / vectorization_factor);
6279 loop->nb_iterations_upper_bound
6280 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6281 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6282 && loop->nb_iterations_upper_bound != 0)
6283 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6284 if (loop->any_estimate)
6286 loop->nb_iterations_estimate
6287 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6288 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6289 && loop->nb_iterations_estimate != 0)
6290 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6293 if (dump_enabled_p ())
6295 dump_printf_loc (MSG_NOTE, vect_location,
6296 "LOOP VECTORIZED\n");
6297 if (loop->inner)
6298 dump_printf_loc (MSG_NOTE, vect_location,
6299 "OUTER LOOP VECTORIZED\n");
6300 dump_printf (MSG_NOTE, "\n");