[gcc]
[official-gcc.git] / gcc / tree-vect-loop.c
blob2a7e0c6661bc1ba82c9f03720e550749f2252a7c
1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
52 /* Loop Vectorization Pass.
54 This pass tries to vectorize loops.
56 For example, the vectorizer transforms the following simple loop:
58 short a[N]; short b[N]; short c[N]; int i;
60 for (i=0; i<N; i++){
61 a[i] = b[i] + c[i];
64 as if it was manually vectorized by rewriting the source code into:
66 typedef int __attribute__((mode(V8HI))) v8hi;
67 short a[N]; short b[N]; short c[N]; int i;
68 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
69 v8hi va, vb, vc;
71 for (i=0; i<N/8; i++){
72 vb = pb[i];
73 vc = pc[i];
74 va = vb + vc;
75 pa[i] = va;
78 The main entry to this pass is vectorize_loops(), in which
79 the vectorizer applies a set of analyses on a given set of loops,
80 followed by the actual vectorization transformation for the loops that
81 had successfully passed the analysis phase.
82 Throughout this pass we make a distinction between two types of
83 data: scalars (which are represented by SSA_NAMES), and memory references
84 ("data-refs"). These two types of data require different handling both
85 during analysis and transformation. The types of data-refs that the
86 vectorizer currently supports are ARRAY_REFS which base is an array DECL
87 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
88 accesses are required to have a simple (consecutive) access pattern.
90 Analysis phase:
91 ===============
92 The driver for the analysis phase is vect_analyze_loop().
93 It applies a set of analyses, some of which rely on the scalar evolution
94 analyzer (scev) developed by Sebastian Pop.
96 During the analysis phase the vectorizer records some information
97 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
98 loop, as well as general information about the loop as a whole, which is
99 recorded in a "loop_vec_info" struct attached to each loop.
101 Transformation phase:
102 =====================
103 The loop transformation phase scans all the stmts in the loop, and
104 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
105 the loop that needs to be vectorized. It inserts the vector code sequence
106 just before the scalar stmt S, and records a pointer to the vector code
107 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
108 attached to S). This pointer will be used for the vectorization of following
109 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
110 otherwise, we rely on dead code elimination for removing it.
112 For example, say stmt S1 was vectorized into stmt VS1:
114 VS1: vb = px[i];
115 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
116 S2: a = b;
118 To vectorize stmt S2, the vectorizer first finds the stmt that defines
119 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
120 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
121 resulting sequence would be:
123 VS1: vb = px[i];
124 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
125 VS2: va = vb;
126 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
128 Operands that are not SSA_NAMEs, are data-refs that appear in
129 load/store operations (like 'x[i]' in S1), and are handled differently.
131 Target modeling:
132 =================
133 Currently the only target specific information that is used is the
134 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
135 Targets that can support different sizes of vectors, for now will need
136 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
137 flexibility will be added in the future.
139 Since we only vectorize operations which vector form can be
140 expressed using existing tree codes, to verify that an operation is
141 supported, the vectorizer checks the relevant optab at the relevant
142 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
143 the value found is CODE_FOR_nothing, then there's no target support, and
144 we can't vectorize the stmt.
146 For additional information on this project see:
147 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
150 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
152 /* Function vect_determine_vectorization_factor
154 Determine the vectorization factor (VF). VF is the number of data elements
155 that are operated upon in parallel in a single iteration of the vectorized
156 loop. For example, when vectorizing a loop that operates on 4byte elements,
157 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
158 elements can fit in a single vector register.
160 We currently support vectorization of loops in which all types operated upon
161 are of the same size. Therefore this function currently sets VF according to
162 the size of the types operated upon, and fails if there are multiple sizes
163 in the loop.
165 VF is also the factor by which the loop iterations are strip-mined, e.g.:
166 original loop:
167 for (i=0; i<N; i++){
168 a[i] = b[i] + c[i];
171 vectorized loop:
172 for (i=0; i<N; i+=VF){
173 a[i:VF] = b[i:VF] + c[i:VF];
177 static bool
178 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
181 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
182 unsigned nbbs = loop->num_nodes;
183 unsigned int vectorization_factor = 0;
184 tree scalar_type;
185 gphi *phi;
186 tree vectype;
187 unsigned int nunits;
188 stmt_vec_info stmt_info;
189 unsigned i;
190 HOST_WIDE_INT dummy;
191 gimple *stmt, *pattern_stmt = NULL;
192 gimple_seq pattern_def_seq = NULL;
193 gimple_stmt_iterator pattern_def_si = gsi_none ();
194 bool analyze_pattern_stmt = false;
195 bool bool_result;
196 auto_vec<stmt_vec_info> mask_producers;
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_NOTE, vect_location,
200 "=== vect_determine_vectorization_factor ===\n");
202 for (i = 0; i < nbbs; i++)
204 basic_block bb = bbs[i];
206 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
207 gsi_next (&si))
209 phi = si.phi ();
210 stmt_info = vinfo_for_stmt (phi);
211 if (dump_enabled_p ())
213 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
214 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
217 gcc_assert (stmt_info);
219 if (STMT_VINFO_RELEVANT_P (stmt_info)
220 || STMT_VINFO_LIVE_P (stmt_info))
222 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
223 scalar_type = TREE_TYPE (PHI_RESULT (phi));
225 if (dump_enabled_p ())
227 dump_printf_loc (MSG_NOTE, vect_location,
228 "get vectype for scalar type: ");
229 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
230 dump_printf (MSG_NOTE, "\n");
233 vectype = get_vectype_for_scalar_type (scalar_type);
234 if (!vectype)
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
239 "not vectorized: unsupported "
240 "data-type ");
241 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
242 scalar_type);
243 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
245 return false;
247 STMT_VINFO_VECTYPE (stmt_info) = vectype;
249 if (dump_enabled_p ())
251 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
252 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
253 dump_printf (MSG_NOTE, "\n");
256 nunits = TYPE_VECTOR_SUBPARTS (vectype);
257 if (dump_enabled_p ())
258 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
259 nunits);
261 if (!vectorization_factor
262 || (nunits > vectorization_factor))
263 vectorization_factor = nunits;
267 for (gimple_stmt_iterator si = gsi_start_bb (bb);
268 !gsi_end_p (si) || analyze_pattern_stmt;)
270 tree vf_vectype;
272 if (analyze_pattern_stmt)
273 stmt = pattern_stmt;
274 else
275 stmt = gsi_stmt (si);
277 stmt_info = vinfo_for_stmt (stmt);
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_NOTE, vect_location,
282 "==> examining statement: ");
283 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
286 gcc_assert (stmt_info);
288 /* Skip stmts which do not need to be vectorized. */
289 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
290 && !STMT_VINFO_LIVE_P (stmt_info))
291 || gimple_clobber_p (stmt))
293 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298 stmt = pattern_stmt;
299 stmt_info = vinfo_for_stmt (pattern_stmt);
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location,
303 "==> examining pattern statement: ");
304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
307 else
309 if (dump_enabled_p ())
310 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
311 gsi_next (&si);
312 continue;
315 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
316 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
317 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
318 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
319 analyze_pattern_stmt = true;
321 /* If a pattern statement has def stmts, analyze them too. */
322 if (is_pattern_stmt_p (stmt_info))
324 if (pattern_def_seq == NULL)
326 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
327 pattern_def_si = gsi_start (pattern_def_seq);
329 else if (!gsi_end_p (pattern_def_si))
330 gsi_next (&pattern_def_si);
331 if (pattern_def_seq != NULL)
333 gimple *pattern_def_stmt = NULL;
334 stmt_vec_info pattern_def_stmt_info = NULL;
336 while (!gsi_end_p (pattern_def_si))
338 pattern_def_stmt = gsi_stmt (pattern_def_si);
339 pattern_def_stmt_info
340 = vinfo_for_stmt (pattern_def_stmt);
341 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
342 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
343 break;
344 gsi_next (&pattern_def_si);
347 if (!gsi_end_p (pattern_def_si))
349 if (dump_enabled_p ())
351 dump_printf_loc (MSG_NOTE, vect_location,
352 "==> examining pattern def stmt: ");
353 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
354 pattern_def_stmt, 0);
357 stmt = pattern_def_stmt;
358 stmt_info = pattern_def_stmt_info;
360 else
362 pattern_def_si = gsi_none ();
363 analyze_pattern_stmt = false;
366 else
367 analyze_pattern_stmt = false;
370 if (gimple_get_lhs (stmt) == NULL_TREE
371 /* MASK_STORE has no lhs, but is ok. */
372 && (!is_gimple_call (stmt)
373 || !gimple_call_internal_p (stmt)
374 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
376 if (is_gimple_call (stmt))
378 /* Ignore calls with no lhs. These must be calls to
379 #pragma omp simd functions, and what vectorization factor
380 it really needs can't be determined until
381 vectorizable_simd_clone_call. */
382 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
384 pattern_def_seq = NULL;
385 gsi_next (&si);
387 continue;
389 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 "not vectorized: irregular stmt.");
393 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
396 return false;
399 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
401 if (dump_enabled_p ())
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "not vectorized: vector stmt in loop:");
405 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
407 return false;
410 bool_result = false;
412 if (STMT_VINFO_VECTYPE (stmt_info))
414 /* The only case when a vectype had been already set is for stmts
415 that contain a dataref, or for "pattern-stmts" (stmts
416 generated by the vectorizer to represent/replace a certain
417 idiom). */
418 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
419 || is_pattern_stmt_p (stmt_info)
420 || !gsi_end_p (pattern_def_si));
421 vectype = STMT_VINFO_VECTYPE (stmt_info);
423 else
425 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
426 if (is_gimple_call (stmt)
427 && gimple_call_internal_p (stmt)
428 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
429 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
430 else
431 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
433 /* Bool ops don't participate in vectorization factor
434 computation. For comparison use compared types to
435 compute a factor. */
436 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
437 && is_gimple_assign (stmt)
438 && gimple_assign_rhs_code (stmt) != COND_EXPR)
440 if (STMT_VINFO_RELEVANT_P (stmt_info)
441 || STMT_VINFO_LIVE_P (stmt_info))
442 mask_producers.safe_push (stmt_info);
443 bool_result = true;
445 if (gimple_code (stmt) == GIMPLE_ASSIGN
446 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
447 == tcc_comparison
448 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
449 != BOOLEAN_TYPE)
450 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
451 else
453 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
455 pattern_def_seq = NULL;
456 gsi_next (&si);
458 continue;
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "get vectype for scalar type: ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
467 dump_printf (MSG_NOTE, "\n");
469 vectype = get_vectype_for_scalar_type (scalar_type);
470 if (!vectype)
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized: unsupported "
476 "data-type ");
477 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
478 scalar_type);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
481 return false;
484 if (!bool_result)
485 STMT_VINFO_VECTYPE (stmt_info) = vectype;
487 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
490 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
491 dump_printf (MSG_NOTE, "\n");
495 /* Don't try to compute VF out scalar types if we stmt
496 produces boolean vector. Use result vectype instead. */
497 if (VECTOR_BOOLEAN_TYPE_P (vectype))
498 vf_vectype = vectype;
499 else
501 /* The vectorization factor is according to the smallest
502 scalar type (or the largest vector size, but we only
503 support one vector size per loop). */
504 if (!bool_result)
505 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
506 &dummy);
507 if (dump_enabled_p ())
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "get vectype for scalar type: ");
511 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
512 dump_printf (MSG_NOTE, "\n");
514 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 return false;
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 return false;
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
563 pattern_def_seq = NULL;
564 gsi_next (&si);
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
582 for (i = 0; i < mask_producers.length (); i++)
584 tree mask_type = NULL;
586 stmt = STMT_VINFO_STMT (mask_producers[i]);
588 if (gimple_code (stmt) == GIMPLE_ASSIGN
589 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
590 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
592 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
593 mask_type = get_mask_type_for_scalar_type (scalar_type);
595 if (!mask_type)
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "not vectorized: unsupported mask\n");
600 return false;
603 else
605 tree rhs;
606 ssa_op_iter iter;
607 gimple *def_stmt;
608 enum vect_def_type dt;
610 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
612 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
613 &def_stmt, &dt, &vectype))
615 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "not vectorized: can't compute mask type "
619 "for statement, ");
620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
623 return false;
626 /* No vectype probably means external definition.
627 Allow it in case there is another operand which
628 allows to determine mask type. */
629 if (!vectype)
630 continue;
632 if (!mask_type)
633 mask_type = vectype;
634 else if (TYPE_VECTOR_SUBPARTS (mask_type)
635 != TYPE_VECTOR_SUBPARTS (vectype))
637 if (dump_enabled_p ())
639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
640 "not vectorized: different sized masks "
641 "types in statement, ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
643 mask_type);
644 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
646 vectype);
647 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649 return false;
651 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
652 != VECTOR_BOOLEAN_TYPE_P (vectype))
654 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "not vectorized: mixed mask and "
658 "nonmask vector types in statement, ");
659 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
660 mask_type);
661 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
662 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
663 vectype);
664 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
666 return false;
670 /* We may compare boolean value loaded as vector of integers.
671 Fix mask_type in such case. */
672 if (mask_type
673 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
674 && gimple_code (stmt) == GIMPLE_ASSIGN
675 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
676 mask_type = build_same_sized_truth_vector_type (mask_type);
679 /* No mask_type should mean loop invariant predicate.
680 This is probably a subject for optimization in
681 if-conversion. */
682 if (!mask_type)
684 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "not vectorized: can't compute mask type "
688 "for statement, ");
689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
692 return false;
695 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
698 return true;
702 /* Function vect_is_simple_iv_evolution.
704 FORNOW: A simple evolution of an induction variables in the loop is
705 considered a polynomial evolution. */
707 static bool
708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
709 tree * step)
711 tree init_expr;
712 tree step_expr;
713 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
714 basic_block bb;
716 /* When there is no evolution in this loop, the evolution function
717 is not "simple". */
718 if (evolution_part == NULL_TREE)
719 return false;
721 /* When the evolution is a polynomial of degree >= 2
722 the evolution function is not "simple". */
723 if (tree_is_chrec (evolution_part))
724 return false;
726 step_expr = evolution_part;
727 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
732 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
733 dump_printf (MSG_NOTE, ", init: ");
734 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
735 dump_printf (MSG_NOTE, "\n");
738 *init = init_expr;
739 *step = step_expr;
741 if (TREE_CODE (step_expr) != INTEGER_CST
742 && (TREE_CODE (step_expr) != SSA_NAME
743 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
744 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
745 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
746 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
747 || !flag_associative_math)))
748 && (TREE_CODE (step_expr) != REAL_CST
749 || !flag_associative_math))
751 if (dump_enabled_p ())
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "step unknown.\n");
754 return false;
757 return true;
760 /* Function vect_analyze_scalar_cycles_1.
762 Examine the cross iteration def-use cycles of scalar variables
763 in LOOP. LOOP_VINFO represents the loop that is now being
764 considered for vectorization (can be LOOP, or an outer-loop
765 enclosing LOOP). */
767 static void
768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
770 basic_block bb = loop->header;
771 tree init, step;
772 auto_vec<gimple *, 64> worklist;
773 gphi_iterator gsi;
774 bool double_reduc;
776 if (dump_enabled_p ())
777 dump_printf_loc (MSG_NOTE, vect_location,
778 "=== vect_analyze_scalar_cycles ===\n");
780 /* First - identify all inductions. Reduction detection assumes that all the
781 inductions have been identified, therefore, this order must not be
782 changed. */
783 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
785 gphi *phi = gsi.phi ();
786 tree access_fn = NULL;
787 tree def = PHI_RESULT (phi);
788 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
793 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
796 /* Skip virtual phi's. The data dependences that are associated with
797 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
798 if (virtual_operand_p (def))
799 continue;
801 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
803 /* Analyze the evolution function. */
804 access_fn = analyze_scalar_evolution (loop, def);
805 if (access_fn)
807 STRIP_NOPS (access_fn);
808 if (dump_enabled_p ())
810 dump_printf_loc (MSG_NOTE, vect_location,
811 "Access function of PHI: ");
812 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
813 dump_printf (MSG_NOTE, "\n");
815 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
816 = initial_condition_in_loop_num (access_fn, loop->num);
817 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
818 = evolution_part_in_loop_num (access_fn, loop->num);
821 if (!access_fn
822 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
823 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
824 && TREE_CODE (step) != INTEGER_CST))
826 worklist.safe_push (phi);
827 continue;
830 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
831 != NULL_TREE);
832 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
836 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
840 /* Second - identify all reductions and nested cycles. */
841 while (worklist.length () > 0)
843 gimple *phi = worklist.pop ();
844 tree def = PHI_RESULT (phi);
845 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
846 gimple *reduc_stmt;
847 bool nested_cycle;
849 if (dump_enabled_p ())
851 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
852 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
855 gcc_assert (!virtual_operand_p (def)
856 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
858 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
859 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
860 &double_reduc, false);
861 if (reduc_stmt)
863 if (double_reduc)
865 if (dump_enabled_p ())
866 dump_printf_loc (MSG_NOTE, vect_location,
867 "Detected double reduction.\n");
869 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
870 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
871 vect_double_reduction_def;
873 else
875 if (nested_cycle)
877 if (dump_enabled_p ())
878 dump_printf_loc (MSG_NOTE, vect_location,
879 "Detected vectorizable nested cycle.\n");
881 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
882 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
883 vect_nested_cycle;
885 else
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_NOTE, vect_location,
889 "Detected reduction.\n");
891 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
892 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
893 vect_reduction_def;
894 /* Store the reduction cycles for possible vectorization in
895 loop-aware SLP. */
896 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
900 else
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "Unknown def-use cycle pattern.\n");
908 /* Function vect_analyze_scalar_cycles.
910 Examine the cross iteration def-use cycles of scalar variables, by
911 analyzing the loop-header PHIs of scalar variables. Classify each
912 cycle as one of the following: invariant, induction, reduction, unknown.
913 We do that for the loop represented by LOOP_VINFO, and also to its
914 inner-loop, if exists.
915 Examples for scalar cycles:
917 Example1: reduction:
919 loop1:
920 for (i=0; i<N; i++)
921 sum += a[i];
923 Example2: induction:
925 loop2:
926 for (i=0; i<N; i++)
927 a[i] = i; */
929 static void
930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
932 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
934 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
936 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
937 Reductions in such inner-loop therefore have different properties than
938 the reductions in the nest that gets vectorized:
939 1. When vectorized, they are executed in the same order as in the original
940 scalar loop, so we can't change the order of computation when
941 vectorizing them.
942 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
943 current checks are too strict. */
945 if (loop->inner)
946 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
949 /* Transfer group and reduction information from STMT to its pattern stmt. */
951 static void
952 vect_fixup_reduc_chain (gimple *stmt)
954 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
955 gimple *stmtp;
956 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
957 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
958 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
961 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
962 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
963 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
964 if (stmt)
965 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
966 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
968 while (stmt);
969 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
972 /* Fixup scalar cycles that now have their stmts detected as patterns. */
974 static void
975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
977 gimple *first;
978 unsigned i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
981 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
983 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
984 while (next)
986 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
987 break;
988 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
990 /* If not all stmt in the chain are patterns try to handle
991 the chain without patterns. */
992 if (! next)
994 vect_fixup_reduc_chain (first);
995 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
996 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001 /* Function vect_get_loop_niters.
1003 Determine how many iterations the loop is executed and place it
1004 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1005 in NUMBER_OF_ITERATIONSM1.
1007 Return the loop exit condition. */
1010 static gcond *
1011 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1012 tree *number_of_iterationsm1)
1014 tree niters;
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_NOTE, vect_location,
1018 "=== get_loop_niters ===\n");
1020 niters = number_of_latch_executions (loop);
1021 *number_of_iterationsm1 = niters;
1023 /* We want the number of loop header executions which is the number
1024 of latch executions plus one.
1025 ??? For UINT_MAX latch executions this number overflows to zero
1026 for loops like do { n++; } while (n != 0); */
1027 if (niters && !chrec_contains_undetermined (niters))
1028 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1029 build_int_cst (TREE_TYPE (niters), 1));
1030 *number_of_iterations = niters;
1032 return get_loop_exit_condition (loop);
1036 /* Function bb_in_loop_p
1038 Used as predicate for dfs order traversal of the loop bbs. */
1040 static bool
1041 bb_in_loop_p (const_basic_block bb, const void *data)
1043 const struct loop *const loop = (const struct loop *)data;
1044 if (flow_bb_inside_loop_p (loop, bb))
1045 return true;
1046 return false;
1050 /* Function new_loop_vec_info.
1052 Create and initialize a new loop_vec_info struct for LOOP, as well as
1053 stmt_vec_info structs for all the stmts in LOOP. */
1055 static loop_vec_info
1056 new_loop_vec_info (struct loop *loop)
1058 loop_vec_info res;
1059 basic_block *bbs;
1060 gimple_stmt_iterator si;
1061 unsigned int i, nbbs;
1063 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1064 res->kind = vec_info::loop;
1065 LOOP_VINFO_LOOP (res) = loop;
1067 bbs = get_loop_body (loop);
1069 /* Create/Update stmt_info for all stmts in the loop. */
1070 for (i = 0; i < loop->num_nodes; i++)
1072 basic_block bb = bbs[i];
1074 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1076 gimple *phi = gsi_stmt (si);
1077 gimple_set_uid (phi, 0);
1078 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1081 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1083 gimple *stmt = gsi_stmt (si);
1084 gimple_set_uid (stmt, 0);
1085 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1089 /* CHECKME: We want to visit all BBs before their successors (except for
1090 latch blocks, for which this assertion wouldn't hold). In the simple
1091 case of the loop forms we allow, a dfs order of the BBs would the same
1092 as reversed postorder traversal, so we are safe. */
1094 free (bbs);
1095 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1096 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1097 bbs, loop->num_nodes, loop);
1098 gcc_assert (nbbs == loop->num_nodes);
1100 LOOP_VINFO_BBS (res) = bbs;
1101 LOOP_VINFO_NITERSM1 (res) = NULL;
1102 LOOP_VINFO_NITERS (res) = NULL;
1103 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1104 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1105 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1106 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1107 LOOP_VINFO_VECT_FACTOR (res) = 0;
1108 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1109 LOOP_VINFO_DATAREFS (res) = vNULL;
1110 LOOP_VINFO_DDRS (res) = vNULL;
1111 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1112 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1113 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1114 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1115 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1116 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1117 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1118 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1119 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1120 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1121 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1122 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1124 return res;
1128 /* Function destroy_loop_vec_info.
1130 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1131 stmts in the loop. */
1133 void
1134 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1136 struct loop *loop;
1137 basic_block *bbs;
1138 int nbbs;
1139 gimple_stmt_iterator si;
1140 int j;
1141 vec<slp_instance> slp_instances;
1142 slp_instance instance;
1143 bool swapped;
1145 if (!loop_vinfo)
1146 return;
1148 loop = LOOP_VINFO_LOOP (loop_vinfo);
1150 bbs = LOOP_VINFO_BBS (loop_vinfo);
1151 nbbs = clean_stmts ? loop->num_nodes : 0;
1152 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1154 for (j = 0; j < nbbs; j++)
1156 basic_block bb = bbs[j];
1157 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1158 free_stmt_vec_info (gsi_stmt (si));
1160 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1162 gimple *stmt = gsi_stmt (si);
1164 /* We may have broken canonical form by moving a constant
1165 into RHS1 of a commutative op. Fix such occurrences. */
1166 if (swapped && is_gimple_assign (stmt))
1168 enum tree_code code = gimple_assign_rhs_code (stmt);
1170 if ((code == PLUS_EXPR
1171 || code == POINTER_PLUS_EXPR
1172 || code == MULT_EXPR)
1173 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1174 swap_ssa_operands (stmt,
1175 gimple_assign_rhs1_ptr (stmt),
1176 gimple_assign_rhs2_ptr (stmt));
1179 /* Free stmt_vec_info. */
1180 free_stmt_vec_info (stmt);
1181 gsi_next (&si);
1185 free (LOOP_VINFO_BBS (loop_vinfo));
1186 vect_destroy_datarefs (loop_vinfo);
1187 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1188 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1189 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1190 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1191 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1192 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1193 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1194 vect_free_slp_instance (instance);
1196 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1197 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1198 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1199 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1201 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1202 loop_vinfo->scalar_cost_vec.release ();
1204 free (loop_vinfo);
1205 loop->aux = NULL;
1209 /* Calculate the cost of one scalar iteration of the loop. */
1210 static void
1211 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1213 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1214 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1215 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1216 int innerloop_iters, i;
1218 /* Count statements in scalar loop. Using this as scalar cost for a single
1219 iteration for now.
1221 TODO: Add outer loop support.
1223 TODO: Consider assigning different costs to different scalar
1224 statements. */
1226 /* FORNOW. */
1227 innerloop_iters = 1;
1228 if (loop->inner)
1229 innerloop_iters = 50; /* FIXME */
1231 for (i = 0; i < nbbs; i++)
1233 gimple_stmt_iterator si;
1234 basic_block bb = bbs[i];
1236 if (bb->loop_father == loop->inner)
1237 factor = innerloop_iters;
1238 else
1239 factor = 1;
1241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1243 gimple *stmt = gsi_stmt (si);
1244 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1246 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1247 continue;
1249 /* Skip stmts that are not vectorized inside the loop. */
1250 if (stmt_info
1251 && !STMT_VINFO_RELEVANT_P (stmt_info)
1252 && (!STMT_VINFO_LIVE_P (stmt_info)
1253 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1254 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1255 continue;
1257 vect_cost_for_stmt kind;
1258 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1260 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1261 kind = scalar_load;
1262 else
1263 kind = scalar_store;
1265 else
1266 kind = scalar_stmt;
1268 scalar_single_iter_cost
1269 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1270 factor, kind, NULL, 0, vect_prologue);
1273 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1274 = scalar_single_iter_cost;
1278 /* Function vect_analyze_loop_form_1.
1280 Verify that certain CFG restrictions hold, including:
1281 - the loop has a pre-header
1282 - the loop has a single entry and exit
1283 - the loop exit condition is simple enough, and the number of iterations
1284 can be analyzed (a countable loop). */
1286 bool
1287 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1288 tree *number_of_iterationsm1,
1289 tree *number_of_iterations, gcond **inner_loop_cond)
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "=== vect_analyze_loop_form ===\n");
1295 /* Different restrictions apply when we are considering an inner-most loop,
1296 vs. an outer (nested) loop.
1297 (FORNOW. May want to relax some of these restrictions in the future). */
1299 if (!loop->inner)
1301 /* Inner-most loop. We currently require that the number of BBs is
1302 exactly 2 (the header and latch). Vectorizable inner-most loops
1303 look like this:
1305 (pre-header)
1307 header <--------+
1308 | | |
1309 | +--> latch --+
1311 (exit-bb) */
1313 if (loop->num_nodes != 2)
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "not vectorized: control flow in loop.\n");
1318 return false;
1321 if (empty_block_p (loop->header))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: empty loop.\n");
1326 return false;
1329 else
1331 struct loop *innerloop = loop->inner;
1332 edge entryedge;
1334 /* Nested loop. We currently require that the loop is doubly-nested,
1335 contains a single inner loop, and the number of BBs is exactly 5.
1336 Vectorizable outer-loops look like this:
1338 (pre-header)
1340 header <---+
1342 inner-loop |
1344 tail ------+
1346 (exit-bb)
1348 The inner-loop has the properties expected of inner-most loops
1349 as described above. */
1351 if ((loop->inner)->inner || (loop->inner)->next)
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "not vectorized: multiple nested loops.\n");
1356 return false;
1359 if (loop->num_nodes != 5)
1361 if (dump_enabled_p ())
1362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1363 "not vectorized: control flow in loop.\n");
1364 return false;
1367 entryedge = loop_preheader_edge (innerloop);
1368 if (entryedge->src != loop->header
1369 || !single_exit (innerloop)
1370 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 "not vectorized: unsupported outerloop form.\n");
1375 return false;
1378 /* Analyze the inner-loop. */
1379 tree inner_niterm1, inner_niter;
1380 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1381 &inner_niterm1, &inner_niter, NULL))
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385 "not vectorized: Bad inner loop.\n");
1386 return false;
1389 if (!expr_invariant_in_loop_p (loop, inner_niter))
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1393 "not vectorized: inner-loop count not"
1394 " invariant.\n");
1395 return false;
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_NOTE, vect_location,
1400 "Considering outer-loop vectorization.\n");
1403 if (!single_exit (loop)
1404 || EDGE_COUNT (loop->header->preds) != 2)
1406 if (dump_enabled_p ())
1408 if (!single_exit (loop))
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "not vectorized: multiple exits.\n");
1411 else if (EDGE_COUNT (loop->header->preds) != 2)
1412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1413 "not vectorized: too many incoming edges.\n");
1415 return false;
1418 /* We assume that the loop exit condition is at the end of the loop. i.e,
1419 that the loop is represented as a do-while (with a proper if-guard
1420 before the loop if needed), where the loop header contains all the
1421 executable statements, and the latch is empty. */
1422 if (!empty_block_p (loop->latch)
1423 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1427 "not vectorized: latch block not empty.\n");
1428 return false;
1431 /* Make sure there exists a single-predecessor exit bb: */
1432 if (!single_pred_p (single_exit (loop)->dest))
1434 edge e = single_exit (loop);
1435 if (!(e->flags & EDGE_ABNORMAL))
1437 split_loop_exit_edge (e);
1438 if (dump_enabled_p ())
1439 dump_printf (MSG_NOTE, "split exit edge.\n");
1441 else
1443 if (dump_enabled_p ())
1444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445 "not vectorized: abnormal loop exit edge.\n");
1446 return false;
1450 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1451 number_of_iterationsm1);
1452 if (!*loop_cond)
1454 if (dump_enabled_p ())
1455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456 "not vectorized: complicated exit condition.\n");
1457 return false;
1460 if (!*number_of_iterations
1461 || chrec_contains_undetermined (*number_of_iterations))
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "not vectorized: number of iterations cannot be "
1466 "computed.\n");
1467 return false;
1470 if (integer_zerop (*number_of_iterations))
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "not vectorized: number of iterations = 0.\n");
1475 return false;
1478 return true;
1481 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1483 loop_vec_info
1484 vect_analyze_loop_form (struct loop *loop)
1486 tree number_of_iterations, number_of_iterationsm1;
1487 gcond *loop_cond, *inner_loop_cond = NULL;
1489 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1490 &number_of_iterations, &inner_loop_cond))
1491 return NULL;
1493 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1494 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1495 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1496 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1498 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1500 if (dump_enabled_p ())
1502 dump_printf_loc (MSG_NOTE, vect_location,
1503 "Symbolic number of iterations is ");
1504 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1505 dump_printf (MSG_NOTE, "\n");
1509 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1510 if (inner_loop_cond)
1511 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1512 = loop_exit_ctrl_vec_info_type;
1514 gcc_assert (!loop->aux);
1515 loop->aux = loop_vinfo;
1516 return loop_vinfo;
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522 statements update the vectorization factor. */
1524 static void
1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1527 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1528 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1529 int nbbs = loop->num_nodes;
1530 unsigned int vectorization_factor;
1531 int i;
1533 if (dump_enabled_p ())
1534 dump_printf_loc (MSG_NOTE, vect_location,
1535 "=== vect_update_vf_for_slp ===\n");
1537 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1538 gcc_assert (vectorization_factor != 0);
1540 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1541 vectorization factor of the loop is the unrolling factor required by
1542 the SLP instances. If that unrolling factor is 1, we say, that we
1543 perform pure SLP on loop - cross iteration parallelism is not
1544 exploited. */
1545 bool only_slp_in_loop = true;
1546 for (i = 0; i < nbbs; i++)
1548 basic_block bb = bbs[i];
1549 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1550 gsi_next (&si))
1552 gimple *stmt = gsi_stmt (si);
1553 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1554 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1555 && STMT_VINFO_RELATED_STMT (stmt_info))
1557 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1558 stmt_info = vinfo_for_stmt (stmt);
1560 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1561 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1562 && !PURE_SLP_STMT (stmt_info))
1563 /* STMT needs both SLP and loop-based vectorization. */
1564 only_slp_in_loop = false;
1568 if (only_slp_in_loop)
1569 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1570 else
1571 vectorization_factor
1572 = least_common_multiple (vectorization_factor,
1573 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1575 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "Updating vectorization factor to %d\n",
1579 vectorization_factor);
1582 /* Function vect_analyze_loop_operations.
1584 Scan the loop stmts and make sure they are all vectorizable. */
1586 static bool
1587 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1590 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1591 int nbbs = loop->num_nodes;
1592 int i;
1593 stmt_vec_info stmt_info;
1594 bool need_to_vectorize = false;
1595 bool ok;
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_NOTE, vect_location,
1599 "=== vect_analyze_loop_operations ===\n");
1601 for (i = 0; i < nbbs; i++)
1603 basic_block bb = bbs[i];
1605 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1606 gsi_next (&si))
1608 gphi *phi = si.phi ();
1609 ok = true;
1611 stmt_info = vinfo_for_stmt (phi);
1612 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1615 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1617 if (virtual_operand_p (gimple_phi_result (phi)))
1618 continue;
1620 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1621 (i.e., a phi in the tail of the outer-loop). */
1622 if (! is_loop_header_bb_p (bb))
1624 /* FORNOW: we currently don't support the case that these phis
1625 are not used in the outerloop (unless it is double reduction,
1626 i.e., this phi is vect_reduction_def), cause this case
1627 requires to actually do something here. */
1628 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1629 || STMT_VINFO_LIVE_P (stmt_info))
1630 && STMT_VINFO_DEF_TYPE (stmt_info)
1631 != vect_double_reduction_def)
1633 if (dump_enabled_p ())
1634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635 "Unsupported loop-closed phi in "
1636 "outer-loop.\n");
1637 return false;
1640 /* If PHI is used in the outer loop, we check that its operand
1641 is defined in the inner loop. */
1642 if (STMT_VINFO_RELEVANT_P (stmt_info))
1644 tree phi_op;
1645 gimple *op_def_stmt;
1647 if (gimple_phi_num_args (phi) != 1)
1648 return false;
1650 phi_op = PHI_ARG_DEF (phi, 0);
1651 if (TREE_CODE (phi_op) != SSA_NAME)
1652 return false;
1654 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1655 if (gimple_nop_p (op_def_stmt)
1656 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1657 || !vinfo_for_stmt (op_def_stmt))
1658 return false;
1660 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1661 != vect_used_in_outer
1662 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1663 != vect_used_in_outer_by_reduction)
1664 return false;
1667 continue;
1670 gcc_assert (stmt_info);
1672 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1673 || STMT_VINFO_LIVE_P (stmt_info))
1674 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1676 /* A scalar-dependence cycle that we don't support. */
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "not vectorized: scalar dependence cycle.\n");
1680 return false;
1683 if (STMT_VINFO_RELEVANT_P (stmt_info))
1685 need_to_vectorize = true;
1686 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1687 ok = vectorizable_induction (phi, NULL, NULL);
1690 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1691 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1693 if (!ok)
1695 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698 "not vectorized: relevant phi not "
1699 "supported: ");
1700 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1702 return false;
1706 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1707 gsi_next (&si))
1709 gimple *stmt = gsi_stmt (si);
1710 if (!gimple_clobber_p (stmt)
1711 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1712 return false;
1714 } /* bbs */
1716 /* All operations in the loop are either irrelevant (deal with loop
1717 control, or dead), or only used outside the loop and can be moved
1718 out of the loop (e.g. invariants, inductions). The loop can be
1719 optimized away by scalar optimizations. We're better off not
1720 touching this loop. */
1721 if (!need_to_vectorize)
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_NOTE, vect_location,
1725 "All the computation can be taken out of the loop.\n");
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1728 "not vectorized: redundant loop. no profit to "
1729 "vectorize.\n");
1730 return false;
1733 return true;
1737 /* Function vect_analyze_loop_2.
1739 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1740 for it. The different analyses will record information in the
1741 loop_vec_info struct. */
1742 static bool
1743 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1745 bool ok;
1746 int max_vf = MAX_VECTORIZATION_FACTOR;
1747 int min_vf = 2;
1748 unsigned int n_stmts = 0;
1750 /* The first group of checks is independent of the vector size. */
1751 fatal = true;
1753 /* Find all data references in the loop (which correspond to vdefs/vuses)
1754 and analyze their evolution in the loop. */
1756 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1758 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1759 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1761 if (dump_enabled_p ())
1762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763 "not vectorized: loop nest containing two "
1764 "or more consecutive inner loops cannot be "
1765 "vectorized\n");
1766 return false;
1769 for (unsigned i = 0; i < loop->num_nodes; i++)
1770 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1771 !gsi_end_p (gsi); gsi_next (&gsi))
1773 gimple *stmt = gsi_stmt (gsi);
1774 if (is_gimple_debug (stmt))
1775 continue;
1776 ++n_stmts;
1777 if (!find_data_references_in_stmt (loop, stmt,
1778 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1780 if (is_gimple_call (stmt) && loop->safelen)
1782 tree fndecl = gimple_call_fndecl (stmt), op;
1783 if (fndecl != NULL_TREE)
1785 cgraph_node *node = cgraph_node::get (fndecl);
1786 if (node != NULL && node->simd_clones != NULL)
1788 unsigned int j, n = gimple_call_num_args (stmt);
1789 for (j = 0; j < n; j++)
1791 op = gimple_call_arg (stmt, j);
1792 if (DECL_P (op)
1793 || (REFERENCE_CLASS_P (op)
1794 && get_base_address (op)))
1795 break;
1797 op = gimple_call_lhs (stmt);
1798 /* Ignore #pragma omp declare simd functions
1799 if they don't have data references in the
1800 call stmt itself. */
1801 if (j == n
1802 && !(op
1803 && (DECL_P (op)
1804 || (REFERENCE_CLASS_P (op)
1805 && get_base_address (op)))))
1806 continue;
1810 if (dump_enabled_p ())
1811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1812 "not vectorized: loop contains function "
1813 "calls or data references that cannot "
1814 "be analyzed\n");
1815 return false;
1819 /* Analyze the data references and also adjust the minimal
1820 vectorization factor according to the loads and stores. */
1822 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1823 if (!ok)
1825 if (dump_enabled_p ())
1826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1827 "bad data references.\n");
1828 return false;
1831 /* Classify all cross-iteration scalar data-flow cycles.
1832 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1833 vect_analyze_scalar_cycles (loop_vinfo);
1835 vect_pattern_recog (loop_vinfo);
1837 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1839 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1840 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1842 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1843 if (!ok)
1845 if (dump_enabled_p ())
1846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847 "bad data access.\n");
1848 return false;
1851 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1853 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1854 if (!ok)
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858 "unexpected pattern.\n");
1859 return false;
1862 /* While the rest of the analysis below depends on it in some way. */
1863 fatal = false;
1865 /* Analyze data dependences between the data-refs in the loop
1866 and adjust the maximum vectorization factor according to
1867 the dependences.
1868 FORNOW: fail at the first data dependence that we encounter. */
1870 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1871 if (!ok
1872 || max_vf < min_vf)
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "bad data dependence.\n");
1877 return false;
1880 ok = vect_determine_vectorization_factor (loop_vinfo);
1881 if (!ok)
1883 if (dump_enabled_p ())
1884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885 "can't determine vectorization factor.\n");
1886 return false;
1888 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data dependence.\n");
1893 return false;
1896 /* Compute the scalar iteration cost. */
1897 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1899 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1900 HOST_WIDE_INT estimated_niter;
1901 unsigned th;
1902 int min_scalar_loop_bound;
1904 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1905 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1906 if (!ok)
1907 return false;
1909 /* If there are any SLP instances mark them as pure_slp. */
1910 bool slp = vect_make_slp_decision (loop_vinfo);
1911 if (slp)
1913 /* Find stmts that need to be both vectorized and SLPed. */
1914 vect_detect_hybrid_slp (loop_vinfo);
1916 /* Update the vectorization factor based on the SLP decision. */
1917 vect_update_vf_for_slp (loop_vinfo);
1920 /* This is the point where we can re-start analysis with SLP forced off. */
1921 start_over:
1923 /* Now the vectorization factor is final. */
1924 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1925 gcc_assert (vectorization_factor != 0);
1927 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1928 dump_printf_loc (MSG_NOTE, vect_location,
1929 "vectorization_factor = %d, niters = "
1930 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1931 LOOP_VINFO_INT_NITERS (loop_vinfo));
1933 HOST_WIDE_INT max_niter
1934 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1935 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1936 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1937 || (max_niter != -1
1938 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1940 if (dump_enabled_p ())
1941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1942 "not vectorized: iteration count smaller than "
1943 "vectorization factor.\n");
1944 return false;
1947 /* Analyze the alignment of the data-refs in the loop.
1948 Fail if a data reference is found that cannot be vectorized. */
1950 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1951 if (!ok)
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955 "bad data alignment.\n");
1956 return false;
1959 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1960 It is important to call pruning after vect_analyze_data_ref_accesses,
1961 since we use grouping information gathered by interleaving analysis. */
1962 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1963 if (!ok)
1964 return false;
1966 /* This pass will decide on using loop versioning and/or loop peeling in
1967 order to enhance the alignment of data references in the loop. */
1968 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1969 if (!ok)
1971 if (dump_enabled_p ())
1972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1973 "bad data alignment.\n");
1974 return false;
1977 if (slp)
1979 /* Analyze operations in the SLP instances. Note this may
1980 remove unsupported SLP instances which makes the above
1981 SLP kind detection invalid. */
1982 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1983 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1984 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1985 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1986 goto again;
1989 /* Scan all the remaining operations in the loop that are not subject
1990 to SLP and make sure they are vectorizable. */
1991 ok = vect_analyze_loop_operations (loop_vinfo);
1992 if (!ok)
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996 "bad operation or unsupported loop bound.\n");
1997 return false;
2000 /* Analyze cost. Decide if worth while to vectorize. */
2001 int min_profitable_estimate, min_profitable_iters;
2002 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2003 &min_profitable_estimate);
2005 if (min_profitable_iters < 0)
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "not vectorized: vectorization not profitable.\n");
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2012 "not vectorized: vector version will never be "
2013 "profitable.\n");
2014 goto again;
2017 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2018 * vectorization_factor) - 1);
2020 /* Use the cost model only if it is more conservative than user specified
2021 threshold. */
2022 th = (unsigned) min_scalar_loop_bound;
2023 if (min_profitable_iters
2024 && (!min_scalar_loop_bound
2025 || min_profitable_iters > min_scalar_loop_bound))
2026 th = (unsigned) min_profitable_iters;
2028 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2030 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2031 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2035 "not vectorized: vectorization not profitable.\n");
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "not vectorized: iteration count smaller than user "
2039 "specified loop bound parameter or minimum profitable "
2040 "iterations (whichever is more conservative).\n");
2041 goto again;
2044 estimated_niter
2045 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2046 if (estimated_niter == -1)
2047 estimated_niter = max_niter;
2048 if (estimated_niter != -1
2049 && ((unsigned HOST_WIDE_INT) estimated_niter
2050 <= MAX (th, (unsigned)min_profitable_estimate)))
2052 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2054 "not vectorized: estimated iteration count too "
2055 "small.\n");
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "not vectorized: estimated iteration count smaller "
2059 "than specified loop bound parameter or minimum "
2060 "profitable iterations (whichever is more "
2061 "conservative).\n");
2062 goto again;
2065 /* Decide whether we need to create an epilogue loop to handle
2066 remaining scalar iterations. */
2067 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2068 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2069 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2071 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2072 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2074 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2075 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2076 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2077 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2079 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2080 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2081 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2082 /* In case of versioning, check if the maximum number of
2083 iterations is greater than th. If they are identical,
2084 the epilogue is unnecessary. */
2085 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2086 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2087 || (unsigned HOST_WIDE_INT) max_niter > th)))
2088 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2090 /* If an epilogue loop is required make sure we can create one. */
2091 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2092 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2096 if (!vect_can_advance_ivs_p (loop_vinfo)
2097 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2098 single_exit (LOOP_VINFO_LOOP
2099 (loop_vinfo))))
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "not vectorized: can't create required "
2104 "epilog loop\n");
2105 goto again;
2109 gcc_assert (vectorization_factor
2110 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2112 /* Ok to vectorize! */
2113 return true;
2115 again:
2116 /* Try again with SLP forced off but if we didn't do any SLP there is
2117 no point in re-trying. */
2118 if (!slp)
2119 return false;
2121 /* If there are reduction chains re-trying will fail anyway. */
2122 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2123 return false;
2125 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2126 via interleaving or lane instructions. */
2127 slp_instance instance;
2128 slp_tree node;
2129 unsigned i, j;
2130 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2132 stmt_vec_info vinfo;
2133 vinfo = vinfo_for_stmt
2134 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2135 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2136 continue;
2137 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2138 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2139 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2140 if (! vect_store_lanes_supported (vectype, size)
2141 && ! vect_grouped_store_supported (vectype, size))
2142 return false;
2143 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2145 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2146 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2147 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2148 size = STMT_VINFO_GROUP_SIZE (vinfo);
2149 vectype = STMT_VINFO_VECTYPE (vinfo);
2150 if (! vect_load_lanes_supported (vectype, size)
2151 && ! vect_grouped_load_supported (vectype, single_element_p,
2152 size))
2153 return false;
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location,
2159 "re-trying with SLP disabled\n");
2161 /* Roll back state appropriately. No SLP this time. */
2162 slp = false;
2163 /* Restore vectorization factor as it were without SLP. */
2164 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2165 /* Free the SLP instances. */
2166 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2167 vect_free_slp_instance (instance);
2168 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2169 /* Reset SLP type to loop_vect on all stmts. */
2170 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2173 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2174 !gsi_end_p (si); gsi_next (&si))
2176 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2177 STMT_SLP_TYPE (stmt_info) = loop_vect;
2178 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2180 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2181 STMT_SLP_TYPE (stmt_info) = loop_vect;
2182 for (gimple_stmt_iterator pi
2183 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2184 !gsi_end_p (pi); gsi_next (&pi))
2186 gimple *pstmt = gsi_stmt (pi);
2187 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2192 /* Free optimized alias test DDRS. */
2193 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2194 /* Reset target cost data. */
2195 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2196 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2197 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2198 /* Reset assorted flags. */
2199 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2200 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2201 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2203 goto start_over;
2206 /* Function vect_analyze_loop.
2208 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2209 for it. The different analyses will record information in the
2210 loop_vec_info struct. */
2211 loop_vec_info
2212 vect_analyze_loop (struct loop *loop)
2214 loop_vec_info loop_vinfo;
2215 unsigned int vector_sizes;
2217 /* Autodetect first vector size we try. */
2218 current_vector_size = 0;
2219 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_NOTE, vect_location,
2223 "===== analyze_loop_nest =====\n");
2225 if (loop_outer (loop)
2226 && loop_vec_info_for_loop (loop_outer (loop))
2227 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_NOTE, vect_location,
2231 "outer-loop already vectorized.\n");
2232 return NULL;
2235 while (1)
2237 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2238 loop_vinfo = vect_analyze_loop_form (loop);
2239 if (!loop_vinfo)
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "bad loop form.\n");
2244 return NULL;
2247 bool fatal = false;
2248 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2250 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2252 return loop_vinfo;
2255 destroy_loop_vec_info (loop_vinfo, true);
2257 vector_sizes &= ~current_vector_size;
2258 if (fatal
2259 || vector_sizes == 0
2260 || current_vector_size == 0)
2261 return NULL;
2263 /* Try the next biggest vector size. */
2264 current_vector_size = 1 << floor_log2 (vector_sizes);
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "***** Re-trying analysis with "
2268 "vector size %d\n", current_vector_size);
2273 /* Function reduction_code_for_scalar_code
2275 Input:
2276 CODE - tree_code of a reduction operations.
2278 Output:
2279 REDUC_CODE - the corresponding tree-code to be used to reduce the
2280 vector of partial results into a single scalar result, or ERROR_MARK
2281 if the operation is a supported reduction operation, but does not have
2282 such a tree-code.
2284 Return FALSE if CODE currently cannot be vectorized as reduction. */
2286 static bool
2287 reduction_code_for_scalar_code (enum tree_code code,
2288 enum tree_code *reduc_code)
2290 switch (code)
2292 case MAX_EXPR:
2293 *reduc_code = REDUC_MAX_EXPR;
2294 return true;
2296 case MIN_EXPR:
2297 *reduc_code = REDUC_MIN_EXPR;
2298 return true;
2300 case PLUS_EXPR:
2301 *reduc_code = REDUC_PLUS_EXPR;
2302 return true;
2304 case MULT_EXPR:
2305 case MINUS_EXPR:
2306 case BIT_IOR_EXPR:
2307 case BIT_XOR_EXPR:
2308 case BIT_AND_EXPR:
2309 *reduc_code = ERROR_MARK;
2310 return true;
2312 default:
2313 return false;
2318 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2319 STMT is printed with a message MSG. */
2321 static void
2322 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2324 dump_printf_loc (msg_type, vect_location, "%s", msg);
2325 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2329 /* Detect SLP reduction of the form:
2331 #a1 = phi <a5, a0>
2332 a2 = operation (a1)
2333 a3 = operation (a2)
2334 a4 = operation (a3)
2335 a5 = operation (a4)
2337 #a = phi <a5>
2339 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2340 FIRST_STMT is the first reduction stmt in the chain
2341 (a2 = operation (a1)).
2343 Return TRUE if a reduction chain was detected. */
2345 static bool
2346 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2347 gimple *first_stmt)
2349 struct loop *loop = (gimple_bb (phi))->loop_father;
2350 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2351 enum tree_code code;
2352 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2353 stmt_vec_info use_stmt_info, current_stmt_info;
2354 tree lhs;
2355 imm_use_iterator imm_iter;
2356 use_operand_p use_p;
2357 int nloop_uses, size = 0, n_out_of_loop_uses;
2358 bool found = false;
2360 if (loop != vect_loop)
2361 return false;
2363 lhs = PHI_RESULT (phi);
2364 code = gimple_assign_rhs_code (first_stmt);
2365 while (1)
2367 nloop_uses = 0;
2368 n_out_of_loop_uses = 0;
2369 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2371 gimple *use_stmt = USE_STMT (use_p);
2372 if (is_gimple_debug (use_stmt))
2373 continue;
2375 /* Check if we got back to the reduction phi. */
2376 if (use_stmt == phi)
2378 loop_use_stmt = use_stmt;
2379 found = true;
2380 break;
2383 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2385 loop_use_stmt = use_stmt;
2386 nloop_uses++;
2388 else
2389 n_out_of_loop_uses++;
2391 /* There are can be either a single use in the loop or two uses in
2392 phi nodes. */
2393 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2394 return false;
2397 if (found)
2398 break;
2400 /* We reached a statement with no loop uses. */
2401 if (nloop_uses == 0)
2402 return false;
2404 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2405 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2406 return false;
2408 if (!is_gimple_assign (loop_use_stmt)
2409 || code != gimple_assign_rhs_code (loop_use_stmt)
2410 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2411 return false;
2413 /* Insert USE_STMT into reduction chain. */
2414 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2415 if (current_stmt)
2417 current_stmt_info = vinfo_for_stmt (current_stmt);
2418 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2419 GROUP_FIRST_ELEMENT (use_stmt_info)
2420 = GROUP_FIRST_ELEMENT (current_stmt_info);
2422 else
2423 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2425 lhs = gimple_assign_lhs (loop_use_stmt);
2426 current_stmt = loop_use_stmt;
2427 size++;
2430 if (!found || loop_use_stmt != phi || size < 2)
2431 return false;
2433 /* Swap the operands, if needed, to make the reduction operand be the second
2434 operand. */
2435 lhs = PHI_RESULT (phi);
2436 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2437 while (next_stmt)
2439 if (gimple_assign_rhs2 (next_stmt) == lhs)
2441 tree op = gimple_assign_rhs1 (next_stmt);
2442 gimple *def_stmt = NULL;
2444 if (TREE_CODE (op) == SSA_NAME)
2445 def_stmt = SSA_NAME_DEF_STMT (op);
2447 /* Check that the other def is either defined in the loop
2448 ("vect_internal_def"), or it's an induction (defined by a
2449 loop-header phi-node). */
2450 if (def_stmt
2451 && gimple_bb (def_stmt)
2452 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2453 && (is_gimple_assign (def_stmt)
2454 || is_gimple_call (def_stmt)
2455 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2456 == vect_induction_def
2457 || (gimple_code (def_stmt) == GIMPLE_PHI
2458 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2459 == vect_internal_def
2460 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2462 lhs = gimple_assign_lhs (next_stmt);
2463 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2464 continue;
2467 return false;
2469 else
2471 tree op = gimple_assign_rhs2 (next_stmt);
2472 gimple *def_stmt = NULL;
2474 if (TREE_CODE (op) == SSA_NAME)
2475 def_stmt = SSA_NAME_DEF_STMT (op);
2477 /* Check that the other def is either defined in the loop
2478 ("vect_internal_def"), or it's an induction (defined by a
2479 loop-header phi-node). */
2480 if (def_stmt
2481 && gimple_bb (def_stmt)
2482 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2483 && (is_gimple_assign (def_stmt)
2484 || is_gimple_call (def_stmt)
2485 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2486 == vect_induction_def
2487 || (gimple_code (def_stmt) == GIMPLE_PHI
2488 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2489 == vect_internal_def
2490 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2492 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2495 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2498 swap_ssa_operands (next_stmt,
2499 gimple_assign_rhs1_ptr (next_stmt),
2500 gimple_assign_rhs2_ptr (next_stmt));
2501 update_stmt (next_stmt);
2503 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2504 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2506 else
2507 return false;
2510 lhs = gimple_assign_lhs (next_stmt);
2511 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2514 /* Save the chain for further analysis in SLP detection. */
2515 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2516 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2517 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2519 return true;
2523 /* Function vect_is_simple_reduction_1
2525 (1) Detect a cross-iteration def-use cycle that represents a simple
2526 reduction computation. We look for the following pattern:
2528 loop_header:
2529 a1 = phi < a0, a2 >
2530 a3 = ...
2531 a2 = operation (a3, a1)
2535 a3 = ...
2536 loop_header:
2537 a1 = phi < a0, a2 >
2538 a2 = operation (a3, a1)
2540 such that:
2541 1. operation is commutative and associative and it is safe to
2542 change the order of the computation (if CHECK_REDUCTION is true)
2543 2. no uses for a2 in the loop (a2 is used out of the loop)
2544 3. no uses of a1 in the loop besides the reduction operation
2545 4. no uses of a1 outside the loop.
2547 Conditions 1,4 are tested here.
2548 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2550 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2551 nested cycles, if CHECK_REDUCTION is false.
2553 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2554 reductions:
2556 a1 = phi < a0, a2 >
2557 inner loop (def of a3)
2558 a2 = phi < a3 >
2560 (4) Detect condition expressions, ie:
2561 for (int i = 0; i < N; i++)
2562 if (a[i] < val)
2563 ret_val = a[i];
2567 static gimple *
2568 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2569 bool check_reduction, bool *double_reduc,
2570 bool need_wrapping_integral_overflow,
2571 enum vect_reduction_type *v_reduc_type)
2573 struct loop *loop = (gimple_bb (phi))->loop_father;
2574 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2575 edge latch_e = loop_latch_edge (loop);
2576 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2577 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2578 enum tree_code orig_code, code;
2579 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2580 tree type;
2581 int nloop_uses;
2582 tree name;
2583 imm_use_iterator imm_iter;
2584 use_operand_p use_p;
2585 bool phi_def;
2587 *double_reduc = false;
2588 *v_reduc_type = TREE_CODE_REDUCTION;
2590 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2591 otherwise, we assume outer loop vectorization. */
2592 gcc_assert ((check_reduction && loop == vect_loop)
2593 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2595 name = PHI_RESULT (phi);
2596 /* ??? If there are no uses of the PHI result the inner loop reduction
2597 won't be detected as possibly double-reduction by vectorizable_reduction
2598 because that tries to walk the PHI arg from the preheader edge which
2599 can be constant. See PR60382. */
2600 if (has_zero_uses (name))
2601 return NULL;
2602 nloop_uses = 0;
2603 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2605 gimple *use_stmt = USE_STMT (use_p);
2606 if (is_gimple_debug (use_stmt))
2607 continue;
2609 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2611 if (dump_enabled_p ())
2612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613 "intermediate value used outside loop.\n");
2615 return NULL;
2618 nloop_uses++;
2619 if (nloop_uses > 1)
2621 if (dump_enabled_p ())
2622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 "reduction used in loop.\n");
2624 return NULL;
2627 phi_use_stmt = use_stmt;
2630 if (TREE_CODE (loop_arg) != SSA_NAME)
2632 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "reduction: not ssa_name: ");
2636 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2637 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2639 return NULL;
2642 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2643 if (!def_stmt)
2645 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2647 "reduction: no def_stmt.\n");
2648 return NULL;
2651 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2653 if (dump_enabled_p ())
2654 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2655 return NULL;
2658 if (is_gimple_assign (def_stmt))
2660 name = gimple_assign_lhs (def_stmt);
2661 phi_def = false;
2663 else
2665 name = PHI_RESULT (def_stmt);
2666 phi_def = true;
2669 nloop_uses = 0;
2670 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2672 gimple *use_stmt = USE_STMT (use_p);
2673 if (is_gimple_debug (use_stmt))
2674 continue;
2675 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2676 nloop_uses++;
2677 if (nloop_uses > 1)
2679 if (dump_enabled_p ())
2680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2681 "reduction used in loop.\n");
2682 return NULL;
2686 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2687 defined in the inner loop. */
2688 if (phi_def)
2690 op1 = PHI_ARG_DEF (def_stmt, 0);
2692 if (gimple_phi_num_args (def_stmt) != 1
2693 || TREE_CODE (op1) != SSA_NAME)
2695 if (dump_enabled_p ())
2696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2697 "unsupported phi node definition.\n");
2699 return NULL;
2702 def1 = SSA_NAME_DEF_STMT (op1);
2703 if (gimple_bb (def1)
2704 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2705 && loop->inner
2706 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2707 && is_gimple_assign (def1)
2708 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2710 if (dump_enabled_p ())
2711 report_vect_op (MSG_NOTE, def_stmt,
2712 "detected double reduction: ");
2714 *double_reduc = true;
2715 return def_stmt;
2718 return NULL;
2721 code = orig_code = gimple_assign_rhs_code (def_stmt);
2723 /* We can handle "res -= x[i]", which is non-associative by
2724 simply rewriting this into "res += -x[i]". Avoid changing
2725 gimple instruction for the first simple tests and only do this
2726 if we're allowed to change code at all. */
2727 if (code == MINUS_EXPR
2728 && (op1 = gimple_assign_rhs1 (def_stmt))
2729 && TREE_CODE (op1) == SSA_NAME
2730 && SSA_NAME_DEF_STMT (op1) == phi)
2731 code = PLUS_EXPR;
2733 if (code == COND_EXPR)
2735 if (check_reduction)
2736 *v_reduc_type = COND_REDUCTION;
2738 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2740 if (dump_enabled_p ())
2741 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2742 "reduction: not commutative/associative: ");
2743 return NULL;
2746 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2748 if (code != COND_EXPR)
2750 if (dump_enabled_p ())
2751 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2752 "reduction: not binary operation: ");
2754 return NULL;
2757 op3 = gimple_assign_rhs1 (def_stmt);
2758 if (COMPARISON_CLASS_P (op3))
2760 op4 = TREE_OPERAND (op3, 1);
2761 op3 = TREE_OPERAND (op3, 0);
2764 op1 = gimple_assign_rhs2 (def_stmt);
2765 op2 = gimple_assign_rhs3 (def_stmt);
2767 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2769 if (dump_enabled_p ())
2770 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2771 "reduction: uses not ssa_names: ");
2773 return NULL;
2776 else
2778 op1 = gimple_assign_rhs1 (def_stmt);
2779 op2 = gimple_assign_rhs2 (def_stmt);
2781 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2783 if (dump_enabled_p ())
2784 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2785 "reduction: uses not ssa_names: ");
2787 return NULL;
2791 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2792 if ((TREE_CODE (op1) == SSA_NAME
2793 && !types_compatible_p (type,TREE_TYPE (op1)))
2794 || (TREE_CODE (op2) == SSA_NAME
2795 && !types_compatible_p (type, TREE_TYPE (op2)))
2796 || (op3 && TREE_CODE (op3) == SSA_NAME
2797 && !types_compatible_p (type, TREE_TYPE (op3)))
2798 || (op4 && TREE_CODE (op4) == SSA_NAME
2799 && !types_compatible_p (type, TREE_TYPE (op4))))
2801 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_NOTE, vect_location,
2804 "reduction: multiple types: operation type: ");
2805 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2806 dump_printf (MSG_NOTE, ", operands types: ");
2807 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2808 TREE_TYPE (op1));
2809 dump_printf (MSG_NOTE, ",");
2810 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2811 TREE_TYPE (op2));
2812 if (op3)
2814 dump_printf (MSG_NOTE, ",");
2815 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2816 TREE_TYPE (op3));
2819 if (op4)
2821 dump_printf (MSG_NOTE, ",");
2822 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2823 TREE_TYPE (op4));
2825 dump_printf (MSG_NOTE, "\n");
2828 return NULL;
2831 /* Check that it's ok to change the order of the computation.
2832 Generally, when vectorizing a reduction we change the order of the
2833 computation. This may change the behavior of the program in some
2834 cases, so we need to check that this is ok. One exception is when
2835 vectorizing an outer-loop: the inner-loop is executed sequentially,
2836 and therefore vectorizing reductions in the inner-loop during
2837 outer-loop vectorization is safe. */
2839 if (*v_reduc_type != COND_REDUCTION
2840 && check_reduction)
2842 /* CHECKME: check for !flag_finite_math_only too? */
2843 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2845 /* Changing the order of operations changes the semantics. */
2846 if (dump_enabled_p ())
2847 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2848 "reduction: unsafe fp math optimization: ");
2849 return NULL;
2851 else if (INTEGRAL_TYPE_P (type))
2853 if (!operation_no_trapping_overflow (type, code))
2855 /* Changing the order of operations changes the semantics. */
2856 if (dump_enabled_p ())
2857 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2858 "reduction: unsafe int math optimization"
2859 " (overflow traps): ");
2860 return NULL;
2862 if (need_wrapping_integral_overflow
2863 && !TYPE_OVERFLOW_WRAPS (type)
2864 && operation_can_overflow (code))
2866 /* Changing the order of operations changes the semantics. */
2867 if (dump_enabled_p ())
2868 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869 "reduction: unsafe int math optimization"
2870 " (overflow doesn't wrap): ");
2871 return NULL;
2874 else if (SAT_FIXED_POINT_TYPE_P (type))
2876 /* Changing the order of operations changes the semantics. */
2877 if (dump_enabled_p ())
2878 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2879 "reduction: unsafe fixed-point math optimization: ");
2880 return NULL;
2884 /* Reduction is safe. We're dealing with one of the following:
2885 1) integer arithmetic and no trapv
2886 2) floating point arithmetic, and special flags permit this optimization
2887 3) nested cycle (i.e., outer loop vectorization). */
2888 if (TREE_CODE (op1) == SSA_NAME)
2889 def1 = SSA_NAME_DEF_STMT (op1);
2891 if (TREE_CODE (op2) == SSA_NAME)
2892 def2 = SSA_NAME_DEF_STMT (op2);
2894 if (code != COND_EXPR
2895 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2897 if (dump_enabled_p ())
2898 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2899 return NULL;
2902 /* Check that one def is the reduction def, defined by PHI,
2903 the other def is either defined in the loop ("vect_internal_def"),
2904 or it's an induction (defined by a loop-header phi-node). */
2906 if (def2 && def2 == phi
2907 && (code == COND_EXPR
2908 || !def1 || gimple_nop_p (def1)
2909 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2910 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2911 && (is_gimple_assign (def1)
2912 || is_gimple_call (def1)
2913 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2914 == vect_induction_def
2915 || (gimple_code (def1) == GIMPLE_PHI
2916 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2917 == vect_internal_def
2918 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2920 if (dump_enabled_p ())
2921 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2922 return def_stmt;
2925 if (def1 && def1 == phi
2926 && (code == COND_EXPR
2927 || !def2 || gimple_nop_p (def2)
2928 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2929 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2930 && (is_gimple_assign (def2)
2931 || is_gimple_call (def2)
2932 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2933 == vect_induction_def
2934 || (gimple_code (def2) == GIMPLE_PHI
2935 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2936 == vect_internal_def
2937 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2939 if (check_reduction
2940 && orig_code != MINUS_EXPR)
2942 if (code == COND_EXPR)
2944 /* No current known use where this case would be useful. */
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE, def_stmt,
2947 "detected reduction: cannot currently swap "
2948 "operands for cond_expr");
2949 return NULL;
2952 /* Swap operands (just for simplicity - so that the rest of the code
2953 can assume that the reduction variable is always the last (second)
2954 argument). */
2955 if (dump_enabled_p ())
2956 report_vect_op (MSG_NOTE, def_stmt,
2957 "detected reduction: need to swap operands: ");
2959 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2960 gimple_assign_rhs2_ptr (def_stmt));
2962 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2963 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2965 else
2967 if (dump_enabled_p ())
2968 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2971 return def_stmt;
2974 /* Try to find SLP reduction chain. */
2975 if (check_reduction && code != COND_EXPR
2976 && vect_is_slp_reduction (loop_info, phi, def_stmt))
2978 if (dump_enabled_p ())
2979 report_vect_op (MSG_NOTE, def_stmt,
2980 "reduction: detected reduction chain: ");
2982 return def_stmt;
2985 if (dump_enabled_p ())
2986 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2987 "reduction: unknown pattern: ");
2989 return NULL;
2992 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2993 in-place if it enables detection of more reductions. Arguments
2994 as there. */
2996 gimple *
2997 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2998 bool check_reduction, bool *double_reduc,
2999 bool need_wrapping_integral_overflow)
3001 enum vect_reduction_type v_reduc_type;
3002 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3003 double_reduc,
3004 need_wrapping_integral_overflow,
3005 &v_reduc_type);
3008 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3010 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3011 int *peel_iters_epilogue,
3012 stmt_vector_for_cost *scalar_cost_vec,
3013 stmt_vector_for_cost *prologue_cost_vec,
3014 stmt_vector_for_cost *epilogue_cost_vec)
3016 int retval = 0;
3017 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3019 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3021 *peel_iters_epilogue = vf/2;
3022 if (dump_enabled_p ())
3023 dump_printf_loc (MSG_NOTE, vect_location,
3024 "cost model: epilogue peel iters set to vf/2 "
3025 "because loop iterations are unknown .\n");
3027 /* If peeled iterations are known but number of scalar loop
3028 iterations are unknown, count a taken branch per peeled loop. */
3029 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3030 NULL, 0, vect_prologue);
3031 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3032 NULL, 0, vect_epilogue);
3034 else
3036 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3037 peel_iters_prologue = niters < peel_iters_prologue ?
3038 niters : peel_iters_prologue;
3039 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3040 /* If we need to peel for gaps, but no peeling is required, we have to
3041 peel VF iterations. */
3042 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3043 *peel_iters_epilogue = vf;
3046 stmt_info_for_cost *si;
3047 int j;
3048 if (peel_iters_prologue)
3049 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3050 retval += record_stmt_cost (prologue_cost_vec,
3051 si->count * peel_iters_prologue,
3052 si->kind, NULL, si->misalign,
3053 vect_prologue);
3054 if (*peel_iters_epilogue)
3055 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3056 retval += record_stmt_cost (epilogue_cost_vec,
3057 si->count * *peel_iters_epilogue,
3058 si->kind, NULL, si->misalign,
3059 vect_epilogue);
3061 return retval;
3064 /* Function vect_estimate_min_profitable_iters
3066 Return the number of iterations required for the vector version of the
3067 loop to be profitable relative to the cost of the scalar version of the
3068 loop.
3070 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3071 of iterations for vectorization. -1 value means loop vectorization
3072 is not profitable. This returned value may be used for dynamic
3073 profitability check.
3075 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3076 for static check against estimated number of iterations. */
3078 static void
3079 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3080 int *ret_min_profitable_niters,
3081 int *ret_min_profitable_estimate)
3083 int min_profitable_iters;
3084 int min_profitable_estimate;
3085 int peel_iters_prologue;
3086 int peel_iters_epilogue;
3087 unsigned vec_inside_cost = 0;
3088 int vec_outside_cost = 0;
3089 unsigned vec_prologue_cost = 0;
3090 unsigned vec_epilogue_cost = 0;
3091 int scalar_single_iter_cost = 0;
3092 int scalar_outside_cost = 0;
3093 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3094 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3095 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3097 /* Cost model disabled. */
3098 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3100 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3101 *ret_min_profitable_niters = 0;
3102 *ret_min_profitable_estimate = 0;
3103 return;
3106 /* Requires loop versioning tests to handle misalignment. */
3107 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3109 /* FIXME: Make cost depend on complexity of individual check. */
3110 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3111 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3112 vect_prologue);
3113 dump_printf (MSG_NOTE,
3114 "cost model: Adding cost of checks for loop "
3115 "versioning to treat misalignment.\n");
3118 /* Requires loop versioning with alias checks. */
3119 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3121 /* FIXME: Make cost depend on complexity of individual check. */
3122 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3123 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3124 vect_prologue);
3125 dump_printf (MSG_NOTE,
3126 "cost model: Adding cost of checks for loop "
3127 "versioning aliasing.\n");
3130 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3131 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3132 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3133 vect_prologue);
3135 /* Count statements in scalar loop. Using this as scalar cost for a single
3136 iteration for now.
3138 TODO: Add outer loop support.
3140 TODO: Consider assigning different costs to different scalar
3141 statements. */
3143 scalar_single_iter_cost
3144 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3146 /* Add additional cost for the peeled instructions in prologue and epilogue
3147 loop.
3149 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3150 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3152 TODO: Build an expression that represents peel_iters for prologue and
3153 epilogue to be used in a run-time test. */
3155 if (npeel < 0)
3157 peel_iters_prologue = vf/2;
3158 dump_printf (MSG_NOTE, "cost model: "
3159 "prologue peel iters set to vf/2.\n");
3161 /* If peeling for alignment is unknown, loop bound of main loop becomes
3162 unknown. */
3163 peel_iters_epilogue = vf/2;
3164 dump_printf (MSG_NOTE, "cost model: "
3165 "epilogue peel iters set to vf/2 because "
3166 "peeling for alignment is unknown.\n");
3168 /* If peeled iterations are unknown, count a taken branch and a not taken
3169 branch per peeled loop. Even if scalar loop iterations are known,
3170 vector iterations are not known since peeled prologue iterations are
3171 not known. Hence guards remain the same. */
3172 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3173 NULL, 0, vect_prologue);
3174 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3175 NULL, 0, vect_prologue);
3176 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3177 NULL, 0, vect_epilogue);
3178 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3179 NULL, 0, vect_epilogue);
3180 stmt_info_for_cost *si;
3181 int j;
3182 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3184 struct _stmt_vec_info *stmt_info
3185 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3186 (void) add_stmt_cost (target_cost_data,
3187 si->count * peel_iters_prologue,
3188 si->kind, stmt_info, si->misalign,
3189 vect_prologue);
3190 (void) add_stmt_cost (target_cost_data,
3191 si->count * peel_iters_epilogue,
3192 si->kind, stmt_info, si->misalign,
3193 vect_epilogue);
3196 else
3198 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3199 stmt_info_for_cost *si;
3200 int j;
3201 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3203 prologue_cost_vec.create (2);
3204 epilogue_cost_vec.create (2);
3205 peel_iters_prologue = npeel;
3207 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3208 &peel_iters_epilogue,
3209 &LOOP_VINFO_SCALAR_ITERATION_COST
3210 (loop_vinfo),
3211 &prologue_cost_vec,
3212 &epilogue_cost_vec);
3214 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3216 struct _stmt_vec_info *stmt_info
3217 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3218 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3219 si->misalign, vect_prologue);
3222 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3224 struct _stmt_vec_info *stmt_info
3225 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3226 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3227 si->misalign, vect_epilogue);
3230 prologue_cost_vec.release ();
3231 epilogue_cost_vec.release ();
3234 /* FORNOW: The scalar outside cost is incremented in one of the
3235 following ways:
3237 1. The vectorizer checks for alignment and aliasing and generates
3238 a condition that allows dynamic vectorization. A cost model
3239 check is ANDED with the versioning condition. Hence scalar code
3240 path now has the added cost of the versioning check.
3242 if (cost > th & versioning_check)
3243 jmp to vector code
3245 Hence run-time scalar is incremented by not-taken branch cost.
3247 2. The vectorizer then checks if a prologue is required. If the
3248 cost model check was not done before during versioning, it has to
3249 be done before the prologue check.
3251 if (cost <= th)
3252 prologue = scalar_iters
3253 if (prologue == 0)
3254 jmp to vector code
3255 else
3256 execute prologue
3257 if (prologue == num_iters)
3258 go to exit
3260 Hence the run-time scalar cost is incremented by a taken branch,
3261 plus a not-taken branch, plus a taken branch cost.
3263 3. The vectorizer then checks if an epilogue is required. If the
3264 cost model check was not done before during prologue check, it
3265 has to be done with the epilogue check.
3267 if (prologue == 0)
3268 jmp to vector code
3269 else
3270 execute prologue
3271 if (prologue == num_iters)
3272 go to exit
3273 vector code:
3274 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3275 jmp to epilogue
3277 Hence the run-time scalar cost should be incremented by 2 taken
3278 branches.
3280 TODO: The back end may reorder the BBS's differently and reverse
3281 conditions/branch directions. Change the estimates below to
3282 something more reasonable. */
3284 /* If the number of iterations is known and we do not do versioning, we can
3285 decide whether to vectorize at compile time. Hence the scalar version
3286 do not carry cost model guard costs. */
3287 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3288 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3289 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3291 /* Cost model check occurs at versioning. */
3292 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3293 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3294 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3295 else
3297 /* Cost model check occurs at prologue generation. */
3298 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3299 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3300 + vect_get_stmt_cost (cond_branch_not_taken);
3301 /* Cost model check occurs at epilogue generation. */
3302 else
3303 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3307 /* Complete the target-specific cost calculations. */
3308 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3309 &vec_inside_cost, &vec_epilogue_cost);
3311 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3313 if (dump_enabled_p ())
3315 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3316 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3317 vec_inside_cost);
3318 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3319 vec_prologue_cost);
3320 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3321 vec_epilogue_cost);
3322 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3323 scalar_single_iter_cost);
3324 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3325 scalar_outside_cost);
3326 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3327 vec_outside_cost);
3328 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3329 peel_iters_prologue);
3330 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3331 peel_iters_epilogue);
3334 /* Calculate number of iterations required to make the vector version
3335 profitable, relative to the loop bodies only. The following condition
3336 must hold true:
3337 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3338 where
3339 SIC = scalar iteration cost, VIC = vector iteration cost,
3340 VOC = vector outside cost, VF = vectorization factor,
3341 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3342 SOC = scalar outside cost for run time cost model check. */
3344 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3346 if (vec_outside_cost <= 0)
3347 min_profitable_iters = 1;
3348 else
3350 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3351 - vec_inside_cost * peel_iters_prologue
3352 - vec_inside_cost * peel_iters_epilogue)
3353 / ((scalar_single_iter_cost * vf)
3354 - vec_inside_cost);
3356 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3357 <= (((int) vec_inside_cost * min_profitable_iters)
3358 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3359 min_profitable_iters++;
3362 /* vector version will never be profitable. */
3363 else
3365 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3366 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3367 "did not happen for a simd loop");
3369 if (dump_enabled_p ())
3370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3371 "cost model: the vector iteration cost = %d "
3372 "divided by the scalar iteration cost = %d "
3373 "is greater or equal to the vectorization factor = %d"
3374 ".\n",
3375 vec_inside_cost, scalar_single_iter_cost, vf);
3376 *ret_min_profitable_niters = -1;
3377 *ret_min_profitable_estimate = -1;
3378 return;
3381 dump_printf (MSG_NOTE,
3382 " Calculated minimum iters for profitability: %d\n",
3383 min_profitable_iters);
3385 min_profitable_iters =
3386 min_profitable_iters < vf ? vf : min_profitable_iters;
3388 /* Because the condition we create is:
3389 if (niters <= min_profitable_iters)
3390 then skip the vectorized loop. */
3391 min_profitable_iters--;
3393 if (dump_enabled_p ())
3394 dump_printf_loc (MSG_NOTE, vect_location,
3395 " Runtime profitability threshold = %d\n",
3396 min_profitable_iters);
3398 *ret_min_profitable_niters = min_profitable_iters;
3400 /* Calculate number of iterations required to make the vector version
3401 profitable, relative to the loop bodies only.
3403 Non-vectorized variant is SIC * niters and it must win over vector
3404 variant on the expected loop trip count. The following condition must hold true:
3405 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3407 if (vec_outside_cost <= 0)
3408 min_profitable_estimate = 1;
3409 else
3411 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3412 - vec_inside_cost * peel_iters_prologue
3413 - vec_inside_cost * peel_iters_epilogue)
3414 / ((scalar_single_iter_cost * vf)
3415 - vec_inside_cost);
3417 min_profitable_estimate --;
3418 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3419 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_NOTE, vect_location,
3421 " Static estimate profitability threshold = %d\n",
3422 min_profitable_estimate);
3424 *ret_min_profitable_estimate = min_profitable_estimate;
3427 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3428 vector elements (not bits) for a vector of mode MODE. */
3429 static void
3430 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3431 unsigned char *sel)
3433 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3435 for (i = 0; i < nelt; i++)
3436 sel[i] = (i + offset) & (2*nelt - 1);
3439 /* Checks whether the target supports whole-vector shifts for vectors of mode
3440 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3441 it supports vec_perm_const with masks for all necessary shift amounts. */
3442 static bool
3443 have_whole_vector_shift (enum machine_mode mode)
3445 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3446 return true;
3448 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3449 return false;
3451 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3452 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3454 for (i = nelt/2; i >= 1; i/=2)
3456 calc_vec_perm_mask_for_shift (mode, i, sel);
3457 if (!can_vec_perm_p (mode, false, sel))
3458 return false;
3460 return true;
3463 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3465 static tree
3466 get_reduction_op (gimple *stmt, int reduc_index)
3468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3470 case GIMPLE_SINGLE_RHS:
3471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3472 == ternary_op);
3473 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3474 case GIMPLE_UNARY_RHS:
3475 return gimple_assign_rhs1 (stmt);
3476 case GIMPLE_BINARY_RHS:
3477 return (reduc_index
3478 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3479 case GIMPLE_TERNARY_RHS:
3480 return gimple_op (stmt, reduc_index + 1);
3481 default:
3482 gcc_unreachable ();
3486 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3487 functions. Design better to avoid maintenance issues. */
3489 /* Function vect_model_reduction_cost.
3491 Models cost for a reduction operation, including the vector ops
3492 generated within the strip-mine loop, the initial definition before
3493 the loop, and the epilogue code that must be generated. */
3495 static bool
3496 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3497 int ncopies, int reduc_index)
3499 int prologue_cost = 0, epilogue_cost = 0;
3500 enum tree_code code;
3501 optab optab;
3502 tree vectype;
3503 gimple *stmt, *orig_stmt;
3504 tree reduction_op;
3505 machine_mode mode;
3506 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3507 struct loop *loop = NULL;
3508 void *target_cost_data;
3510 if (loop_vinfo)
3512 loop = LOOP_VINFO_LOOP (loop_vinfo);
3513 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3515 else
3516 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3518 /* Condition reductions generate two reductions in the loop. */
3519 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3520 ncopies *= 2;
3522 /* Cost of reduction op inside loop. */
3523 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3524 stmt_info, 0, vect_body);
3525 stmt = STMT_VINFO_STMT (stmt_info);
3527 reduction_op = get_reduction_op (stmt, reduc_index);
3529 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3530 if (!vectype)
3532 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3535 "unsupported data-type ");
3536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3537 TREE_TYPE (reduction_op));
3538 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3540 return false;
3543 mode = TYPE_MODE (vectype);
3544 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3546 if (!orig_stmt)
3547 orig_stmt = STMT_VINFO_STMT (stmt_info);
3549 code = gimple_assign_rhs_code (orig_stmt);
3551 /* Add in cost for initial definition.
3552 For cond reduction we have four vectors: initial index, step, initial
3553 result of the data reduction, initial value of the index reduction. */
3554 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3555 == COND_REDUCTION ? 4 : 1;
3556 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3557 scalar_to_vec, stmt_info, 0,
3558 vect_prologue);
3560 /* Determine cost of epilogue code.
3562 We have a reduction operator that will reduce the vector in one statement.
3563 Also requires scalar extract. */
3565 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3567 if (reduc_code != ERROR_MARK)
3569 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3571 /* An EQ stmt and an COND_EXPR stmt. */
3572 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3573 vector_stmt, stmt_info, 0,
3574 vect_epilogue);
3575 /* Reduction of the max index and a reduction of the found
3576 values. */
3577 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3578 vec_to_scalar, stmt_info, 0,
3579 vect_epilogue);
3580 /* A broadcast of the max value. */
3581 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3582 scalar_to_vec, stmt_info, 0,
3583 vect_epilogue);
3585 else
3587 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3588 stmt_info, 0, vect_epilogue);
3589 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3590 vec_to_scalar, stmt_info, 0,
3591 vect_epilogue);
3594 else
3596 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3597 tree bitsize =
3598 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3599 int element_bitsize = tree_to_uhwi (bitsize);
3600 int nelements = vec_size_in_bits / element_bitsize;
3602 optab = optab_for_tree_code (code, vectype, optab_default);
3604 /* We have a whole vector shift available. */
3605 if (VECTOR_MODE_P (mode)
3606 && optab_handler (optab, mode) != CODE_FOR_nothing
3607 && have_whole_vector_shift (mode))
3609 /* Final reduction via vector shifts and the reduction operator.
3610 Also requires scalar extract. */
3611 epilogue_cost += add_stmt_cost (target_cost_data,
3612 exact_log2 (nelements) * 2,
3613 vector_stmt, stmt_info, 0,
3614 vect_epilogue);
3615 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3616 vec_to_scalar, stmt_info, 0,
3617 vect_epilogue);
3619 else
3620 /* Use extracts and reduction op for final reduction. For N
3621 elements, we have N extracts and N-1 reduction ops. */
3622 epilogue_cost += add_stmt_cost (target_cost_data,
3623 nelements + nelements - 1,
3624 vector_stmt, stmt_info, 0,
3625 vect_epilogue);
3629 if (dump_enabled_p ())
3630 dump_printf (MSG_NOTE,
3631 "vect_model_reduction_cost: inside_cost = %d, "
3632 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3633 prologue_cost, epilogue_cost);
3635 return true;
3639 /* Function vect_model_induction_cost.
3641 Models cost for induction operations. */
3643 static void
3644 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3647 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3648 unsigned inside_cost, prologue_cost;
3650 /* loop cost for vec_loop. */
3651 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3652 stmt_info, 0, vect_body);
3654 /* prologue cost for vec_init and vec_step. */
3655 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3656 stmt_info, 0, vect_prologue);
3658 if (dump_enabled_p ())
3659 dump_printf_loc (MSG_NOTE, vect_location,
3660 "vect_model_induction_cost: inside_cost = %d, "
3661 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3665 /* Function get_initial_def_for_induction
3667 Input:
3668 STMT - a stmt that performs an induction operation in the loop.
3669 IV_PHI - the initial value of the induction variable
3671 Output:
3672 Return a vector variable, initialized with the first VF values of
3673 the induction variable. E.g., for an iv with IV_PHI='X' and
3674 evolution S, for a vector of 4 units, we want to return:
3675 [X, X + S, X + 2*S, X + 3*S]. */
3677 static tree
3678 get_initial_def_for_induction (gimple *iv_phi)
3680 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3681 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3682 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3683 tree vectype;
3684 int nunits;
3685 edge pe = loop_preheader_edge (loop);
3686 struct loop *iv_loop;
3687 basic_block new_bb;
3688 tree new_vec, vec_init, vec_step, t;
3689 tree new_name;
3690 gimple *new_stmt;
3691 gphi *induction_phi;
3692 tree induc_def, vec_def, vec_dest;
3693 tree init_expr, step_expr;
3694 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3695 int i;
3696 int ncopies;
3697 tree expr;
3698 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3699 bool nested_in_vect_loop = false;
3700 gimple_seq stmts;
3701 imm_use_iterator imm_iter;
3702 use_operand_p use_p;
3703 gimple *exit_phi;
3704 edge latch_e;
3705 tree loop_arg;
3706 gimple_stmt_iterator si;
3707 basic_block bb = gimple_bb (iv_phi);
3708 tree stepvectype;
3709 tree resvectype;
3711 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3712 if (nested_in_vect_loop_p (loop, iv_phi))
3714 nested_in_vect_loop = true;
3715 iv_loop = loop->inner;
3717 else
3718 iv_loop = loop;
3719 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3721 latch_e = loop_latch_edge (iv_loop);
3722 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3724 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3725 gcc_assert (step_expr != NULL_TREE);
3727 pe = loop_preheader_edge (iv_loop);
3728 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3729 loop_preheader_edge (iv_loop));
3731 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3732 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3733 gcc_assert (vectype);
3734 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3735 ncopies = vf / nunits;
3737 gcc_assert (phi_info);
3738 gcc_assert (ncopies >= 1);
3740 /* Convert the step to the desired type. */
3741 stmts = NULL;
3742 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3743 if (stmts)
3745 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3746 gcc_assert (!new_bb);
3749 /* Find the first insertion point in the BB. */
3750 si = gsi_after_labels (bb);
3752 /* Create the vector that holds the initial_value of the induction. */
3753 if (nested_in_vect_loop)
3755 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3756 been created during vectorization of previous stmts. We obtain it
3757 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3758 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3759 /* If the initial value is not of proper type, convert it. */
3760 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3762 new_stmt
3763 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3764 vect_simple_var,
3765 "vec_iv_"),
3766 VIEW_CONVERT_EXPR,
3767 build1 (VIEW_CONVERT_EXPR, vectype,
3768 vec_init));
3769 vec_init = gimple_assign_lhs (new_stmt);
3770 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3771 new_stmt);
3772 gcc_assert (!new_bb);
3773 set_vinfo_for_stmt (new_stmt,
3774 new_stmt_vec_info (new_stmt, loop_vinfo));
3777 else
3779 vec<constructor_elt, va_gc> *v;
3781 /* iv_loop is the loop to be vectorized. Create:
3782 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3783 stmts = NULL;
3784 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3786 vec_alloc (v, nunits);
3787 bool constant_p = is_gimple_min_invariant (new_name);
3788 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3789 for (i = 1; i < nunits; i++)
3791 /* Create: new_name_i = new_name + step_expr */
3792 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3793 new_name, step_expr);
3794 if (!is_gimple_min_invariant (new_name))
3795 constant_p = false;
3796 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3798 if (stmts)
3800 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3801 gcc_assert (!new_bb);
3804 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3805 if (constant_p)
3806 new_vec = build_vector_from_ctor (vectype, v);
3807 else
3808 new_vec = build_constructor (vectype, v);
3809 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3813 /* Create the vector that holds the step of the induction. */
3814 if (nested_in_vect_loop)
3815 /* iv_loop is nested in the loop to be vectorized. Generate:
3816 vec_step = [S, S, S, S] */
3817 new_name = step_expr;
3818 else
3820 /* iv_loop is the loop to be vectorized. Generate:
3821 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3822 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3824 expr = build_int_cst (integer_type_node, vf);
3825 expr = fold_convert (TREE_TYPE (step_expr), expr);
3827 else
3828 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3829 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3830 expr, step_expr);
3831 if (TREE_CODE (step_expr) == SSA_NAME)
3832 new_name = vect_init_vector (iv_phi, new_name,
3833 TREE_TYPE (step_expr), NULL);
3836 t = unshare_expr (new_name);
3837 gcc_assert (CONSTANT_CLASS_P (new_name)
3838 || TREE_CODE (new_name) == SSA_NAME);
3839 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3840 gcc_assert (stepvectype);
3841 new_vec = build_vector_from_val (stepvectype, t);
3842 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3845 /* Create the following def-use cycle:
3846 loop prolog:
3847 vec_init = ...
3848 vec_step = ...
3849 loop:
3850 vec_iv = PHI <vec_init, vec_loop>
3852 STMT
3854 vec_loop = vec_iv + vec_step; */
3856 /* Create the induction-phi that defines the induction-operand. */
3857 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3858 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3859 set_vinfo_for_stmt (induction_phi,
3860 new_stmt_vec_info (induction_phi, loop_vinfo));
3861 induc_def = PHI_RESULT (induction_phi);
3863 /* Create the iv update inside the loop */
3864 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3865 vec_def = make_ssa_name (vec_dest, new_stmt);
3866 gimple_assign_set_lhs (new_stmt, vec_def);
3867 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3868 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3870 /* Set the arguments of the phi node: */
3871 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3872 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3873 UNKNOWN_LOCATION);
3876 /* In case that vectorization factor (VF) is bigger than the number
3877 of elements that we can fit in a vectype (nunits), we have to generate
3878 more than one vector stmt - i.e - we need to "unroll" the
3879 vector stmt by a factor VF/nunits. For more details see documentation
3880 in vectorizable_operation. */
3882 if (ncopies > 1)
3884 stmt_vec_info prev_stmt_vinfo;
3885 /* FORNOW. This restriction should be relaxed. */
3886 gcc_assert (!nested_in_vect_loop);
3888 /* Create the vector that holds the step of the induction. */
3889 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3891 expr = build_int_cst (integer_type_node, nunits);
3892 expr = fold_convert (TREE_TYPE (step_expr), expr);
3894 else
3895 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3896 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3897 expr, step_expr);
3898 if (TREE_CODE (step_expr) == SSA_NAME)
3899 new_name = vect_init_vector (iv_phi, new_name,
3900 TREE_TYPE (step_expr), NULL);
3901 t = unshare_expr (new_name);
3902 gcc_assert (CONSTANT_CLASS_P (new_name)
3903 || TREE_CODE (new_name) == SSA_NAME);
3904 new_vec = build_vector_from_val (stepvectype, t);
3905 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3907 vec_def = induc_def;
3908 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3909 for (i = 1; i < ncopies; i++)
3911 /* vec_i = vec_prev + vec_step */
3912 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3913 vec_def, vec_step);
3914 vec_def = make_ssa_name (vec_dest, new_stmt);
3915 gimple_assign_set_lhs (new_stmt, vec_def);
3917 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3918 if (!useless_type_conversion_p (resvectype, vectype))
3920 new_stmt
3921 = gimple_build_assign
3922 (vect_get_new_vect_var (resvectype, vect_simple_var,
3923 "vec_iv_"),
3924 VIEW_CONVERT_EXPR,
3925 build1 (VIEW_CONVERT_EXPR, resvectype,
3926 gimple_assign_lhs (new_stmt)));
3927 gimple_assign_set_lhs (new_stmt,
3928 make_ssa_name
3929 (gimple_assign_lhs (new_stmt), new_stmt));
3930 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3932 set_vinfo_for_stmt (new_stmt,
3933 new_stmt_vec_info (new_stmt, loop_vinfo));
3934 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3935 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3939 if (nested_in_vect_loop)
3941 /* Find the loop-closed exit-phi of the induction, and record
3942 the final vector of induction results: */
3943 exit_phi = NULL;
3944 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3946 gimple *use_stmt = USE_STMT (use_p);
3947 if (is_gimple_debug (use_stmt))
3948 continue;
3950 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3952 exit_phi = use_stmt;
3953 break;
3956 if (exit_phi)
3958 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3959 /* FORNOW. Currently not supporting the case that an inner-loop induction
3960 is not used in the outer-loop (i.e. only outside the outer-loop). */
3961 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3962 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3964 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3965 if (dump_enabled_p ())
3967 dump_printf_loc (MSG_NOTE, vect_location,
3968 "vector of inductions after inner-loop:");
3969 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3975 if (dump_enabled_p ())
3977 dump_printf_loc (MSG_NOTE, vect_location,
3978 "transform induction: created def-use cycle: ");
3979 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3980 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3981 SSA_NAME_DEF_STMT (vec_def), 0);
3984 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3985 if (!useless_type_conversion_p (resvectype, vectype))
3987 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3988 vect_simple_var,
3989 "vec_iv_"),
3990 VIEW_CONVERT_EXPR,
3991 build1 (VIEW_CONVERT_EXPR, resvectype,
3992 induc_def));
3993 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3994 gimple_assign_set_lhs (new_stmt, induc_def);
3995 si = gsi_after_labels (bb);
3996 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3997 set_vinfo_for_stmt (new_stmt,
3998 new_stmt_vec_info (new_stmt, loop_vinfo));
3999 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4000 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4003 return induc_def;
4007 /* Function get_initial_def_for_reduction
4009 Input:
4010 STMT - a stmt that performs a reduction operation in the loop.
4011 INIT_VAL - the initial value of the reduction variable
4013 Output:
4014 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4015 of the reduction (used for adjusting the epilog - see below).
4016 Return a vector variable, initialized according to the operation that STMT
4017 performs. This vector will be used as the initial value of the
4018 vector of partial results.
4020 Option1 (adjust in epilog): Initialize the vector as follows:
4021 add/bit or/xor: [0,0,...,0,0]
4022 mult/bit and: [1,1,...,1,1]
4023 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4024 and when necessary (e.g. add/mult case) let the caller know
4025 that it needs to adjust the result by init_val.
4027 Option2: Initialize the vector as follows:
4028 add/bit or/xor: [init_val,0,0,...,0]
4029 mult/bit and: [init_val,1,1,...,1]
4030 min/max/cond_expr: [init_val,init_val,...,init_val]
4031 and no adjustments are needed.
4033 For example, for the following code:
4035 s = init_val;
4036 for (i=0;i<n;i++)
4037 s = s + a[i];
4039 STMT is 's = s + a[i]', and the reduction variable is 's'.
4040 For a vector of 4 units, we want to return either [0,0,0,init_val],
4041 or [0,0,0,0] and let the caller know that it needs to adjust
4042 the result at the end by 'init_val'.
4044 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4045 initialization vector is simpler (same element in all entries), if
4046 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4048 A cost model should help decide between these two schemes. */
4050 tree
4051 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4052 tree *adjustment_def)
4054 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4056 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4057 tree scalar_type = TREE_TYPE (init_val);
4058 tree vectype = get_vectype_for_scalar_type (scalar_type);
4059 int nunits;
4060 enum tree_code code = gimple_assign_rhs_code (stmt);
4061 tree def_for_init;
4062 tree init_def;
4063 tree *elts;
4064 int i;
4065 bool nested_in_vect_loop = false;
4066 REAL_VALUE_TYPE real_init_val = dconst0;
4067 int int_init_val = 0;
4068 gimple *def_stmt = NULL;
4069 gimple_seq stmts = NULL;
4071 gcc_assert (vectype);
4072 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4074 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4075 || SCALAR_FLOAT_TYPE_P (scalar_type));
4077 if (nested_in_vect_loop_p (loop, stmt))
4078 nested_in_vect_loop = true;
4079 else
4080 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4082 /* In case of double reduction we only create a vector variable to be put
4083 in the reduction phi node. The actual statement creation is done in
4084 vect_create_epilog_for_reduction. */
4085 if (adjustment_def && nested_in_vect_loop
4086 && TREE_CODE (init_val) == SSA_NAME
4087 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4088 && gimple_code (def_stmt) == GIMPLE_PHI
4089 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4090 && vinfo_for_stmt (def_stmt)
4091 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4092 == vect_double_reduction_def)
4094 *adjustment_def = NULL;
4095 return vect_create_destination_var (init_val, vectype);
4098 /* In case of a nested reduction do not use an adjustment def as
4099 that case is not supported by the epilogue generation correctly
4100 if ncopies is not one. */
4101 if (adjustment_def && nested_in_vect_loop)
4103 *adjustment_def = NULL;
4104 return vect_get_vec_def_for_operand (init_val, stmt);
4107 switch (code)
4109 case WIDEN_SUM_EXPR:
4110 case DOT_PROD_EXPR:
4111 case SAD_EXPR:
4112 case PLUS_EXPR:
4113 case MINUS_EXPR:
4114 case BIT_IOR_EXPR:
4115 case BIT_XOR_EXPR:
4116 case MULT_EXPR:
4117 case BIT_AND_EXPR:
4118 /* ADJUSMENT_DEF is NULL when called from
4119 vect_create_epilog_for_reduction to vectorize double reduction. */
4120 if (adjustment_def)
4121 *adjustment_def = init_val;
4123 if (code == MULT_EXPR)
4125 real_init_val = dconst1;
4126 int_init_val = 1;
4129 if (code == BIT_AND_EXPR)
4130 int_init_val = -1;
4132 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4133 def_for_init = build_real (scalar_type, real_init_val);
4134 else
4135 def_for_init = build_int_cst (scalar_type, int_init_val);
4137 /* Create a vector of '0' or '1' except the first element. */
4138 elts = XALLOCAVEC (tree, nunits);
4139 for (i = nunits - 2; i >= 0; --i)
4140 elts[i + 1] = def_for_init;
4142 /* Option1: the first element is '0' or '1' as well. */
4143 if (adjustment_def)
4145 elts[0] = def_for_init;
4146 init_def = build_vector (vectype, elts);
4147 break;
4150 /* Option2: the first element is INIT_VAL. */
4151 elts[0] = init_val;
4152 if (TREE_CONSTANT (init_val))
4153 init_def = build_vector (vectype, elts);
4154 else
4156 vec<constructor_elt, va_gc> *v;
4157 vec_alloc (v, nunits);
4158 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4159 for (i = 1; i < nunits; ++i)
4160 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4161 init_def = build_constructor (vectype, v);
4164 break;
4166 case MIN_EXPR:
4167 case MAX_EXPR:
4168 case COND_EXPR:
4169 if (adjustment_def)
4171 *adjustment_def = NULL_TREE;
4172 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4174 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4175 break;
4178 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4179 if (! gimple_seq_empty_p (stmts))
4180 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4181 init_def = build_vector_from_val (vectype, init_val);
4182 break;
4184 default:
4185 gcc_unreachable ();
4188 return init_def;
4191 /* Function vect_create_epilog_for_reduction
4193 Create code at the loop-epilog to finalize the result of a reduction
4194 computation.
4196 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4197 reduction statements.
4198 STMT is the scalar reduction stmt that is being vectorized.
4199 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4200 number of elements that we can fit in a vectype (nunits). In this case
4201 we have to generate more than one vector stmt - i.e - we need to "unroll"
4202 the vector stmt by a factor VF/nunits. For more details see documentation
4203 in vectorizable_operation.
4204 REDUC_CODE is the tree-code for the epilog reduction.
4205 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4206 computation.
4207 REDUC_INDEX is the index of the operand in the right hand side of the
4208 statement that is defined by REDUCTION_PHI.
4209 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4210 SLP_NODE is an SLP node containing a group of reduction statements. The
4211 first one in this group is STMT.
4212 INDUCTION_INDEX is the index of the loop for condition reductions.
4213 Otherwise it is undefined.
4215 This function:
4216 1. Creates the reduction def-use cycles: sets the arguments for
4217 REDUCTION_PHIS:
4218 The loop-entry argument is the vectorized initial-value of the reduction.
4219 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4220 sums.
4221 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4222 by applying the operation specified by REDUC_CODE if available, or by
4223 other means (whole-vector shifts or a scalar loop).
4224 The function also creates a new phi node at the loop exit to preserve
4225 loop-closed form, as illustrated below.
4227 The flow at the entry to this function:
4229 loop:
4230 vec_def = phi <null, null> # REDUCTION_PHI
4231 VECT_DEF = vector_stmt # vectorized form of STMT
4232 s_loop = scalar_stmt # (scalar) STMT
4233 loop_exit:
4234 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4235 use <s_out0>
4236 use <s_out0>
4238 The above is transformed by this function into:
4240 loop:
4241 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4242 VECT_DEF = vector_stmt # vectorized form of STMT
4243 s_loop = scalar_stmt # (scalar) STMT
4244 loop_exit:
4245 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4246 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4247 v_out2 = reduce <v_out1>
4248 s_out3 = extract_field <v_out2, 0>
4249 s_out4 = adjust_result <s_out3>
4250 use <s_out4>
4251 use <s_out4>
4254 static void
4255 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4256 int ncopies, enum tree_code reduc_code,
4257 vec<gimple *> reduction_phis,
4258 int reduc_index, bool double_reduc,
4259 slp_tree slp_node, tree induction_index)
4261 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4262 stmt_vec_info prev_phi_info;
4263 tree vectype;
4264 machine_mode mode;
4265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4267 basic_block exit_bb;
4268 tree scalar_dest;
4269 tree scalar_type;
4270 gimple *new_phi = NULL, *phi;
4271 gimple_stmt_iterator exit_gsi;
4272 tree vec_dest;
4273 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4274 gimple *epilog_stmt = NULL;
4275 enum tree_code code = gimple_assign_rhs_code (stmt);
4276 gimple *exit_phi;
4277 tree bitsize;
4278 tree adjustment_def = NULL;
4279 tree vec_initial_def = NULL;
4280 tree reduction_op, expr, def, initial_def = NULL;
4281 tree orig_name, scalar_result;
4282 imm_use_iterator imm_iter, phi_imm_iter;
4283 use_operand_p use_p, phi_use_p;
4284 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4285 bool nested_in_vect_loop = false;
4286 auto_vec<gimple *> new_phis;
4287 auto_vec<gimple *> inner_phis;
4288 enum vect_def_type dt = vect_unknown_def_type;
4289 int j, i;
4290 auto_vec<tree> scalar_results;
4291 unsigned int group_size = 1, k, ratio;
4292 auto_vec<tree> vec_initial_defs;
4293 auto_vec<gimple *> phis;
4294 bool slp_reduc = false;
4295 tree new_phi_result;
4296 gimple *inner_phi = NULL;
4298 if (slp_node)
4299 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4301 if (nested_in_vect_loop_p (loop, stmt))
4303 outer_loop = loop;
4304 loop = loop->inner;
4305 nested_in_vect_loop = true;
4306 gcc_assert (!slp_node);
4309 reduction_op = get_reduction_op (stmt, reduc_index);
4311 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4312 gcc_assert (vectype);
4313 mode = TYPE_MODE (vectype);
4315 /* 1. Create the reduction def-use cycle:
4316 Set the arguments of REDUCTION_PHIS, i.e., transform
4318 loop:
4319 vec_def = phi <null, null> # REDUCTION_PHI
4320 VECT_DEF = vector_stmt # vectorized form of STMT
4323 into:
4325 loop:
4326 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4327 VECT_DEF = vector_stmt # vectorized form of STMT
4330 (in case of SLP, do it for all the phis). */
4332 /* Get the loop-entry arguments. */
4333 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4334 if (slp_node)
4335 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4336 NULL, slp_node, reduc_index);
4337 else
4339 /* Get at the scalar def before the loop, that defines the initial value
4340 of the reduction variable. */
4341 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4342 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4343 loop_preheader_edge (loop));
4344 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4345 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4346 &adjustment_def);
4347 vec_initial_defs.create (1);
4348 vec_initial_defs.quick_push (vec_initial_def);
4351 /* Set phi nodes arguments. */
4352 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4354 tree vec_init_def, def;
4355 gimple_seq stmts;
4356 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4357 true, NULL_TREE);
4358 if (stmts)
4359 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4361 def = vect_defs[i];
4362 for (j = 0; j < ncopies; j++)
4364 if (j != 0)
4366 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4367 if (nested_in_vect_loop)
4368 vec_init_def
4369 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4370 vec_init_def);
4373 /* Set the loop-entry arg of the reduction-phi. */
4375 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4376 == INTEGER_INDUC_COND_REDUCTION)
4378 /* Initialise the reduction phi to zero. This prevents initial
4379 values of non-zero interferring with the reduction op. */
4380 gcc_assert (ncopies == 1);
4381 gcc_assert (i == 0);
4383 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4384 tree zero_vec = build_zero_cst (vec_init_def_type);
4386 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4387 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4389 else
4390 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4391 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4393 /* Set the loop-latch arg for the reduction-phi. */
4394 if (j > 0)
4395 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4397 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4398 UNKNOWN_LOCATION);
4400 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_NOTE, vect_location,
4403 "transform reduction: created def-use cycle: ");
4404 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4405 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4410 /* 2. Create epilog code.
4411 The reduction epilog code operates across the elements of the vector
4412 of partial results computed by the vectorized loop.
4413 The reduction epilog code consists of:
4415 step 1: compute the scalar result in a vector (v_out2)
4416 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4417 step 3: adjust the scalar result (s_out3) if needed.
4419 Step 1 can be accomplished using one the following three schemes:
4420 (scheme 1) using reduc_code, if available.
4421 (scheme 2) using whole-vector shifts, if available.
4422 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4423 combined.
4425 The overall epilog code looks like this:
4427 s_out0 = phi <s_loop> # original EXIT_PHI
4428 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4429 v_out2 = reduce <v_out1> # step 1
4430 s_out3 = extract_field <v_out2, 0> # step 2
4431 s_out4 = adjust_result <s_out3> # step 3
4433 (step 3 is optional, and steps 1 and 2 may be combined).
4434 Lastly, the uses of s_out0 are replaced by s_out4. */
4437 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4438 v_out1 = phi <VECT_DEF>
4439 Store them in NEW_PHIS. */
4441 exit_bb = single_exit (loop)->dest;
4442 prev_phi_info = NULL;
4443 new_phis.create (vect_defs.length ());
4444 FOR_EACH_VEC_ELT (vect_defs, i, def)
4446 for (j = 0; j < ncopies; j++)
4448 tree new_def = copy_ssa_name (def);
4449 phi = create_phi_node (new_def, exit_bb);
4450 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4451 if (j == 0)
4452 new_phis.quick_push (phi);
4453 else
4455 def = vect_get_vec_def_for_stmt_copy (dt, def);
4456 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4459 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4460 prev_phi_info = vinfo_for_stmt (phi);
4464 /* The epilogue is created for the outer-loop, i.e., for the loop being
4465 vectorized. Create exit phis for the outer loop. */
4466 if (double_reduc)
4468 loop = outer_loop;
4469 exit_bb = single_exit (loop)->dest;
4470 inner_phis.create (vect_defs.length ());
4471 FOR_EACH_VEC_ELT (new_phis, i, phi)
4473 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4474 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4475 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4476 PHI_RESULT (phi));
4477 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4478 loop_vinfo));
4479 inner_phis.quick_push (phi);
4480 new_phis[i] = outer_phi;
4481 prev_phi_info = vinfo_for_stmt (outer_phi);
4482 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4484 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4485 new_result = copy_ssa_name (PHI_RESULT (phi));
4486 outer_phi = create_phi_node (new_result, exit_bb);
4487 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4488 PHI_RESULT (phi));
4489 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4490 loop_vinfo));
4491 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4492 prev_phi_info = vinfo_for_stmt (outer_phi);
4497 exit_gsi = gsi_after_labels (exit_bb);
4499 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4500 (i.e. when reduc_code is not available) and in the final adjustment
4501 code (if needed). Also get the original scalar reduction variable as
4502 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4503 represents a reduction pattern), the tree-code and scalar-def are
4504 taken from the original stmt that the pattern-stmt (STMT) replaces.
4505 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4506 are taken from STMT. */
4508 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4509 if (!orig_stmt)
4511 /* Regular reduction */
4512 orig_stmt = stmt;
4514 else
4516 /* Reduction pattern */
4517 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4518 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4519 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4522 code = gimple_assign_rhs_code (orig_stmt);
4523 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4524 partial results are added and not subtracted. */
4525 if (code == MINUS_EXPR)
4526 code = PLUS_EXPR;
4528 scalar_dest = gimple_assign_lhs (orig_stmt);
4529 scalar_type = TREE_TYPE (scalar_dest);
4530 scalar_results.create (group_size);
4531 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4532 bitsize = TYPE_SIZE (scalar_type);
4534 /* In case this is a reduction in an inner-loop while vectorizing an outer
4535 loop - we don't need to extract a single scalar result at the end of the
4536 inner-loop (unless it is double reduction, i.e., the use of reduction is
4537 outside the outer-loop). The final vector of partial results will be used
4538 in the vectorized outer-loop, or reduced to a scalar result at the end of
4539 the outer-loop. */
4540 if (nested_in_vect_loop && !double_reduc)
4541 goto vect_finalize_reduction;
4543 /* SLP reduction without reduction chain, e.g.,
4544 # a1 = phi <a2, a0>
4545 # b1 = phi <b2, b0>
4546 a2 = operation (a1)
4547 b2 = operation (b1) */
4548 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4550 /* In case of reduction chain, e.g.,
4551 # a1 = phi <a3, a0>
4552 a2 = operation (a1)
4553 a3 = operation (a2),
4555 we may end up with more than one vector result. Here we reduce them to
4556 one vector. */
4557 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4559 tree first_vect = PHI_RESULT (new_phis[0]);
4560 tree tmp;
4561 gassign *new_vec_stmt = NULL;
4563 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4564 for (k = 1; k < new_phis.length (); k++)
4566 gimple *next_phi = new_phis[k];
4567 tree second_vect = PHI_RESULT (next_phi);
4569 tmp = build2 (code, vectype, first_vect, second_vect);
4570 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4571 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4572 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4573 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4576 new_phi_result = first_vect;
4577 if (new_vec_stmt)
4579 new_phis.truncate (0);
4580 new_phis.safe_push (new_vec_stmt);
4583 else
4584 new_phi_result = PHI_RESULT (new_phis[0]);
4586 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4588 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4589 various data values where the condition matched and another vector
4590 (INDUCTION_INDEX) containing all the indexes of those matches. We
4591 need to extract the last matching index (which will be the index with
4592 highest value) and use this to index into the data vector.
4593 For the case where there were no matches, the data vector will contain
4594 all default values and the index vector will be all zeros. */
4596 /* Get various versions of the type of the vector of indexes. */
4597 tree index_vec_type = TREE_TYPE (induction_index);
4598 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4599 tree index_scalar_type = TREE_TYPE (index_vec_type);
4600 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4601 (index_vec_type);
4603 /* Get an unsigned integer version of the type of the data vector. */
4604 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4605 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4606 tree vectype_unsigned = build_vector_type
4607 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4609 /* First we need to create a vector (ZERO_VEC) of zeros and another
4610 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4611 can create using a MAX reduction and then expanding.
4612 In the case where the loop never made any matches, the max index will
4613 be zero. */
4615 /* Vector of {0, 0, 0,...}. */
4616 tree zero_vec = make_ssa_name (vectype);
4617 tree zero_vec_rhs = build_zero_cst (vectype);
4618 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4619 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4621 /* Find maximum value from the vector of found indexes. */
4622 tree max_index = make_ssa_name (index_scalar_type);
4623 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4624 induction_index);
4625 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4627 /* Vector of {max_index, max_index, max_index,...}. */
4628 tree max_index_vec = make_ssa_name (index_vec_type);
4629 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4630 max_index);
4631 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4632 max_index_vec_rhs);
4633 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4635 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4636 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4637 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4638 otherwise. Only one value should match, resulting in a vector
4639 (VEC_COND) with one data value and the rest zeros.
4640 In the case where the loop never made any matches, every index will
4641 match, resulting in a vector with all data values (which will all be
4642 the default value). */
4644 /* Compare the max index vector to the vector of found indexes to find
4645 the position of the max value. */
4646 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4647 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4648 induction_index,
4649 max_index_vec);
4650 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4652 /* Use the compare to choose either values from the data vector or
4653 zero. */
4654 tree vec_cond = make_ssa_name (vectype);
4655 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4656 vec_compare, new_phi_result,
4657 zero_vec);
4658 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4660 /* Finally we need to extract the data value from the vector (VEC_COND)
4661 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4662 reduction, but because this doesn't exist, we can use a MAX reduction
4663 instead. The data value might be signed or a float so we need to cast
4664 it first.
4665 In the case where the loop never made any matches, the data values are
4666 all identical, and so will reduce down correctly. */
4668 /* Make the matched data values unsigned. */
4669 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4670 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4671 vec_cond);
4672 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4673 VIEW_CONVERT_EXPR,
4674 vec_cond_cast_rhs);
4675 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4677 /* Reduce down to a scalar value. */
4678 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4679 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4680 optab_default);
4681 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4682 != CODE_FOR_nothing);
4683 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4684 REDUC_MAX_EXPR,
4685 vec_cond_cast);
4686 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4688 /* Convert the reduced value back to the result type and set as the
4689 result. */
4690 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4691 data_reduc);
4692 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4693 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4694 gimple_assign_set_lhs (epilog_stmt, new_temp);
4695 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4696 scalar_results.safe_push (new_temp);
4699 /* 2.3 Create the reduction code, using one of the three schemes described
4700 above. In SLP we simply need to extract all the elements from the
4701 vector (without reducing them), so we use scalar shifts. */
4702 else if (reduc_code != ERROR_MARK && !slp_reduc)
4704 tree tmp;
4705 tree vec_elem_type;
4707 /*** Case 1: Create:
4708 v_out2 = reduc_expr <v_out1> */
4710 if (dump_enabled_p ())
4711 dump_printf_loc (MSG_NOTE, vect_location,
4712 "Reduce using direct vector reduction.\n");
4714 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4715 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4717 tree tmp_dest =
4718 vect_create_destination_var (scalar_dest, vec_elem_type);
4719 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4720 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4721 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4722 gimple_assign_set_lhs (epilog_stmt, new_temp);
4723 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4725 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4727 else
4728 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4730 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4731 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4732 gimple_assign_set_lhs (epilog_stmt, new_temp);
4733 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4735 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4736 == INTEGER_INDUC_COND_REDUCTION)
4738 /* Earlier we set the initial value to be zero. Check the result
4739 and if it is zero then replace with the original initial
4740 value. */
4741 tree zero = build_zero_cst (scalar_type);
4742 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4744 tmp = make_ssa_name (new_scalar_dest);
4745 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4746 initial_def, new_temp);
4747 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4748 new_temp = tmp;
4751 scalar_results.safe_push (new_temp);
4753 else
4755 bool reduce_with_shift = have_whole_vector_shift (mode);
4756 int element_bitsize = tree_to_uhwi (bitsize);
4757 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4758 tree vec_temp;
4760 /* Regardless of whether we have a whole vector shift, if we're
4761 emulating the operation via tree-vect-generic, we don't want
4762 to use it. Only the first round of the reduction is likely
4763 to still be profitable via emulation. */
4764 /* ??? It might be better to emit a reduction tree code here, so that
4765 tree-vect-generic can expand the first round via bit tricks. */
4766 if (!VECTOR_MODE_P (mode))
4767 reduce_with_shift = false;
4768 else
4770 optab optab = optab_for_tree_code (code, vectype, optab_default);
4771 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4772 reduce_with_shift = false;
4775 if (reduce_with_shift && !slp_reduc)
4777 int nelements = vec_size_in_bits / element_bitsize;
4778 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4780 int elt_offset;
4782 tree zero_vec = build_zero_cst (vectype);
4783 /*** Case 2: Create:
4784 for (offset = nelements/2; offset >= 1; offset/=2)
4786 Create: va' = vec_shift <va, offset>
4787 Create: va = vop <va, va'>
4788 } */
4790 tree rhs;
4792 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_NOTE, vect_location,
4794 "Reduce using vector shifts\n");
4796 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4797 new_temp = new_phi_result;
4798 for (elt_offset = nelements / 2;
4799 elt_offset >= 1;
4800 elt_offset /= 2)
4802 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4803 tree mask = vect_gen_perm_mask_any (vectype, sel);
4804 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4805 new_temp, zero_vec, mask);
4806 new_name = make_ssa_name (vec_dest, epilog_stmt);
4807 gimple_assign_set_lhs (epilog_stmt, new_name);
4808 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4810 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4811 new_temp);
4812 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4813 gimple_assign_set_lhs (epilog_stmt, new_temp);
4814 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4817 /* 2.4 Extract the final scalar result. Create:
4818 s_out3 = extract_field <v_out2, bitpos> */
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_NOTE, vect_location,
4822 "extract scalar result\n");
4824 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4825 bitsize, bitsize_zero_node);
4826 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4827 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4828 gimple_assign_set_lhs (epilog_stmt, new_temp);
4829 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4830 scalar_results.safe_push (new_temp);
4832 else
4834 /*** Case 3: Create:
4835 s = extract_field <v_out2, 0>
4836 for (offset = element_size;
4837 offset < vector_size;
4838 offset += element_size;)
4840 Create: s' = extract_field <v_out2, offset>
4841 Create: s = op <s, s'> // For non SLP cases
4842 } */
4844 if (dump_enabled_p ())
4845 dump_printf_loc (MSG_NOTE, vect_location,
4846 "Reduce using scalar code.\n");
4848 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4849 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4851 int bit_offset;
4852 if (gimple_code (new_phi) == GIMPLE_PHI)
4853 vec_temp = PHI_RESULT (new_phi);
4854 else
4855 vec_temp = gimple_assign_lhs (new_phi);
4856 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4857 bitsize_zero_node);
4858 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4859 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4860 gimple_assign_set_lhs (epilog_stmt, new_temp);
4861 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4863 /* In SLP we don't need to apply reduction operation, so we just
4864 collect s' values in SCALAR_RESULTS. */
4865 if (slp_reduc)
4866 scalar_results.safe_push (new_temp);
4868 for (bit_offset = element_bitsize;
4869 bit_offset < vec_size_in_bits;
4870 bit_offset += element_bitsize)
4872 tree bitpos = bitsize_int (bit_offset);
4873 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4874 bitsize, bitpos);
4876 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4877 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4878 gimple_assign_set_lhs (epilog_stmt, new_name);
4879 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4881 if (slp_reduc)
4883 /* In SLP we don't need to apply reduction operation, so
4884 we just collect s' values in SCALAR_RESULTS. */
4885 new_temp = new_name;
4886 scalar_results.safe_push (new_name);
4888 else
4890 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4891 new_name, new_temp);
4892 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4893 gimple_assign_set_lhs (epilog_stmt, new_temp);
4894 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4899 /* The only case where we need to reduce scalar results in SLP, is
4900 unrolling. If the size of SCALAR_RESULTS is greater than
4901 GROUP_SIZE, we reduce them combining elements modulo
4902 GROUP_SIZE. */
4903 if (slp_reduc)
4905 tree res, first_res, new_res;
4906 gimple *new_stmt;
4908 /* Reduce multiple scalar results in case of SLP unrolling. */
4909 for (j = group_size; scalar_results.iterate (j, &res);
4910 j++)
4912 first_res = scalar_results[j % group_size];
4913 new_stmt = gimple_build_assign (new_scalar_dest, code,
4914 first_res, res);
4915 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4916 gimple_assign_set_lhs (new_stmt, new_res);
4917 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4918 scalar_results[j % group_size] = new_res;
4921 else
4922 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4923 scalar_results.safe_push (new_temp);
4927 vect_finalize_reduction:
4929 if (double_reduc)
4930 loop = loop->inner;
4932 /* 2.5 Adjust the final result by the initial value of the reduction
4933 variable. (When such adjustment is not needed, then
4934 'adjustment_def' is zero). For example, if code is PLUS we create:
4935 new_temp = loop_exit_def + adjustment_def */
4937 if (adjustment_def)
4939 gcc_assert (!slp_reduc);
4940 if (nested_in_vect_loop)
4942 new_phi = new_phis[0];
4943 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4944 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4945 new_dest = vect_create_destination_var (scalar_dest, vectype);
4947 else
4949 new_temp = scalar_results[0];
4950 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4951 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4952 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4955 epilog_stmt = gimple_build_assign (new_dest, expr);
4956 new_temp = make_ssa_name (new_dest, epilog_stmt);
4957 gimple_assign_set_lhs (epilog_stmt, new_temp);
4958 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4959 if (nested_in_vect_loop)
4961 set_vinfo_for_stmt (epilog_stmt,
4962 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4963 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4964 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4966 if (!double_reduc)
4967 scalar_results.quick_push (new_temp);
4968 else
4969 scalar_results[0] = new_temp;
4971 else
4972 scalar_results[0] = new_temp;
4974 new_phis[0] = epilog_stmt;
4977 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4978 phis with new adjusted scalar results, i.e., replace use <s_out0>
4979 with use <s_out4>.
4981 Transform:
4982 loop_exit:
4983 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4984 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4985 v_out2 = reduce <v_out1>
4986 s_out3 = extract_field <v_out2, 0>
4987 s_out4 = adjust_result <s_out3>
4988 use <s_out0>
4989 use <s_out0>
4991 into:
4993 loop_exit:
4994 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4995 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4996 v_out2 = reduce <v_out1>
4997 s_out3 = extract_field <v_out2, 0>
4998 s_out4 = adjust_result <s_out3>
4999 use <s_out4>
5000 use <s_out4> */
5003 /* In SLP reduction chain we reduce vector results into one vector if
5004 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5005 the last stmt in the reduction chain, since we are looking for the loop
5006 exit phi node. */
5007 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5009 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5010 /* Handle reduction patterns. */
5011 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5012 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5014 scalar_dest = gimple_assign_lhs (dest_stmt);
5015 group_size = 1;
5018 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5019 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5020 need to match SCALAR_RESULTS with corresponding statements. The first
5021 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5022 the first vector stmt, etc.
5023 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5024 if (group_size > new_phis.length ())
5026 ratio = group_size / new_phis.length ();
5027 gcc_assert (!(group_size % new_phis.length ()));
5029 else
5030 ratio = 1;
5032 for (k = 0; k < group_size; k++)
5034 if (k % ratio == 0)
5036 epilog_stmt = new_phis[k / ratio];
5037 reduction_phi = reduction_phis[k / ratio];
5038 if (double_reduc)
5039 inner_phi = inner_phis[k / ratio];
5042 if (slp_reduc)
5044 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5046 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5047 /* SLP statements can't participate in patterns. */
5048 gcc_assert (!orig_stmt);
5049 scalar_dest = gimple_assign_lhs (current_stmt);
5052 phis.create (3);
5053 /* Find the loop-closed-use at the loop exit of the original scalar
5054 result. (The reduction result is expected to have two immediate uses -
5055 one at the latch block, and one at the loop exit). */
5056 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5057 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5058 && !is_gimple_debug (USE_STMT (use_p)))
5059 phis.safe_push (USE_STMT (use_p));
5061 /* While we expect to have found an exit_phi because of loop-closed-ssa
5062 form we can end up without one if the scalar cycle is dead. */
5064 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5066 if (outer_loop)
5068 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5069 gphi *vect_phi;
5071 /* FORNOW. Currently not supporting the case that an inner-loop
5072 reduction is not used in the outer-loop (but only outside the
5073 outer-loop), unless it is double reduction. */
5074 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5075 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5076 || double_reduc);
5078 if (double_reduc)
5079 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5080 else
5081 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5082 if (!double_reduc
5083 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5084 != vect_double_reduction_def)
5085 continue;
5087 /* Handle double reduction:
5089 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5090 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5091 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5092 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5094 At that point the regular reduction (stmt2 and stmt3) is
5095 already vectorized, as well as the exit phi node, stmt4.
5096 Here we vectorize the phi node of double reduction, stmt1, and
5097 update all relevant statements. */
5099 /* Go through all the uses of s2 to find double reduction phi
5100 node, i.e., stmt1 above. */
5101 orig_name = PHI_RESULT (exit_phi);
5102 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5104 stmt_vec_info use_stmt_vinfo;
5105 stmt_vec_info new_phi_vinfo;
5106 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5107 basic_block bb = gimple_bb (use_stmt);
5108 gimple *use;
5110 /* Check that USE_STMT is really double reduction phi
5111 node. */
5112 if (gimple_code (use_stmt) != GIMPLE_PHI
5113 || gimple_phi_num_args (use_stmt) != 2
5114 || bb->loop_father != outer_loop)
5115 continue;
5116 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5117 if (!use_stmt_vinfo
5118 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5119 != vect_double_reduction_def)
5120 continue;
5122 /* Create vector phi node for double reduction:
5123 vs1 = phi <vs0, vs2>
5124 vs1 was created previously in this function by a call to
5125 vect_get_vec_def_for_operand and is stored in
5126 vec_initial_def;
5127 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5128 vs0 is created here. */
5130 /* Create vector phi node. */
5131 vect_phi = create_phi_node (vec_initial_def, bb);
5132 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5133 loop_vec_info_for_loop (outer_loop));
5134 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5136 /* Create vs0 - initial def of the double reduction phi. */
5137 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5138 loop_preheader_edge (outer_loop));
5139 init_def = get_initial_def_for_reduction (stmt,
5140 preheader_arg, NULL);
5141 vect_phi_init = vect_init_vector (use_stmt, init_def,
5142 vectype, NULL);
5144 /* Update phi node arguments with vs0 and vs2. */
5145 add_phi_arg (vect_phi, vect_phi_init,
5146 loop_preheader_edge (outer_loop),
5147 UNKNOWN_LOCATION);
5148 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5149 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5150 if (dump_enabled_p ())
5152 dump_printf_loc (MSG_NOTE, vect_location,
5153 "created double reduction phi node: ");
5154 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5157 vect_phi_res = PHI_RESULT (vect_phi);
5159 /* Replace the use, i.e., set the correct vs1 in the regular
5160 reduction phi node. FORNOW, NCOPIES is always 1, so the
5161 loop is redundant. */
5162 use = reduction_phi;
5163 for (j = 0; j < ncopies; j++)
5165 edge pr_edge = loop_preheader_edge (loop);
5166 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5167 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5173 phis.release ();
5174 if (nested_in_vect_loop)
5176 if (double_reduc)
5177 loop = outer_loop;
5178 else
5179 continue;
5182 phis.create (3);
5183 /* Find the loop-closed-use at the loop exit of the original scalar
5184 result. (The reduction result is expected to have two immediate uses,
5185 one at the latch block, and one at the loop exit). For double
5186 reductions we are looking for exit phis of the outer loop. */
5187 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5189 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5191 if (!is_gimple_debug (USE_STMT (use_p)))
5192 phis.safe_push (USE_STMT (use_p));
5194 else
5196 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5198 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5200 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5202 if (!flow_bb_inside_loop_p (loop,
5203 gimple_bb (USE_STMT (phi_use_p)))
5204 && !is_gimple_debug (USE_STMT (phi_use_p)))
5205 phis.safe_push (USE_STMT (phi_use_p));
5211 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5213 /* Replace the uses: */
5214 orig_name = PHI_RESULT (exit_phi);
5215 scalar_result = scalar_results[k];
5216 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5217 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5218 SET_USE (use_p, scalar_result);
5221 phis.release ();
5226 /* Function is_nonwrapping_integer_induction.
5228 Check if STMT (which is part of loop LOOP) both increments and
5229 does not cause overflow. */
5231 static bool
5232 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5234 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5235 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5236 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5237 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5238 widest_int ni, max_loop_value, lhs_max;
5239 bool overflow = false;
5241 /* Make sure the loop is integer based. */
5242 if (TREE_CODE (base) != INTEGER_CST
5243 || TREE_CODE (step) != INTEGER_CST)
5244 return false;
5246 /* Check that the induction increments. */
5247 if (tree_int_cst_sgn (step) == -1)
5248 return false;
5250 /* Check that the max size of the loop will not wrap. */
5252 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5253 return true;
5255 if (! max_stmt_executions (loop, &ni))
5256 return false;
5258 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5259 &overflow);
5260 if (overflow)
5261 return false;
5263 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5264 TYPE_SIGN (lhs_type), &overflow);
5265 if (overflow)
5266 return false;
5268 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5269 <= TYPE_PRECISION (lhs_type));
5272 /* Function vectorizable_reduction.
5274 Check if STMT performs a reduction operation that can be vectorized.
5275 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5276 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5277 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5279 This function also handles reduction idioms (patterns) that have been
5280 recognized in advance during vect_pattern_recog. In this case, STMT may be
5281 of this form:
5282 X = pattern_expr (arg0, arg1, ..., X)
5283 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5284 sequence that had been detected and replaced by the pattern-stmt (STMT).
5286 This function also handles reduction of condition expressions, for example:
5287 for (int i = 0; i < N; i++)
5288 if (a[i] < value)
5289 last = a[i];
5290 This is handled by vectorising the loop and creating an additional vector
5291 containing the loop indexes for which "a[i] < value" was true. In the
5292 function epilogue this is reduced to a single max value and then used to
5293 index into the vector of results.
5295 In some cases of reduction patterns, the type of the reduction variable X is
5296 different than the type of the other arguments of STMT.
5297 In such cases, the vectype that is used when transforming STMT into a vector
5298 stmt is different than the vectype that is used to determine the
5299 vectorization factor, because it consists of a different number of elements
5300 than the actual number of elements that are being operated upon in parallel.
5302 For example, consider an accumulation of shorts into an int accumulator.
5303 On some targets it's possible to vectorize this pattern operating on 8
5304 shorts at a time (hence, the vectype for purposes of determining the
5305 vectorization factor should be V8HI); on the other hand, the vectype that
5306 is used to create the vector form is actually V4SI (the type of the result).
5308 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5309 indicates what is the actual level of parallelism (V8HI in the example), so
5310 that the right vectorization factor would be derived. This vectype
5311 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5312 be used to create the vectorized stmt. The right vectype for the vectorized
5313 stmt is obtained from the type of the result X:
5314 get_vectype_for_scalar_type (TREE_TYPE (X))
5316 This means that, contrary to "regular" reductions (or "regular" stmts in
5317 general), the following equation:
5318 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5319 does *NOT* necessarily hold for reduction patterns. */
5321 bool
5322 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5323 gimple **vec_stmt, slp_tree slp_node)
5325 tree vec_dest;
5326 tree scalar_dest;
5327 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5328 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5329 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5330 tree vectype_in = NULL_TREE;
5331 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5332 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5333 enum tree_code code, orig_code, epilog_reduc_code;
5334 machine_mode vec_mode;
5335 int op_type;
5336 optab optab, reduc_optab;
5337 tree new_temp = NULL_TREE;
5338 gimple *def_stmt;
5339 enum vect_def_type dt;
5340 gphi *new_phi = NULL;
5341 tree scalar_type;
5342 bool is_simple_use;
5343 gimple *orig_stmt;
5344 stmt_vec_info orig_stmt_info;
5345 tree expr = NULL_TREE;
5346 int i;
5347 int ncopies;
5348 int epilog_copies;
5349 stmt_vec_info prev_stmt_info, prev_phi_info;
5350 bool single_defuse_cycle = false;
5351 tree reduc_def = NULL_TREE;
5352 gimple *new_stmt = NULL;
5353 int j;
5354 tree ops[3];
5355 bool nested_cycle = false, found_nested_cycle_def = false;
5356 gimple *reduc_def_stmt = NULL;
5357 bool double_reduc = false, dummy;
5358 basic_block def_bb;
5359 struct loop * def_stmt_loop, *outer_loop = NULL;
5360 tree def_arg;
5361 gimple *def_arg_stmt;
5362 auto_vec<tree> vec_oprnds0;
5363 auto_vec<tree> vec_oprnds1;
5364 auto_vec<tree> vect_defs;
5365 auto_vec<gimple *> phis;
5366 int vec_num;
5367 tree def0, def1, tem, op0, op1 = NULL_TREE;
5368 bool first_p = true;
5369 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5370 gimple *cond_expr_induction_def_stmt = NULL;
5372 /* In case of reduction chain we switch to the first stmt in the chain, but
5373 we don't update STMT_INFO, since only the last stmt is marked as reduction
5374 and has reduction properties. */
5375 if (GROUP_FIRST_ELEMENT (stmt_info)
5376 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5378 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5379 first_p = false;
5382 if (nested_in_vect_loop_p (loop, stmt))
5384 outer_loop = loop;
5385 loop = loop->inner;
5386 nested_cycle = true;
5389 /* 1. Is vectorizable reduction? */
5390 /* Not supportable if the reduction variable is used in the loop, unless
5391 it's a reduction chain. */
5392 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5393 && !GROUP_FIRST_ELEMENT (stmt_info))
5394 return false;
5396 /* Reductions that are not used even in an enclosing outer-loop,
5397 are expected to be "live" (used out of the loop). */
5398 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5399 && !STMT_VINFO_LIVE_P (stmt_info))
5400 return false;
5402 /* Make sure it was already recognized as a reduction computation. */
5403 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5404 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5405 return false;
5407 /* 2. Has this been recognized as a reduction pattern?
5409 Check if STMT represents a pattern that has been recognized
5410 in earlier analysis stages. For stmts that represent a pattern,
5411 the STMT_VINFO_RELATED_STMT field records the last stmt in
5412 the original sequence that constitutes the pattern. */
5414 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5415 if (orig_stmt)
5417 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5418 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5419 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5422 /* 3. Check the operands of the operation. The first operands are defined
5423 inside the loop body. The last operand is the reduction variable,
5424 which is defined by the loop-header-phi. */
5426 gcc_assert (is_gimple_assign (stmt));
5428 /* Flatten RHS. */
5429 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5431 case GIMPLE_SINGLE_RHS:
5432 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5433 if (op_type == ternary_op)
5435 tree rhs = gimple_assign_rhs1 (stmt);
5436 ops[0] = TREE_OPERAND (rhs, 0);
5437 ops[1] = TREE_OPERAND (rhs, 1);
5438 ops[2] = TREE_OPERAND (rhs, 2);
5439 code = TREE_CODE (rhs);
5441 else
5442 return false;
5443 break;
5445 case GIMPLE_BINARY_RHS:
5446 code = gimple_assign_rhs_code (stmt);
5447 op_type = TREE_CODE_LENGTH (code);
5448 gcc_assert (op_type == binary_op);
5449 ops[0] = gimple_assign_rhs1 (stmt);
5450 ops[1] = gimple_assign_rhs2 (stmt);
5451 break;
5453 case GIMPLE_TERNARY_RHS:
5454 code = gimple_assign_rhs_code (stmt);
5455 op_type = TREE_CODE_LENGTH (code);
5456 gcc_assert (op_type == ternary_op);
5457 ops[0] = gimple_assign_rhs1 (stmt);
5458 ops[1] = gimple_assign_rhs2 (stmt);
5459 ops[2] = gimple_assign_rhs3 (stmt);
5460 break;
5462 case GIMPLE_UNARY_RHS:
5463 return false;
5465 default:
5466 gcc_unreachable ();
5468 /* The default is that the reduction variable is the last in statement. */
5469 int reduc_index = op_type - 1;
5470 if (code == MINUS_EXPR)
5471 reduc_index = 0;
5473 if (code == COND_EXPR && slp_node)
5474 return false;
5476 scalar_dest = gimple_assign_lhs (stmt);
5477 scalar_type = TREE_TYPE (scalar_dest);
5478 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5479 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5480 return false;
5482 /* Do not try to vectorize bit-precision reductions. */
5483 if ((TYPE_PRECISION (scalar_type)
5484 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5485 return false;
5487 /* All uses but the last are expected to be defined in the loop.
5488 The last use is the reduction variable. In case of nested cycle this
5489 assumption is not true: we use reduc_index to record the index of the
5490 reduction variable. */
5491 for (i = 0; i < op_type; i++)
5493 if (i == reduc_index)
5494 continue;
5496 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5497 if (i == 0 && code == COND_EXPR)
5498 continue;
5500 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5501 &def_stmt, &dt, &tem);
5502 if (!vectype_in)
5503 vectype_in = tem;
5504 gcc_assert (is_simple_use);
5506 if (dt != vect_internal_def
5507 && dt != vect_external_def
5508 && dt != vect_constant_def
5509 && dt != vect_induction_def
5510 && !(dt == vect_nested_cycle && nested_cycle))
5511 return false;
5513 if (dt == vect_nested_cycle)
5515 found_nested_cycle_def = true;
5516 reduc_def_stmt = def_stmt;
5517 reduc_index = i;
5520 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5521 cond_expr_induction_def_stmt = def_stmt;
5524 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5525 &def_stmt, &dt, &tem);
5526 if (!vectype_in)
5527 vectype_in = tem;
5528 gcc_assert (is_simple_use);
5529 if (!found_nested_cycle_def)
5530 reduc_def_stmt = def_stmt;
5532 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5533 return false;
5535 if (!(dt == vect_reduction_def
5536 || dt == vect_nested_cycle
5537 || ((dt == vect_internal_def || dt == vect_external_def
5538 || dt == vect_constant_def || dt == vect_induction_def)
5539 && nested_cycle && found_nested_cycle_def)))
5541 /* For pattern recognized stmts, orig_stmt might be a reduction,
5542 but some helper statements for the pattern might not, or
5543 might be COND_EXPRs with reduction uses in the condition. */
5544 gcc_assert (orig_stmt);
5545 return false;
5548 enum vect_reduction_type v_reduc_type;
5549 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5550 !nested_cycle, &dummy, false,
5551 &v_reduc_type);
5553 /* If we have a condition reduction, see if we can simplify it further. */
5554 if (v_reduc_type == COND_REDUCTION
5555 && cond_expr_induction_def_stmt != NULL
5556 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5558 if (dump_enabled_p ())
5559 dump_printf_loc (MSG_NOTE, vect_location,
5560 "condition expression based on integer induction.\n");
5561 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5563 else
5564 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5566 if (orig_stmt)
5567 gcc_assert (tmp == orig_stmt
5568 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5569 else
5570 /* We changed STMT to be the first stmt in reduction chain, hence we
5571 check that in this case the first element in the chain is STMT. */
5572 gcc_assert (stmt == tmp
5573 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5575 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5576 return false;
5578 if (slp_node)
5579 ncopies = 1;
5580 else
5581 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5582 / TYPE_VECTOR_SUBPARTS (vectype_in));
5584 gcc_assert (ncopies >= 1);
5586 vec_mode = TYPE_MODE (vectype_in);
5588 if (code == COND_EXPR)
5590 /* Only call during the analysis stage, otherwise we'll lose
5591 STMT_VINFO_TYPE. */
5592 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5593 ops[reduc_index], 0, NULL))
5595 if (dump_enabled_p ())
5596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5597 "unsupported condition in reduction\n");
5598 return false;
5601 else
5603 /* 4. Supportable by target? */
5605 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5606 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5608 /* Shifts and rotates are only supported by vectorizable_shifts,
5609 not vectorizable_reduction. */
5610 if (dump_enabled_p ())
5611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5612 "unsupported shift or rotation.\n");
5613 return false;
5616 /* 4.1. check support for the operation in the loop */
5617 optab = optab_for_tree_code (code, vectype_in, optab_default);
5618 if (!optab)
5620 if (dump_enabled_p ())
5621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622 "no optab.\n");
5624 return false;
5627 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5629 if (dump_enabled_p ())
5630 dump_printf (MSG_NOTE, "op not supported by target.\n");
5632 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5633 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5634 < vect_min_worthwhile_factor (code))
5635 return false;
5637 if (dump_enabled_p ())
5638 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5641 /* Worthwhile without SIMD support? */
5642 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5643 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5644 < vect_min_worthwhile_factor (code))
5646 if (dump_enabled_p ())
5647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5648 "not worthwhile without SIMD support.\n");
5650 return false;
5654 /* 4.2. Check support for the epilog operation.
5656 If STMT represents a reduction pattern, then the type of the
5657 reduction variable may be different than the type of the rest
5658 of the arguments. For example, consider the case of accumulation
5659 of shorts into an int accumulator; The original code:
5660 S1: int_a = (int) short_a;
5661 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5663 was replaced with:
5664 STMT: int_acc = widen_sum <short_a, int_acc>
5666 This means that:
5667 1. The tree-code that is used to create the vector operation in the
5668 epilog code (that reduces the partial results) is not the
5669 tree-code of STMT, but is rather the tree-code of the original
5670 stmt from the pattern that STMT is replacing. I.e, in the example
5671 above we want to use 'widen_sum' in the loop, but 'plus' in the
5672 epilog.
5673 2. The type (mode) we use to check available target support
5674 for the vector operation to be created in the *epilog*, is
5675 determined by the type of the reduction variable (in the example
5676 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5677 However the type (mode) we use to check available target support
5678 for the vector operation to be created *inside the loop*, is
5679 determined by the type of the other arguments to STMT (in the
5680 example we'd check this: optab_handler (widen_sum_optab,
5681 vect_short_mode)).
5683 This is contrary to "regular" reductions, in which the types of all
5684 the arguments are the same as the type of the reduction variable.
5685 For "regular" reductions we can therefore use the same vector type
5686 (and also the same tree-code) when generating the epilog code and
5687 when generating the code inside the loop. */
5689 if (orig_stmt)
5691 /* This is a reduction pattern: get the vectype from the type of the
5692 reduction variable, and get the tree-code from orig_stmt. */
5693 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5694 == TREE_CODE_REDUCTION);
5695 orig_code = gimple_assign_rhs_code (orig_stmt);
5696 gcc_assert (vectype_out);
5697 vec_mode = TYPE_MODE (vectype_out);
5699 else
5701 /* Regular reduction: use the same vectype and tree-code as used for
5702 the vector code inside the loop can be used for the epilog code. */
5703 orig_code = code;
5705 if (code == MINUS_EXPR)
5706 orig_code = PLUS_EXPR;
5708 /* For simple condition reductions, replace with the actual expression
5709 we want to base our reduction around. */
5710 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5711 == INTEGER_INDUC_COND_REDUCTION)
5712 orig_code = MAX_EXPR;
5715 if (nested_cycle)
5717 def_bb = gimple_bb (reduc_def_stmt);
5718 def_stmt_loop = def_bb->loop_father;
5719 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5720 loop_preheader_edge (def_stmt_loop));
5721 if (TREE_CODE (def_arg) == SSA_NAME
5722 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5723 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5724 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5725 && vinfo_for_stmt (def_arg_stmt)
5726 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5727 == vect_double_reduction_def)
5728 double_reduc = true;
5731 epilog_reduc_code = ERROR_MARK;
5733 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5734 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5735 == INTEGER_INDUC_COND_REDUCTION)
5737 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5739 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5740 optab_default);
5741 if (!reduc_optab)
5743 if (dump_enabled_p ())
5744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5745 "no optab for reduction.\n");
5747 epilog_reduc_code = ERROR_MARK;
5749 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5751 if (dump_enabled_p ())
5752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753 "reduc op not supported by target.\n");
5755 epilog_reduc_code = ERROR_MARK;
5758 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5759 generated in the epilog using multiple expressions. This does not
5760 work for condition reductions. */
5761 if (epilog_reduc_code == ERROR_MARK
5762 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5763 == INTEGER_INDUC_COND_REDUCTION)
5765 if (dump_enabled_p ())
5766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5767 "no reduc code for scalar code.\n");
5768 return false;
5771 else
5773 if (!nested_cycle || double_reduc)
5775 if (dump_enabled_p ())
5776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5777 "no reduc code for scalar code.\n");
5779 return false;
5783 else
5785 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5786 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5787 cr_index_vector_type = build_vector_type
5788 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5790 epilog_reduc_code = REDUC_MAX_EXPR;
5791 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5792 optab_default);
5793 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5794 == CODE_FOR_nothing)
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "reduc max op not supported by target.\n");
5799 return false;
5803 if ((double_reduc
5804 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5805 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5806 == INTEGER_INDUC_COND_REDUCTION)
5807 && ncopies > 1)
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5811 "multiple types in double reduction or condition "
5812 "reduction.\n");
5813 return false;
5816 /* In case of widenning multiplication by a constant, we update the type
5817 of the constant to be the type of the other operand. We check that the
5818 constant fits the type in the pattern recognition pass. */
5819 if (code == DOT_PROD_EXPR
5820 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5822 if (TREE_CODE (ops[0]) == INTEGER_CST)
5823 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5824 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5825 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5826 else
5828 if (dump_enabled_p ())
5829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5830 "invalid types in dot-prod\n");
5832 return false;
5836 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5838 widest_int ni;
5840 if (! max_loop_iterations (loop, &ni))
5842 if (dump_enabled_p ())
5843 dump_printf_loc (MSG_NOTE, vect_location,
5844 "loop count not known, cannot create cond "
5845 "reduction.\n");
5846 return false;
5848 /* Convert backedges to iterations. */
5849 ni += 1;
5851 /* The additional index will be the same type as the condition. Check
5852 that the loop can fit into this less one (because we'll use up the
5853 zero slot for when there are no matches). */
5854 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5855 if (wi::geu_p (ni, wi::to_widest (max_index)))
5857 if (dump_enabled_p ())
5858 dump_printf_loc (MSG_NOTE, vect_location,
5859 "loop size is greater than data size.\n");
5860 return false;
5864 if (!vec_stmt) /* transformation not required. */
5866 if (first_p
5867 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5868 reduc_index))
5869 return false;
5870 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5871 return true;
5874 /** Transform. **/
5876 if (dump_enabled_p ())
5877 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5879 /* FORNOW: Multiple types are not supported for condition. */
5880 if (code == COND_EXPR)
5881 gcc_assert (ncopies == 1);
5883 /* Create the destination vector */
5884 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5886 /* In case the vectorization factor (VF) is bigger than the number
5887 of elements that we can fit in a vectype (nunits), we have to generate
5888 more than one vector stmt - i.e - we need to "unroll" the
5889 vector stmt by a factor VF/nunits. For more details see documentation
5890 in vectorizable_operation. */
5892 /* If the reduction is used in an outer loop we need to generate
5893 VF intermediate results, like so (e.g. for ncopies=2):
5894 r0 = phi (init, r0)
5895 r1 = phi (init, r1)
5896 r0 = x0 + r0;
5897 r1 = x1 + r1;
5898 (i.e. we generate VF results in 2 registers).
5899 In this case we have a separate def-use cycle for each copy, and therefore
5900 for each copy we get the vector def for the reduction variable from the
5901 respective phi node created for this copy.
5903 Otherwise (the reduction is unused in the loop nest), we can combine
5904 together intermediate results, like so (e.g. for ncopies=2):
5905 r = phi (init, r)
5906 r = x0 + r;
5907 r = x1 + r;
5908 (i.e. we generate VF/2 results in a single register).
5909 In this case for each copy we get the vector def for the reduction variable
5910 from the vectorized reduction operation generated in the previous iteration.
5913 if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
5915 single_defuse_cycle = true;
5916 epilog_copies = 1;
5918 else
5919 epilog_copies = ncopies;
5921 prev_stmt_info = NULL;
5922 prev_phi_info = NULL;
5923 if (slp_node)
5924 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5925 else
5927 vec_num = 1;
5928 vec_oprnds0.create (1);
5929 if (op_type == ternary_op)
5930 vec_oprnds1.create (1);
5933 phis.create (vec_num);
5934 vect_defs.create (vec_num);
5935 if (!slp_node)
5936 vect_defs.quick_push (NULL_TREE);
5938 for (j = 0; j < ncopies; j++)
5940 if (j == 0 || !single_defuse_cycle)
5942 for (i = 0; i < vec_num; i++)
5944 /* Create the reduction-phi that defines the reduction
5945 operand. */
5946 new_phi = create_phi_node (vec_dest, loop->header);
5947 set_vinfo_for_stmt (new_phi,
5948 new_stmt_vec_info (new_phi, loop_vinfo));
5949 if (j == 0 || slp_node)
5950 phis.quick_push (new_phi);
5954 if (code == COND_EXPR)
5956 gcc_assert (!slp_node);
5957 vectorizable_condition (stmt, gsi, vec_stmt,
5958 PHI_RESULT (phis[0]),
5959 reduc_index, NULL);
5960 /* Multiple types are not supported for condition. */
5961 break;
5964 /* Handle uses. */
5965 if (j == 0)
5967 op0 = ops[!reduc_index];
5968 if (op_type == ternary_op)
5970 if (reduc_index == 0)
5971 op1 = ops[2];
5972 else
5973 op1 = ops[1];
5976 if (slp_node)
5977 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5978 slp_node, -1);
5979 else
5981 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5982 stmt);
5983 vec_oprnds0.quick_push (loop_vec_def0);
5984 if (op_type == ternary_op)
5986 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5987 vec_oprnds1.quick_push (loop_vec_def1);
5991 else
5993 if (!slp_node)
5995 enum vect_def_type dt;
5996 gimple *dummy_stmt;
5998 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5999 &dummy_stmt, &dt);
6000 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6001 loop_vec_def0);
6002 vec_oprnds0[0] = loop_vec_def0;
6003 if (op_type == ternary_op)
6005 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6006 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6007 loop_vec_def1);
6008 vec_oprnds1[0] = loop_vec_def1;
6012 if (single_defuse_cycle)
6013 reduc_def = gimple_assign_lhs (new_stmt);
6015 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6018 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6020 if (slp_node)
6021 reduc_def = PHI_RESULT (phis[i]);
6022 else
6024 if (!single_defuse_cycle || j == 0)
6025 reduc_def = PHI_RESULT (new_phi);
6028 def1 = ((op_type == ternary_op)
6029 ? vec_oprnds1[i] : NULL);
6030 if (op_type == binary_op)
6032 if (reduc_index == 0)
6033 expr = build2 (code, vectype_out, reduc_def, def0);
6034 else
6035 expr = build2 (code, vectype_out, def0, reduc_def);
6037 else
6039 if (reduc_index == 0)
6040 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6041 else
6043 if (reduc_index == 1)
6044 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6045 else
6046 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6050 new_stmt = gimple_build_assign (vec_dest, expr);
6051 new_temp = make_ssa_name (vec_dest, new_stmt);
6052 gimple_assign_set_lhs (new_stmt, new_temp);
6053 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6055 if (slp_node)
6057 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6058 vect_defs.quick_push (new_temp);
6060 else
6061 vect_defs[0] = new_temp;
6064 if (slp_node)
6065 continue;
6067 if (j == 0)
6068 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6069 else
6070 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6072 prev_stmt_info = vinfo_for_stmt (new_stmt);
6073 prev_phi_info = vinfo_for_stmt (new_phi);
6076 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6078 /* Finalize the reduction-phi (set its arguments) and create the
6079 epilog reduction code. */
6080 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6082 new_temp = gimple_assign_lhs (*vec_stmt);
6083 vect_defs[0] = new_temp;
6085 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6086 which is updated with the current index of the loop for every match of
6087 the original loop's cond_expr (VEC_STMT). This results in a vector
6088 containing the last time the condition passed for that vector lane.
6089 The first match will be a 1 to allow 0 to be used for non-matching
6090 indexes. If there are no matches at all then the vector will be all
6091 zeroes. */
6092 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6094 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6095 int k;
6097 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6099 /* First we create a simple vector induction variable which starts
6100 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6101 vector size (STEP). */
6103 /* Create a {1,2,3,...} vector. */
6104 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6105 for (k = 0; k < nunits_out; ++k)
6106 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6107 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6109 /* Create a vector of the step value. */
6110 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6111 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6113 /* Create an induction variable. */
6114 gimple_stmt_iterator incr_gsi;
6115 bool insert_after;
6116 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6117 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6118 insert_after, &indx_before_incr, &indx_after_incr);
6120 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6121 filled with zeros (VEC_ZERO). */
6123 /* Create a vector of 0s. */
6124 tree zero = build_zero_cst (cr_index_scalar_type);
6125 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6127 /* Create a vector phi node. */
6128 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6129 new_phi = create_phi_node (new_phi_tree, loop->header);
6130 set_vinfo_for_stmt (new_phi,
6131 new_stmt_vec_info (new_phi, loop_vinfo));
6132 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6133 UNKNOWN_LOCATION);
6135 /* Now take the condition from the loops original cond_expr
6136 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6137 every match uses values from the induction variable
6138 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6139 (NEW_PHI_TREE).
6140 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6141 the new cond_expr (INDEX_COND_EXPR). */
6143 /* Duplicate the condition from vec_stmt. */
6144 tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6146 /* Create a conditional, where the condition is taken from vec_stmt
6147 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6148 else is the phi (NEW_PHI_TREE). */
6149 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6150 ccompare, indx_before_incr,
6151 new_phi_tree);
6152 cond_name = make_ssa_name (cr_index_vector_type);
6153 gimple *index_condition = gimple_build_assign (cond_name,
6154 index_cond_expr);
6155 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6156 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6157 loop_vinfo);
6158 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6159 set_vinfo_for_stmt (index_condition, index_vec_info);
6161 /* Update the phi with the vec cond. */
6162 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6163 UNKNOWN_LOCATION);
6167 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6168 epilog_reduc_code, phis, reduc_index,
6169 double_reduc, slp_node, cond_name);
6171 return true;
6174 /* Function vect_min_worthwhile_factor.
6176 For a loop where we could vectorize the operation indicated by CODE,
6177 return the minimum vectorization factor that makes it worthwhile
6178 to use generic vectors. */
6180 vect_min_worthwhile_factor (enum tree_code code)
6182 switch (code)
6184 case PLUS_EXPR:
6185 case MINUS_EXPR:
6186 case NEGATE_EXPR:
6187 return 4;
6189 case BIT_AND_EXPR:
6190 case BIT_IOR_EXPR:
6191 case BIT_XOR_EXPR:
6192 case BIT_NOT_EXPR:
6193 return 2;
6195 default:
6196 return INT_MAX;
6201 /* Function vectorizable_induction
6203 Check if PHI performs an induction computation that can be vectorized.
6204 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6205 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6206 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6208 bool
6209 vectorizable_induction (gimple *phi,
6210 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6211 gimple **vec_stmt)
6213 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6214 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6215 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6216 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6217 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6218 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6219 tree vec_def;
6221 gcc_assert (ncopies >= 1);
6222 /* FORNOW. These restrictions should be relaxed. */
6223 if (nested_in_vect_loop_p (loop, phi))
6225 imm_use_iterator imm_iter;
6226 use_operand_p use_p;
6227 gimple *exit_phi;
6228 edge latch_e;
6229 tree loop_arg;
6231 if (ncopies > 1)
6233 if (dump_enabled_p ())
6234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6235 "multiple types in nested loop.\n");
6236 return false;
6239 exit_phi = NULL;
6240 latch_e = loop_latch_edge (loop->inner);
6241 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6242 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6244 gimple *use_stmt = USE_STMT (use_p);
6245 if (is_gimple_debug (use_stmt))
6246 continue;
6248 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6250 exit_phi = use_stmt;
6251 break;
6254 if (exit_phi)
6256 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6257 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6258 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6260 if (dump_enabled_p ())
6261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262 "inner-loop induction only used outside "
6263 "of the outer vectorized loop.\n");
6264 return false;
6269 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6270 return false;
6272 /* FORNOW: SLP not supported. */
6273 if (STMT_SLP_TYPE (stmt_info))
6274 return false;
6276 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6278 if (gimple_code (phi) != GIMPLE_PHI)
6279 return false;
6281 if (!vec_stmt) /* transformation not required. */
6283 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6284 if (dump_enabled_p ())
6285 dump_printf_loc (MSG_NOTE, vect_location,
6286 "=== vectorizable_induction ===\n");
6287 vect_model_induction_cost (stmt_info, ncopies);
6288 return true;
6291 /** Transform. **/
6293 if (dump_enabled_p ())
6294 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6296 vec_def = get_initial_def_for_induction (phi);
6297 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6298 return true;
6301 /* Function vectorizable_live_operation.
6303 STMT computes a value that is used outside the loop. Check if
6304 it can be supported. */
6306 bool
6307 vectorizable_live_operation (gimple *stmt,
6308 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6309 slp_tree slp_node, int slp_index,
6310 gimple **vec_stmt)
6312 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6313 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6315 imm_use_iterator imm_iter;
6316 tree lhs, lhs_type, bitsize, vec_bitsize;
6317 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6318 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6319 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6320 gimple *use_stmt;
6321 auto_vec<tree> vec_oprnds;
6323 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6325 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6326 return false;
6328 /* FORNOW. CHECKME. */
6329 if (nested_in_vect_loop_p (loop, stmt))
6330 return false;
6332 /* If STMT is not relevant and it is a simple assignment and its inputs are
6333 invariant then it can remain in place, unvectorized. The original last
6334 scalar value that it computes will be used. */
6335 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6337 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
6338 if (dump_enabled_p ())
6339 dump_printf_loc (MSG_NOTE, vect_location,
6340 "statement is simple and uses invariant. Leaving in "
6341 "place.\n");
6342 return true;
6345 if (!vec_stmt)
6346 /* No transformation required. */
6347 return true;
6349 /* If stmt has a related stmt, then use that for getting the lhs. */
6350 if (is_pattern_stmt_p (stmt_info))
6351 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
6353 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
6354 : gimple_get_lhs (stmt);
6355 lhs_type = TREE_TYPE (lhs);
6357 /* Find all uses of STMT outside the loop - there should be at least one. */
6358 auto_vec<gimple *, 4> worklist;
6359 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
6360 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
6361 && !is_gimple_debug (use_stmt))
6362 worklist.safe_push (use_stmt);
6363 gcc_assert (worklist.length () >= 1);
6365 bitsize = TYPE_SIZE (TREE_TYPE (vectype));
6366 vec_bitsize = TYPE_SIZE (vectype);
6368 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
6369 tree vec_lhs, bitstart;
6370 if (slp_node)
6372 gcc_assert (slp_index >= 0);
6374 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6375 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6377 /* Get the last occurrence of the scalar index from the concatenation of
6378 all the slp vectors. Calculate which slp vector it is and the index
6379 within. */
6380 int pos = (num_vec * nunits) - num_scalar + slp_index;
6381 int vec_entry = pos / nunits;
6382 int vec_index = pos % nunits;
6384 /* Get the correct slp vectorized stmt. */
6385 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
6387 /* Get entry to use. */
6388 bitstart = build_int_cst (unsigned_type_node, vec_index);
6389 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
6391 else
6393 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
6394 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
6396 /* For multiple copies, get the last copy. */
6397 for (int i = 1; i < ncopies; ++i)
6398 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
6399 vec_lhs);
6401 /* Get the last lane in the vector. */
6402 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
6405 /* Create a new vectorized stmt for the uses of STMT and insert outside the
6406 loop. */
6407 gimple_seq stmts = NULL;
6408 tree new_tree = build3 (BIT_FIELD_REF, TREE_TYPE (vectype), vec_lhs, bitsize,
6409 bitstart);
6410 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
6411 true, NULL_TREE);
6412 if (stmts)
6413 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
6415 /* Replace all uses of the USE_STMT in the worklist with the newly inserted
6416 statement. */
6417 while (!worklist.is_empty ())
6419 use_stmt = worklist.pop ();
6420 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
6421 update_stmt (use_stmt);
6424 return true;
6427 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6429 static void
6430 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6432 ssa_op_iter op_iter;
6433 imm_use_iterator imm_iter;
6434 def_operand_p def_p;
6435 gimple *ustmt;
6437 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6439 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6441 basic_block bb;
6443 if (!is_gimple_debug (ustmt))
6444 continue;
6446 bb = gimple_bb (ustmt);
6448 if (!flow_bb_inside_loop_p (loop, bb))
6450 if (gimple_debug_bind_p (ustmt))
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_NOTE, vect_location,
6454 "killing debug use\n");
6456 gimple_debug_bind_reset_value (ustmt);
6457 update_stmt (ustmt);
6459 else
6460 gcc_unreachable ();
6467 /* This function builds ni_name = number of iterations. Statements
6468 are emitted on the loop preheader edge. */
6470 static tree
6471 vect_build_loop_niters (loop_vec_info loop_vinfo)
6473 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6474 if (TREE_CODE (ni) == INTEGER_CST)
6475 return ni;
6476 else
6478 tree ni_name, var;
6479 gimple_seq stmts = NULL;
6480 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6482 var = create_tmp_var (TREE_TYPE (ni), "niters");
6483 ni_name = force_gimple_operand (ni, &stmts, false, var);
6484 if (stmts)
6485 gsi_insert_seq_on_edge_immediate (pe, stmts);
6487 return ni_name;
6492 /* This function generates the following statements:
6494 ni_name = number of iterations loop executes
6495 ratio = ni_name / vf
6496 ratio_mult_vf_name = ratio * vf
6498 and places them on the loop preheader edge. */
6500 static void
6501 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6502 tree ni_name,
6503 tree *ratio_mult_vf_name_ptr,
6504 tree *ratio_name_ptr)
6506 tree ni_minus_gap_name;
6507 tree var;
6508 tree ratio_name;
6509 tree ratio_mult_vf_name;
6510 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6511 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6512 tree log_vf;
6514 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6516 /* If epilogue loop is required because of data accesses with gaps, we
6517 subtract one iteration from the total number of iterations here for
6518 correct calculation of RATIO. */
6519 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6521 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6522 ni_name,
6523 build_one_cst (TREE_TYPE (ni_name)));
6524 if (!is_gimple_val (ni_minus_gap_name))
6526 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6527 gimple *stmts = NULL;
6528 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6529 true, var);
6530 gsi_insert_seq_on_edge_immediate (pe, stmts);
6533 else
6534 ni_minus_gap_name = ni_name;
6536 /* Create: ratio = ni >> log2(vf) */
6537 /* ??? As we have ni == number of latch executions + 1, ni could
6538 have overflown to zero. So avoid computing ratio based on ni
6539 but compute it using the fact that we know ratio will be at least
6540 one, thus via (ni - vf) >> log2(vf) + 1. */
6541 ratio_name
6542 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6543 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6544 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6545 ni_minus_gap_name,
6546 build_int_cst
6547 (TREE_TYPE (ni_name), vf)),
6548 log_vf),
6549 build_int_cst (TREE_TYPE (ni_name), 1));
6550 if (!is_gimple_val (ratio_name))
6552 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6553 gimple *stmts = NULL;
6554 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6555 gsi_insert_seq_on_edge_immediate (pe, stmts);
6557 *ratio_name_ptr = ratio_name;
6559 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6561 if (ratio_mult_vf_name_ptr)
6563 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6564 ratio_name, log_vf);
6565 if (!is_gimple_val (ratio_mult_vf_name))
6567 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6568 gimple *stmts = NULL;
6569 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6570 true, var);
6571 gsi_insert_seq_on_edge_immediate (pe, stmts);
6573 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6576 return;
6580 /* Function vect_transform_loop.
6582 The analysis phase has determined that the loop is vectorizable.
6583 Vectorize the loop - created vectorized stmts to replace the scalar
6584 stmts in the loop, and update the loop exit condition. */
6586 void
6587 vect_transform_loop (loop_vec_info loop_vinfo)
6589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6590 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6591 int nbbs = loop->num_nodes;
6592 int i;
6593 tree ratio = NULL;
6594 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6595 bool grouped_store;
6596 bool slp_scheduled = false;
6597 gimple *stmt, *pattern_stmt;
6598 gimple_seq pattern_def_seq = NULL;
6599 gimple_stmt_iterator pattern_def_si = gsi_none ();
6600 bool transform_pattern_stmt = false;
6601 bool check_profitability = false;
6602 int th;
6603 /* Record number of iterations before we started tampering with the profile. */
6604 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6609 /* If profile is inprecise, we have chance to fix it up. */
6610 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6611 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6613 /* Use the more conservative vectorization threshold. If the number
6614 of iterations is constant assume the cost check has been performed
6615 by our caller. If the threshold makes all loops profitable that
6616 run at least the vectorization factor number of times checking
6617 is pointless, too. */
6618 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6619 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6620 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6622 if (dump_enabled_p ())
6623 dump_printf_loc (MSG_NOTE, vect_location,
6624 "Profitability threshold is %d loop iterations.\n",
6625 th);
6626 check_profitability = true;
6629 /* Version the loop first, if required, so the profitability check
6630 comes first. */
6632 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6633 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6635 vect_loop_versioning (loop_vinfo, th, check_profitability);
6636 check_profitability = false;
6639 tree ni_name = vect_build_loop_niters (loop_vinfo);
6640 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6642 /* Peel the loop if there are data refs with unknown alignment.
6643 Only one data ref with unknown store is allowed. */
6645 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6647 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6648 th, check_profitability);
6649 check_profitability = false;
6650 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6651 be re-computed. */
6652 ni_name = NULL_TREE;
6655 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6656 compile time constant), or it is a constant that doesn't divide by the
6657 vectorization factor, then an epilog loop needs to be created.
6658 We therefore duplicate the loop: the original loop will be vectorized,
6659 and will compute the first (n/VF) iterations. The second copy of the loop
6660 will remain scalar and will compute the remaining (n%VF) iterations.
6661 (VF is the vectorization factor). */
6663 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6664 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6666 tree ratio_mult_vf;
6667 if (!ni_name)
6668 ni_name = vect_build_loop_niters (loop_vinfo);
6669 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6670 &ratio);
6671 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6672 th, check_profitability);
6674 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6675 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6676 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6677 else
6679 if (!ni_name)
6680 ni_name = vect_build_loop_niters (loop_vinfo);
6681 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6684 /* 1) Make sure the loop header has exactly two entries
6685 2) Make sure we have a preheader basic block. */
6687 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6689 split_edge (loop_preheader_edge (loop));
6691 /* FORNOW: the vectorizer supports only loops which body consist
6692 of one basic block (header + empty latch). When the vectorizer will
6693 support more involved loop forms, the order by which the BBs are
6694 traversed need to be reconsidered. */
6696 for (i = 0; i < nbbs; i++)
6698 basic_block bb = bbs[i];
6699 stmt_vec_info stmt_info;
6701 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6702 gsi_next (&si))
6704 gphi *phi = si.phi ();
6705 if (dump_enabled_p ())
6707 dump_printf_loc (MSG_NOTE, vect_location,
6708 "------>vectorizing phi: ");
6709 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6711 stmt_info = vinfo_for_stmt (phi);
6712 if (!stmt_info)
6713 continue;
6715 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6716 vect_loop_kill_debug_uses (loop, phi);
6718 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6719 && !STMT_VINFO_LIVE_P (stmt_info))
6720 continue;
6722 if (STMT_VINFO_VECTYPE (stmt_info)
6723 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6724 != (unsigned HOST_WIDE_INT) vectorization_factor)
6725 && dump_enabled_p ())
6726 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6730 if (dump_enabled_p ())
6731 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6732 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6736 pattern_stmt = NULL;
6737 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6738 !gsi_end_p (si) || transform_pattern_stmt;)
6740 bool is_store;
6742 if (transform_pattern_stmt)
6743 stmt = pattern_stmt;
6744 else
6746 stmt = gsi_stmt (si);
6747 /* During vectorization remove existing clobber stmts. */
6748 if (gimple_clobber_p (stmt))
6750 unlink_stmt_vdef (stmt);
6751 gsi_remove (&si, true);
6752 release_defs (stmt);
6753 continue;
6757 if (dump_enabled_p ())
6759 dump_printf_loc (MSG_NOTE, vect_location,
6760 "------>vectorizing statement: ");
6761 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6764 stmt_info = vinfo_for_stmt (stmt);
6766 /* vector stmts created in the outer-loop during vectorization of
6767 stmts in an inner-loop may not have a stmt_info, and do not
6768 need to be vectorized. */
6769 if (!stmt_info)
6771 gsi_next (&si);
6772 continue;
6775 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6776 vect_loop_kill_debug_uses (loop, stmt);
6778 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6779 && !STMT_VINFO_LIVE_P (stmt_info))
6781 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6782 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6783 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6784 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6786 stmt = pattern_stmt;
6787 stmt_info = vinfo_for_stmt (stmt);
6789 else
6791 gsi_next (&si);
6792 continue;
6795 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6796 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6797 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6798 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6799 transform_pattern_stmt = true;
6801 /* If pattern statement has def stmts, vectorize them too. */
6802 if (is_pattern_stmt_p (stmt_info))
6804 if (pattern_def_seq == NULL)
6806 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6807 pattern_def_si = gsi_start (pattern_def_seq);
6809 else if (!gsi_end_p (pattern_def_si))
6810 gsi_next (&pattern_def_si);
6811 if (pattern_def_seq != NULL)
6813 gimple *pattern_def_stmt = NULL;
6814 stmt_vec_info pattern_def_stmt_info = NULL;
6816 while (!gsi_end_p (pattern_def_si))
6818 pattern_def_stmt = gsi_stmt (pattern_def_si);
6819 pattern_def_stmt_info
6820 = vinfo_for_stmt (pattern_def_stmt);
6821 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6822 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6823 break;
6824 gsi_next (&pattern_def_si);
6827 if (!gsi_end_p (pattern_def_si))
6829 if (dump_enabled_p ())
6831 dump_printf_loc (MSG_NOTE, vect_location,
6832 "==> vectorizing pattern def "
6833 "stmt: ");
6834 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6835 pattern_def_stmt, 0);
6838 stmt = pattern_def_stmt;
6839 stmt_info = pattern_def_stmt_info;
6841 else
6843 pattern_def_si = gsi_none ();
6844 transform_pattern_stmt = false;
6847 else
6848 transform_pattern_stmt = false;
6851 if (STMT_VINFO_VECTYPE (stmt_info))
6853 unsigned int nunits
6854 = (unsigned int)
6855 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6856 if (!STMT_SLP_TYPE (stmt_info)
6857 && nunits != (unsigned int) vectorization_factor
6858 && dump_enabled_p ())
6859 /* For SLP VF is set according to unrolling factor, and not
6860 to vector size, hence for SLP this print is not valid. */
6861 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6864 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6865 reached. */
6866 if (STMT_SLP_TYPE (stmt_info))
6868 if (!slp_scheduled)
6870 slp_scheduled = true;
6872 if (dump_enabled_p ())
6873 dump_printf_loc (MSG_NOTE, vect_location,
6874 "=== scheduling SLP instances ===\n");
6876 vect_schedule_slp (loop_vinfo);
6879 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6880 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6882 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6884 pattern_def_seq = NULL;
6885 gsi_next (&si);
6887 continue;
6891 /* -------- vectorize statement ------------ */
6892 if (dump_enabled_p ())
6893 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6895 grouped_store = false;
6896 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6897 if (is_store)
6899 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6901 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6902 interleaving chain was completed - free all the stores in
6903 the chain. */
6904 gsi_next (&si);
6905 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6907 else
6909 /* Free the attached stmt_vec_info and remove the stmt. */
6910 gimple *store = gsi_stmt (si);
6911 free_stmt_vec_info (store);
6912 unlink_stmt_vdef (store);
6913 gsi_remove (&si, true);
6914 release_defs (store);
6917 /* Stores can only appear at the end of pattern statements. */
6918 gcc_assert (!transform_pattern_stmt);
6919 pattern_def_seq = NULL;
6921 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6923 pattern_def_seq = NULL;
6924 gsi_next (&si);
6926 } /* stmts in BB */
6927 } /* BBs in loop */
6929 slpeel_make_loop_iterate_ntimes (loop, ratio);
6931 /* Reduce loop iterations by the vectorization factor. */
6932 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6933 expected_iterations / vectorization_factor);
6934 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6936 if (loop->nb_iterations_upper_bound != 0)
6937 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6938 if (loop->nb_iterations_likely_upper_bound != 0)
6939 loop->nb_iterations_likely_upper_bound
6940 = loop->nb_iterations_likely_upper_bound - 1;
6942 loop->nb_iterations_upper_bound
6943 = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
6944 vectorization_factor) - 1;
6945 loop->nb_iterations_likely_upper_bound
6946 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + 1,
6947 vectorization_factor) - 1;
6949 if (loop->any_estimate)
6951 loop->nb_iterations_estimate
6952 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6953 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6954 && loop->nb_iterations_estimate != 0)
6955 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6958 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_NOTE, vect_location,
6961 "LOOP VECTORIZED\n");
6962 if (loop->inner)
6963 dump_printf_loc (MSG_NOTE, vect_location,
6964 "OUTER LOOP VECTORIZED\n");
6965 dump_printf (MSG_NOTE, "\n");
6968 /* Free SLP instances here because otherwise stmt reference counting
6969 won't work. */
6970 slp_instance instance;
6971 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6972 vect_free_slp_instance (instance);
6973 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6974 /* Clear-up safelen field since its value is invalid after vectorization
6975 since vectorized loop can have loop-carried dependencies. */
6976 loop->safelen = 0;
6979 /* The code below is trying to perform simple optimization - revert
6980 if-conversion for masked stores, i.e. if the mask of a store is zero
6981 do not perform it and all stored value producers also if possible.
6982 For example,
6983 for (i=0; i<n; i++)
6984 if (c[i])
6986 p1[i] += 1;
6987 p2[i] = p3[i] +2;
6989 this transformation will produce the following semi-hammock:
6991 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6993 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6994 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6995 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6996 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6997 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6998 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7002 void
7003 optimize_mask_stores (struct loop *loop)
7005 basic_block *bbs = get_loop_body (loop);
7006 unsigned nbbs = loop->num_nodes;
7007 unsigned i;
7008 basic_block bb;
7009 gimple_stmt_iterator gsi;
7010 gimple *stmt;
7011 auto_vec<gimple *> worklist;
7013 vect_location = find_loop_location (loop);
7014 /* Pick up all masked stores in loop if any. */
7015 for (i = 0; i < nbbs; i++)
7017 bb = bbs[i];
7018 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7019 gsi_next (&gsi))
7021 stmt = gsi_stmt (gsi);
7022 if (is_gimple_call (stmt)
7023 && gimple_call_internal_p (stmt)
7024 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7025 worklist.safe_push (stmt);
7029 free (bbs);
7030 if (worklist.is_empty ())
7031 return;
7033 /* Loop has masked stores. */
7034 while (!worklist.is_empty ())
7036 gimple *last, *last_store;
7037 edge e, efalse;
7038 tree mask;
7039 basic_block store_bb, join_bb;
7040 gimple_stmt_iterator gsi_to;
7041 tree vdef, new_vdef;
7042 gphi *phi;
7043 tree vectype;
7044 tree zero;
7046 last = worklist.pop ();
7047 mask = gimple_call_arg (last, 2);
7048 bb = gimple_bb (last);
7049 /* Create new bb. */
7050 e = split_block (bb, last);
7051 join_bb = e->dest;
7052 store_bb = create_empty_bb (bb);
7053 add_bb_to_loop (store_bb, loop);
7054 e->flags = EDGE_TRUE_VALUE;
7055 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7056 /* Put STORE_BB to likely part. */
7057 efalse->probability = PROB_UNLIKELY;
7058 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7059 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7060 if (dom_info_available_p (CDI_DOMINATORS))
7061 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7062 if (dump_enabled_p ())
7063 dump_printf_loc (MSG_NOTE, vect_location,
7064 "Create new block %d to sink mask stores.",
7065 store_bb->index);
7066 /* Create vector comparison with boolean result. */
7067 vectype = TREE_TYPE (mask);
7068 zero = build_zero_cst (vectype);
7069 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7070 gsi = gsi_last_bb (bb);
7071 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7072 /* Create new PHI node for vdef of the last masked store:
7073 .MEM_2 = VDEF <.MEM_1>
7074 will be converted to
7075 .MEM.3 = VDEF <.MEM_1>
7076 and new PHI node will be created in join bb
7077 .MEM_2 = PHI <.MEM_1, .MEM_3>
7079 vdef = gimple_vdef (last);
7080 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7081 gimple_set_vdef (last, new_vdef);
7082 phi = create_phi_node (vdef, join_bb);
7083 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7085 /* Put all masked stores with the same mask to STORE_BB if possible. */
7086 while (true)
7088 gimple_stmt_iterator gsi_from;
7089 gimple *stmt1 = NULL;
7091 /* Move masked store to STORE_BB. */
7092 last_store = last;
7093 gsi = gsi_for_stmt (last);
7094 gsi_from = gsi;
7095 /* Shift GSI to the previous stmt for further traversal. */
7096 gsi_prev (&gsi);
7097 gsi_to = gsi_start_bb (store_bb);
7098 gsi_move_before (&gsi_from, &gsi_to);
7099 /* Setup GSI_TO to the non-empty block start. */
7100 gsi_to = gsi_start_bb (store_bb);
7101 if (dump_enabled_p ())
7103 dump_printf_loc (MSG_NOTE, vect_location,
7104 "Move stmt to created bb\n");
7105 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7107 /* Move all stored value producers if possible. */
7108 while (!gsi_end_p (gsi))
7110 tree lhs;
7111 imm_use_iterator imm_iter;
7112 use_operand_p use_p;
7113 bool res;
7115 /* Skip debug statements. */
7116 if (is_gimple_debug (gsi_stmt (gsi)))
7118 gsi_prev (&gsi);
7119 continue;
7121 stmt1 = gsi_stmt (gsi);
7122 /* Do not consider statements writing to memory or having
7123 volatile operand. */
7124 if (gimple_vdef (stmt1)
7125 || gimple_has_volatile_ops (stmt1))
7126 break;
7127 gsi_from = gsi;
7128 gsi_prev (&gsi);
7129 lhs = gimple_get_lhs (stmt1);
7130 if (!lhs)
7131 break;
7133 /* LHS of vectorized stmt must be SSA_NAME. */
7134 if (TREE_CODE (lhs) != SSA_NAME)
7135 break;
7137 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7139 /* Remove dead scalar statement. */
7140 if (has_zero_uses (lhs))
7142 gsi_remove (&gsi_from, true);
7143 continue;
7147 /* Check that LHS does not have uses outside of STORE_BB. */
7148 res = true;
7149 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7151 gimple *use_stmt;
7152 use_stmt = USE_STMT (use_p);
7153 if (is_gimple_debug (use_stmt))
7154 continue;
7155 if (gimple_bb (use_stmt) != store_bb)
7157 res = false;
7158 break;
7161 if (!res)
7162 break;
7164 if (gimple_vuse (stmt1)
7165 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7166 break;
7168 /* Can move STMT1 to STORE_BB. */
7169 if (dump_enabled_p ())
7171 dump_printf_loc (MSG_NOTE, vect_location,
7172 "Move stmt to created bb\n");
7173 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7175 gsi_move_before (&gsi_from, &gsi_to);
7176 /* Shift GSI_TO for further insertion. */
7177 gsi_prev (&gsi_to);
7179 /* Put other masked stores with the same mask to STORE_BB. */
7180 if (worklist.is_empty ()
7181 || gimple_call_arg (worklist.last (), 2) != mask
7182 || worklist.last () != stmt1)
7183 break;
7184 last = worklist.pop ();
7186 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);