builtins.def: (_Float<N> and _Float<N>X BUILT_IN_CEIL): Add _Float<N> and _Float...
[official-gcc.git] / gcc / tree-vect-loop.c
blob810fa5f3ce9f225d2e22b4372e39c4f4eff5a68f
1 /* Loop Vectorization
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
56 /* Loop Vectorization Pass.
58 This pass tries to vectorize loops.
60 For example, the vectorizer transforms the following simple loop:
62 short a[N]; short b[N]; short c[N]; int i;
64 for (i=0; i<N; i++){
65 a[i] = b[i] + c[i];
68 as if it was manually vectorized by rewriting the source code into:
70 typedef int __attribute__((mode(V8HI))) v8hi;
71 short a[N]; short b[N]; short c[N]; int i;
72 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
73 v8hi va, vb, vc;
75 for (i=0; i<N/8; i++){
76 vb = pb[i];
77 vc = pc[i];
78 va = vb + vc;
79 pa[i] = va;
82 The main entry to this pass is vectorize_loops(), in which
83 the vectorizer applies a set of analyses on a given set of loops,
84 followed by the actual vectorization transformation for the loops that
85 had successfully passed the analysis phase.
86 Throughout this pass we make a distinction between two types of
87 data: scalars (which are represented by SSA_NAMES), and memory references
88 ("data-refs"). These two types of data require different handling both
89 during analysis and transformation. The types of data-refs that the
90 vectorizer currently supports are ARRAY_REFS which base is an array DECL
91 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
92 accesses are required to have a simple (consecutive) access pattern.
94 Analysis phase:
95 ===============
96 The driver for the analysis phase is vect_analyze_loop().
97 It applies a set of analyses, some of which rely on the scalar evolution
98 analyzer (scev) developed by Sebastian Pop.
100 During the analysis phase the vectorizer records some information
101 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
102 loop, as well as general information about the loop as a whole, which is
103 recorded in a "loop_vec_info" struct attached to each loop.
105 Transformation phase:
106 =====================
107 The loop transformation phase scans all the stmts in the loop, and
108 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
109 the loop that needs to be vectorized. It inserts the vector code sequence
110 just before the scalar stmt S, and records a pointer to the vector code
111 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
112 attached to S). This pointer will be used for the vectorization of following
113 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
114 otherwise, we rely on dead code elimination for removing it.
116 For example, say stmt S1 was vectorized into stmt VS1:
118 VS1: vb = px[i];
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 S2: a = b;
122 To vectorize stmt S2, the vectorizer first finds the stmt that defines
123 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
124 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
125 resulting sequence would be:
127 VS1: vb = px[i];
128 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
129 VS2: va = vb;
130 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132 Operands that are not SSA_NAMEs, are data-refs that appear in
133 load/store operations (like 'x[i]' in S1), and are handled differently.
135 Target modeling:
136 =================
137 Currently the only target specific information that is used is the
138 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
139 Targets that can support different sizes of vectors, for now will need
140 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
141 flexibility will be added in the future.
143 Since we only vectorize operations which vector form can be
144 expressed using existing tree codes, to verify that an operation is
145 supported, the vectorizer checks the relevant optab at the relevant
146 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
147 the value found is CODE_FOR_nothing, then there's no target support, and
148 we can't vectorize the stmt.
150 For additional information on this project see:
151 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 /* Function vect_determine_vectorization_factor
158 Determine the vectorization factor (VF). VF is the number of data elements
159 that are operated upon in parallel in a single iteration of the vectorized
160 loop. For example, when vectorizing a loop that operates on 4byte elements,
161 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
162 elements can fit in a single vector register.
164 We currently support vectorization of loops in which all types operated upon
165 are of the same size. Therefore this function currently sets VF according to
166 the size of the types operated upon, and fails if there are multiple sizes
167 in the loop.
169 VF is also the factor by which the loop iterations are strip-mined, e.g.:
170 original loop:
171 for (i=0; i<N; i++){
172 a[i] = b[i] + c[i];
175 vectorized loop:
176 for (i=0; i<N; i+=VF){
177 a[i:VF] = b[i:VF] + c[i:VF];
181 static bool
182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
185 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
186 unsigned nbbs = loop->num_nodes;
187 unsigned int vectorization_factor = 0;
188 tree scalar_type = NULL_TREE;
189 gphi *phi;
190 tree vectype;
191 unsigned int nunits;
192 stmt_vec_info stmt_info;
193 unsigned i;
194 HOST_WIDE_INT dummy;
195 gimple *stmt, *pattern_stmt = NULL;
196 gimple_seq pattern_def_seq = NULL;
197 gimple_stmt_iterator pattern_def_si = gsi_none ();
198 bool analyze_pattern_stmt = false;
199 bool bool_result;
200 auto_vec<stmt_vec_info> mask_producers;
202 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE, vect_location,
204 "=== vect_determine_vectorization_factor ===\n");
206 for (i = 0; i < nbbs; i++)
208 basic_block bb = bbs[i];
210 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
211 gsi_next (&si))
213 phi = si.phi ();
214 stmt_info = vinfo_for_stmt (phi);
215 if (dump_enabled_p ())
217 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
218 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
221 gcc_assert (stmt_info);
223 if (STMT_VINFO_RELEVANT_P (stmt_info)
224 || STMT_VINFO_LIVE_P (stmt_info))
226 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
227 scalar_type = TREE_TYPE (PHI_RESULT (phi));
229 if (dump_enabled_p ())
231 dump_printf_loc (MSG_NOTE, vect_location,
232 "get vectype for scalar type: ");
233 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
234 dump_printf (MSG_NOTE, "\n");
237 vectype = get_vectype_for_scalar_type (scalar_type);
238 if (!vectype)
240 if (dump_enabled_p ())
242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
243 "not vectorized: unsupported "
244 "data-type ");
245 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
246 scalar_type);
247 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
249 return false;
251 STMT_VINFO_VECTYPE (stmt_info) = vectype;
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
256 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
257 dump_printf (MSG_NOTE, "\n");
260 nunits = TYPE_VECTOR_SUBPARTS (vectype);
261 if (dump_enabled_p ())
262 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
263 nunits);
265 if (!vectorization_factor
266 || (nunits > vectorization_factor))
267 vectorization_factor = nunits;
271 for (gimple_stmt_iterator si = gsi_start_bb (bb);
272 !gsi_end_p (si) || analyze_pattern_stmt;)
274 tree vf_vectype;
276 if (analyze_pattern_stmt)
277 stmt = pattern_stmt;
278 else
279 stmt = gsi_stmt (si);
281 stmt_info = vinfo_for_stmt (stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
290 gcc_assert (stmt_info);
292 /* Skip stmts which do not need to be vectorized. */
293 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
294 && !STMT_VINFO_LIVE_P (stmt_info))
295 || gimple_clobber_p (stmt))
297 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
298 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
299 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
300 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 stmt = pattern_stmt;
303 stmt_info = vinfo_for_stmt (pattern_stmt);
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location,
307 "==> examining pattern statement: ");
308 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
311 else
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
315 gsi_next (&si);
316 continue;
319 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
320 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
321 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
322 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
323 analyze_pattern_stmt = true;
325 /* If a pattern statement has def stmts, analyze them too. */
326 if (is_pattern_stmt_p (stmt_info))
328 if (pattern_def_seq == NULL)
330 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
331 pattern_def_si = gsi_start (pattern_def_seq);
333 else if (!gsi_end_p (pattern_def_si))
334 gsi_next (&pattern_def_si);
335 if (pattern_def_seq != NULL)
337 gimple *pattern_def_stmt = NULL;
338 stmt_vec_info pattern_def_stmt_info = NULL;
340 while (!gsi_end_p (pattern_def_si))
342 pattern_def_stmt = gsi_stmt (pattern_def_si);
343 pattern_def_stmt_info
344 = vinfo_for_stmt (pattern_def_stmt);
345 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
346 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
347 break;
348 gsi_next (&pattern_def_si);
351 if (!gsi_end_p (pattern_def_si))
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "==> examining pattern def stmt: ");
357 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
358 pattern_def_stmt, 0);
361 stmt = pattern_def_stmt;
362 stmt_info = pattern_def_stmt_info;
364 else
366 pattern_def_si = gsi_none ();
367 analyze_pattern_stmt = false;
370 else
371 analyze_pattern_stmt = false;
374 if (gimple_get_lhs (stmt) == NULL_TREE
375 /* MASK_STORE has no lhs, but is ok. */
376 && (!is_gimple_call (stmt)
377 || !gimple_call_internal_p (stmt)
378 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
380 if (is_gimple_call (stmt))
382 /* Ignore calls with no lhs. These must be calls to
383 #pragma omp simd functions, and what vectorization factor
384 it really needs can't be determined until
385 vectorizable_simd_clone_call. */
386 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
388 pattern_def_seq = NULL;
389 gsi_next (&si);
391 continue;
393 if (dump_enabled_p ())
395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
396 "not vectorized: irregular stmt.");
397 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
400 return false;
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
411 return false;
414 bool_result = false;
416 if (STMT_VINFO_VECTYPE (stmt_info))
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 else
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
431 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
432 else
433 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
435 /* Bool ops don't participate in vectorization factor
436 computation. For comparison use compared types to
437 compute a factor. */
438 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
439 && is_gimple_assign (stmt)
440 && gimple_assign_rhs_code (stmt) != COND_EXPR)
442 if (STMT_VINFO_RELEVANT_P (stmt_info)
443 || STMT_VINFO_LIVE_P (stmt_info))
444 mask_producers.safe_push (stmt_info);
445 bool_result = true;
447 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
448 == tcc_comparison
449 && !VECT_SCALAR_BOOLEAN_TYPE_P
450 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
451 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
452 else
454 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 pattern_def_seq = NULL;
457 gsi_next (&si);
459 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "get vectype for scalar type: ");
467 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
468 dump_printf (MSG_NOTE, "\n");
470 vectype = get_vectype_for_scalar_type (scalar_type);
471 if (!vectype)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "not vectorized: unsupported "
477 "data-type ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 scalar_type);
480 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 return false;
485 if (!bool_result)
486 STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
492 dump_printf (MSG_NOTE, "\n");
496 /* Don't try to compute VF out scalar types if we stmt
497 produces boolean vector. Use result vectype instead. */
498 if (VECTOR_BOOLEAN_TYPE_P (vectype))
499 vf_vectype = vectype;
500 else
502 /* The vectorization factor is according to the smallest
503 scalar type (or the largest vector size, but we only
504 support one vector size per loop). */
505 if (!bool_result)
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 if (!vf_vectype)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "not vectorized: unsupported data-type ");
523 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
524 scalar_type);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 return false;
530 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
531 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: different sized vector "
537 "types in statement, ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
539 vectype);
540 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vf_vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 return false;
548 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
551 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
552 dump_printf (MSG_NOTE, "\n");
555 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
558 if (!vectorization_factor
559 || (nunits > vectorization_factor))
560 vectorization_factor = nunits;
562 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
564 pattern_def_seq = NULL;
565 gsi_next (&si);
570 /* TODO: Analyze cost. Decide if worth while to vectorize. */
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
573 vectorization_factor);
574 if (vectorization_factor <= 1)
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "not vectorized: unsupported data-type\n");
579 return false;
581 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
583 for (i = 0; i < mask_producers.length (); i++)
585 tree mask_type = NULL;
587 stmt = STMT_VINFO_STMT (mask_producers[i]);
589 if (is_gimple_assign (stmt)
590 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
591 && !VECT_SCALAR_BOOLEAN_TYPE_P
592 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
594 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
595 mask_type = get_mask_type_for_scalar_type (scalar_type);
597 if (!mask_type)
599 if (dump_enabled_p ())
600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
601 "not vectorized: unsupported mask\n");
602 return false;
605 else
607 tree rhs;
608 ssa_op_iter iter;
609 gimple *def_stmt;
610 enum vect_def_type dt;
612 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
614 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
615 &def_stmt, &dt, &vectype))
617 if (dump_enabled_p ())
619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
620 "not vectorized: can't compute mask type "
621 "for statement, ");
622 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
625 return false;
628 /* No vectype probably means external definition.
629 Allow it in case there is another operand which
630 allows to determine mask type. */
631 if (!vectype)
632 continue;
634 if (!mask_type)
635 mask_type = vectype;
636 else if (TYPE_VECTOR_SUBPARTS (mask_type)
637 != TYPE_VECTOR_SUBPARTS (vectype))
639 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "not vectorized: different sized masks "
643 "types in statement, ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
645 mask_type);
646 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
647 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
648 vectype);
649 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
651 return false;
653 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
654 != VECTOR_BOOLEAN_TYPE_P (vectype))
656 if (dump_enabled_p ())
658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
659 "not vectorized: mixed mask and "
660 "nonmask vector types in statement, ");
661 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
662 mask_type);
663 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
664 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
665 vectype);
666 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
668 return false;
672 /* We may compare boolean value loaded as vector of integers.
673 Fix mask_type in such case. */
674 if (mask_type
675 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
676 && gimple_code (stmt) == GIMPLE_ASSIGN
677 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
678 mask_type = build_same_sized_truth_vector_type (mask_type);
681 /* No mask_type should mean loop invariant predicate.
682 This is probably a subject for optimization in
683 if-conversion. */
684 if (!mask_type)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689 "not vectorized: can't compute mask type "
690 "for statement, ");
691 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
694 return false;
697 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
700 return true;
704 /* Function vect_is_simple_iv_evolution.
706 FORNOW: A simple evolution of an induction variables in the loop is
707 considered a polynomial evolution. */
709 static bool
710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
711 tree * step)
713 tree init_expr;
714 tree step_expr;
715 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
716 basic_block bb;
718 /* When there is no evolution in this loop, the evolution function
719 is not "simple". */
720 if (evolution_part == NULL_TREE)
721 return false;
723 /* When the evolution is a polynomial of degree >= 2
724 the evolution function is not "simple". */
725 if (tree_is_chrec (evolution_part))
726 return false;
728 step_expr = evolution_part;
729 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
734 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
735 dump_printf (MSG_NOTE, ", init: ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
737 dump_printf (MSG_NOTE, "\n");
740 *init = init_expr;
741 *step = step_expr;
743 if (TREE_CODE (step_expr) != INTEGER_CST
744 && (TREE_CODE (step_expr) != SSA_NAME
745 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
746 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
747 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
748 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
749 || !flag_associative_math)))
750 && (TREE_CODE (step_expr) != REAL_CST
751 || !flag_associative_math))
753 if (dump_enabled_p ())
754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
755 "step unknown.\n");
756 return false;
759 return true;
762 /* Function vect_analyze_scalar_cycles_1.
764 Examine the cross iteration def-use cycles of scalar variables
765 in LOOP. LOOP_VINFO represents the loop that is now being
766 considered for vectorization (can be LOOP, or an outer-loop
767 enclosing LOOP). */
769 static void
770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
772 basic_block bb = loop->header;
773 tree init, step;
774 auto_vec<gimple *, 64> worklist;
775 gphi_iterator gsi;
776 bool double_reduc;
778 if (dump_enabled_p ())
779 dump_printf_loc (MSG_NOTE, vect_location,
780 "=== vect_analyze_scalar_cycles ===\n");
782 /* First - identify all inductions. Reduction detection assumes that all the
783 inductions have been identified, therefore, this order must not be
784 changed. */
785 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
787 gphi *phi = gsi.phi ();
788 tree access_fn = NULL;
789 tree def = PHI_RESULT (phi);
790 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
792 if (dump_enabled_p ())
794 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
795 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
798 /* Skip virtual phi's. The data dependences that are associated with
799 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
800 if (virtual_operand_p (def))
801 continue;
803 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
805 /* Analyze the evolution function. */
806 access_fn = analyze_scalar_evolution (loop, def);
807 if (access_fn)
809 STRIP_NOPS (access_fn);
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_NOTE, vect_location,
813 "Access function of PHI: ");
814 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
815 dump_printf (MSG_NOTE, "\n");
817 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
818 = initial_condition_in_loop_num (access_fn, loop->num);
819 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
820 = evolution_part_in_loop_num (access_fn, loop->num);
823 if (!access_fn
824 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
825 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
826 && TREE_CODE (step) != INTEGER_CST))
828 worklist.safe_push (phi);
829 continue;
832 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
833 != NULL_TREE);
834 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
836 if (dump_enabled_p ())
837 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
838 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
842 /* Second - identify all reductions and nested cycles. */
843 while (worklist.length () > 0)
845 gimple *phi = worklist.pop ();
846 tree def = PHI_RESULT (phi);
847 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
848 gimple *reduc_stmt;
850 if (dump_enabled_p ())
852 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
853 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
856 gcc_assert (!virtual_operand_p (def)
857 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
859 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
860 &double_reduc, false);
861 if (reduc_stmt)
863 if (double_reduc)
865 if (dump_enabled_p ())
866 dump_printf_loc (MSG_NOTE, vect_location,
867 "Detected double reduction.\n");
869 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
870 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
871 vect_double_reduction_def;
873 else
875 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
877 if (dump_enabled_p ())
878 dump_printf_loc (MSG_NOTE, vect_location,
879 "Detected vectorizable nested cycle.\n");
881 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
882 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
883 vect_nested_cycle;
885 else
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_NOTE, vect_location,
889 "Detected reduction.\n");
891 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
892 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
893 vect_reduction_def;
894 /* Store the reduction cycles for possible vectorization in
895 loop-aware SLP if it was not detected as reduction
896 chain. */
897 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
898 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
902 else
903 if (dump_enabled_p ())
904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
905 "Unknown def-use cycle pattern.\n");
910 /* Function vect_analyze_scalar_cycles.
912 Examine the cross iteration def-use cycles of scalar variables, by
913 analyzing the loop-header PHIs of scalar variables. Classify each
914 cycle as one of the following: invariant, induction, reduction, unknown.
915 We do that for the loop represented by LOOP_VINFO, and also to its
916 inner-loop, if exists.
917 Examples for scalar cycles:
919 Example1: reduction:
921 loop1:
922 for (i=0; i<N; i++)
923 sum += a[i];
925 Example2: induction:
927 loop2:
928 for (i=0; i<N; i++)
929 a[i] = i; */
931 static void
932 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
934 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
936 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
938 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
939 Reductions in such inner-loop therefore have different properties than
940 the reductions in the nest that gets vectorized:
941 1. When vectorized, they are executed in the same order as in the original
942 scalar loop, so we can't change the order of computation when
943 vectorizing them.
944 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
945 current checks are too strict. */
947 if (loop->inner)
948 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
951 /* Transfer group and reduction information from STMT to its pattern stmt. */
953 static void
954 vect_fixup_reduc_chain (gimple *stmt)
956 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
957 gimple *stmtp;
958 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
959 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
960 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
963 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
965 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
966 if (stmt)
967 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
968 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
970 while (stmt);
971 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
974 /* Fixup scalar cycles that now have their stmts detected as patterns. */
976 static void
977 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
979 gimple *first;
980 unsigned i;
982 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
983 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
985 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
986 while (next)
988 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
989 break;
990 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
992 /* If not all stmt in the chain are patterns try to handle
993 the chain without patterns. */
994 if (! next)
996 vect_fixup_reduc_chain (first);
997 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
998 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1003 /* Function vect_get_loop_niters.
1005 Determine how many iterations the loop is executed and place it
1006 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1007 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1008 niter information holds in ASSUMPTIONS.
1010 Return the loop exit condition. */
1013 static gcond *
1014 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1015 tree *number_of_iterations, tree *number_of_iterationsm1)
1017 edge exit = single_exit (loop);
1018 struct tree_niter_desc niter_desc;
1019 tree niter_assumptions, niter, may_be_zero;
1020 gcond *cond = get_loop_exit_condition (loop);
1022 *assumptions = boolean_true_node;
1023 *number_of_iterationsm1 = chrec_dont_know;
1024 *number_of_iterations = chrec_dont_know;
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE, vect_location,
1027 "=== get_loop_niters ===\n");
1029 if (!exit)
1030 return cond;
1032 niter = chrec_dont_know;
1033 may_be_zero = NULL_TREE;
1034 niter_assumptions = boolean_true_node;
1035 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1036 || chrec_contains_undetermined (niter_desc.niter))
1037 return cond;
1039 niter_assumptions = niter_desc.assumptions;
1040 may_be_zero = niter_desc.may_be_zero;
1041 niter = niter_desc.niter;
1043 if (may_be_zero && integer_zerop (may_be_zero))
1044 may_be_zero = NULL_TREE;
1046 if (may_be_zero)
1048 if (COMPARISON_CLASS_P (may_be_zero))
1050 /* Try to combine may_be_zero with assumptions, this can simplify
1051 computation of niter expression. */
1052 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1053 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1054 niter_assumptions,
1055 fold_build1 (TRUTH_NOT_EXPR,
1056 boolean_type_node,
1057 may_be_zero));
1058 else
1059 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1060 build_int_cst (TREE_TYPE (niter), 0), niter);
1062 may_be_zero = NULL_TREE;
1064 else if (integer_nonzerop (may_be_zero))
1066 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1067 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1068 return cond;
1070 else
1071 return cond;
1074 *assumptions = niter_assumptions;
1075 *number_of_iterationsm1 = niter;
1077 /* We want the number of loop header executions which is the number
1078 of latch executions plus one.
1079 ??? For UINT_MAX latch executions this number overflows to zero
1080 for loops like do { n++; } while (n != 0); */
1081 if (niter && !chrec_contains_undetermined (niter))
1082 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1083 build_int_cst (TREE_TYPE (niter), 1));
1084 *number_of_iterations = niter;
1086 return cond;
1089 /* Function bb_in_loop_p
1091 Used as predicate for dfs order traversal of the loop bbs. */
1093 static bool
1094 bb_in_loop_p (const_basic_block bb, const void *data)
1096 const struct loop *const loop = (const struct loop *)data;
1097 if (flow_bb_inside_loop_p (loop, bb))
1098 return true;
1099 return false;
1103 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1104 stmt_vec_info structs for all the stmts in LOOP_IN. */
1106 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1107 : vec_info (vec_info::loop, init_cost (loop_in)),
1108 loop (loop_in),
1109 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1110 num_itersm1 (NULL_TREE),
1111 num_iters (NULL_TREE),
1112 num_iters_unchanged (NULL_TREE),
1113 num_iters_assumptions (NULL_TREE),
1114 th (0),
1115 versioning_threshold (0),
1116 vectorization_factor (0),
1117 max_vectorization_factor (0),
1118 unaligned_dr (NULL),
1119 peeling_for_alignment (0),
1120 ptr_mask (0),
1121 slp_unrolling_factor (1),
1122 single_scalar_iteration_cost (0),
1123 vectorizable (false),
1124 peeling_for_gaps (false),
1125 peeling_for_niter (false),
1126 operands_swapped (false),
1127 no_data_dependencies (false),
1128 has_mask_store (false),
1129 scalar_loop (NULL),
1130 orig_loop_info (NULL)
1132 /* Create/Update stmt_info for all stmts in the loop. */
1133 basic_block *body = get_loop_body (loop);
1134 for (unsigned int i = 0; i < loop->num_nodes; i++)
1136 basic_block bb = body[i];
1137 gimple_stmt_iterator si;
1139 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1141 gimple *phi = gsi_stmt (si);
1142 gimple_set_uid (phi, 0);
1143 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1146 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1148 gimple *stmt = gsi_stmt (si);
1149 gimple_set_uid (stmt, 0);
1150 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1153 free (body);
1155 /* CHECKME: We want to visit all BBs before their successors (except for
1156 latch blocks, for which this assertion wouldn't hold). In the simple
1157 case of the loop forms we allow, a dfs order of the BBs would the same
1158 as reversed postorder traversal, so we are safe. */
1160 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1161 bbs, loop->num_nodes, loop);
1162 gcc_assert (nbbs == loop->num_nodes);
1166 /* Free all memory used by the _loop_vec_info, as well as all the
1167 stmt_vec_info structs of all the stmts in the loop. */
1169 _loop_vec_info::~_loop_vec_info ()
1171 int nbbs;
1172 gimple_stmt_iterator si;
1173 int j;
1175 nbbs = loop->num_nodes;
1176 for (j = 0; j < nbbs; j++)
1178 basic_block bb = bbs[j];
1179 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1180 free_stmt_vec_info (gsi_stmt (si));
1182 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1184 gimple *stmt = gsi_stmt (si);
1186 /* We may have broken canonical form by moving a constant
1187 into RHS1 of a commutative op. Fix such occurrences. */
1188 if (operands_swapped && is_gimple_assign (stmt))
1190 enum tree_code code = gimple_assign_rhs_code (stmt);
1192 if ((code == PLUS_EXPR
1193 || code == POINTER_PLUS_EXPR
1194 || code == MULT_EXPR)
1195 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1196 swap_ssa_operands (stmt,
1197 gimple_assign_rhs1_ptr (stmt),
1198 gimple_assign_rhs2_ptr (stmt));
1199 else if (code == COND_EXPR
1200 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1202 tree cond_expr = gimple_assign_rhs1 (stmt);
1203 enum tree_code cond_code = TREE_CODE (cond_expr);
1205 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1207 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1208 0));
1209 cond_code = invert_tree_comparison (cond_code,
1210 honor_nans);
1211 if (cond_code != ERROR_MARK)
1213 TREE_SET_CODE (cond_expr, cond_code);
1214 swap_ssa_operands (stmt,
1215 gimple_assign_rhs2_ptr (stmt),
1216 gimple_assign_rhs3_ptr (stmt));
1222 /* Free stmt_vec_info. */
1223 free_stmt_vec_info (stmt);
1224 gsi_next (&si);
1228 free (bbs);
1230 loop->aux = NULL;
1234 /* Calculate the cost of one scalar iteration of the loop. */
1235 static void
1236 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1238 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1239 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1240 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1241 int innerloop_iters, i;
1243 /* Count statements in scalar loop. Using this as scalar cost for a single
1244 iteration for now.
1246 TODO: Add outer loop support.
1248 TODO: Consider assigning different costs to different scalar
1249 statements. */
1251 /* FORNOW. */
1252 innerloop_iters = 1;
1253 if (loop->inner)
1254 innerloop_iters = 50; /* FIXME */
1256 for (i = 0; i < nbbs; i++)
1258 gimple_stmt_iterator si;
1259 basic_block bb = bbs[i];
1261 if (bb->loop_father == loop->inner)
1262 factor = innerloop_iters;
1263 else
1264 factor = 1;
1266 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1268 gimple *stmt = gsi_stmt (si);
1269 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1271 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1272 continue;
1274 /* Skip stmts that are not vectorized inside the loop. */
1275 if (stmt_info
1276 && !STMT_VINFO_RELEVANT_P (stmt_info)
1277 && (!STMT_VINFO_LIVE_P (stmt_info)
1278 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1279 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1280 continue;
1282 vect_cost_for_stmt kind;
1283 if (STMT_VINFO_DATA_REF (stmt_info))
1285 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1286 kind = scalar_load;
1287 else
1288 kind = scalar_store;
1290 else
1291 kind = scalar_stmt;
1293 scalar_single_iter_cost
1294 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1295 factor, kind, stmt_info, 0, vect_prologue);
1298 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1299 = scalar_single_iter_cost;
1303 /* Function vect_analyze_loop_form_1.
1305 Verify that certain CFG restrictions hold, including:
1306 - the loop has a pre-header
1307 - the loop has a single entry and exit
1308 - the loop exit condition is simple enough
1309 - the number of iterations can be analyzed, i.e, a countable loop. The
1310 niter could be analyzed under some assumptions. */
1312 bool
1313 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1314 tree *assumptions, tree *number_of_iterationsm1,
1315 tree *number_of_iterations, gcond **inner_loop_cond)
1317 if (dump_enabled_p ())
1318 dump_printf_loc (MSG_NOTE, vect_location,
1319 "=== vect_analyze_loop_form ===\n");
1321 /* Different restrictions apply when we are considering an inner-most loop,
1322 vs. an outer (nested) loop.
1323 (FORNOW. May want to relax some of these restrictions in the future). */
1325 if (!loop->inner)
1327 /* Inner-most loop. We currently require that the number of BBs is
1328 exactly 2 (the header and latch). Vectorizable inner-most loops
1329 look like this:
1331 (pre-header)
1333 header <--------+
1334 | | |
1335 | +--> latch --+
1337 (exit-bb) */
1339 if (loop->num_nodes != 2)
1341 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "not vectorized: control flow in loop.\n");
1344 return false;
1347 if (empty_block_p (loop->header))
1349 if (dump_enabled_p ())
1350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351 "not vectorized: empty loop.\n");
1352 return false;
1355 else
1357 struct loop *innerloop = loop->inner;
1358 edge entryedge;
1360 /* Nested loop. We currently require that the loop is doubly-nested,
1361 contains a single inner loop, and the number of BBs is exactly 5.
1362 Vectorizable outer-loops look like this:
1364 (pre-header)
1366 header <---+
1368 inner-loop |
1370 tail ------+
1372 (exit-bb)
1374 The inner-loop has the properties expected of inner-most loops
1375 as described above. */
1377 if ((loop->inner)->inner || (loop->inner)->next)
1379 if (dump_enabled_p ())
1380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1381 "not vectorized: multiple nested loops.\n");
1382 return false;
1385 if (loop->num_nodes != 5)
1387 if (dump_enabled_p ())
1388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1389 "not vectorized: control flow in loop.\n");
1390 return false;
1393 entryedge = loop_preheader_edge (innerloop);
1394 if (entryedge->src != loop->header
1395 || !single_exit (innerloop)
1396 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "not vectorized: unsupported outerloop form.\n");
1401 return false;
1404 /* Analyze the inner-loop. */
1405 tree inner_niterm1, inner_niter, inner_assumptions;
1406 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1407 &inner_assumptions, &inner_niterm1,
1408 &inner_niter, NULL)
1409 /* Don't support analyzing niter under assumptions for inner
1410 loop. */
1411 || !integer_onep (inner_assumptions))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "not vectorized: Bad inner loop.\n");
1416 return false;
1419 if (!expr_invariant_in_loop_p (loop, inner_niter))
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423 "not vectorized: inner-loop count not"
1424 " invariant.\n");
1425 return false;
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_NOTE, vect_location,
1430 "Considering outer-loop vectorization.\n");
1433 if (!single_exit (loop)
1434 || EDGE_COUNT (loop->header->preds) != 2)
1436 if (dump_enabled_p ())
1438 if (!single_exit (loop))
1439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440 "not vectorized: multiple exits.\n");
1441 else if (EDGE_COUNT (loop->header->preds) != 2)
1442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443 "not vectorized: too many incoming edges.\n");
1445 return false;
1448 /* We assume that the loop exit condition is at the end of the loop. i.e,
1449 that the loop is represented as a do-while (with a proper if-guard
1450 before the loop if needed), where the loop header contains all the
1451 executable statements, and the latch is empty. */
1452 if (!empty_block_p (loop->latch)
1453 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457 "not vectorized: latch block not empty.\n");
1458 return false;
1461 /* Make sure the exit is not abnormal. */
1462 edge e = single_exit (loop);
1463 if (e->flags & EDGE_ABNORMAL)
1465 if (dump_enabled_p ())
1466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467 "not vectorized: abnormal loop exit edge.\n");
1468 return false;
1471 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1472 number_of_iterationsm1);
1473 if (!*loop_cond)
1475 if (dump_enabled_p ())
1476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477 "not vectorized: complicated exit condition.\n");
1478 return false;
1481 if (integer_zerop (*assumptions)
1482 || !*number_of_iterations
1483 || chrec_contains_undetermined (*number_of_iterations))
1485 if (dump_enabled_p ())
1486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 "not vectorized: number of iterations cannot be "
1488 "computed.\n");
1489 return false;
1492 if (integer_zerop (*number_of_iterations))
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 "not vectorized: number of iterations = 0.\n");
1497 return false;
1500 return true;
1503 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1505 loop_vec_info
1506 vect_analyze_loop_form (struct loop *loop)
1508 tree assumptions, number_of_iterations, number_of_iterationsm1;
1509 gcond *loop_cond, *inner_loop_cond = NULL;
1511 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1512 &assumptions, &number_of_iterationsm1,
1513 &number_of_iterations, &inner_loop_cond))
1514 return NULL;
1516 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1517 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1518 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1519 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1520 if (!integer_onep (assumptions))
1522 /* We consider to vectorize this loop by versioning it under
1523 some assumptions. In order to do this, we need to clear
1524 existing information computed by scev and niter analyzer. */
1525 scev_reset_htab ();
1526 free_numbers_of_iterations_estimates (loop);
1527 /* Also set flag for this loop so that following scev and niter
1528 analysis are done under the assumptions. */
1529 loop_constraint_set (loop, LOOP_C_FINITE);
1530 /* Also record the assumptions for versioning. */
1531 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1534 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1536 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_NOTE, vect_location,
1539 "Symbolic number of iterations is ");
1540 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1541 dump_printf (MSG_NOTE, "\n");
1545 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1546 if (inner_loop_cond)
1547 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1548 = loop_exit_ctrl_vec_info_type;
1550 gcc_assert (!loop->aux);
1551 loop->aux = loop_vinfo;
1552 return loop_vinfo;
1557 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1558 statements update the vectorization factor. */
1560 static void
1561 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1563 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1564 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1565 int nbbs = loop->num_nodes;
1566 unsigned int vectorization_factor;
1567 int i;
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "=== vect_update_vf_for_slp ===\n");
1573 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1574 gcc_assert (vectorization_factor != 0);
1576 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1577 vectorization factor of the loop is the unrolling factor required by
1578 the SLP instances. If that unrolling factor is 1, we say, that we
1579 perform pure SLP on loop - cross iteration parallelism is not
1580 exploited. */
1581 bool only_slp_in_loop = true;
1582 for (i = 0; i < nbbs; i++)
1584 basic_block bb = bbs[i];
1585 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1586 gsi_next (&si))
1588 gimple *stmt = gsi_stmt (si);
1589 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1590 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1591 && STMT_VINFO_RELATED_STMT (stmt_info))
1593 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1594 stmt_info = vinfo_for_stmt (stmt);
1596 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1597 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1598 && !PURE_SLP_STMT (stmt_info))
1599 /* STMT needs both SLP and loop-based vectorization. */
1600 only_slp_in_loop = false;
1604 if (only_slp_in_loop)
1606 dump_printf_loc (MSG_NOTE, vect_location,
1607 "Loop contains only SLP stmts\n");
1608 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1610 else
1612 dump_printf_loc (MSG_NOTE, vect_location,
1613 "Loop contains SLP and non-SLP stmts\n");
1614 vectorization_factor
1615 = least_common_multiple (vectorization_factor,
1616 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1619 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1620 if (dump_enabled_p ())
1621 dump_printf_loc (MSG_NOTE, vect_location,
1622 "Updating vectorization factor to %d\n",
1623 vectorization_factor);
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1630 static bool
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1633 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "=== vect_analyze_loop_operations ===\n");
1645 for (i = 0; i < nbbs; i++)
1647 basic_block bb = bbs[i];
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1652 gphi *phi = si.phi ();
1653 ok = true;
1655 stmt_info = vinfo_for_stmt (phi);
1656 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1659 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1661 if (virtual_operand_p (gimple_phi_result (phi)))
1662 continue;
1664 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1665 (i.e., a phi in the tail of the outer-loop). */
1666 if (! is_loop_header_bb_p (bb))
1668 /* FORNOW: we currently don't support the case that these phis
1669 are not used in the outerloop (unless it is double reduction,
1670 i.e., this phi is vect_reduction_def), cause this case
1671 requires to actually do something here. */
1672 if (STMT_VINFO_LIVE_P (stmt_info)
1673 && STMT_VINFO_DEF_TYPE (stmt_info)
1674 != vect_double_reduction_def)
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678 "Unsupported loop-closed phi in "
1679 "outer-loop.\n");
1680 return false;
1683 /* If PHI is used in the outer loop, we check that its operand
1684 is defined in the inner loop. */
1685 if (STMT_VINFO_RELEVANT_P (stmt_info))
1687 tree phi_op;
1688 gimple *op_def_stmt;
1690 if (gimple_phi_num_args (phi) != 1)
1691 return false;
1693 phi_op = PHI_ARG_DEF (phi, 0);
1694 if (TREE_CODE (phi_op) != SSA_NAME)
1695 return false;
1697 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1698 if (gimple_nop_p (op_def_stmt)
1699 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1700 || !vinfo_for_stmt (op_def_stmt))
1701 return false;
1703 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1704 != vect_used_in_outer
1705 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1706 != vect_used_in_outer_by_reduction)
1707 return false;
1710 continue;
1713 gcc_assert (stmt_info);
1715 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1716 || STMT_VINFO_LIVE_P (stmt_info))
1717 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1719 /* A scalar-dependence cycle that we don't support. */
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722 "not vectorized: scalar dependence cycle.\n");
1723 return false;
1726 if (STMT_VINFO_RELEVANT_P (stmt_info))
1728 need_to_vectorize = true;
1729 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1730 && ! PURE_SLP_STMT (stmt_info))
1731 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1732 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1733 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1734 && ! PURE_SLP_STMT (stmt_info))
1735 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1738 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1739 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1741 if (!ok)
1743 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746 "not vectorized: relevant phi not "
1747 "supported: ");
1748 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1750 return false;
1754 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1755 gsi_next (&si))
1757 gimple *stmt = gsi_stmt (si);
1758 if (!gimple_clobber_p (stmt)
1759 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1760 return false;
1762 } /* bbs */
1764 /* All operations in the loop are either irrelevant (deal with loop
1765 control, or dead), or only used outside the loop and can be moved
1766 out of the loop (e.g. invariants, inductions). The loop can be
1767 optimized away by scalar optimizations. We're better off not
1768 touching this loop. */
1769 if (!need_to_vectorize)
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_NOTE, vect_location,
1773 "All the computation can be taken out of the loop.\n");
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776 "not vectorized: redundant loop. no profit to "
1777 "vectorize.\n");
1778 return false;
1781 return true;
1785 /* Function vect_analyze_loop_2.
1787 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1788 for it. The different analyses will record information in the
1789 loop_vec_info struct. */
1790 static bool
1791 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1793 bool ok;
1794 int max_vf = MAX_VECTORIZATION_FACTOR;
1795 int min_vf = 2;
1796 unsigned int n_stmts = 0;
1798 /* The first group of checks is independent of the vector size. */
1799 fatal = true;
1801 /* Find all data references in the loop (which correspond to vdefs/vuses)
1802 and analyze their evolution in the loop. */
1804 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1806 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1807 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1809 if (dump_enabled_p ())
1810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811 "not vectorized: loop nest containing two "
1812 "or more consecutive inner loops cannot be "
1813 "vectorized\n");
1814 return false;
1817 for (unsigned i = 0; i < loop->num_nodes; i++)
1818 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1819 !gsi_end_p (gsi); gsi_next (&gsi))
1821 gimple *stmt = gsi_stmt (gsi);
1822 if (is_gimple_debug (stmt))
1823 continue;
1824 ++n_stmts;
1825 if (!find_data_references_in_stmt (loop, stmt,
1826 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1828 if (is_gimple_call (stmt) && loop->safelen)
1830 tree fndecl = gimple_call_fndecl (stmt), op;
1831 if (fndecl != NULL_TREE)
1833 cgraph_node *node = cgraph_node::get (fndecl);
1834 if (node != NULL && node->simd_clones != NULL)
1836 unsigned int j, n = gimple_call_num_args (stmt);
1837 for (j = 0; j < n; j++)
1839 op = gimple_call_arg (stmt, j);
1840 if (DECL_P (op)
1841 || (REFERENCE_CLASS_P (op)
1842 && get_base_address (op)))
1843 break;
1845 op = gimple_call_lhs (stmt);
1846 /* Ignore #pragma omp declare simd functions
1847 if they don't have data references in the
1848 call stmt itself. */
1849 if (j == n
1850 && !(op
1851 && (DECL_P (op)
1852 || (REFERENCE_CLASS_P (op)
1853 && get_base_address (op)))))
1854 continue;
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "not vectorized: loop contains function "
1861 "calls or data references that cannot "
1862 "be analyzed\n");
1863 return false;
1867 /* Analyze the data references and also adjust the minimal
1868 vectorization factor according to the loads and stores. */
1870 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1871 if (!ok)
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "bad data references.\n");
1876 return false;
1879 /* Classify all cross-iteration scalar data-flow cycles.
1880 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1881 vect_analyze_scalar_cycles (loop_vinfo);
1883 vect_pattern_recog (loop_vinfo);
1885 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1887 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1888 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1890 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1891 if (!ok)
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "bad data access.\n");
1896 return false;
1899 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1901 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1902 if (!ok)
1904 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "unexpected pattern.\n");
1907 return false;
1910 /* While the rest of the analysis below depends on it in some way. */
1911 fatal = false;
1913 /* Analyze data dependences between the data-refs in the loop
1914 and adjust the maximum vectorization factor according to
1915 the dependences.
1916 FORNOW: fail at the first data dependence that we encounter. */
1918 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1919 if (!ok
1920 || max_vf < min_vf)
1922 if (dump_enabled_p ())
1923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924 "bad data dependence.\n");
1925 return false;
1927 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1929 ok = vect_determine_vectorization_factor (loop_vinfo);
1930 if (!ok)
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 "can't determine vectorization factor.\n");
1935 return false;
1937 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "bad data dependence.\n");
1942 return false;
1945 /* Compute the scalar iteration cost. */
1946 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1948 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949 HOST_WIDE_INT estimated_niter;
1950 unsigned th;
1951 int min_scalar_loop_bound;
1953 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1954 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1955 if (!ok)
1956 return false;
1958 /* If there are any SLP instances mark them as pure_slp. */
1959 bool slp = vect_make_slp_decision (loop_vinfo);
1960 if (slp)
1962 /* Find stmts that need to be both vectorized and SLPed. */
1963 vect_detect_hybrid_slp (loop_vinfo);
1965 /* Update the vectorization factor based on the SLP decision. */
1966 vect_update_vf_for_slp (loop_vinfo);
1969 /* This is the point where we can re-start analysis with SLP forced off. */
1970 start_over:
1972 /* Now the vectorization factor is final. */
1973 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1974 gcc_assert (vectorization_factor != 0);
1976 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "vectorization_factor = %d, niters = "
1979 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1980 LOOP_VINFO_INT_NITERS (loop_vinfo));
1982 HOST_WIDE_INT max_niter
1983 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1984 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1985 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1986 || (max_niter != -1
1987 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1989 if (dump_enabled_p ())
1990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991 "not vectorized: iteration count smaller than "
1992 "vectorization factor.\n");
1993 return false;
1996 /* Analyze the alignment of the data-refs in the loop.
1997 Fail if a data reference is found that cannot be vectorized. */
1999 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2000 if (!ok)
2002 if (dump_enabled_p ())
2003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2004 "bad data alignment.\n");
2005 return false;
2008 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2009 It is important to call pruning after vect_analyze_data_ref_accesses,
2010 since we use grouping information gathered by interleaving analysis. */
2011 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2012 if (!ok)
2013 return false;
2015 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2016 vectorization. */
2017 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2019 /* This pass will decide on using loop versioning and/or loop peeling in
2020 order to enhance the alignment of data references in the loop. */
2021 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2022 if (!ok)
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "bad data alignment.\n");
2027 return false;
2031 if (slp)
2033 /* Analyze operations in the SLP instances. Note this may
2034 remove unsupported SLP instances which makes the above
2035 SLP kind detection invalid. */
2036 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2037 vect_slp_analyze_operations (loop_vinfo);
2038 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2039 goto again;
2042 /* Scan all the remaining operations in the loop that are not subject
2043 to SLP and make sure they are vectorizable. */
2044 ok = vect_analyze_loop_operations (loop_vinfo);
2045 if (!ok)
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049 "bad operation or unsupported loop bound.\n");
2050 return false;
2053 /* If epilog loop is required because of data accesses with gaps,
2054 one additional iteration needs to be peeled. Check if there is
2055 enough iterations for vectorization. */
2056 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2057 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2059 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2060 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2062 if (wi::to_widest (scalar_niters) < vf)
2064 if (dump_enabled_p ())
2065 dump_printf_loc (MSG_NOTE, vect_location,
2066 "loop has no enough iterations to support"
2067 " peeling for gaps.\n");
2068 return false;
2072 /* Analyze cost. Decide if worth while to vectorize. */
2073 int min_profitable_estimate, min_profitable_iters;
2074 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2075 &min_profitable_estimate);
2077 if (min_profitable_iters < 0)
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "not vectorized: vectorization not profitable.\n");
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084 "not vectorized: vector version will never be "
2085 "profitable.\n");
2086 goto again;
2089 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2090 * vectorization_factor);
2092 /* Use the cost model only if it is more conservative than user specified
2093 threshold. */
2094 th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2096 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2098 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2099 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "not vectorized: vectorization not profitable.\n");
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "not vectorized: iteration count smaller than user "
2107 "specified loop bound parameter or minimum profitable "
2108 "iterations (whichever is more conservative).\n");
2109 goto again;
2112 estimated_niter
2113 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2114 if (estimated_niter == -1)
2115 estimated_niter = max_niter;
2116 if (estimated_niter != -1
2117 && ((unsigned HOST_WIDE_INT) estimated_niter
2118 < MAX (th, (unsigned) min_profitable_estimate)))
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2122 "not vectorized: estimated iteration count too "
2123 "small.\n");
2124 if (dump_enabled_p ())
2125 dump_printf_loc (MSG_NOTE, vect_location,
2126 "not vectorized: estimated iteration count smaller "
2127 "than specified loop bound parameter or minimum "
2128 "profitable iterations (whichever is more "
2129 "conservative).\n");
2130 goto again;
2133 /* Decide whether we need to create an epilogue loop to handle
2134 remaining scalar iterations. */
2135 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2136 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2137 * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2139 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2140 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2142 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2143 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2144 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2145 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2147 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2148 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2149 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2150 /* In case of versioning, check if the maximum number of
2151 iterations is greater than th. If they are identical,
2152 the epilogue is unnecessary. */
2153 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2154 || (unsigned HOST_WIDE_INT) max_niter > th)))
2155 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2157 /* If an epilogue loop is required make sure we can create one. */
2158 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2159 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2163 if (!vect_can_advance_ivs_p (loop_vinfo)
2164 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2165 single_exit (LOOP_VINFO_LOOP
2166 (loop_vinfo))))
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "not vectorized: can't create required "
2171 "epilog loop\n");
2172 goto again;
2176 /* During peeling, we need to check if number of loop iterations is
2177 enough for both peeled prolog loop and vector loop. This check
2178 can be merged along with threshold check of loop versioning, so
2179 increase threshold for this case if necessary. */
2180 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2182 poly_uint64 niters_th;
2184 /* Niters for peeled prolog loop. */
2185 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2187 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2188 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2190 niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2192 else
2193 niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2195 /* Niters for at least one iteration of vectorized loop. */
2196 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197 /* One additional iteration because of peeling for gap. */
2198 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2199 niters_th += 1;
2200 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2203 gcc_assert (vectorization_factor
2204 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2206 /* Ok to vectorize! */
2207 return true;
2209 again:
2210 /* Try again with SLP forced off but if we didn't do any SLP there is
2211 no point in re-trying. */
2212 if (!slp)
2213 return false;
2215 /* If there are reduction chains re-trying will fail anyway. */
2216 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2217 return false;
2219 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2220 via interleaving or lane instructions. */
2221 slp_instance instance;
2222 slp_tree node;
2223 unsigned i, j;
2224 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2226 stmt_vec_info vinfo;
2227 vinfo = vinfo_for_stmt
2228 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2229 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2230 continue;
2231 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2232 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2233 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2234 if (! vect_store_lanes_supported (vectype, size)
2235 && ! vect_grouped_store_supported (vectype, size))
2236 return false;
2237 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2239 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2240 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2241 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2242 size = STMT_VINFO_GROUP_SIZE (vinfo);
2243 vectype = STMT_VINFO_VECTYPE (vinfo);
2244 if (! vect_load_lanes_supported (vectype, size)
2245 && ! vect_grouped_load_supported (vectype, single_element_p,
2246 size))
2247 return false;
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "re-trying with SLP disabled\n");
2255 /* Roll back state appropriately. No SLP this time. */
2256 slp = false;
2257 /* Restore vectorization factor as it were without SLP. */
2258 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2259 /* Free the SLP instances. */
2260 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2261 vect_free_slp_instance (instance);
2262 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2263 /* Reset SLP type to loop_vect on all stmts. */
2264 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2266 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2267 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2268 !gsi_end_p (si); gsi_next (&si))
2270 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2271 STMT_SLP_TYPE (stmt_info) = loop_vect;
2273 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2274 !gsi_end_p (si); gsi_next (&si))
2276 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2277 STMT_SLP_TYPE (stmt_info) = loop_vect;
2278 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2280 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2281 STMT_SLP_TYPE (stmt_info) = loop_vect;
2282 for (gimple_stmt_iterator pi
2283 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2284 !gsi_end_p (pi); gsi_next (&pi))
2286 gimple *pstmt = gsi_stmt (pi);
2287 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2292 /* Free optimized alias test DDRS. */
2293 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2294 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2295 /* Reset target cost data. */
2296 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299 /* Reset assorted flags. */
2300 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2303 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2305 goto start_over;
2308 /* Function vect_analyze_loop.
2310 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2311 for it. The different analyses will record information in the
2312 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2313 be vectorized. */
2314 loop_vec_info
2315 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2317 loop_vec_info loop_vinfo;
2318 unsigned int vector_sizes;
2320 /* Autodetect first vector size we try. */
2321 current_vector_size = 0;
2322 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_NOTE, vect_location,
2326 "===== analyze_loop_nest =====\n");
2328 if (loop_outer (loop)
2329 && loop_vec_info_for_loop (loop_outer (loop))
2330 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "outer-loop already vectorized.\n");
2335 return NULL;
2338 while (1)
2340 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2341 loop_vinfo = vect_analyze_loop_form (loop);
2342 if (!loop_vinfo)
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "bad loop form.\n");
2347 return NULL;
2350 bool fatal = false;
2352 if (orig_loop_vinfo)
2353 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2357 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2359 return loop_vinfo;
2362 delete loop_vinfo;
2364 vector_sizes &= ~current_vector_size;
2365 if (fatal
2366 || vector_sizes == 0
2367 || current_vector_size == 0)
2368 return NULL;
2370 /* Try the next biggest vector size. */
2371 current_vector_size = 1 << floor_log2 (vector_sizes);
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "***** Re-trying analysis with "
2375 "vector size %d\n", current_vector_size);
2380 /* Function reduction_fn_for_scalar_code
2382 Input:
2383 CODE - tree_code of a reduction operations.
2385 Output:
2386 REDUC_FN - the corresponding internal function to be used to reduce the
2387 vector of partial results into a single scalar result, or IFN_LAST
2388 if the operation is a supported reduction operation, but does not have
2389 such an internal function.
2391 Return FALSE if CODE currently cannot be vectorized as reduction. */
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2396 switch (code)
2398 case MAX_EXPR:
2399 *reduc_fn = IFN_REDUC_MAX;
2400 return true;
2402 case MIN_EXPR:
2403 *reduc_fn = IFN_REDUC_MIN;
2404 return true;
2406 case PLUS_EXPR:
2407 *reduc_fn = IFN_REDUC_PLUS;
2408 return true;
2410 case MULT_EXPR:
2411 case MINUS_EXPR:
2412 case BIT_IOR_EXPR:
2413 case BIT_XOR_EXPR:
2414 case BIT_AND_EXPR:
2415 *reduc_fn = IFN_LAST;
2416 return true;
2418 default:
2419 return false;
2424 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2425 STMT is printed with a message MSG. */
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 dump_printf_loc (msg_type, vect_location, "%s", msg);
2431 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2435 /* Detect SLP reduction of the form:
2437 #a1 = phi <a5, a0>
2438 a2 = operation (a1)
2439 a3 = operation (a2)
2440 a4 = operation (a3)
2441 a5 = operation (a4)
2443 #a = phi <a5>
2445 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446 FIRST_STMT is the first reduction stmt in the chain
2447 (a2 = operation (a1)).
2449 Return TRUE if a reduction chain was detected. */
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453 gimple *first_stmt)
2455 struct loop *loop = (gimple_bb (phi))->loop_father;
2456 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457 enum tree_code code;
2458 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459 stmt_vec_info use_stmt_info, current_stmt_info;
2460 tree lhs;
2461 imm_use_iterator imm_iter;
2462 use_operand_p use_p;
2463 int nloop_uses, size = 0, n_out_of_loop_uses;
2464 bool found = false;
2466 if (loop != vect_loop)
2467 return false;
2469 lhs = PHI_RESULT (phi);
2470 code = gimple_assign_rhs_code (first_stmt);
2471 while (1)
2473 nloop_uses = 0;
2474 n_out_of_loop_uses = 0;
2475 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477 gimple *use_stmt = USE_STMT (use_p);
2478 if (is_gimple_debug (use_stmt))
2479 continue;
2481 /* Check if we got back to the reduction phi. */
2482 if (use_stmt == phi)
2484 loop_use_stmt = use_stmt;
2485 found = true;
2486 break;
2489 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491 loop_use_stmt = use_stmt;
2492 nloop_uses++;
2494 else
2495 n_out_of_loop_uses++;
2497 /* There are can be either a single use in the loop or two uses in
2498 phi nodes. */
2499 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500 return false;
2503 if (found)
2504 break;
2506 /* We reached a statement with no loop uses. */
2507 if (nloop_uses == 0)
2508 return false;
2510 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2511 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512 return false;
2514 if (!is_gimple_assign (loop_use_stmt)
2515 || code != gimple_assign_rhs_code (loop_use_stmt)
2516 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517 return false;
2519 /* Insert USE_STMT into reduction chain. */
2520 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521 if (current_stmt)
2523 current_stmt_info = vinfo_for_stmt (current_stmt);
2524 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525 GROUP_FIRST_ELEMENT (use_stmt_info)
2526 = GROUP_FIRST_ELEMENT (current_stmt_info);
2528 else
2529 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531 lhs = gimple_assign_lhs (loop_use_stmt);
2532 current_stmt = loop_use_stmt;
2533 size++;
2536 if (!found || loop_use_stmt != phi || size < 2)
2537 return false;
2539 /* Swap the operands, if needed, to make the reduction operand be the second
2540 operand. */
2541 lhs = PHI_RESULT (phi);
2542 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543 while (next_stmt)
2545 if (gimple_assign_rhs2 (next_stmt) == lhs)
2547 tree op = gimple_assign_rhs1 (next_stmt);
2548 gimple *def_stmt = NULL;
2550 if (TREE_CODE (op) == SSA_NAME)
2551 def_stmt = SSA_NAME_DEF_STMT (op);
2553 /* Check that the other def is either defined in the loop
2554 ("vect_internal_def"), or it's an induction (defined by a
2555 loop-header phi-node). */
2556 if (def_stmt
2557 && gimple_bb (def_stmt)
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559 && (is_gimple_assign (def_stmt)
2560 || is_gimple_call (def_stmt)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562 == vect_induction_def
2563 || (gimple_code (def_stmt) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568 lhs = gimple_assign_lhs (next_stmt);
2569 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570 continue;
2573 return false;
2575 else
2577 tree op = gimple_assign_rhs2 (next_stmt);
2578 gimple *def_stmt = NULL;
2580 if (TREE_CODE (op) == SSA_NAME)
2581 def_stmt = SSA_NAME_DEF_STMT (op);
2583 /* Check that the other def is either defined in the loop
2584 ("vect_internal_def"), or it's an induction (defined by a
2585 loop-header phi-node). */
2586 if (def_stmt
2587 && gimple_bb (def_stmt)
2588 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589 && (is_gimple_assign (def_stmt)
2590 || is_gimple_call (def_stmt)
2591 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592 == vect_induction_def
2593 || (gimple_code (def_stmt) == GIMPLE_PHI
2594 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595 == vect_internal_def
2596 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2604 swap_ssa_operands (next_stmt,
2605 gimple_assign_rhs1_ptr (next_stmt),
2606 gimple_assign_rhs2_ptr (next_stmt));
2607 update_stmt (next_stmt);
2609 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612 else
2613 return false;
2616 lhs = gimple_assign_lhs (next_stmt);
2617 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2620 /* Save the chain for further analysis in SLP detection. */
2621 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625 return true;
2629 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2630 reduction operation CODE has a handled computation expression. */
2632 bool
2633 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2634 enum tree_code code)
2636 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2637 auto_bitmap visited;
2638 tree lookfor = PHI_RESULT (phi);
2639 ssa_op_iter curri;
2640 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2641 while (USE_FROM_PTR (curr) != loop_arg)
2642 curr = op_iter_next_use (&curri);
2643 curri.i = curri.numops;
2646 path.safe_push (std::make_pair (curri, curr));
2647 tree use = USE_FROM_PTR (curr);
2648 if (use == lookfor)
2649 break;
2650 gimple *def = SSA_NAME_DEF_STMT (use);
2651 if (gimple_nop_p (def)
2652 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2654 pop:
2657 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2658 curri = x.first;
2659 curr = x.second;
2661 curr = op_iter_next_use (&curri);
2662 /* Skip already visited or non-SSA operands (from iterating
2663 over PHI args). */
2664 while (curr != NULL_USE_OPERAND_P
2665 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2666 || ! bitmap_set_bit (visited,
2667 SSA_NAME_VERSION
2668 (USE_FROM_PTR (curr)))));
2670 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2671 if (curr == NULL_USE_OPERAND_P)
2672 break;
2674 else
2676 if (gimple_code (def) == GIMPLE_PHI)
2677 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2678 else
2679 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2680 while (curr != NULL_USE_OPERAND_P
2681 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2682 || ! bitmap_set_bit (visited,
2683 SSA_NAME_VERSION
2684 (USE_FROM_PTR (curr)))))
2685 curr = op_iter_next_use (&curri);
2686 if (curr == NULL_USE_OPERAND_P)
2687 goto pop;
2690 while (1);
2691 if (dump_file && (dump_flags & TDF_DETAILS))
2693 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2694 unsigned i;
2695 std::pair<ssa_op_iter, use_operand_p> *x;
2696 FOR_EACH_VEC_ELT (path, i, x)
2698 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2699 dump_printf (MSG_NOTE, " ");
2701 dump_printf (MSG_NOTE, "\n");
2704 /* Check whether the reduction path detected is valid. */
2705 bool fail = path.length () == 0;
2706 bool neg = false;
2707 for (unsigned i = 1; i < path.length (); ++i)
2709 gimple *use_stmt = USE_STMT (path[i].second);
2710 tree op = USE_FROM_PTR (path[i].second);
2711 if (! has_single_use (op)
2712 || ! is_gimple_assign (use_stmt))
2714 fail = true;
2715 break;
2717 if (gimple_assign_rhs_code (use_stmt) != code)
2719 if (code == PLUS_EXPR
2720 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2722 /* Track whether we negate the reduction value each iteration. */
2723 if (gimple_assign_rhs2 (use_stmt) == op)
2724 neg = ! neg;
2726 else
2728 fail = true;
2729 break;
2733 return ! fail && ! neg;
2737 /* Function vect_is_simple_reduction
2739 (1) Detect a cross-iteration def-use cycle that represents a simple
2740 reduction computation. We look for the following pattern:
2742 loop_header:
2743 a1 = phi < a0, a2 >
2744 a3 = ...
2745 a2 = operation (a3, a1)
2749 a3 = ...
2750 loop_header:
2751 a1 = phi < a0, a2 >
2752 a2 = operation (a3, a1)
2754 such that:
2755 1. operation is commutative and associative and it is safe to
2756 change the order of the computation
2757 2. no uses for a2 in the loop (a2 is used out of the loop)
2758 3. no uses of a1 in the loop besides the reduction operation
2759 4. no uses of a1 outside the loop.
2761 Conditions 1,4 are tested here.
2762 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2764 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2765 nested cycles.
2767 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2768 reductions:
2770 a1 = phi < a0, a2 >
2771 inner loop (def of a3)
2772 a2 = phi < a3 >
2774 (4) Detect condition expressions, ie:
2775 for (int i = 0; i < N; i++)
2776 if (a[i] < val)
2777 ret_val = a[i];
2781 static gimple *
2782 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2783 bool *double_reduc,
2784 bool need_wrapping_integral_overflow,
2785 enum vect_reduction_type *v_reduc_type)
2787 struct loop *loop = (gimple_bb (phi))->loop_father;
2788 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2789 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2790 enum tree_code orig_code, code;
2791 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2792 tree type;
2793 int nloop_uses;
2794 tree name;
2795 imm_use_iterator imm_iter;
2796 use_operand_p use_p;
2797 bool phi_def;
2799 *double_reduc = false;
2800 *v_reduc_type = TREE_CODE_REDUCTION;
2802 tree phi_name = PHI_RESULT (phi);
2803 /* ??? If there are no uses of the PHI result the inner loop reduction
2804 won't be detected as possibly double-reduction by vectorizable_reduction
2805 because that tries to walk the PHI arg from the preheader edge which
2806 can be constant. See PR60382. */
2807 if (has_zero_uses (phi_name))
2808 return NULL;
2809 nloop_uses = 0;
2810 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2812 gimple *use_stmt = USE_STMT (use_p);
2813 if (is_gimple_debug (use_stmt))
2814 continue;
2816 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2818 if (dump_enabled_p ())
2819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2820 "intermediate value used outside loop.\n");
2822 return NULL;
2825 nloop_uses++;
2826 if (nloop_uses > 1)
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830 "reduction value used in loop.\n");
2831 return NULL;
2834 phi_use_stmt = use_stmt;
2837 edge latch_e = loop_latch_edge (loop);
2838 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2839 if (TREE_CODE (loop_arg) != SSA_NAME)
2841 if (dump_enabled_p ())
2843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2844 "reduction: not ssa_name: ");
2845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2848 return NULL;
2851 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2852 if (is_gimple_assign (def_stmt))
2854 name = gimple_assign_lhs (def_stmt);
2855 phi_def = false;
2857 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2859 name = PHI_RESULT (def_stmt);
2860 phi_def = true;
2862 else
2864 if (dump_enabled_p ())
2866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2867 "reduction: unhandled reduction operation: ");
2868 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2870 return NULL;
2873 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2874 return NULL;
2876 nloop_uses = 0;
2877 auto_vec<gphi *, 3> lcphis;
2878 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2880 gimple *use_stmt = USE_STMT (use_p);
2881 if (is_gimple_debug (use_stmt))
2882 continue;
2883 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2884 nloop_uses++;
2885 else
2886 /* We can have more than one loop-closed PHI. */
2887 lcphis.safe_push (as_a <gphi *> (use_stmt));
2888 if (nloop_uses > 1)
2890 if (dump_enabled_p ())
2891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892 "reduction used in loop.\n");
2893 return NULL;
2897 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2898 defined in the inner loop. */
2899 if (phi_def)
2901 op1 = PHI_ARG_DEF (def_stmt, 0);
2903 if (gimple_phi_num_args (def_stmt) != 1
2904 || TREE_CODE (op1) != SSA_NAME)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "unsupported phi node definition.\n");
2910 return NULL;
2913 def1 = SSA_NAME_DEF_STMT (op1);
2914 if (gimple_bb (def1)
2915 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2916 && loop->inner
2917 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2918 && is_gimple_assign (def1)
2919 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2921 if (dump_enabled_p ())
2922 report_vect_op (MSG_NOTE, def_stmt,
2923 "detected double reduction: ");
2925 *double_reduc = true;
2926 return def_stmt;
2929 return NULL;
2932 /* If we are vectorizing an inner reduction we are executing that
2933 in the original order only in case we are not dealing with a
2934 double reduction. */
2935 bool check_reduction = true;
2936 if (flow_loop_nested_p (vect_loop, loop))
2938 gphi *lcphi;
2939 unsigned i;
2940 check_reduction = false;
2941 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2942 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2944 gimple *use_stmt = USE_STMT (use_p);
2945 if (is_gimple_debug (use_stmt))
2946 continue;
2947 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2948 check_reduction = true;
2952 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2953 code = orig_code = gimple_assign_rhs_code (def_stmt);
2955 /* We can handle "res -= x[i]", which is non-associative by
2956 simply rewriting this into "res += -x[i]". Avoid changing
2957 gimple instruction for the first simple tests and only do this
2958 if we're allowed to change code at all. */
2959 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2960 code = PLUS_EXPR;
2962 if (code == COND_EXPR)
2964 if (! nested_in_vect_loop)
2965 *v_reduc_type = COND_REDUCTION;
2967 op3 = gimple_assign_rhs1 (def_stmt);
2968 if (COMPARISON_CLASS_P (op3))
2970 op4 = TREE_OPERAND (op3, 1);
2971 op3 = TREE_OPERAND (op3, 0);
2973 if (op3 == phi_name || op4 == phi_name)
2975 if (dump_enabled_p ())
2976 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2977 "reduction: condition depends on previous"
2978 " iteration: ");
2979 return NULL;
2982 op1 = gimple_assign_rhs2 (def_stmt);
2983 op2 = gimple_assign_rhs3 (def_stmt);
2985 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2987 if (dump_enabled_p ())
2988 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989 "reduction: not commutative/associative: ");
2990 return NULL;
2992 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2994 op1 = gimple_assign_rhs1 (def_stmt);
2995 op2 = gimple_assign_rhs2 (def_stmt);
2997 else
2999 if (dump_enabled_p ())
3000 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3001 "reduction: not handled operation: ");
3002 return NULL;
3005 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3007 if (dump_enabled_p ())
3008 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3009 "reduction: both uses not ssa_names: ");
3011 return NULL;
3014 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3015 if ((TREE_CODE (op1) == SSA_NAME
3016 && !types_compatible_p (type,TREE_TYPE (op1)))
3017 || (TREE_CODE (op2) == SSA_NAME
3018 && !types_compatible_p (type, TREE_TYPE (op2)))
3019 || (op3 && TREE_CODE (op3) == SSA_NAME
3020 && !types_compatible_p (type, TREE_TYPE (op3)))
3021 || (op4 && TREE_CODE (op4) == SSA_NAME
3022 && !types_compatible_p (type, TREE_TYPE (op4))))
3024 if (dump_enabled_p ())
3026 dump_printf_loc (MSG_NOTE, vect_location,
3027 "reduction: multiple types: operation type: ");
3028 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3029 dump_printf (MSG_NOTE, ", operands types: ");
3030 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3031 TREE_TYPE (op1));
3032 dump_printf (MSG_NOTE, ",");
3033 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3034 TREE_TYPE (op2));
3035 if (op3)
3037 dump_printf (MSG_NOTE, ",");
3038 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3039 TREE_TYPE (op3));
3042 if (op4)
3044 dump_printf (MSG_NOTE, ",");
3045 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3046 TREE_TYPE (op4));
3048 dump_printf (MSG_NOTE, "\n");
3051 return NULL;
3054 /* Check that it's ok to change the order of the computation.
3055 Generally, when vectorizing a reduction we change the order of the
3056 computation. This may change the behavior of the program in some
3057 cases, so we need to check that this is ok. One exception is when
3058 vectorizing an outer-loop: the inner-loop is executed sequentially,
3059 and therefore vectorizing reductions in the inner-loop during
3060 outer-loop vectorization is safe. */
3062 if (*v_reduc_type != COND_REDUCTION
3063 && check_reduction)
3065 /* CHECKME: check for !flag_finite_math_only too? */
3066 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3068 /* Changing the order of operations changes the semantics. */
3069 if (dump_enabled_p ())
3070 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071 "reduction: unsafe fp math optimization: ");
3072 return NULL;
3074 else if (INTEGRAL_TYPE_P (type))
3076 if (!operation_no_trapping_overflow (type, code))
3078 /* Changing the order of operations changes the semantics. */
3079 if (dump_enabled_p ())
3080 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081 "reduction: unsafe int math optimization"
3082 " (overflow traps): ");
3083 return NULL;
3085 if (need_wrapping_integral_overflow
3086 && !TYPE_OVERFLOW_WRAPS (type)
3087 && operation_can_overflow (code))
3089 /* Changing the order of operations changes the semantics. */
3090 if (dump_enabled_p ())
3091 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3092 "reduction: unsafe int math optimization"
3093 " (overflow doesn't wrap): ");
3094 return NULL;
3097 else if (SAT_FIXED_POINT_TYPE_P (type))
3099 /* Changing the order of operations changes the semantics. */
3100 if (dump_enabled_p ())
3101 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102 "reduction: unsafe fixed-point math optimization: ");
3103 return NULL;
3107 /* Reduction is safe. We're dealing with one of the following:
3108 1) integer arithmetic and no trapv
3109 2) floating point arithmetic, and special flags permit this optimization
3110 3) nested cycle (i.e., outer loop vectorization). */
3111 if (TREE_CODE (op1) == SSA_NAME)
3112 def1 = SSA_NAME_DEF_STMT (op1);
3114 if (TREE_CODE (op2) == SSA_NAME)
3115 def2 = SSA_NAME_DEF_STMT (op2);
3117 if (code != COND_EXPR
3118 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3120 if (dump_enabled_p ())
3121 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3122 return NULL;
3125 /* Check that one def is the reduction def, defined by PHI,
3126 the other def is either defined in the loop ("vect_internal_def"),
3127 or it's an induction (defined by a loop-header phi-node). */
3129 if (def2 && def2 == phi
3130 && (code == COND_EXPR
3131 || !def1 || gimple_nop_p (def1)
3132 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3133 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3134 && (is_gimple_assign (def1)
3135 || is_gimple_call (def1)
3136 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3137 == vect_induction_def
3138 || (gimple_code (def1) == GIMPLE_PHI
3139 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3140 == vect_internal_def
3141 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3143 if (dump_enabled_p ())
3144 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3145 return def_stmt;
3148 if (def1 && def1 == phi
3149 && (code == COND_EXPR
3150 || !def2 || gimple_nop_p (def2)
3151 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3152 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3153 && (is_gimple_assign (def2)
3154 || is_gimple_call (def2)
3155 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3156 == vect_induction_def
3157 || (gimple_code (def2) == GIMPLE_PHI
3158 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3159 == vect_internal_def
3160 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3162 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3164 /* Check if we can swap operands (just for simplicity - so that
3165 the rest of the code can assume that the reduction variable
3166 is always the last (second) argument). */
3167 if (code == COND_EXPR)
3169 /* Swap cond_expr by inverting the condition. */
3170 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3171 enum tree_code invert_code = ERROR_MARK;
3172 enum tree_code cond_code = TREE_CODE (cond_expr);
3174 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3176 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3177 invert_code = invert_tree_comparison (cond_code, honor_nans);
3179 if (invert_code != ERROR_MARK)
3181 TREE_SET_CODE (cond_expr, invert_code);
3182 swap_ssa_operands (def_stmt,
3183 gimple_assign_rhs2_ptr (def_stmt),
3184 gimple_assign_rhs3_ptr (def_stmt));
3186 else
3188 if (dump_enabled_p ())
3189 report_vect_op (MSG_NOTE, def_stmt,
3190 "detected reduction: cannot swap operands "
3191 "for cond_expr");
3192 return NULL;
3195 else
3196 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3197 gimple_assign_rhs2_ptr (def_stmt));
3199 if (dump_enabled_p ())
3200 report_vect_op (MSG_NOTE, def_stmt,
3201 "detected reduction: need to swap operands: ");
3203 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3204 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3206 else
3208 if (dump_enabled_p ())
3209 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3212 return def_stmt;
3215 /* Try to find SLP reduction chain. */
3216 if (! nested_in_vect_loop
3217 && code != COND_EXPR
3218 && orig_code != MINUS_EXPR
3219 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3221 if (dump_enabled_p ())
3222 report_vect_op (MSG_NOTE, def_stmt,
3223 "reduction: detected reduction chain: ");
3225 return def_stmt;
3228 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3229 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3230 while (first)
3232 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3233 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3234 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3235 first = next;
3238 /* Look for the expression computing loop_arg from loop PHI result. */
3239 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3240 code))
3241 return def_stmt;
3243 if (dump_enabled_p ())
3245 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3246 "reduction: unknown pattern: ");
3249 return NULL;
3252 /* Wrapper around vect_is_simple_reduction, which will modify code
3253 in-place if it enables detection of more reductions. Arguments
3254 as there. */
3256 gimple *
3257 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3258 bool *double_reduc,
3259 bool need_wrapping_integral_overflow)
3261 enum vect_reduction_type v_reduc_type;
3262 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3263 need_wrapping_integral_overflow,
3264 &v_reduc_type);
3265 if (def)
3267 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3268 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3269 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3270 reduc_def_info = vinfo_for_stmt (def);
3271 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3273 return def;
3276 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3278 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3279 int *peel_iters_epilogue,
3280 stmt_vector_for_cost *scalar_cost_vec,
3281 stmt_vector_for_cost *prologue_cost_vec,
3282 stmt_vector_for_cost *epilogue_cost_vec)
3284 int retval = 0;
3285 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3287 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3289 *peel_iters_epilogue = vf/2;
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE, vect_location,
3292 "cost model: epilogue peel iters set to vf/2 "
3293 "because loop iterations are unknown .\n");
3295 /* If peeled iterations are known but number of scalar loop
3296 iterations are unknown, count a taken branch per peeled loop. */
3297 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3298 NULL, 0, vect_prologue);
3299 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3300 NULL, 0, vect_epilogue);
3302 else
3304 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3305 peel_iters_prologue = niters < peel_iters_prologue ?
3306 niters : peel_iters_prologue;
3307 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3308 /* If we need to peel for gaps, but no peeling is required, we have to
3309 peel VF iterations. */
3310 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3311 *peel_iters_epilogue = vf;
3314 stmt_info_for_cost *si;
3315 int j;
3316 if (peel_iters_prologue)
3317 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319 stmt_vec_info stmt_info
3320 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321 retval += record_stmt_cost (prologue_cost_vec,
3322 si->count * peel_iters_prologue,
3323 si->kind, stmt_info, si->misalign,
3324 vect_prologue);
3326 if (*peel_iters_epilogue)
3327 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3329 stmt_vec_info stmt_info
3330 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3331 retval += record_stmt_cost (epilogue_cost_vec,
3332 si->count * *peel_iters_epilogue,
3333 si->kind, stmt_info, si->misalign,
3334 vect_epilogue);
3337 return retval;
3340 /* Function vect_estimate_min_profitable_iters
3342 Return the number of iterations required for the vector version of the
3343 loop to be profitable relative to the cost of the scalar version of the
3344 loop.
3346 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3347 of iterations for vectorization. -1 value means loop vectorization
3348 is not profitable. This returned value may be used for dynamic
3349 profitability check.
3351 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3352 for static check against estimated number of iterations. */
3354 static void
3355 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3356 int *ret_min_profitable_niters,
3357 int *ret_min_profitable_estimate)
3359 int min_profitable_iters;
3360 int min_profitable_estimate;
3361 int peel_iters_prologue;
3362 int peel_iters_epilogue;
3363 unsigned vec_inside_cost = 0;
3364 int vec_outside_cost = 0;
3365 unsigned vec_prologue_cost = 0;
3366 unsigned vec_epilogue_cost = 0;
3367 int scalar_single_iter_cost = 0;
3368 int scalar_outside_cost = 0;
3369 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3370 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3371 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3373 /* Cost model disabled. */
3374 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3376 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3377 *ret_min_profitable_niters = 0;
3378 *ret_min_profitable_estimate = 0;
3379 return;
3382 /* Requires loop versioning tests to handle misalignment. */
3383 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3385 /* FIXME: Make cost depend on complexity of individual check. */
3386 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3387 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3388 vect_prologue);
3389 dump_printf (MSG_NOTE,
3390 "cost model: Adding cost of checks for loop "
3391 "versioning to treat misalignment.\n");
3394 /* Requires loop versioning with alias checks. */
3395 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3397 /* FIXME: Make cost depend on complexity of individual check. */
3398 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3399 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3400 vect_prologue);
3401 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3402 if (len)
3403 /* Count LEN - 1 ANDs and LEN comparisons. */
3404 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3405 NULL, 0, vect_prologue);
3406 dump_printf (MSG_NOTE,
3407 "cost model: Adding cost of checks for loop "
3408 "versioning aliasing.\n");
3411 /* Requires loop versioning with niter checks. */
3412 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3414 /* FIXME: Make cost depend on complexity of individual check. */
3415 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3416 vect_prologue);
3417 dump_printf (MSG_NOTE,
3418 "cost model: Adding cost of checks for loop "
3419 "versioning niters.\n");
3422 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3423 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3424 vect_prologue);
3426 /* Count statements in scalar loop. Using this as scalar cost for a single
3427 iteration for now.
3429 TODO: Add outer loop support.
3431 TODO: Consider assigning different costs to different scalar
3432 statements. */
3434 scalar_single_iter_cost
3435 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3437 /* Add additional cost for the peeled instructions in prologue and epilogue
3438 loop.
3440 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3441 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3443 TODO: Build an expression that represents peel_iters for prologue and
3444 epilogue to be used in a run-time test. */
3446 if (npeel < 0)
3448 peel_iters_prologue = vf/2;
3449 dump_printf (MSG_NOTE, "cost model: "
3450 "prologue peel iters set to vf/2.\n");
3452 /* If peeling for alignment is unknown, loop bound of main loop becomes
3453 unknown. */
3454 peel_iters_epilogue = vf/2;
3455 dump_printf (MSG_NOTE, "cost model: "
3456 "epilogue peel iters set to vf/2 because "
3457 "peeling for alignment is unknown.\n");
3459 /* If peeled iterations are unknown, count a taken branch and a not taken
3460 branch per peeled loop. Even if scalar loop iterations are known,
3461 vector iterations are not known since peeled prologue iterations are
3462 not known. Hence guards remain the same. */
3463 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3464 NULL, 0, vect_prologue);
3465 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3466 NULL, 0, vect_prologue);
3467 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3468 NULL, 0, vect_epilogue);
3469 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3470 NULL, 0, vect_epilogue);
3471 stmt_info_for_cost *si;
3472 int j;
3473 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3475 struct _stmt_vec_info *stmt_info
3476 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3477 (void) add_stmt_cost (target_cost_data,
3478 si->count * peel_iters_prologue,
3479 si->kind, stmt_info, si->misalign,
3480 vect_prologue);
3481 (void) add_stmt_cost (target_cost_data,
3482 si->count * peel_iters_epilogue,
3483 si->kind, stmt_info, si->misalign,
3484 vect_epilogue);
3487 else
3489 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3490 stmt_info_for_cost *si;
3491 int j;
3492 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3494 prologue_cost_vec.create (2);
3495 epilogue_cost_vec.create (2);
3496 peel_iters_prologue = npeel;
3498 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3499 &peel_iters_epilogue,
3500 &LOOP_VINFO_SCALAR_ITERATION_COST
3501 (loop_vinfo),
3502 &prologue_cost_vec,
3503 &epilogue_cost_vec);
3505 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3507 struct _stmt_vec_info *stmt_info
3508 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3509 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3510 si->misalign, vect_prologue);
3513 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3515 struct _stmt_vec_info *stmt_info
3516 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3517 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3518 si->misalign, vect_epilogue);
3521 prologue_cost_vec.release ();
3522 epilogue_cost_vec.release ();
3525 /* FORNOW: The scalar outside cost is incremented in one of the
3526 following ways:
3528 1. The vectorizer checks for alignment and aliasing and generates
3529 a condition that allows dynamic vectorization. A cost model
3530 check is ANDED with the versioning condition. Hence scalar code
3531 path now has the added cost of the versioning check.
3533 if (cost > th & versioning_check)
3534 jmp to vector code
3536 Hence run-time scalar is incremented by not-taken branch cost.
3538 2. The vectorizer then checks if a prologue is required. If the
3539 cost model check was not done before during versioning, it has to
3540 be done before the prologue check.
3542 if (cost <= th)
3543 prologue = scalar_iters
3544 if (prologue == 0)
3545 jmp to vector code
3546 else
3547 execute prologue
3548 if (prologue == num_iters)
3549 go to exit
3551 Hence the run-time scalar cost is incremented by a taken branch,
3552 plus a not-taken branch, plus a taken branch cost.
3554 3. The vectorizer then checks if an epilogue is required. If the
3555 cost model check was not done before during prologue check, it
3556 has to be done with the epilogue check.
3558 if (prologue == 0)
3559 jmp to vector code
3560 else
3561 execute prologue
3562 if (prologue == num_iters)
3563 go to exit
3564 vector code:
3565 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3566 jmp to epilogue
3568 Hence the run-time scalar cost should be incremented by 2 taken
3569 branches.
3571 TODO: The back end may reorder the BBS's differently and reverse
3572 conditions/branch directions. Change the estimates below to
3573 something more reasonable. */
3575 /* If the number of iterations is known and we do not do versioning, we can
3576 decide whether to vectorize at compile time. Hence the scalar version
3577 do not carry cost model guard costs. */
3578 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3579 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3581 /* Cost model check occurs at versioning. */
3582 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3583 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3584 else
3586 /* Cost model check occurs at prologue generation. */
3587 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3588 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3589 + vect_get_stmt_cost (cond_branch_not_taken);
3590 /* Cost model check occurs at epilogue generation. */
3591 else
3592 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3596 /* Complete the target-specific cost calculations. */
3597 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3598 &vec_inside_cost, &vec_epilogue_cost);
3600 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3605 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3606 vec_inside_cost);
3607 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3608 vec_prologue_cost);
3609 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3610 vec_epilogue_cost);
3611 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3612 scalar_single_iter_cost);
3613 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3614 scalar_outside_cost);
3615 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3616 vec_outside_cost);
3617 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3618 peel_iters_prologue);
3619 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3620 peel_iters_epilogue);
3623 /* Calculate number of iterations required to make the vector version
3624 profitable, relative to the loop bodies only. The following condition
3625 must hold true:
3626 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3627 where
3628 SIC = scalar iteration cost, VIC = vector iteration cost,
3629 VOC = vector outside cost, VF = vectorization factor,
3630 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3631 SOC = scalar outside cost for run time cost model check. */
3633 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3635 if (vec_outside_cost <= 0)
3636 min_profitable_iters = 0;
3637 else
3639 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3640 - vec_inside_cost * peel_iters_prologue
3641 - vec_inside_cost * peel_iters_epilogue)
3642 / ((scalar_single_iter_cost * vf)
3643 - vec_inside_cost);
3645 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3646 <= (((int) vec_inside_cost * min_profitable_iters)
3647 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3648 min_profitable_iters++;
3651 /* vector version will never be profitable. */
3652 else
3654 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3655 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3656 "did not happen for a simd loop");
3658 if (dump_enabled_p ())
3659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660 "cost model: the vector iteration cost = %d "
3661 "divided by the scalar iteration cost = %d "
3662 "is greater or equal to the vectorization factor = %d"
3663 ".\n",
3664 vec_inside_cost, scalar_single_iter_cost, vf);
3665 *ret_min_profitable_niters = -1;
3666 *ret_min_profitable_estimate = -1;
3667 return;
3670 dump_printf (MSG_NOTE,
3671 " Calculated minimum iters for profitability: %d\n",
3672 min_profitable_iters);
3674 /* We want the vectorized loop to execute at least once. */
3675 if (min_profitable_iters < (vf + peel_iters_prologue))
3676 min_profitable_iters = vf + peel_iters_prologue;
3678 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_NOTE, vect_location,
3680 " Runtime profitability threshold = %d\n",
3681 min_profitable_iters);
3683 *ret_min_profitable_niters = min_profitable_iters;
3685 /* Calculate number of iterations required to make the vector version
3686 profitable, relative to the loop bodies only.
3688 Non-vectorized variant is SIC * niters and it must win over vector
3689 variant on the expected loop trip count. The following condition must hold true:
3690 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3692 if (vec_outside_cost <= 0)
3693 min_profitable_estimate = 0;
3694 else
3696 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3697 - vec_inside_cost * peel_iters_prologue
3698 - vec_inside_cost * peel_iters_epilogue)
3699 / ((scalar_single_iter_cost * vf)
3700 - vec_inside_cost);
3702 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3703 if (dump_enabled_p ())
3704 dump_printf_loc (MSG_NOTE, vect_location,
3705 " Static estimate profitability threshold = %d\n",
3706 min_profitable_estimate);
3708 *ret_min_profitable_estimate = min_profitable_estimate;
3711 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3712 vector elements (not bits) for a vector with NELT elements. */
3713 static void
3714 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3715 vec_perm_indices *sel)
3717 unsigned int i;
3719 for (i = 0; i < nelt; i++)
3720 sel->quick_push ((i + offset) & (2 * nelt - 1));
3723 /* Checks whether the target supports whole-vector shifts for vectors of mode
3724 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3725 it supports vec_perm_const with masks for all necessary shift amounts. */
3726 static bool
3727 have_whole_vector_shift (machine_mode mode)
3729 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3730 return true;
3732 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3733 return false;
3735 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3736 auto_vec_perm_indices sel (nelt);
3738 for (i = nelt/2; i >= 1; i/=2)
3740 sel.truncate (0);
3741 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3742 if (!can_vec_perm_p (mode, false, &sel))
3743 return false;
3745 return true;
3748 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3749 functions. Design better to avoid maintenance issues. */
3751 /* Function vect_model_reduction_cost.
3753 Models cost for a reduction operation, including the vector ops
3754 generated within the strip-mine loop, the initial definition before
3755 the loop, and the epilogue code that must be generated. */
3757 static void
3758 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3759 int ncopies)
3761 int prologue_cost = 0, epilogue_cost = 0;
3762 enum tree_code code;
3763 optab optab;
3764 tree vectype;
3765 gimple *orig_stmt;
3766 machine_mode mode;
3767 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3768 struct loop *loop = NULL;
3769 void *target_cost_data;
3771 if (loop_vinfo)
3773 loop = LOOP_VINFO_LOOP (loop_vinfo);
3774 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3776 else
3777 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3779 /* Condition reductions generate two reductions in the loop. */
3780 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3781 ncopies *= 2;
3783 /* Cost of reduction op inside loop. */
3784 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3785 stmt_info, 0, vect_body);
3787 vectype = STMT_VINFO_VECTYPE (stmt_info);
3788 mode = TYPE_MODE (vectype);
3789 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3791 if (!orig_stmt)
3792 orig_stmt = STMT_VINFO_STMT (stmt_info);
3794 code = gimple_assign_rhs_code (orig_stmt);
3796 /* Add in cost for initial definition.
3797 For cond reduction we have four vectors: initial index, step, initial
3798 result of the data reduction, initial value of the index reduction. */
3799 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3800 == COND_REDUCTION ? 4 : 1;
3801 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3802 scalar_to_vec, stmt_info, 0,
3803 vect_prologue);
3805 /* Determine cost of epilogue code.
3807 We have a reduction operator that will reduce the vector in one statement.
3808 Also requires scalar extract. */
3810 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3812 if (reduc_fn != IFN_LAST)
3814 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3816 /* An EQ stmt and an COND_EXPR stmt. */
3817 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3818 vector_stmt, stmt_info, 0,
3819 vect_epilogue);
3820 /* Reduction of the max index and a reduction of the found
3821 values. */
3822 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3823 vec_to_scalar, stmt_info, 0,
3824 vect_epilogue);
3825 /* A broadcast of the max value. */
3826 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3827 scalar_to_vec, stmt_info, 0,
3828 vect_epilogue);
3830 else
3832 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3833 stmt_info, 0, vect_epilogue);
3834 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3835 vec_to_scalar, stmt_info, 0,
3836 vect_epilogue);
3839 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3841 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3842 /* Extraction of scalar elements. */
3843 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3844 vec_to_scalar, stmt_info, 0,
3845 vect_epilogue);
3846 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3847 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3848 scalar_stmt, stmt_info, 0,
3849 vect_epilogue);
3851 else
3853 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3854 tree bitsize =
3855 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3856 int element_bitsize = tree_to_uhwi (bitsize);
3857 int nelements = vec_size_in_bits / element_bitsize;
3859 if (code == COND_EXPR)
3860 code = MAX_EXPR;
3862 optab = optab_for_tree_code (code, vectype, optab_default);
3864 /* We have a whole vector shift available. */
3865 if (optab != unknown_optab
3866 && VECTOR_MODE_P (mode)
3867 && optab_handler (optab, mode) != CODE_FOR_nothing
3868 && have_whole_vector_shift (mode))
3870 /* Final reduction via vector shifts and the reduction operator.
3871 Also requires scalar extract. */
3872 epilogue_cost += add_stmt_cost (target_cost_data,
3873 exact_log2 (nelements) * 2,
3874 vector_stmt, stmt_info, 0,
3875 vect_epilogue);
3876 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3877 vec_to_scalar, stmt_info, 0,
3878 vect_epilogue);
3880 else
3881 /* Use extracts and reduction op for final reduction. For N
3882 elements, we have N extracts and N-1 reduction ops. */
3883 epilogue_cost += add_stmt_cost (target_cost_data,
3884 nelements + nelements - 1,
3885 vector_stmt, stmt_info, 0,
3886 vect_epilogue);
3890 if (dump_enabled_p ())
3891 dump_printf (MSG_NOTE,
3892 "vect_model_reduction_cost: inside_cost = %d, "
3893 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3894 prologue_cost, epilogue_cost);
3898 /* Function vect_model_induction_cost.
3900 Models cost for induction operations. */
3902 static void
3903 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3906 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3907 unsigned inside_cost, prologue_cost;
3909 if (PURE_SLP_STMT (stmt_info))
3910 return;
3912 /* loop cost for vec_loop. */
3913 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3914 stmt_info, 0, vect_body);
3916 /* prologue cost for vec_init and vec_step. */
3917 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3918 stmt_info, 0, vect_prologue);
3920 if (dump_enabled_p ())
3921 dump_printf_loc (MSG_NOTE, vect_location,
3922 "vect_model_induction_cost: inside_cost = %d, "
3923 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3928 /* Function get_initial_def_for_reduction
3930 Input:
3931 STMT - a stmt that performs a reduction operation in the loop.
3932 INIT_VAL - the initial value of the reduction variable
3934 Output:
3935 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3936 of the reduction (used for adjusting the epilog - see below).
3937 Return a vector variable, initialized according to the operation that STMT
3938 performs. This vector will be used as the initial value of the
3939 vector of partial results.
3941 Option1 (adjust in epilog): Initialize the vector as follows:
3942 add/bit or/xor: [0,0,...,0,0]
3943 mult/bit and: [1,1,...,1,1]
3944 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3945 and when necessary (e.g. add/mult case) let the caller know
3946 that it needs to adjust the result by init_val.
3948 Option2: Initialize the vector as follows:
3949 add/bit or/xor: [init_val,0,0,...,0]
3950 mult/bit and: [init_val,1,1,...,1]
3951 min/max/cond_expr: [init_val,init_val,...,init_val]
3952 and no adjustments are needed.
3954 For example, for the following code:
3956 s = init_val;
3957 for (i=0;i<n;i++)
3958 s = s + a[i];
3960 STMT is 's = s + a[i]', and the reduction variable is 's'.
3961 For a vector of 4 units, we want to return either [0,0,0,init_val],
3962 or [0,0,0,0] and let the caller know that it needs to adjust
3963 the result at the end by 'init_val'.
3965 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3966 initialization vector is simpler (same element in all entries), if
3967 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3969 A cost model should help decide between these two schemes. */
3971 tree
3972 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3973 tree *adjustment_def)
3975 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3976 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3978 tree scalar_type = TREE_TYPE (init_val);
3979 tree vectype = get_vectype_for_scalar_type (scalar_type);
3980 enum tree_code code = gimple_assign_rhs_code (stmt);
3981 tree def_for_init;
3982 tree init_def;
3983 bool nested_in_vect_loop = false;
3984 REAL_VALUE_TYPE real_init_val = dconst0;
3985 int int_init_val = 0;
3986 gimple *def_stmt = NULL;
3987 gimple_seq stmts = NULL;
3989 gcc_assert (vectype);
3991 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3992 || SCALAR_FLOAT_TYPE_P (scalar_type));
3994 if (nested_in_vect_loop_p (loop, stmt))
3995 nested_in_vect_loop = true;
3996 else
3997 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3999 /* In case of double reduction we only create a vector variable to be put
4000 in the reduction phi node. The actual statement creation is done in
4001 vect_create_epilog_for_reduction. */
4002 if (adjustment_def && nested_in_vect_loop
4003 && TREE_CODE (init_val) == SSA_NAME
4004 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4005 && gimple_code (def_stmt) == GIMPLE_PHI
4006 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4007 && vinfo_for_stmt (def_stmt)
4008 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4009 == vect_double_reduction_def)
4011 *adjustment_def = NULL;
4012 return vect_create_destination_var (init_val, vectype);
4015 /* In case of a nested reduction do not use an adjustment def as
4016 that case is not supported by the epilogue generation correctly
4017 if ncopies is not one. */
4018 if (adjustment_def && nested_in_vect_loop)
4020 *adjustment_def = NULL;
4021 return vect_get_vec_def_for_operand (init_val, stmt);
4024 switch (code)
4026 case WIDEN_SUM_EXPR:
4027 case DOT_PROD_EXPR:
4028 case SAD_EXPR:
4029 case PLUS_EXPR:
4030 case MINUS_EXPR:
4031 case BIT_IOR_EXPR:
4032 case BIT_XOR_EXPR:
4033 case MULT_EXPR:
4034 case BIT_AND_EXPR:
4036 /* ADJUSTMENT_DEF is NULL when called from
4037 vect_create_epilog_for_reduction to vectorize double reduction. */
4038 if (adjustment_def)
4039 *adjustment_def = init_val;
4041 if (code == MULT_EXPR)
4043 real_init_val = dconst1;
4044 int_init_val = 1;
4047 if (code == BIT_AND_EXPR)
4048 int_init_val = -1;
4050 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4051 def_for_init = build_real (scalar_type, real_init_val);
4052 else
4053 def_for_init = build_int_cst (scalar_type, int_init_val);
4055 if (adjustment_def)
4056 /* Option1: the first element is '0' or '1' as well. */
4057 init_def = gimple_build_vector_from_val (&stmts, vectype,
4058 def_for_init);
4059 else
4061 /* Option2: the first element is INIT_VAL. */
4062 tree_vector_builder elts (vectype, 1, 2);
4063 elts.quick_push (init_val);
4064 elts.quick_push (def_for_init);
4065 init_def = gimple_build_vector (&stmts, &elts);
4068 break;
4070 case MIN_EXPR:
4071 case MAX_EXPR:
4072 case COND_EXPR:
4074 if (adjustment_def)
4076 *adjustment_def = NULL_TREE;
4077 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4079 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4080 break;
4083 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4084 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4086 break;
4088 default:
4089 gcc_unreachable ();
4092 if (stmts)
4093 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4094 return init_def;
4097 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4098 NUMBER_OF_VECTORS is the number of vector defs to create. */
4100 static void
4101 get_initial_defs_for_reduction (slp_tree slp_node,
4102 vec<tree> *vec_oprnds,
4103 unsigned int number_of_vectors,
4104 enum tree_code code, bool reduc_chain)
4106 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4107 gimple *stmt = stmts[0];
4108 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4109 unsigned nunits;
4110 unsigned j, number_of_places_left_in_vector;
4111 tree vector_type, scalar_type;
4112 tree vop;
4113 int group_size = stmts.length ();
4114 unsigned int vec_num, i;
4115 unsigned number_of_copies = 1;
4116 vec<tree> voprnds;
4117 voprnds.create (number_of_vectors);
4118 tree neutral_op = NULL;
4119 struct loop *loop;
4121 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4122 scalar_type = TREE_TYPE (vector_type);
4123 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4125 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4127 loop = (gimple_bb (stmt))->loop_father;
4128 gcc_assert (loop);
4129 edge pe = loop_preheader_edge (loop);
4131 /* op is the reduction operand of the first stmt already. */
4132 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4133 we need either neutral operands or the original operands. See
4134 get_initial_def_for_reduction() for details. */
4135 switch (code)
4137 case WIDEN_SUM_EXPR:
4138 case DOT_PROD_EXPR:
4139 case SAD_EXPR:
4140 case PLUS_EXPR:
4141 case MINUS_EXPR:
4142 case BIT_IOR_EXPR:
4143 case BIT_XOR_EXPR:
4144 neutral_op = build_zero_cst (scalar_type);
4145 break;
4147 case MULT_EXPR:
4148 neutral_op = build_one_cst (scalar_type);
4149 break;
4151 case BIT_AND_EXPR:
4152 neutral_op = build_all_ones_cst (scalar_type);
4153 break;
4155 /* For MIN/MAX we don't have an easy neutral operand but
4156 the initial values can be used fine here. Only for
4157 a reduction chain we have to force a neutral element. */
4158 case MAX_EXPR:
4159 case MIN_EXPR:
4160 if (! reduc_chain)
4161 neutral_op = NULL;
4162 else
4163 neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4164 break;
4166 default:
4167 gcc_assert (! reduc_chain);
4168 neutral_op = NULL;
4171 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4172 created vectors. It is greater than 1 if unrolling is performed.
4174 For example, we have two scalar operands, s1 and s2 (e.g., group of
4175 strided accesses of size two), while NUNITS is four (i.e., four scalars
4176 of this type can be packed in a vector). The output vector will contain
4177 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4178 will be 2).
4180 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4181 containing the operands.
4183 For example, NUNITS is four as before, and the group size is 8
4184 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4185 {s5, s6, s7, s8}. */
4187 number_of_copies = nunits * number_of_vectors / group_size;
4189 number_of_places_left_in_vector = nunits;
4190 tree_vector_builder elts (vector_type, nunits, 1);
4191 elts.quick_grow (nunits);
4192 for (j = 0; j < number_of_copies; j++)
4194 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4196 tree op;
4197 /* Get the def before the loop. In reduction chain we have only
4198 one initial value. */
4199 if ((j != (number_of_copies - 1)
4200 || (reduc_chain && i != 0))
4201 && neutral_op)
4202 op = neutral_op;
4203 else
4204 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4206 /* Create 'vect_ = {op0,op1,...,opn}'. */
4207 number_of_places_left_in_vector--;
4208 elts[number_of_places_left_in_vector] = op;
4210 if (number_of_places_left_in_vector == 0)
4212 gimple_seq ctor_seq = NULL;
4213 tree init = gimple_build_vector (&ctor_seq, &elts);
4214 if (ctor_seq != NULL)
4215 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4216 voprnds.quick_push (init);
4218 number_of_places_left_in_vector = nunits;
4219 elts.new_vector (vector_type, nunits, 1);
4220 elts.quick_grow (nunits);
4225 /* Since the vectors are created in the reverse order, we should invert
4226 them. */
4227 vec_num = voprnds.length ();
4228 for (j = vec_num; j != 0; j--)
4230 vop = voprnds[j - 1];
4231 vec_oprnds->quick_push (vop);
4234 voprnds.release ();
4236 /* In case that VF is greater than the unrolling factor needed for the SLP
4237 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4238 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4239 to replicate the vectors. */
4240 tree neutral_vec = NULL;
4241 while (number_of_vectors > vec_oprnds->length ())
4243 if (neutral_op)
4245 if (!neutral_vec)
4247 gimple_seq ctor_seq = NULL;
4248 neutral_vec = gimple_build_vector_from_val
4249 (&ctor_seq, vector_type, neutral_op);
4250 if (ctor_seq != NULL)
4251 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4253 vec_oprnds->quick_push (neutral_vec);
4255 else
4257 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4258 vec_oprnds->quick_push (vop);
4264 /* Function vect_create_epilog_for_reduction
4266 Create code at the loop-epilog to finalize the result of a reduction
4267 computation.
4269 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4270 reduction statements.
4271 STMT is the scalar reduction stmt that is being vectorized.
4272 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4273 number of elements that we can fit in a vectype (nunits). In this case
4274 we have to generate more than one vector stmt - i.e - we need to "unroll"
4275 the vector stmt by a factor VF/nunits. For more details see documentation
4276 in vectorizable_operation.
4277 REDUC_FN is the internal function for the epilog reduction.
4278 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4279 computation.
4280 REDUC_INDEX is the index of the operand in the right hand side of the
4281 statement that is defined by REDUCTION_PHI.
4282 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4283 SLP_NODE is an SLP node containing a group of reduction statements. The
4284 first one in this group is STMT.
4285 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4286 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4287 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4288 any value of the IV in the loop.
4289 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4291 This function:
4292 1. Creates the reduction def-use cycles: sets the arguments for
4293 REDUCTION_PHIS:
4294 The loop-entry argument is the vectorized initial-value of the reduction.
4295 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4296 sums.
4297 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4298 by calling the function specified by REDUC_FN if available, or by
4299 other means (whole-vector shifts or a scalar loop).
4300 The function also creates a new phi node at the loop exit to preserve
4301 loop-closed form, as illustrated below.
4303 The flow at the entry to this function:
4305 loop:
4306 vec_def = phi <null, null> # REDUCTION_PHI
4307 VECT_DEF = vector_stmt # vectorized form of STMT
4308 s_loop = scalar_stmt # (scalar) STMT
4309 loop_exit:
4310 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4311 use <s_out0>
4312 use <s_out0>
4314 The above is transformed by this function into:
4316 loop:
4317 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4318 VECT_DEF = vector_stmt # vectorized form of STMT
4319 s_loop = scalar_stmt # (scalar) STMT
4320 loop_exit:
4321 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4322 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4323 v_out2 = reduce <v_out1>
4324 s_out3 = extract_field <v_out2, 0>
4325 s_out4 = adjust_result <s_out3>
4326 use <s_out4>
4327 use <s_out4>
4330 static void
4331 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4332 gimple *reduc_def_stmt,
4333 int ncopies, internal_fn reduc_fn,
4334 vec<gimple *> reduction_phis,
4335 bool double_reduc,
4336 slp_tree slp_node,
4337 slp_instance slp_node_instance,
4338 tree induc_val, enum tree_code induc_code)
4340 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4341 stmt_vec_info prev_phi_info;
4342 tree vectype;
4343 machine_mode mode;
4344 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4346 basic_block exit_bb;
4347 tree scalar_dest;
4348 tree scalar_type;
4349 gimple *new_phi = NULL, *phi;
4350 gimple_stmt_iterator exit_gsi;
4351 tree vec_dest;
4352 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4353 gimple *epilog_stmt = NULL;
4354 enum tree_code code = gimple_assign_rhs_code (stmt);
4355 gimple *exit_phi;
4356 tree bitsize;
4357 tree adjustment_def = NULL;
4358 tree vec_initial_def = NULL;
4359 tree expr, def, initial_def = NULL;
4360 tree orig_name, scalar_result;
4361 imm_use_iterator imm_iter, phi_imm_iter;
4362 use_operand_p use_p, phi_use_p;
4363 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4364 bool nested_in_vect_loop = false;
4365 auto_vec<gimple *> new_phis;
4366 auto_vec<gimple *> inner_phis;
4367 enum vect_def_type dt = vect_unknown_def_type;
4368 int j, i;
4369 auto_vec<tree> scalar_results;
4370 unsigned int group_size = 1, k, ratio;
4371 auto_vec<tree> vec_initial_defs;
4372 auto_vec<gimple *> phis;
4373 bool slp_reduc = false;
4374 tree new_phi_result;
4375 gimple *inner_phi = NULL;
4376 tree induction_index = NULL_TREE;
4378 if (slp_node)
4379 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4381 if (nested_in_vect_loop_p (loop, stmt))
4383 outer_loop = loop;
4384 loop = loop->inner;
4385 nested_in_vect_loop = true;
4386 gcc_assert (!slp_node);
4389 vectype = STMT_VINFO_VECTYPE (stmt_info);
4390 gcc_assert (vectype);
4391 mode = TYPE_MODE (vectype);
4393 /* 1. Create the reduction def-use cycle:
4394 Set the arguments of REDUCTION_PHIS, i.e., transform
4396 loop:
4397 vec_def = phi <null, null> # REDUCTION_PHI
4398 VECT_DEF = vector_stmt # vectorized form of STMT
4401 into:
4403 loop:
4404 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4405 VECT_DEF = vector_stmt # vectorized form of STMT
4408 (in case of SLP, do it for all the phis). */
4410 /* Get the loop-entry arguments. */
4411 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4412 if (slp_node)
4414 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4415 vec_initial_defs.reserve (vec_num);
4416 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4417 &vec_initial_defs, vec_num, code,
4418 GROUP_FIRST_ELEMENT (stmt_info));
4420 else
4422 /* Get at the scalar def before the loop, that defines the initial value
4423 of the reduction variable. */
4424 gimple *def_stmt;
4425 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4426 loop_preheader_edge (loop));
4427 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4428 and we can't use zero for induc_val, use initial_def. Similarly
4429 for REDUC_MIN and initial_def larger than the base. */
4430 if (TREE_CODE (initial_def) == INTEGER_CST
4431 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4432 == INTEGER_INDUC_COND_REDUCTION)
4433 && !integer_zerop (induc_val)
4434 && ((induc_code == MAX_EXPR
4435 && tree_int_cst_lt (initial_def, induc_val))
4436 || (induc_code == MIN_EXPR
4437 && tree_int_cst_lt (induc_val, initial_def))))
4438 induc_val = initial_def;
4439 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4440 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4441 &adjustment_def);
4442 vec_initial_defs.create (1);
4443 vec_initial_defs.quick_push (vec_initial_def);
4446 /* Set phi nodes arguments. */
4447 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4449 tree vec_init_def = vec_initial_defs[i];
4450 tree def = vect_defs[i];
4451 for (j = 0; j < ncopies; j++)
4453 if (j != 0)
4455 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4456 if (nested_in_vect_loop)
4457 vec_init_def
4458 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4459 vec_init_def);
4462 /* Set the loop-entry arg of the reduction-phi. */
4464 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4465 == INTEGER_INDUC_COND_REDUCTION)
4467 /* Initialise the reduction phi to zero. This prevents initial
4468 values of non-zero interferring with the reduction op. */
4469 gcc_assert (ncopies == 1);
4470 gcc_assert (i == 0);
4472 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4473 tree induc_val_vec
4474 = build_vector_from_val (vec_init_def_type, induc_val);
4476 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4477 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4479 else
4480 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4481 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4483 /* Set the loop-latch arg for the reduction-phi. */
4484 if (j > 0)
4485 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4487 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4488 UNKNOWN_LOCATION);
4490 if (dump_enabled_p ())
4492 dump_printf_loc (MSG_NOTE, vect_location,
4493 "transform reduction: created def-use cycle: ");
4494 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4495 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4500 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4501 which is updated with the current index of the loop for every match of
4502 the original loop's cond_expr (VEC_STMT). This results in a vector
4503 containing the last time the condition passed for that vector lane.
4504 The first match will be a 1 to allow 0 to be used for non-matching
4505 indexes. If there are no matches at all then the vector will be all
4506 zeroes. */
4507 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4509 tree indx_before_incr, indx_after_incr;
4510 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4511 int k;
4513 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4514 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4516 int scalar_precision
4517 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4518 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4519 tree cr_index_vector_type = build_vector_type
4520 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4522 /* First we create a simple vector induction variable which starts
4523 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4524 vector size (STEP). */
4526 /* Create a {1,2,3,...} vector. */
4527 tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4528 for (k = 0; k < 3; ++k)
4529 vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4530 tree series_vect = vtemp.build ();
4532 /* Create a vector of the step value. */
4533 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4534 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4536 /* Create an induction variable. */
4537 gimple_stmt_iterator incr_gsi;
4538 bool insert_after;
4539 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4540 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4541 insert_after, &indx_before_incr, &indx_after_incr);
4543 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4544 filled with zeros (VEC_ZERO). */
4546 /* Create a vector of 0s. */
4547 tree zero = build_zero_cst (cr_index_scalar_type);
4548 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4550 /* Create a vector phi node. */
4551 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4552 new_phi = create_phi_node (new_phi_tree, loop->header);
4553 set_vinfo_for_stmt (new_phi,
4554 new_stmt_vec_info (new_phi, loop_vinfo));
4555 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4556 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4558 /* Now take the condition from the loops original cond_expr
4559 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4560 every match uses values from the induction variable
4561 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4562 (NEW_PHI_TREE).
4563 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4564 the new cond_expr (INDEX_COND_EXPR). */
4566 /* Duplicate the condition from vec_stmt. */
4567 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4569 /* Create a conditional, where the condition is taken from vec_stmt
4570 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4571 else is the phi (NEW_PHI_TREE). */
4572 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4573 ccompare, indx_before_incr,
4574 new_phi_tree);
4575 induction_index = make_ssa_name (cr_index_vector_type);
4576 gimple *index_condition = gimple_build_assign (induction_index,
4577 index_cond_expr);
4578 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4579 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4580 loop_vinfo);
4581 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4582 set_vinfo_for_stmt (index_condition, index_vec_info);
4584 /* Update the phi with the vec cond. */
4585 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4586 loop_latch_edge (loop), UNKNOWN_LOCATION);
4589 /* 2. Create epilog code.
4590 The reduction epilog code operates across the elements of the vector
4591 of partial results computed by the vectorized loop.
4592 The reduction epilog code consists of:
4594 step 1: compute the scalar result in a vector (v_out2)
4595 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4596 step 3: adjust the scalar result (s_out3) if needed.
4598 Step 1 can be accomplished using one the following three schemes:
4599 (scheme 1) using reduc_fn, if available.
4600 (scheme 2) using whole-vector shifts, if available.
4601 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4602 combined.
4604 The overall epilog code looks like this:
4606 s_out0 = phi <s_loop> # original EXIT_PHI
4607 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4608 v_out2 = reduce <v_out1> # step 1
4609 s_out3 = extract_field <v_out2, 0> # step 2
4610 s_out4 = adjust_result <s_out3> # step 3
4612 (step 3 is optional, and steps 1 and 2 may be combined).
4613 Lastly, the uses of s_out0 are replaced by s_out4. */
4616 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4617 v_out1 = phi <VECT_DEF>
4618 Store them in NEW_PHIS. */
4620 exit_bb = single_exit (loop)->dest;
4621 prev_phi_info = NULL;
4622 new_phis.create (vect_defs.length ());
4623 FOR_EACH_VEC_ELT (vect_defs, i, def)
4625 for (j = 0; j < ncopies; j++)
4627 tree new_def = copy_ssa_name (def);
4628 phi = create_phi_node (new_def, exit_bb);
4629 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4630 if (j == 0)
4631 new_phis.quick_push (phi);
4632 else
4634 def = vect_get_vec_def_for_stmt_copy (dt, def);
4635 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4638 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4639 prev_phi_info = vinfo_for_stmt (phi);
4643 /* The epilogue is created for the outer-loop, i.e., for the loop being
4644 vectorized. Create exit phis for the outer loop. */
4645 if (double_reduc)
4647 loop = outer_loop;
4648 exit_bb = single_exit (loop)->dest;
4649 inner_phis.create (vect_defs.length ());
4650 FOR_EACH_VEC_ELT (new_phis, i, phi)
4652 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4653 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4654 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4655 PHI_RESULT (phi));
4656 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4657 loop_vinfo));
4658 inner_phis.quick_push (phi);
4659 new_phis[i] = outer_phi;
4660 prev_phi_info = vinfo_for_stmt (outer_phi);
4661 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4663 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4664 new_result = copy_ssa_name (PHI_RESULT (phi));
4665 outer_phi = create_phi_node (new_result, exit_bb);
4666 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4667 PHI_RESULT (phi));
4668 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4669 loop_vinfo));
4670 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4671 prev_phi_info = vinfo_for_stmt (outer_phi);
4676 exit_gsi = gsi_after_labels (exit_bb);
4678 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4679 (i.e. when reduc_fn is not available) and in the final adjustment
4680 code (if needed). Also get the original scalar reduction variable as
4681 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4682 represents a reduction pattern), the tree-code and scalar-def are
4683 taken from the original stmt that the pattern-stmt (STMT) replaces.
4684 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4685 are taken from STMT. */
4687 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4688 if (!orig_stmt)
4690 /* Regular reduction */
4691 orig_stmt = stmt;
4693 else
4695 /* Reduction pattern */
4696 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4697 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4698 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4701 code = gimple_assign_rhs_code (orig_stmt);
4702 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4703 partial results are added and not subtracted. */
4704 if (code == MINUS_EXPR)
4705 code = PLUS_EXPR;
4707 scalar_dest = gimple_assign_lhs (orig_stmt);
4708 scalar_type = TREE_TYPE (scalar_dest);
4709 scalar_results.create (group_size);
4710 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4711 bitsize = TYPE_SIZE (scalar_type);
4713 /* In case this is a reduction in an inner-loop while vectorizing an outer
4714 loop - we don't need to extract a single scalar result at the end of the
4715 inner-loop (unless it is double reduction, i.e., the use of reduction is
4716 outside the outer-loop). The final vector of partial results will be used
4717 in the vectorized outer-loop, or reduced to a scalar result at the end of
4718 the outer-loop. */
4719 if (nested_in_vect_loop && !double_reduc)
4720 goto vect_finalize_reduction;
4722 /* SLP reduction without reduction chain, e.g.,
4723 # a1 = phi <a2, a0>
4724 # b1 = phi <b2, b0>
4725 a2 = operation (a1)
4726 b2 = operation (b1) */
4727 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4729 /* In case of reduction chain, e.g.,
4730 # a1 = phi <a3, a0>
4731 a2 = operation (a1)
4732 a3 = operation (a2),
4734 we may end up with more than one vector result. Here we reduce them to
4735 one vector. */
4736 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4738 tree first_vect = PHI_RESULT (new_phis[0]);
4739 gassign *new_vec_stmt = NULL;
4740 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4741 for (k = 1; k < new_phis.length (); k++)
4743 gimple *next_phi = new_phis[k];
4744 tree second_vect = PHI_RESULT (next_phi);
4745 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4746 new_vec_stmt = gimple_build_assign (tem, code,
4747 first_vect, second_vect);
4748 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4749 first_vect = tem;
4752 new_phi_result = first_vect;
4753 if (new_vec_stmt)
4755 new_phis.truncate (0);
4756 new_phis.safe_push (new_vec_stmt);
4759 /* Likewise if we couldn't use a single defuse cycle. */
4760 else if (ncopies > 1)
4762 gcc_assert (new_phis.length () == 1);
4763 tree first_vect = PHI_RESULT (new_phis[0]);
4764 gassign *new_vec_stmt = NULL;
4765 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4766 gimple *next_phi = new_phis[0];
4767 for (int k = 1; k < ncopies; ++k)
4769 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4770 tree second_vect = PHI_RESULT (next_phi);
4771 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4772 new_vec_stmt = gimple_build_assign (tem, code,
4773 first_vect, second_vect);
4774 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4775 first_vect = tem;
4777 new_phi_result = first_vect;
4778 new_phis.truncate (0);
4779 new_phis.safe_push (new_vec_stmt);
4781 else
4782 new_phi_result = PHI_RESULT (new_phis[0]);
4784 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4785 && reduc_fn != IFN_LAST)
4787 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4788 various data values where the condition matched and another vector
4789 (INDUCTION_INDEX) containing all the indexes of those matches. We
4790 need to extract the last matching index (which will be the index with
4791 highest value) and use this to index into the data vector.
4792 For the case where there were no matches, the data vector will contain
4793 all default values and the index vector will be all zeros. */
4795 /* Get various versions of the type of the vector of indexes. */
4796 tree index_vec_type = TREE_TYPE (induction_index);
4797 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4798 tree index_scalar_type = TREE_TYPE (index_vec_type);
4799 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4800 (index_vec_type);
4802 /* Get an unsigned integer version of the type of the data vector. */
4803 int scalar_precision
4804 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4805 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4806 tree vectype_unsigned = build_vector_type
4807 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4809 /* First we need to create a vector (ZERO_VEC) of zeros and another
4810 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4811 can create using a MAX reduction and then expanding.
4812 In the case where the loop never made any matches, the max index will
4813 be zero. */
4815 /* Vector of {0, 0, 0,...}. */
4816 tree zero_vec = make_ssa_name (vectype);
4817 tree zero_vec_rhs = build_zero_cst (vectype);
4818 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4819 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4821 /* Find maximum value from the vector of found indexes. */
4822 tree max_index = make_ssa_name (index_scalar_type);
4823 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4824 1, induction_index);
4825 gimple_call_set_lhs (max_index_stmt, max_index);
4826 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4828 /* Vector of {max_index, max_index, max_index,...}. */
4829 tree max_index_vec = make_ssa_name (index_vec_type);
4830 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4831 max_index);
4832 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4833 max_index_vec_rhs);
4834 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4836 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4837 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4838 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4839 otherwise. Only one value should match, resulting in a vector
4840 (VEC_COND) with one data value and the rest zeros.
4841 In the case where the loop never made any matches, every index will
4842 match, resulting in a vector with all data values (which will all be
4843 the default value). */
4845 /* Compare the max index vector to the vector of found indexes to find
4846 the position of the max value. */
4847 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4848 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4849 induction_index,
4850 max_index_vec);
4851 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4853 /* Use the compare to choose either values from the data vector or
4854 zero. */
4855 tree vec_cond = make_ssa_name (vectype);
4856 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4857 vec_compare, new_phi_result,
4858 zero_vec);
4859 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4861 /* Finally we need to extract the data value from the vector (VEC_COND)
4862 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4863 reduction, but because this doesn't exist, we can use a MAX reduction
4864 instead. The data value might be signed or a float so we need to cast
4865 it first.
4866 In the case where the loop never made any matches, the data values are
4867 all identical, and so will reduce down correctly. */
4869 /* Make the matched data values unsigned. */
4870 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4871 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4872 vec_cond);
4873 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4874 VIEW_CONVERT_EXPR,
4875 vec_cond_cast_rhs);
4876 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4878 /* Reduce down to a scalar value. */
4879 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4880 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4881 1, vec_cond_cast);
4882 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4883 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4885 /* Convert the reduced value back to the result type and set as the
4886 result. */
4887 gimple_seq stmts = NULL;
4888 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4889 data_reduc);
4890 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4891 scalar_results.safe_push (new_temp);
4893 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4894 && reduc_fn == IFN_LAST)
4896 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4897 idx = 0;
4898 idx_val = induction_index[0];
4899 val = data_reduc[0];
4900 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4901 if (induction_index[i] > idx_val)
4902 val = data_reduc[i], idx_val = induction_index[i];
4903 return val; */
4905 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4906 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4907 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4908 unsigned HOST_WIDE_INT v_size
4909 = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910 tree idx_val = NULL_TREE, val = NULL_TREE;
4911 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4913 tree old_idx_val = idx_val;
4914 tree old_val = val;
4915 idx_val = make_ssa_name (idx_eltype);
4916 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4917 build3 (BIT_FIELD_REF, idx_eltype,
4918 induction_index,
4919 bitsize_int (el_size),
4920 bitsize_int (off)));
4921 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4922 val = make_ssa_name (data_eltype);
4923 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4924 build3 (BIT_FIELD_REF,
4925 data_eltype,
4926 new_phi_result,
4927 bitsize_int (el_size),
4928 bitsize_int (off)));
4929 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4930 if (off != 0)
4932 tree new_idx_val = idx_val;
4933 tree new_val = val;
4934 if (off != v_size - el_size)
4936 new_idx_val = make_ssa_name (idx_eltype);
4937 epilog_stmt = gimple_build_assign (new_idx_val,
4938 MAX_EXPR, idx_val,
4939 old_idx_val);
4940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942 new_val = make_ssa_name (data_eltype);
4943 epilog_stmt = gimple_build_assign (new_val,
4944 COND_EXPR,
4945 build2 (GT_EXPR,
4946 boolean_type_node,
4947 idx_val,
4948 old_idx_val),
4949 val, old_val);
4950 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4951 idx_val = new_idx_val;
4952 val = new_val;
4955 /* Convert the reduced value back to the result type and set as the
4956 result. */
4957 gimple_seq stmts = NULL;
4958 val = gimple_convert (&stmts, scalar_type, val);
4959 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4960 scalar_results.safe_push (val);
4963 /* 2.3 Create the reduction code, using one of the three schemes described
4964 above. In SLP we simply need to extract all the elements from the
4965 vector (without reducing them), so we use scalar shifts. */
4966 else if (reduc_fn != IFN_LAST && !slp_reduc)
4968 tree tmp;
4969 tree vec_elem_type;
4971 /* Case 1: Create:
4972 v_out2 = reduc_expr <v_out1> */
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_NOTE, vect_location,
4976 "Reduce using direct vector reduction.\n");
4978 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4979 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4981 tree tmp_dest
4982 = vect_create_destination_var (scalar_dest, vec_elem_type);
4983 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4984 new_phi_result);
4985 gimple_set_lhs (epilog_stmt, tmp_dest);
4986 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4987 gimple_set_lhs (epilog_stmt, new_temp);
4988 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4991 new_temp);
4993 else
4995 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4996 new_phi_result);
4997 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5000 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5001 gimple_set_lhs (epilog_stmt, new_temp);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5004 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5005 == INTEGER_INDUC_COND_REDUCTION)
5006 && !operand_equal_p (initial_def, induc_val, 0))
5008 /* Earlier we set the initial value to be a vector if induc_val
5009 values. Check the result and if it is induc_val then replace
5010 with the original initial value, unless induc_val is
5011 the same as initial_def already. */
5012 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5013 induc_val);
5015 tmp = make_ssa_name (new_scalar_dest);
5016 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5017 initial_def, new_temp);
5018 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019 new_temp = tmp;
5022 scalar_results.safe_push (new_temp);
5024 else
5026 bool reduce_with_shift = have_whole_vector_shift (mode);
5027 int element_bitsize = tree_to_uhwi (bitsize);
5028 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5029 tree vec_temp;
5031 /* COND reductions all do the final reduction with MAX_EXPR
5032 or MIN_EXPR. */
5033 if (code == COND_EXPR)
5035 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5036 == INTEGER_INDUC_COND_REDUCTION)
5037 code = induc_code;
5038 else
5039 code = MAX_EXPR;
5042 /* Regardless of whether we have a whole vector shift, if we're
5043 emulating the operation via tree-vect-generic, we don't want
5044 to use it. Only the first round of the reduction is likely
5045 to still be profitable via emulation. */
5046 /* ??? It might be better to emit a reduction tree code here, so that
5047 tree-vect-generic can expand the first round via bit tricks. */
5048 if (!VECTOR_MODE_P (mode))
5049 reduce_with_shift = false;
5050 else
5052 optab optab = optab_for_tree_code (code, vectype, optab_default);
5053 if (optab_handler (optab, mode) == CODE_FOR_nothing)
5054 reduce_with_shift = false;
5057 if (reduce_with_shift && !slp_reduc)
5059 int nelements = vec_size_in_bits / element_bitsize;
5060 auto_vec_perm_indices sel (nelements);
5062 int elt_offset;
5064 tree zero_vec = build_zero_cst (vectype);
5065 /* Case 2: Create:
5066 for (offset = nelements/2; offset >= 1; offset/=2)
5068 Create: va' = vec_shift <va, offset>
5069 Create: va = vop <va, va'>
5070 } */
5072 tree rhs;
5074 if (dump_enabled_p ())
5075 dump_printf_loc (MSG_NOTE, vect_location,
5076 "Reduce using vector shifts\n");
5078 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5079 new_temp = new_phi_result;
5080 for (elt_offset = nelements / 2;
5081 elt_offset >= 1;
5082 elt_offset /= 2)
5084 sel.truncate (0);
5085 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5086 tree mask = vect_gen_perm_mask_any (vectype, sel);
5087 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5088 new_temp, zero_vec, mask);
5089 new_name = make_ssa_name (vec_dest, epilog_stmt);
5090 gimple_assign_set_lhs (epilog_stmt, new_name);
5091 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5093 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5094 new_temp);
5095 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5096 gimple_assign_set_lhs (epilog_stmt, new_temp);
5097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5100 /* 2.4 Extract the final scalar result. Create:
5101 s_out3 = extract_field <v_out2, bitpos> */
5103 if (dump_enabled_p ())
5104 dump_printf_loc (MSG_NOTE, vect_location,
5105 "extract scalar result\n");
5107 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5108 bitsize, bitsize_zero_node);
5109 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5110 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5111 gimple_assign_set_lhs (epilog_stmt, new_temp);
5112 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5113 scalar_results.safe_push (new_temp);
5115 else
5117 /* Case 3: Create:
5118 s = extract_field <v_out2, 0>
5119 for (offset = element_size;
5120 offset < vector_size;
5121 offset += element_size;)
5123 Create: s' = extract_field <v_out2, offset>
5124 Create: s = op <s, s'> // For non SLP cases
5125 } */
5127 if (dump_enabled_p ())
5128 dump_printf_loc (MSG_NOTE, vect_location,
5129 "Reduce using scalar code.\n");
5131 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5132 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5134 int bit_offset;
5135 if (gimple_code (new_phi) == GIMPLE_PHI)
5136 vec_temp = PHI_RESULT (new_phi);
5137 else
5138 vec_temp = gimple_assign_lhs (new_phi);
5139 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5140 bitsize_zero_node);
5141 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5142 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5143 gimple_assign_set_lhs (epilog_stmt, new_temp);
5144 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146 /* In SLP we don't need to apply reduction operation, so we just
5147 collect s' values in SCALAR_RESULTS. */
5148 if (slp_reduc)
5149 scalar_results.safe_push (new_temp);
5151 for (bit_offset = element_bitsize;
5152 bit_offset < vec_size_in_bits;
5153 bit_offset += element_bitsize)
5155 tree bitpos = bitsize_int (bit_offset);
5156 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5157 bitsize, bitpos);
5159 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5160 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5161 gimple_assign_set_lhs (epilog_stmt, new_name);
5162 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5164 if (slp_reduc)
5166 /* In SLP we don't need to apply reduction operation, so
5167 we just collect s' values in SCALAR_RESULTS. */
5168 new_temp = new_name;
5169 scalar_results.safe_push (new_name);
5171 else
5173 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5174 new_name, new_temp);
5175 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5176 gimple_assign_set_lhs (epilog_stmt, new_temp);
5177 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5182 /* The only case where we need to reduce scalar results in SLP, is
5183 unrolling. If the size of SCALAR_RESULTS is greater than
5184 GROUP_SIZE, we reduce them combining elements modulo
5185 GROUP_SIZE. */
5186 if (slp_reduc)
5188 tree res, first_res, new_res;
5189 gimple *new_stmt;
5191 /* Reduce multiple scalar results in case of SLP unrolling. */
5192 for (j = group_size; scalar_results.iterate (j, &res);
5193 j++)
5195 first_res = scalar_results[j % group_size];
5196 new_stmt = gimple_build_assign (new_scalar_dest, code,
5197 first_res, res);
5198 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5199 gimple_assign_set_lhs (new_stmt, new_res);
5200 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5201 scalar_results[j % group_size] = new_res;
5204 else
5205 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5206 scalar_results.safe_push (new_temp);
5209 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5210 == INTEGER_INDUC_COND_REDUCTION)
5211 && !operand_equal_p (initial_def, induc_val, 0))
5213 /* Earlier we set the initial value to be a vector if induc_val
5214 values. Check the result and if it is induc_val then replace
5215 with the original initial value, unless induc_val is
5216 the same as initial_def already. */
5217 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5218 induc_val);
5220 tree tmp = make_ssa_name (new_scalar_dest);
5221 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5222 initial_def, new_temp);
5223 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224 scalar_results[0] = tmp;
5228 vect_finalize_reduction:
5230 if (double_reduc)
5231 loop = loop->inner;
5233 /* 2.5 Adjust the final result by the initial value of the reduction
5234 variable. (When such adjustment is not needed, then
5235 'adjustment_def' is zero). For example, if code is PLUS we create:
5236 new_temp = loop_exit_def + adjustment_def */
5238 if (adjustment_def)
5240 gcc_assert (!slp_reduc);
5241 if (nested_in_vect_loop)
5243 new_phi = new_phis[0];
5244 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5245 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5246 new_dest = vect_create_destination_var (scalar_dest, vectype);
5248 else
5250 new_temp = scalar_results[0];
5251 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5252 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5253 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5256 epilog_stmt = gimple_build_assign (new_dest, expr);
5257 new_temp = make_ssa_name (new_dest, epilog_stmt);
5258 gimple_assign_set_lhs (epilog_stmt, new_temp);
5259 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5260 if (nested_in_vect_loop)
5262 set_vinfo_for_stmt (epilog_stmt,
5263 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5264 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5265 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5267 if (!double_reduc)
5268 scalar_results.quick_push (new_temp);
5269 else
5270 scalar_results[0] = new_temp;
5272 else
5273 scalar_results[0] = new_temp;
5275 new_phis[0] = epilog_stmt;
5278 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5279 phis with new adjusted scalar results, i.e., replace use <s_out0>
5280 with use <s_out4>.
5282 Transform:
5283 loop_exit:
5284 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5285 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5286 v_out2 = reduce <v_out1>
5287 s_out3 = extract_field <v_out2, 0>
5288 s_out4 = adjust_result <s_out3>
5289 use <s_out0>
5290 use <s_out0>
5292 into:
5294 loop_exit:
5295 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5296 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5297 v_out2 = reduce <v_out1>
5298 s_out3 = extract_field <v_out2, 0>
5299 s_out4 = adjust_result <s_out3>
5300 use <s_out4>
5301 use <s_out4> */
5304 /* In SLP reduction chain we reduce vector results into one vector if
5305 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5306 the last stmt in the reduction chain, since we are looking for the loop
5307 exit phi node. */
5308 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5310 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5311 /* Handle reduction patterns. */
5312 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5313 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5315 scalar_dest = gimple_assign_lhs (dest_stmt);
5316 group_size = 1;
5319 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5320 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5321 need to match SCALAR_RESULTS with corresponding statements. The first
5322 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5323 the first vector stmt, etc.
5324 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5325 if (group_size > new_phis.length ())
5327 ratio = group_size / new_phis.length ();
5328 gcc_assert (!(group_size % new_phis.length ()));
5330 else
5331 ratio = 1;
5333 for (k = 0; k < group_size; k++)
5335 if (k % ratio == 0)
5337 epilog_stmt = new_phis[k / ratio];
5338 reduction_phi = reduction_phis[k / ratio];
5339 if (double_reduc)
5340 inner_phi = inner_phis[k / ratio];
5343 if (slp_reduc)
5345 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5347 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5348 /* SLP statements can't participate in patterns. */
5349 gcc_assert (!orig_stmt);
5350 scalar_dest = gimple_assign_lhs (current_stmt);
5353 phis.create (3);
5354 /* Find the loop-closed-use at the loop exit of the original scalar
5355 result. (The reduction result is expected to have two immediate uses -
5356 one at the latch block, and one at the loop exit). */
5357 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5358 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5359 && !is_gimple_debug (USE_STMT (use_p)))
5360 phis.safe_push (USE_STMT (use_p));
5362 /* While we expect to have found an exit_phi because of loop-closed-ssa
5363 form we can end up without one if the scalar cycle is dead. */
5365 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5367 if (outer_loop)
5369 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5370 gphi *vect_phi;
5372 /* FORNOW. Currently not supporting the case that an inner-loop
5373 reduction is not used in the outer-loop (but only outside the
5374 outer-loop), unless it is double reduction. */
5375 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5376 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5377 || double_reduc);
5379 if (double_reduc)
5380 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5381 else
5382 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5383 if (!double_reduc
5384 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5385 != vect_double_reduction_def)
5386 continue;
5388 /* Handle double reduction:
5390 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5391 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5392 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5393 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5395 At that point the regular reduction (stmt2 and stmt3) is
5396 already vectorized, as well as the exit phi node, stmt4.
5397 Here we vectorize the phi node of double reduction, stmt1, and
5398 update all relevant statements. */
5400 /* Go through all the uses of s2 to find double reduction phi
5401 node, i.e., stmt1 above. */
5402 orig_name = PHI_RESULT (exit_phi);
5403 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5405 stmt_vec_info use_stmt_vinfo;
5406 stmt_vec_info new_phi_vinfo;
5407 tree vect_phi_init, preheader_arg, vect_phi_res;
5408 basic_block bb = gimple_bb (use_stmt);
5409 gimple *use;
5411 /* Check that USE_STMT is really double reduction phi
5412 node. */
5413 if (gimple_code (use_stmt) != GIMPLE_PHI
5414 || gimple_phi_num_args (use_stmt) != 2
5415 || bb->loop_father != outer_loop)
5416 continue;
5417 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5418 if (!use_stmt_vinfo
5419 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5420 != vect_double_reduction_def)
5421 continue;
5423 /* Create vector phi node for double reduction:
5424 vs1 = phi <vs0, vs2>
5425 vs1 was created previously in this function by a call to
5426 vect_get_vec_def_for_operand and is stored in
5427 vec_initial_def;
5428 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5429 vs0 is created here. */
5431 /* Create vector phi node. */
5432 vect_phi = create_phi_node (vec_initial_def, bb);
5433 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5434 loop_vec_info_for_loop (outer_loop));
5435 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5437 /* Create vs0 - initial def of the double reduction phi. */
5438 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5439 loop_preheader_edge (outer_loop));
5440 vect_phi_init = get_initial_def_for_reduction
5441 (stmt, preheader_arg, NULL);
5443 /* Update phi node arguments with vs0 and vs2. */
5444 add_phi_arg (vect_phi, vect_phi_init,
5445 loop_preheader_edge (outer_loop),
5446 UNKNOWN_LOCATION);
5447 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5448 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5449 if (dump_enabled_p ())
5451 dump_printf_loc (MSG_NOTE, vect_location,
5452 "created double reduction phi node: ");
5453 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5456 vect_phi_res = PHI_RESULT (vect_phi);
5458 /* Replace the use, i.e., set the correct vs1 in the regular
5459 reduction phi node. FORNOW, NCOPIES is always 1, so the
5460 loop is redundant. */
5461 use = reduction_phi;
5462 for (j = 0; j < ncopies; j++)
5464 edge pr_edge = loop_preheader_edge (loop);
5465 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5466 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5472 phis.release ();
5473 if (nested_in_vect_loop)
5475 if (double_reduc)
5476 loop = outer_loop;
5477 else
5478 continue;
5481 phis.create (3);
5482 /* Find the loop-closed-use at the loop exit of the original scalar
5483 result. (The reduction result is expected to have two immediate uses,
5484 one at the latch block, and one at the loop exit). For double
5485 reductions we are looking for exit phis of the outer loop. */
5486 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5488 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5490 if (!is_gimple_debug (USE_STMT (use_p)))
5491 phis.safe_push (USE_STMT (use_p));
5493 else
5495 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5497 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5499 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5501 if (!flow_bb_inside_loop_p (loop,
5502 gimple_bb (USE_STMT (phi_use_p)))
5503 && !is_gimple_debug (USE_STMT (phi_use_p)))
5504 phis.safe_push (USE_STMT (phi_use_p));
5510 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5512 /* Replace the uses: */
5513 orig_name = PHI_RESULT (exit_phi);
5514 scalar_result = scalar_results[k];
5515 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5516 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5517 SET_USE (use_p, scalar_result);
5520 phis.release ();
5525 /* Function is_nonwrapping_integer_induction.
5527 Check if STMT (which is part of loop LOOP) both increments and
5528 does not cause overflow. */
5530 static bool
5531 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5533 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5534 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5535 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5536 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5537 widest_int ni, max_loop_value, lhs_max;
5538 bool overflow = false;
5540 /* Make sure the loop is integer based. */
5541 if (TREE_CODE (base) != INTEGER_CST
5542 || TREE_CODE (step) != INTEGER_CST)
5543 return false;
5545 /* Check that the max size of the loop will not wrap. */
5547 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5548 return true;
5550 if (! max_stmt_executions (loop, &ni))
5551 return false;
5553 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5554 &overflow);
5555 if (overflow)
5556 return false;
5558 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5559 TYPE_SIGN (lhs_type), &overflow);
5560 if (overflow)
5561 return false;
5563 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5564 <= TYPE_PRECISION (lhs_type));
5567 /* Function vectorizable_reduction.
5569 Check if STMT performs a reduction operation that can be vectorized.
5570 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5571 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5572 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5574 This function also handles reduction idioms (patterns) that have been
5575 recognized in advance during vect_pattern_recog. In this case, STMT may be
5576 of this form:
5577 X = pattern_expr (arg0, arg1, ..., X)
5578 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5579 sequence that had been detected and replaced by the pattern-stmt (STMT).
5581 This function also handles reduction of condition expressions, for example:
5582 for (int i = 0; i < N; i++)
5583 if (a[i] < value)
5584 last = a[i];
5585 This is handled by vectorising the loop and creating an additional vector
5586 containing the loop indexes for which "a[i] < value" was true. In the
5587 function epilogue this is reduced to a single max value and then used to
5588 index into the vector of results.
5590 In some cases of reduction patterns, the type of the reduction variable X is
5591 different than the type of the other arguments of STMT.
5592 In such cases, the vectype that is used when transforming STMT into a vector
5593 stmt is different than the vectype that is used to determine the
5594 vectorization factor, because it consists of a different number of elements
5595 than the actual number of elements that are being operated upon in parallel.
5597 For example, consider an accumulation of shorts into an int accumulator.
5598 On some targets it's possible to vectorize this pattern operating on 8
5599 shorts at a time (hence, the vectype for purposes of determining the
5600 vectorization factor should be V8HI); on the other hand, the vectype that
5601 is used to create the vector form is actually V4SI (the type of the result).
5603 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5604 indicates what is the actual level of parallelism (V8HI in the example), so
5605 that the right vectorization factor would be derived. This vectype
5606 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5607 be used to create the vectorized stmt. The right vectype for the vectorized
5608 stmt is obtained from the type of the result X:
5609 get_vectype_for_scalar_type (TREE_TYPE (X))
5611 This means that, contrary to "regular" reductions (or "regular" stmts in
5612 general), the following equation:
5613 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5614 does *NOT* necessarily hold for reduction patterns. */
5616 bool
5617 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5618 gimple **vec_stmt, slp_tree slp_node,
5619 slp_instance slp_node_instance)
5621 tree vec_dest;
5622 tree scalar_dest;
5623 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5624 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5625 tree vectype_in = NULL_TREE;
5626 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5627 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5628 enum tree_code code, orig_code;
5629 internal_fn reduc_fn;
5630 machine_mode vec_mode;
5631 int op_type;
5632 optab optab;
5633 tree new_temp = NULL_TREE;
5634 gimple *def_stmt;
5635 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5636 gimple *cond_reduc_def_stmt = NULL;
5637 enum tree_code cond_reduc_op_code = ERROR_MARK;
5638 tree scalar_type;
5639 bool is_simple_use;
5640 gimple *orig_stmt;
5641 stmt_vec_info orig_stmt_info = NULL;
5642 int i;
5643 int ncopies;
5644 int epilog_copies;
5645 stmt_vec_info prev_stmt_info, prev_phi_info;
5646 bool single_defuse_cycle = false;
5647 gimple *new_stmt = NULL;
5648 int j;
5649 tree ops[3];
5650 enum vect_def_type dts[3];
5651 bool nested_cycle = false, found_nested_cycle_def = false;
5652 bool double_reduc = false;
5653 basic_block def_bb;
5654 struct loop * def_stmt_loop, *outer_loop = NULL;
5655 tree def_arg;
5656 gimple *def_arg_stmt;
5657 auto_vec<tree> vec_oprnds0;
5658 auto_vec<tree> vec_oprnds1;
5659 auto_vec<tree> vec_oprnds2;
5660 auto_vec<tree> vect_defs;
5661 auto_vec<gimple *> phis;
5662 int vec_num;
5663 tree def0, tem;
5664 bool first_p = true;
5665 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5666 tree cond_reduc_val = NULL_TREE;
5668 /* Make sure it was already recognized as a reduction computation. */
5669 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5670 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5671 return false;
5673 if (nested_in_vect_loop_p (loop, stmt))
5675 outer_loop = loop;
5676 loop = loop->inner;
5677 nested_cycle = true;
5680 /* In case of reduction chain we switch to the first stmt in the chain, but
5681 we don't update STMT_INFO, since only the last stmt is marked as reduction
5682 and has reduction properties. */
5683 if (GROUP_FIRST_ELEMENT (stmt_info)
5684 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5686 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5687 first_p = false;
5690 if (gimple_code (stmt) == GIMPLE_PHI)
5692 /* Analysis is fully done on the reduction stmt invocation. */
5693 if (! vec_stmt)
5695 if (slp_node)
5696 slp_node_instance->reduc_phis = slp_node;
5698 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5699 return true;
5702 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5703 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5704 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5706 gcc_assert (is_gimple_assign (reduc_stmt));
5707 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5709 tree op = gimple_op (reduc_stmt, k);
5710 if (op == gimple_phi_result (stmt))
5711 continue;
5712 if (k == 1
5713 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5714 continue;
5715 tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5716 if (! vectype_in
5717 || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5718 vectype_in = tem;
5719 break;
5721 gcc_assert (vectype_in);
5723 if (slp_node)
5724 ncopies = 1;
5725 else
5726 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5728 use_operand_p use_p;
5729 gimple *use_stmt;
5730 if (ncopies > 1
5731 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5732 <= vect_used_only_live)
5733 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5734 && (use_stmt == reduc_stmt
5735 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5736 == reduc_stmt)))
5737 single_defuse_cycle = true;
5739 /* Create the destination vector */
5740 scalar_dest = gimple_assign_lhs (reduc_stmt);
5741 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5743 if (slp_node)
5744 /* The size vect_schedule_slp_instance computes is off for us. */
5745 vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5746 * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5747 / TYPE_VECTOR_SUBPARTS (vectype_in));
5748 else
5749 vec_num = 1;
5751 /* Generate the reduction PHIs upfront. */
5752 prev_phi_info = NULL;
5753 for (j = 0; j < ncopies; j++)
5755 if (j == 0 || !single_defuse_cycle)
5757 for (i = 0; i < vec_num; i++)
5759 /* Create the reduction-phi that defines the reduction
5760 operand. */
5761 gimple *new_phi = create_phi_node (vec_dest, loop->header);
5762 set_vinfo_for_stmt (new_phi,
5763 new_stmt_vec_info (new_phi, loop_vinfo));
5765 if (slp_node)
5766 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5767 else
5769 if (j == 0)
5770 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5771 else
5772 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5773 prev_phi_info = vinfo_for_stmt (new_phi);
5779 return true;
5782 /* 1. Is vectorizable reduction? */
5783 /* Not supportable if the reduction variable is used in the loop, unless
5784 it's a reduction chain. */
5785 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5786 && !GROUP_FIRST_ELEMENT (stmt_info))
5787 return false;
5789 /* Reductions that are not used even in an enclosing outer-loop,
5790 are expected to be "live" (used out of the loop). */
5791 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5792 && !STMT_VINFO_LIVE_P (stmt_info))
5793 return false;
5795 /* 2. Has this been recognized as a reduction pattern?
5797 Check if STMT represents a pattern that has been recognized
5798 in earlier analysis stages. For stmts that represent a pattern,
5799 the STMT_VINFO_RELATED_STMT field records the last stmt in
5800 the original sequence that constitutes the pattern. */
5802 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5803 if (orig_stmt)
5805 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5806 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5807 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5810 /* 3. Check the operands of the operation. The first operands are defined
5811 inside the loop body. The last operand is the reduction variable,
5812 which is defined by the loop-header-phi. */
5814 gcc_assert (is_gimple_assign (stmt));
5816 /* Flatten RHS. */
5817 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5819 case GIMPLE_BINARY_RHS:
5820 code = gimple_assign_rhs_code (stmt);
5821 op_type = TREE_CODE_LENGTH (code);
5822 gcc_assert (op_type == binary_op);
5823 ops[0] = gimple_assign_rhs1 (stmt);
5824 ops[1] = gimple_assign_rhs2 (stmt);
5825 break;
5827 case GIMPLE_TERNARY_RHS:
5828 code = gimple_assign_rhs_code (stmt);
5829 op_type = TREE_CODE_LENGTH (code);
5830 gcc_assert (op_type == ternary_op);
5831 ops[0] = gimple_assign_rhs1 (stmt);
5832 ops[1] = gimple_assign_rhs2 (stmt);
5833 ops[2] = gimple_assign_rhs3 (stmt);
5834 break;
5836 case GIMPLE_UNARY_RHS:
5837 return false;
5839 default:
5840 gcc_unreachable ();
5843 if (code == COND_EXPR && slp_node)
5844 return false;
5846 scalar_dest = gimple_assign_lhs (stmt);
5847 scalar_type = TREE_TYPE (scalar_dest);
5848 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5849 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5850 return false;
5852 /* Do not try to vectorize bit-precision reductions. */
5853 if (!type_has_mode_precision_p (scalar_type))
5854 return false;
5856 /* All uses but the last are expected to be defined in the loop.
5857 The last use is the reduction variable. In case of nested cycle this
5858 assumption is not true: we use reduc_index to record the index of the
5859 reduction variable. */
5860 gimple *reduc_def_stmt = NULL;
5861 int reduc_index = -1;
5862 for (i = 0; i < op_type; i++)
5864 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5865 if (i == 0 && code == COND_EXPR)
5866 continue;
5868 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5869 &def_stmt, &dts[i], &tem);
5870 dt = dts[i];
5871 gcc_assert (is_simple_use);
5872 if (dt == vect_reduction_def)
5874 reduc_def_stmt = def_stmt;
5875 reduc_index = i;
5876 continue;
5878 else if (tem)
5880 /* To properly compute ncopies we are interested in the widest
5881 input type in case we're looking at a widening accumulation. */
5882 if (!vectype_in
5883 || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5884 vectype_in = tem;
5887 if (dt != vect_internal_def
5888 && dt != vect_external_def
5889 && dt != vect_constant_def
5890 && dt != vect_induction_def
5891 && !(dt == vect_nested_cycle && nested_cycle))
5892 return false;
5894 if (dt == vect_nested_cycle)
5896 found_nested_cycle_def = true;
5897 reduc_def_stmt = def_stmt;
5898 reduc_index = i;
5901 if (i == 1 && code == COND_EXPR)
5903 /* Record how value of COND_EXPR is defined. */
5904 if (dt == vect_constant_def)
5906 cond_reduc_dt = dt;
5907 cond_reduc_val = ops[i];
5909 if (dt == vect_induction_def
5910 && def_stmt != NULL
5911 && is_nonwrapping_integer_induction (def_stmt, loop))
5913 cond_reduc_dt = dt;
5914 cond_reduc_def_stmt = def_stmt;
5919 if (!vectype_in)
5920 vectype_in = vectype_out;
5922 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5923 directy used in stmt. */
5924 if (reduc_index == -1)
5926 if (orig_stmt)
5927 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5928 else
5929 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5932 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5933 return false;
5935 if (!(reduc_index == -1
5936 || dts[reduc_index] == vect_reduction_def
5937 || dts[reduc_index] == vect_nested_cycle
5938 || ((dts[reduc_index] == vect_internal_def
5939 || dts[reduc_index] == vect_external_def
5940 || dts[reduc_index] == vect_constant_def
5941 || dts[reduc_index] == vect_induction_def)
5942 && nested_cycle && found_nested_cycle_def)))
5944 /* For pattern recognized stmts, orig_stmt might be a reduction,
5945 but some helper statements for the pattern might not, or
5946 might be COND_EXPRs with reduction uses in the condition. */
5947 gcc_assert (orig_stmt);
5948 return false;
5951 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5952 enum vect_reduction_type v_reduc_type
5953 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5954 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5956 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5957 /* If we have a condition reduction, see if we can simplify it further. */
5958 if (v_reduc_type == COND_REDUCTION)
5960 if (cond_reduc_dt == vect_induction_def)
5962 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
5963 tree base
5964 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5965 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5967 gcc_assert (TREE_CODE (base) == INTEGER_CST
5968 && TREE_CODE (step) == INTEGER_CST);
5969 cond_reduc_val = NULL_TREE;
5970 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5971 above base; punt if base is the minimum value of the type for
5972 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
5973 if (tree_int_cst_sgn (step) == -1)
5975 cond_reduc_op_code = MIN_EXPR;
5976 if (tree_int_cst_sgn (base) == -1)
5977 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5978 else if (tree_int_cst_lt (base,
5979 TYPE_MAX_VALUE (TREE_TYPE (base))))
5980 cond_reduc_val
5981 = int_const_binop (PLUS_EXPR, base, integer_one_node);
5983 else
5985 cond_reduc_op_code = MAX_EXPR;
5986 if (tree_int_cst_sgn (base) == 1)
5987 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5988 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5989 base))
5990 cond_reduc_val
5991 = int_const_binop (MINUS_EXPR, base, integer_one_node);
5993 if (cond_reduc_val)
5995 if (dump_enabled_p ())
5996 dump_printf_loc (MSG_NOTE, vect_location,
5997 "condition expression based on "
5998 "integer induction.\n");
5999 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6000 = INTEGER_INDUC_COND_REDUCTION;
6004 /* Loop peeling modifies initial value of reduction PHI, which
6005 makes the reduction stmt to be transformed different to the
6006 original stmt analyzed. We need to record reduction code for
6007 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6008 it can be used directly at transform stage. */
6009 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6010 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6012 /* Also set the reduction type to CONST_COND_REDUCTION. */
6013 gcc_assert (cond_reduc_dt == vect_constant_def);
6014 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6016 else if (cond_reduc_dt == vect_constant_def)
6018 enum vect_def_type cond_initial_dt;
6019 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6020 tree cond_initial_val
6021 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6023 gcc_assert (cond_reduc_val != NULL_TREE);
6024 vect_is_simple_use (cond_initial_val, loop_vinfo,
6025 &def_stmt, &cond_initial_dt);
6026 if (cond_initial_dt == vect_constant_def
6027 && types_compatible_p (TREE_TYPE (cond_initial_val),
6028 TREE_TYPE (cond_reduc_val)))
6030 tree e = fold_binary (LE_EXPR, boolean_type_node,
6031 cond_initial_val, cond_reduc_val);
6032 if (e && (integer_onep (e) || integer_zerop (e)))
6034 if (dump_enabled_p ())
6035 dump_printf_loc (MSG_NOTE, vect_location,
6036 "condition expression based on "
6037 "compile time constant.\n");
6038 /* Record reduction code at analysis stage. */
6039 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6040 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6041 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6042 = CONST_COND_REDUCTION;
6048 if (orig_stmt)
6049 gcc_assert (tmp == orig_stmt
6050 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6051 else
6052 /* We changed STMT to be the first stmt in reduction chain, hence we
6053 check that in this case the first element in the chain is STMT. */
6054 gcc_assert (stmt == tmp
6055 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6057 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6058 return false;
6060 if (slp_node)
6061 ncopies = 1;
6062 else
6063 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6065 gcc_assert (ncopies >= 1);
6067 vec_mode = TYPE_MODE (vectype_in);
6069 if (code == COND_EXPR)
6071 /* Only call during the analysis stage, otherwise we'll lose
6072 STMT_VINFO_TYPE. */
6073 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6074 ops[reduc_index], 0, NULL))
6076 if (dump_enabled_p ())
6077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6078 "unsupported condition in reduction\n");
6079 return false;
6082 else
6084 /* 4. Supportable by target? */
6086 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6087 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6089 /* Shifts and rotates are only supported by vectorizable_shifts,
6090 not vectorizable_reduction. */
6091 if (dump_enabled_p ())
6092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6093 "unsupported shift or rotation.\n");
6094 return false;
6097 /* 4.1. check support for the operation in the loop */
6098 optab = optab_for_tree_code (code, vectype_in, optab_default);
6099 if (!optab)
6101 if (dump_enabled_p ())
6102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6103 "no optab.\n");
6105 return false;
6108 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6110 if (dump_enabled_p ())
6111 dump_printf (MSG_NOTE, "op not supported by target.\n");
6113 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6114 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6115 return false;
6117 if (dump_enabled_p ())
6118 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6121 /* Worthwhile without SIMD support? */
6122 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6123 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127 "not worthwhile without SIMD support.\n");
6129 return false;
6133 /* 4.2. Check support for the epilog operation.
6135 If STMT represents a reduction pattern, then the type of the
6136 reduction variable may be different than the type of the rest
6137 of the arguments. For example, consider the case of accumulation
6138 of shorts into an int accumulator; The original code:
6139 S1: int_a = (int) short_a;
6140 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6142 was replaced with:
6143 STMT: int_acc = widen_sum <short_a, int_acc>
6145 This means that:
6146 1. The tree-code that is used to create the vector operation in the
6147 epilog code (that reduces the partial results) is not the
6148 tree-code of STMT, but is rather the tree-code of the original
6149 stmt from the pattern that STMT is replacing. I.e, in the example
6150 above we want to use 'widen_sum' in the loop, but 'plus' in the
6151 epilog.
6152 2. The type (mode) we use to check available target support
6153 for the vector operation to be created in the *epilog*, is
6154 determined by the type of the reduction variable (in the example
6155 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6156 However the type (mode) we use to check available target support
6157 for the vector operation to be created *inside the loop*, is
6158 determined by the type of the other arguments to STMT (in the
6159 example we'd check this: optab_handler (widen_sum_optab,
6160 vect_short_mode)).
6162 This is contrary to "regular" reductions, in which the types of all
6163 the arguments are the same as the type of the reduction variable.
6164 For "regular" reductions we can therefore use the same vector type
6165 (and also the same tree-code) when generating the epilog code and
6166 when generating the code inside the loop. */
6168 if (orig_stmt)
6170 /* This is a reduction pattern: get the vectype from the type of the
6171 reduction variable, and get the tree-code from orig_stmt. */
6172 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6173 == TREE_CODE_REDUCTION);
6174 orig_code = gimple_assign_rhs_code (orig_stmt);
6175 gcc_assert (vectype_out);
6176 vec_mode = TYPE_MODE (vectype_out);
6178 else
6180 /* Regular reduction: use the same vectype and tree-code as used for
6181 the vector code inside the loop can be used for the epilog code. */
6182 orig_code = code;
6184 if (code == MINUS_EXPR)
6185 orig_code = PLUS_EXPR;
6187 /* For simple condition reductions, replace with the actual expression
6188 we want to base our reduction around. */
6189 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6191 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6192 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6194 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6195 == INTEGER_INDUC_COND_REDUCTION)
6196 orig_code = cond_reduc_op_code;
6199 if (nested_cycle)
6201 def_bb = gimple_bb (reduc_def_stmt);
6202 def_stmt_loop = def_bb->loop_father;
6203 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6204 loop_preheader_edge (def_stmt_loop));
6205 if (TREE_CODE (def_arg) == SSA_NAME
6206 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6207 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6208 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6209 && vinfo_for_stmt (def_arg_stmt)
6210 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6211 == vect_double_reduction_def)
6212 double_reduc = true;
6215 reduc_fn = IFN_LAST;
6217 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6219 if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6221 if (reduc_fn != IFN_LAST
6222 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6223 OPTIMIZE_FOR_SPEED))
6225 if (dump_enabled_p ())
6226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227 "reduc op not supported by target.\n");
6229 reduc_fn = IFN_LAST;
6232 else
6234 if (!nested_cycle || double_reduc)
6236 if (dump_enabled_p ())
6237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6238 "no reduc code for scalar code.\n");
6240 return false;
6244 else
6246 int scalar_precision
6247 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6248 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6249 cr_index_vector_type = build_vector_type
6250 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6252 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6253 OPTIMIZE_FOR_SPEED))
6254 reduc_fn = IFN_REDUC_MAX;
6257 if ((double_reduc
6258 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6259 && ncopies > 1)
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "multiple types in double reduction or condition "
6264 "reduction.\n");
6265 return false;
6268 /* In case of widenning multiplication by a constant, we update the type
6269 of the constant to be the type of the other operand. We check that the
6270 constant fits the type in the pattern recognition pass. */
6271 if (code == DOT_PROD_EXPR
6272 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6274 if (TREE_CODE (ops[0]) == INTEGER_CST)
6275 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6276 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6277 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6278 else
6280 if (dump_enabled_p ())
6281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6282 "invalid types in dot-prod\n");
6284 return false;
6288 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6290 widest_int ni;
6292 if (! max_loop_iterations (loop, &ni))
6294 if (dump_enabled_p ())
6295 dump_printf_loc (MSG_NOTE, vect_location,
6296 "loop count not known, cannot create cond "
6297 "reduction.\n");
6298 return false;
6300 /* Convert backedges to iterations. */
6301 ni += 1;
6303 /* The additional index will be the same type as the condition. Check
6304 that the loop can fit into this less one (because we'll use up the
6305 zero slot for when there are no matches). */
6306 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6307 if (wi::geu_p (ni, wi::to_widest (max_index)))
6309 if (dump_enabled_p ())
6310 dump_printf_loc (MSG_NOTE, vect_location,
6311 "loop size is greater than data size.\n");
6312 return false;
6316 /* In case the vectorization factor (VF) is bigger than the number
6317 of elements that we can fit in a vectype (nunits), we have to generate
6318 more than one vector stmt - i.e - we need to "unroll" the
6319 vector stmt by a factor VF/nunits. For more details see documentation
6320 in vectorizable_operation. */
6322 /* If the reduction is used in an outer loop we need to generate
6323 VF intermediate results, like so (e.g. for ncopies=2):
6324 r0 = phi (init, r0)
6325 r1 = phi (init, r1)
6326 r0 = x0 + r0;
6327 r1 = x1 + r1;
6328 (i.e. we generate VF results in 2 registers).
6329 In this case we have a separate def-use cycle for each copy, and therefore
6330 for each copy we get the vector def for the reduction variable from the
6331 respective phi node created for this copy.
6333 Otherwise (the reduction is unused in the loop nest), we can combine
6334 together intermediate results, like so (e.g. for ncopies=2):
6335 r = phi (init, r)
6336 r = x0 + r;
6337 r = x1 + r;
6338 (i.e. we generate VF/2 results in a single register).
6339 In this case for each copy we get the vector def for the reduction variable
6340 from the vectorized reduction operation generated in the previous iteration.
6342 This only works when we see both the reduction PHI and its only consumer
6343 in vectorizable_reduction and there are no intermediate stmts
6344 participating. */
6345 use_operand_p use_p;
6346 gimple *use_stmt;
6347 if (ncopies > 1
6348 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6349 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6350 && (use_stmt == stmt
6351 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6353 single_defuse_cycle = true;
6354 epilog_copies = 1;
6356 else
6357 epilog_copies = ncopies;
6359 /* If the reduction stmt is one of the patterns that have lane
6360 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6361 if ((ncopies > 1
6362 && ! single_defuse_cycle)
6363 && (code == DOT_PROD_EXPR
6364 || code == WIDEN_SUM_EXPR
6365 || code == SAD_EXPR))
6367 if (dump_enabled_p ())
6368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6369 "multi def-use cycle not possible for lane-reducing "
6370 "reduction operation\n");
6371 return false;
6374 if (!vec_stmt) /* transformation not required. */
6376 if (first_p)
6377 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6378 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6379 return true;
6382 /* Transform. */
6384 if (dump_enabled_p ())
6385 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6387 /* FORNOW: Multiple types are not supported for condition. */
6388 if (code == COND_EXPR)
6389 gcc_assert (ncopies == 1);
6391 /* Create the destination vector */
6392 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6394 prev_stmt_info = NULL;
6395 prev_phi_info = NULL;
6396 if (slp_node)
6397 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6398 else
6400 vec_num = 1;
6401 vec_oprnds0.create (1);
6402 vec_oprnds1.create (1);
6403 if (op_type == ternary_op)
6404 vec_oprnds2.create (1);
6407 phis.create (vec_num);
6408 vect_defs.create (vec_num);
6409 if (!slp_node)
6410 vect_defs.quick_push (NULL_TREE);
6412 if (slp_node)
6413 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6414 else
6415 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6417 for (j = 0; j < ncopies; j++)
6419 if (code == COND_EXPR)
6421 gcc_assert (!slp_node);
6422 vectorizable_condition (stmt, gsi, vec_stmt,
6423 PHI_RESULT (phis[0]),
6424 reduc_index, NULL);
6425 /* Multiple types are not supported for condition. */
6426 break;
6429 /* Handle uses. */
6430 if (j == 0)
6432 if (slp_node)
6434 /* Get vec defs for all the operands except the reduction index,
6435 ensuring the ordering of the ops in the vector is kept. */
6436 auto_vec<tree, 3> slp_ops;
6437 auto_vec<vec<tree>, 3> vec_defs;
6439 slp_ops.quick_push (ops[0]);
6440 slp_ops.quick_push (ops[1]);
6441 if (op_type == ternary_op)
6442 slp_ops.quick_push (ops[2]);
6444 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6446 vec_oprnds0.safe_splice (vec_defs[0]);
6447 vec_defs[0].release ();
6448 vec_oprnds1.safe_splice (vec_defs[1]);
6449 vec_defs[1].release ();
6450 if (op_type == ternary_op)
6452 vec_oprnds2.safe_splice (vec_defs[2]);
6453 vec_defs[2].release ();
6456 else
6458 vec_oprnds0.quick_push
6459 (vect_get_vec_def_for_operand (ops[0], stmt));
6460 vec_oprnds1.quick_push
6461 (vect_get_vec_def_for_operand (ops[1], stmt));
6462 if (op_type == ternary_op)
6463 vec_oprnds2.quick_push
6464 (vect_get_vec_def_for_operand (ops[2], stmt));
6467 else
6469 if (!slp_node)
6471 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6473 if (single_defuse_cycle && reduc_index == 0)
6474 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6475 else
6476 vec_oprnds0[0]
6477 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6478 if (single_defuse_cycle && reduc_index == 1)
6479 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6480 else
6481 vec_oprnds1[0]
6482 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6483 if (op_type == ternary_op)
6485 if (single_defuse_cycle && reduc_index == 2)
6486 vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6487 else
6488 vec_oprnds2[0]
6489 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6494 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6496 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6497 if (op_type == ternary_op)
6498 vop[2] = vec_oprnds2[i];
6500 new_temp = make_ssa_name (vec_dest, new_stmt);
6501 new_stmt = gimple_build_assign (new_temp, code,
6502 vop[0], vop[1], vop[2]);
6503 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6505 if (slp_node)
6507 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6508 vect_defs.quick_push (new_temp);
6510 else
6511 vect_defs[0] = new_temp;
6514 if (slp_node)
6515 continue;
6517 if (j == 0)
6518 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6519 else
6520 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6522 prev_stmt_info = vinfo_for_stmt (new_stmt);
6525 /* Finalize the reduction-phi (set its arguments) and create the
6526 epilog reduction code. */
6527 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6528 vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6530 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6531 epilog_copies, reduc_fn, phis,
6532 double_reduc, slp_node, slp_node_instance,
6533 cond_reduc_val, cond_reduc_op_code);
6535 return true;
6538 /* Function vect_min_worthwhile_factor.
6540 For a loop where we could vectorize the operation indicated by CODE,
6541 return the minimum vectorization factor that makes it worthwhile
6542 to use generic vectors. */
6544 vect_min_worthwhile_factor (enum tree_code code)
6546 switch (code)
6548 case PLUS_EXPR:
6549 case MINUS_EXPR:
6550 case NEGATE_EXPR:
6551 return 4;
6553 case BIT_AND_EXPR:
6554 case BIT_IOR_EXPR:
6555 case BIT_XOR_EXPR:
6556 case BIT_NOT_EXPR:
6557 return 2;
6559 default:
6560 return INT_MAX;
6564 /* Return true if VINFO indicates we are doing loop vectorization and if
6565 it is worth decomposing CODE operations into scalar operations for
6566 that loop's vectorization factor. */
6568 bool
6569 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6571 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6572 return (loop_vinfo
6573 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6574 >= vect_min_worthwhile_factor (code)));
6577 /* Function vectorizable_induction
6579 Check if PHI performs an induction computation that can be vectorized.
6580 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6581 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6582 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6584 bool
6585 vectorizable_induction (gimple *phi,
6586 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6587 gimple **vec_stmt, slp_tree slp_node)
6589 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6590 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6592 unsigned ncopies;
6593 bool nested_in_vect_loop = false;
6594 struct loop *iv_loop;
6595 tree vec_def;
6596 edge pe = loop_preheader_edge (loop);
6597 basic_block new_bb;
6598 tree new_vec, vec_init, vec_step, t;
6599 tree new_name;
6600 gimple *new_stmt;
6601 gphi *induction_phi;
6602 tree induc_def, vec_dest;
6603 tree init_expr, step_expr;
6604 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6605 unsigned i;
6606 tree expr;
6607 gimple_seq stmts;
6608 imm_use_iterator imm_iter;
6609 use_operand_p use_p;
6610 gimple *exit_phi;
6611 edge latch_e;
6612 tree loop_arg;
6613 gimple_stmt_iterator si;
6614 basic_block bb = gimple_bb (phi);
6616 if (gimple_code (phi) != GIMPLE_PHI)
6617 return false;
6619 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6620 return false;
6622 /* Make sure it was recognized as induction computation. */
6623 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6624 return false;
6626 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6627 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6629 if (slp_node)
6630 ncopies = 1;
6631 else
6632 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6633 gcc_assert (ncopies >= 1);
6635 /* FORNOW. These restrictions should be relaxed. */
6636 if (nested_in_vect_loop_p (loop, phi))
6638 imm_use_iterator imm_iter;
6639 use_operand_p use_p;
6640 gimple *exit_phi;
6641 edge latch_e;
6642 tree loop_arg;
6644 if (ncopies > 1)
6646 if (dump_enabled_p ())
6647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6648 "multiple types in nested loop.\n");
6649 return false;
6652 /* FORNOW: outer loop induction with SLP not supported. */
6653 if (STMT_SLP_TYPE (stmt_info))
6654 return false;
6656 exit_phi = NULL;
6657 latch_e = loop_latch_edge (loop->inner);
6658 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6659 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6661 gimple *use_stmt = USE_STMT (use_p);
6662 if (is_gimple_debug (use_stmt))
6663 continue;
6665 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6667 exit_phi = use_stmt;
6668 break;
6671 if (exit_phi)
6673 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6674 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6675 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6677 if (dump_enabled_p ())
6678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6679 "inner-loop induction only used outside "
6680 "of the outer vectorized loop.\n");
6681 return false;
6685 nested_in_vect_loop = true;
6686 iv_loop = loop->inner;
6688 else
6689 iv_loop = loop;
6690 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6692 if (!vec_stmt) /* transformation not required. */
6694 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_NOTE, vect_location,
6697 "=== vectorizable_induction ===\n");
6698 vect_model_induction_cost (stmt_info, ncopies);
6699 return true;
6702 /* Transform. */
6704 /* Compute a vector variable, initialized with the first VF values of
6705 the induction variable. E.g., for an iv with IV_PHI='X' and
6706 evolution S, for a vector of 4 units, we want to compute:
6707 [X, X + S, X + 2*S, X + 3*S]. */
6709 if (dump_enabled_p ())
6710 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6712 latch_e = loop_latch_edge (iv_loop);
6713 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6715 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6716 gcc_assert (step_expr != NULL_TREE);
6718 pe = loop_preheader_edge (iv_loop);
6719 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6720 loop_preheader_edge (iv_loop));
6722 /* Convert the step to the desired type. */
6723 stmts = NULL;
6724 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6725 if (stmts)
6727 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6728 gcc_assert (!new_bb);
6731 /* Find the first insertion point in the BB. */
6732 si = gsi_after_labels (bb);
6734 /* For SLP induction we have to generate several IVs as for example
6735 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6736 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
6737 [VF*S, VF*S, VF*S, VF*S] for all. */
6738 if (slp_node)
6740 /* Convert the init to the desired type. */
6741 stmts = NULL;
6742 init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6743 if (stmts)
6745 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6746 gcc_assert (!new_bb);
6749 /* Generate [VF*S, VF*S, ... ]. */
6750 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6752 expr = build_int_cst (integer_type_node, vf);
6753 expr = fold_convert (TREE_TYPE (step_expr), expr);
6755 else
6756 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6757 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6758 expr, step_expr);
6759 if (! CONSTANT_CLASS_P (new_name))
6760 new_name = vect_init_vector (phi, new_name,
6761 TREE_TYPE (step_expr), NULL);
6762 new_vec = build_vector_from_val (vectype, new_name);
6763 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6765 /* Now generate the IVs. */
6766 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6767 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6768 unsigned elts = nunits * nvects;
6769 unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6770 gcc_assert (elts % group_size == 0);
6771 tree elt = init_expr;
6772 unsigned ivn;
6773 for (ivn = 0; ivn < nivs; ++ivn)
6775 tree_vector_builder elts (vectype, nunits, 1);
6776 stmts = NULL;
6777 for (unsigned eltn = 0; eltn < nunits; ++eltn)
6779 if (ivn*nunits + eltn >= group_size
6780 && (ivn*nunits + eltn) % group_size == 0)
6781 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6782 elt, step_expr);
6783 elts.quick_push (elt);
6785 vec_init = gimple_build_vector (&stmts, &elts);
6786 if (stmts)
6788 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6789 gcc_assert (!new_bb);
6792 /* Create the induction-phi that defines the induction-operand. */
6793 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6794 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6795 set_vinfo_for_stmt (induction_phi,
6796 new_stmt_vec_info (induction_phi, loop_vinfo));
6797 induc_def = PHI_RESULT (induction_phi);
6799 /* Create the iv update inside the loop */
6800 vec_def = make_ssa_name (vec_dest);
6801 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6802 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6803 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6805 /* Set the arguments of the phi node: */
6806 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6807 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6808 UNKNOWN_LOCATION);
6810 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6813 /* Re-use IVs when we can. */
6814 if (ivn < nvects)
6816 unsigned vfp
6817 = least_common_multiple (group_size, nunits) / group_size;
6818 /* Generate [VF'*S, VF'*S, ... ]. */
6819 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6821 expr = build_int_cst (integer_type_node, vfp);
6822 expr = fold_convert (TREE_TYPE (step_expr), expr);
6824 else
6825 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6826 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6827 expr, step_expr);
6828 if (! CONSTANT_CLASS_P (new_name))
6829 new_name = vect_init_vector (phi, new_name,
6830 TREE_TYPE (step_expr), NULL);
6831 new_vec = build_vector_from_val (vectype, new_name);
6832 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6833 for (; ivn < nvects; ++ivn)
6835 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6836 tree def;
6837 if (gimple_code (iv) == GIMPLE_PHI)
6838 def = gimple_phi_result (iv);
6839 else
6840 def = gimple_assign_lhs (iv);
6841 new_stmt = gimple_build_assign (make_ssa_name (vectype),
6842 PLUS_EXPR,
6843 def, vec_step);
6844 if (gimple_code (iv) == GIMPLE_PHI)
6845 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6846 else
6848 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6849 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6851 set_vinfo_for_stmt (new_stmt,
6852 new_stmt_vec_info (new_stmt, loop_vinfo));
6853 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6857 return true;
6860 /* Create the vector that holds the initial_value of the induction. */
6861 if (nested_in_vect_loop)
6863 /* iv_loop is nested in the loop to be vectorized. init_expr had already
6864 been created during vectorization of previous stmts. We obtain it
6865 from the STMT_VINFO_VEC_STMT of the defining stmt. */
6866 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6867 /* If the initial value is not of proper type, convert it. */
6868 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6870 new_stmt
6871 = gimple_build_assign (vect_get_new_ssa_name (vectype,
6872 vect_simple_var,
6873 "vec_iv_"),
6874 VIEW_CONVERT_EXPR,
6875 build1 (VIEW_CONVERT_EXPR, vectype,
6876 vec_init));
6877 vec_init = gimple_assign_lhs (new_stmt);
6878 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6879 new_stmt);
6880 gcc_assert (!new_bb);
6881 set_vinfo_for_stmt (new_stmt,
6882 new_stmt_vec_info (new_stmt, loop_vinfo));
6885 else
6887 /* iv_loop is the loop to be vectorized. Create:
6888 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
6889 stmts = NULL;
6890 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6892 tree_vector_builder elts (vectype, nunits, 1);
6893 elts.quick_push (new_name);
6894 for (i = 1; i < nunits; i++)
6896 /* Create: new_name_i = new_name + step_expr */
6897 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6898 new_name, step_expr);
6899 elts.quick_push (new_name);
6901 /* Create a vector from [new_name_0, new_name_1, ...,
6902 new_name_nunits-1] */
6903 vec_init = gimple_build_vector (&stmts, &elts);
6904 if (stmts)
6906 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6907 gcc_assert (!new_bb);
6912 /* Create the vector that holds the step of the induction. */
6913 if (nested_in_vect_loop)
6914 /* iv_loop is nested in the loop to be vectorized. Generate:
6915 vec_step = [S, S, S, S] */
6916 new_name = step_expr;
6917 else
6919 /* iv_loop is the loop to be vectorized. Generate:
6920 vec_step = [VF*S, VF*S, VF*S, VF*S] */
6921 gimple_seq seq = NULL;
6922 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6924 expr = build_int_cst (integer_type_node, vf);
6925 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6927 else
6928 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6929 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6930 expr, step_expr);
6931 if (seq)
6933 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6934 gcc_assert (!new_bb);
6938 t = unshare_expr (new_name);
6939 gcc_assert (CONSTANT_CLASS_P (new_name)
6940 || TREE_CODE (new_name) == SSA_NAME);
6941 new_vec = build_vector_from_val (vectype, t);
6942 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6945 /* Create the following def-use cycle:
6946 loop prolog:
6947 vec_init = ...
6948 vec_step = ...
6949 loop:
6950 vec_iv = PHI <vec_init, vec_loop>
6952 STMT
6954 vec_loop = vec_iv + vec_step; */
6956 /* Create the induction-phi that defines the induction-operand. */
6957 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6958 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6959 set_vinfo_for_stmt (induction_phi,
6960 new_stmt_vec_info (induction_phi, loop_vinfo));
6961 induc_def = PHI_RESULT (induction_phi);
6963 /* Create the iv update inside the loop */
6964 vec_def = make_ssa_name (vec_dest);
6965 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6966 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6967 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6969 /* Set the arguments of the phi node: */
6970 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6971 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6972 UNKNOWN_LOCATION);
6974 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6976 /* In case that vectorization factor (VF) is bigger than the number
6977 of elements that we can fit in a vectype (nunits), we have to generate
6978 more than one vector stmt - i.e - we need to "unroll" the
6979 vector stmt by a factor VF/nunits. For more details see documentation
6980 in vectorizable_operation. */
6982 if (ncopies > 1)
6984 gimple_seq seq = NULL;
6985 stmt_vec_info prev_stmt_vinfo;
6986 /* FORNOW. This restriction should be relaxed. */
6987 gcc_assert (!nested_in_vect_loop);
6989 /* Create the vector that holds the step of the induction. */
6990 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6992 expr = build_int_cst (integer_type_node, nunits);
6993 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6995 else
6996 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6997 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6998 expr, step_expr);
6999 if (seq)
7001 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7002 gcc_assert (!new_bb);
7005 t = unshare_expr (new_name);
7006 gcc_assert (CONSTANT_CLASS_P (new_name)
7007 || TREE_CODE (new_name) == SSA_NAME);
7008 new_vec = build_vector_from_val (vectype, t);
7009 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7011 vec_def = induc_def;
7012 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7013 for (i = 1; i < ncopies; i++)
7015 /* vec_i = vec_prev + vec_step */
7016 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7017 vec_def, vec_step);
7018 vec_def = make_ssa_name (vec_dest, new_stmt);
7019 gimple_assign_set_lhs (new_stmt, vec_def);
7021 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7022 set_vinfo_for_stmt (new_stmt,
7023 new_stmt_vec_info (new_stmt, loop_vinfo));
7024 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7025 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7029 if (nested_in_vect_loop)
7031 /* Find the loop-closed exit-phi of the induction, and record
7032 the final vector of induction results: */
7033 exit_phi = NULL;
7034 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7036 gimple *use_stmt = USE_STMT (use_p);
7037 if (is_gimple_debug (use_stmt))
7038 continue;
7040 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7042 exit_phi = use_stmt;
7043 break;
7046 if (exit_phi)
7048 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7049 /* FORNOW. Currently not supporting the case that an inner-loop induction
7050 is not used in the outer-loop (i.e. only outside the outer-loop). */
7051 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7052 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7054 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7055 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_NOTE, vect_location,
7058 "vector of inductions after inner-loop:");
7059 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7065 if (dump_enabled_p ())
7067 dump_printf_loc (MSG_NOTE, vect_location,
7068 "transform induction: created def-use cycle: ");
7069 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7070 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7071 SSA_NAME_DEF_STMT (vec_def), 0);
7074 return true;
7077 /* Function vectorizable_live_operation.
7079 STMT computes a value that is used outside the loop. Check if
7080 it can be supported. */
7082 bool
7083 vectorizable_live_operation (gimple *stmt,
7084 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7085 slp_tree slp_node, int slp_index,
7086 gimple **vec_stmt)
7088 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7090 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7091 imm_use_iterator imm_iter;
7092 tree lhs, lhs_type, bitsize, vec_bitsize;
7093 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7094 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7095 int ncopies;
7096 gimple *use_stmt;
7097 auto_vec<tree> vec_oprnds;
7099 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7101 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7102 return false;
7104 /* FORNOW. CHECKME. */
7105 if (nested_in_vect_loop_p (loop, stmt))
7106 return false;
7108 /* If STMT is not relevant and it is a simple assignment and its inputs are
7109 invariant then it can remain in place, unvectorized. The original last
7110 scalar value that it computes will be used. */
7111 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7113 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7114 if (dump_enabled_p ())
7115 dump_printf_loc (MSG_NOTE, vect_location,
7116 "statement is simple and uses invariant. Leaving in "
7117 "place.\n");
7118 return true;
7121 if (slp_node)
7122 ncopies = 1;
7123 else
7124 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7126 if (!vec_stmt)
7127 /* No transformation required. */
7128 return true;
7130 /* If stmt has a related stmt, then use that for getting the lhs. */
7131 if (is_pattern_stmt_p (stmt_info))
7132 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7134 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7135 : gimple_get_lhs (stmt);
7136 lhs_type = TREE_TYPE (lhs);
7138 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7139 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7140 : TYPE_SIZE (TREE_TYPE (vectype)));
7141 vec_bitsize = TYPE_SIZE (vectype);
7143 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7144 tree vec_lhs, bitstart;
7145 if (slp_node)
7147 gcc_assert (slp_index >= 0);
7149 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7150 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7152 /* Get the last occurrence of the scalar index from the concatenation of
7153 all the slp vectors. Calculate which slp vector it is and the index
7154 within. */
7155 int pos = (num_vec * nunits) - num_scalar + slp_index;
7156 int vec_entry = pos / nunits;
7157 int vec_index = pos % nunits;
7159 /* Get the correct slp vectorized stmt. */
7160 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7162 /* Get entry to use. */
7163 bitstart = bitsize_int (vec_index);
7164 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7166 else
7168 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7169 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7171 /* For multiple copies, get the last copy. */
7172 for (int i = 1; i < ncopies; ++i)
7173 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7174 vec_lhs);
7176 /* Get the last lane in the vector. */
7177 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7180 /* Create a new vectorized stmt for the uses of STMT and insert outside the
7181 loop. */
7182 gimple_seq stmts = NULL;
7183 tree bftype = TREE_TYPE (vectype);
7184 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7185 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7186 tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7187 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7188 true, NULL_TREE);
7189 if (stmts)
7190 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7192 /* Replace use of lhs with newly computed result. If the use stmt is a
7193 single arg PHI, just replace all uses of PHI result. It's necessary
7194 because lcssa PHI defining lhs may be before newly inserted stmt. */
7195 use_operand_p use_p;
7196 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7197 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7198 && !is_gimple_debug (use_stmt))
7200 if (gimple_code (use_stmt) == GIMPLE_PHI
7201 && gimple_phi_num_args (use_stmt) == 1)
7203 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7205 else
7207 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7208 SET_USE (use_p, new_tree);
7210 update_stmt (use_stmt);
7213 return true;
7216 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
7218 static void
7219 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7221 ssa_op_iter op_iter;
7222 imm_use_iterator imm_iter;
7223 def_operand_p def_p;
7224 gimple *ustmt;
7226 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7228 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7230 basic_block bb;
7232 if (!is_gimple_debug (ustmt))
7233 continue;
7235 bb = gimple_bb (ustmt);
7237 if (!flow_bb_inside_loop_p (loop, bb))
7239 if (gimple_debug_bind_p (ustmt))
7241 if (dump_enabled_p ())
7242 dump_printf_loc (MSG_NOTE, vect_location,
7243 "killing debug use\n");
7245 gimple_debug_bind_reset_value (ustmt);
7246 update_stmt (ustmt);
7248 else
7249 gcc_unreachable ();
7255 /* Given loop represented by LOOP_VINFO, return true if computation of
7256 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7257 otherwise. */
7259 static bool
7260 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7262 /* Constant case. */
7263 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7265 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7266 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7268 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7269 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7270 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7271 return true;
7274 widest_int max;
7275 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7276 /* Check the upper bound of loop niters. */
7277 if (get_max_loop_iterations (loop, &max))
7279 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7280 signop sgn = TYPE_SIGN (type);
7281 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7282 if (max < type_max)
7283 return true;
7285 return false;
7288 /* Scale profiling counters by estimation for LOOP which is vectorized
7289 by factor VF. */
7291 static void
7292 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7294 edge preheader = loop_preheader_edge (loop);
7295 /* Reduce loop iterations by the vectorization factor. */
7296 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7297 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7299 if (freq_h.nonzero_p ())
7301 profile_probability p;
7303 /* Avoid dropping loop body profile counter to 0 because of zero count
7304 in loop's preheader. */
7305 if (!(freq_e == profile_count::zero ()))
7306 freq_e = freq_e.force_nonzero ();
7307 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7308 scale_loop_frequencies (loop, p);
7311 edge exit_e = single_exit (loop);
7312 exit_e->probability = profile_probability::always ()
7313 .apply_scale (1, new_est_niter + 1);
7315 edge exit_l = single_pred_edge (loop->latch);
7316 profile_probability prob = exit_l->probability;
7317 exit_l->probability = exit_e->probability.invert ();
7318 if (prob.initialized_p () && exit_l->probability.initialized_p ())
7319 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7322 /* Function vect_transform_loop.
7324 The analysis phase has determined that the loop is vectorizable.
7325 Vectorize the loop - created vectorized stmts to replace the scalar
7326 stmts in the loop, and update the loop exit condition.
7327 Returns scalar epilogue loop if any. */
7329 struct loop *
7330 vect_transform_loop (loop_vec_info loop_vinfo)
7332 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7333 struct loop *epilogue = NULL;
7334 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7335 int nbbs = loop->num_nodes;
7336 int i;
7337 tree niters_vector = NULL;
7338 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7339 bool grouped_store;
7340 bool slp_scheduled = false;
7341 gimple *stmt, *pattern_stmt;
7342 gimple_seq pattern_def_seq = NULL;
7343 gimple_stmt_iterator pattern_def_si = gsi_none ();
7344 bool transform_pattern_stmt = false;
7345 bool check_profitability = false;
7346 int th;
7348 if (dump_enabled_p ())
7349 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7351 /* Use the more conservative vectorization threshold. If the number
7352 of iterations is constant assume the cost check has been performed
7353 by our caller. If the threshold makes all loops profitable that
7354 run at least the vectorization factor number of times checking
7355 is pointless, too. */
7356 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7357 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7358 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7360 if (dump_enabled_p ())
7361 dump_printf_loc (MSG_NOTE, vect_location,
7362 "Profitability threshold is %d loop iterations.\n",
7363 th);
7364 check_profitability = true;
7367 /* Make sure there exists a single-predecessor exit bb. Do this before
7368 versioning. */
7369 edge e = single_exit (loop);
7370 if (! single_pred_p (e->dest))
7372 split_loop_exit_edge (e);
7373 if (dump_enabled_p ())
7374 dump_printf (MSG_NOTE, "split exit edge\n");
7377 /* Version the loop first, if required, so the profitability check
7378 comes first. */
7380 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7382 poly_uint64 versioning_threshold
7383 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7384 if (check_profitability
7385 && ordered_p (poly_uint64 (th), versioning_threshold))
7387 versioning_threshold = ordered_max (poly_uint64 (th),
7388 versioning_threshold);
7389 check_profitability = false;
7391 vect_loop_versioning (loop_vinfo, th, check_profitability,
7392 versioning_threshold);
7393 check_profitability = false;
7396 /* Make sure there exists a single-predecessor exit bb also on the
7397 scalar loop copy. Do this after versioning but before peeling
7398 so CFG structure is fine for both scalar and if-converted loop
7399 to make slpeel_duplicate_current_defs_from_edges face matched
7400 loop closed PHI nodes on the exit. */
7401 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7403 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7404 if (! single_pred_p (e->dest))
7406 split_loop_exit_edge (e);
7407 if (dump_enabled_p ())
7408 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7412 tree niters = vect_build_loop_niters (loop_vinfo);
7413 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7414 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7415 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7416 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7417 check_profitability, niters_no_overflow);
7418 if (niters_vector == NULL_TREE)
7420 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7421 niters_vector
7422 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7423 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7424 else
7425 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7426 niters_no_overflow);
7429 /* 1) Make sure the loop header has exactly two entries
7430 2) Make sure we have a preheader basic block. */
7432 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7434 split_edge (loop_preheader_edge (loop));
7436 /* FORNOW: the vectorizer supports only loops which body consist
7437 of one basic block (header + empty latch). When the vectorizer will
7438 support more involved loop forms, the order by which the BBs are
7439 traversed need to be reconsidered. */
7441 for (i = 0; i < nbbs; i++)
7443 basic_block bb = bbs[i];
7444 stmt_vec_info stmt_info;
7446 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7447 gsi_next (&si))
7449 gphi *phi = si.phi ();
7450 if (dump_enabled_p ())
7452 dump_printf_loc (MSG_NOTE, vect_location,
7453 "------>vectorizing phi: ");
7454 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7456 stmt_info = vinfo_for_stmt (phi);
7457 if (!stmt_info)
7458 continue;
7460 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7461 vect_loop_kill_debug_uses (loop, phi);
7463 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7464 && !STMT_VINFO_LIVE_P (stmt_info))
7465 continue;
7467 if (STMT_VINFO_VECTYPE (stmt_info)
7468 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7469 != (unsigned HOST_WIDE_INT) vf)
7470 && dump_enabled_p ())
7471 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7473 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7474 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7475 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7476 && ! PURE_SLP_STMT (stmt_info))
7478 if (dump_enabled_p ())
7479 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7480 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7484 pattern_stmt = NULL;
7485 for (gimple_stmt_iterator si = gsi_start_bb (bb);
7486 !gsi_end_p (si) || transform_pattern_stmt;)
7488 bool is_store;
7490 if (transform_pattern_stmt)
7491 stmt = pattern_stmt;
7492 else
7494 stmt = gsi_stmt (si);
7495 /* During vectorization remove existing clobber stmts. */
7496 if (gimple_clobber_p (stmt))
7498 unlink_stmt_vdef (stmt);
7499 gsi_remove (&si, true);
7500 release_defs (stmt);
7501 continue;
7505 if (dump_enabled_p ())
7507 dump_printf_loc (MSG_NOTE, vect_location,
7508 "------>vectorizing statement: ");
7509 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7512 stmt_info = vinfo_for_stmt (stmt);
7514 /* vector stmts created in the outer-loop during vectorization of
7515 stmts in an inner-loop may not have a stmt_info, and do not
7516 need to be vectorized. */
7517 if (!stmt_info)
7519 gsi_next (&si);
7520 continue;
7523 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7524 vect_loop_kill_debug_uses (loop, stmt);
7526 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7527 && !STMT_VINFO_LIVE_P (stmt_info))
7529 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7530 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7531 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7532 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7534 stmt = pattern_stmt;
7535 stmt_info = vinfo_for_stmt (stmt);
7537 else
7539 gsi_next (&si);
7540 continue;
7543 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7544 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7545 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7546 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7547 transform_pattern_stmt = true;
7549 /* If pattern statement has def stmts, vectorize them too. */
7550 if (is_pattern_stmt_p (stmt_info))
7552 if (pattern_def_seq == NULL)
7554 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7555 pattern_def_si = gsi_start (pattern_def_seq);
7557 else if (!gsi_end_p (pattern_def_si))
7558 gsi_next (&pattern_def_si);
7559 if (pattern_def_seq != NULL)
7561 gimple *pattern_def_stmt = NULL;
7562 stmt_vec_info pattern_def_stmt_info = NULL;
7564 while (!gsi_end_p (pattern_def_si))
7566 pattern_def_stmt = gsi_stmt (pattern_def_si);
7567 pattern_def_stmt_info
7568 = vinfo_for_stmt (pattern_def_stmt);
7569 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7570 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7571 break;
7572 gsi_next (&pattern_def_si);
7575 if (!gsi_end_p (pattern_def_si))
7577 if (dump_enabled_p ())
7579 dump_printf_loc (MSG_NOTE, vect_location,
7580 "==> vectorizing pattern def "
7581 "stmt: ");
7582 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7583 pattern_def_stmt, 0);
7586 stmt = pattern_def_stmt;
7587 stmt_info = pattern_def_stmt_info;
7589 else
7591 pattern_def_si = gsi_none ();
7592 transform_pattern_stmt = false;
7595 else
7596 transform_pattern_stmt = false;
7599 if (STMT_VINFO_VECTYPE (stmt_info))
7601 unsigned int nunits
7602 = (unsigned int)
7603 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7604 if (!STMT_SLP_TYPE (stmt_info)
7605 && nunits != (unsigned int) vf
7606 && dump_enabled_p ())
7607 /* For SLP VF is set according to unrolling factor, and not
7608 to vector size, hence for SLP this print is not valid. */
7609 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7612 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7613 reached. */
7614 if (STMT_SLP_TYPE (stmt_info))
7616 if (!slp_scheduled)
7618 slp_scheduled = true;
7620 if (dump_enabled_p ())
7621 dump_printf_loc (MSG_NOTE, vect_location,
7622 "=== scheduling SLP instances ===\n");
7624 vect_schedule_slp (loop_vinfo);
7627 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7628 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7630 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7632 pattern_def_seq = NULL;
7633 gsi_next (&si);
7635 continue;
7639 /* -------- vectorize statement ------------ */
7640 if (dump_enabled_p ())
7641 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7643 grouped_store = false;
7644 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7645 if (is_store)
7647 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7649 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7650 interleaving chain was completed - free all the stores in
7651 the chain. */
7652 gsi_next (&si);
7653 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7655 else
7657 /* Free the attached stmt_vec_info and remove the stmt. */
7658 gimple *store = gsi_stmt (si);
7659 free_stmt_vec_info (store);
7660 unlink_stmt_vdef (store);
7661 gsi_remove (&si, true);
7662 release_defs (store);
7665 /* Stores can only appear at the end of pattern statements. */
7666 gcc_assert (!transform_pattern_stmt);
7667 pattern_def_seq = NULL;
7669 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7671 pattern_def_seq = NULL;
7672 gsi_next (&si);
7674 } /* stmts in BB */
7675 } /* BBs in loop */
7677 slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7679 scale_profile_for_vect_loop (loop, vf);
7681 /* The minimum number of iterations performed by the epilogue. This
7682 is 1 when peeling for gaps because we always need a final scalar
7683 iteration. */
7684 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7685 /* +1 to convert latch counts to loop iteration counts,
7686 -min_epilogue_iters to remove iterations that cannot be performed
7687 by the vector code. */
7688 int bias = 1 - min_epilogue_iters;
7689 /* In these calculations the "- 1" converts loop iteration counts
7690 back to latch counts. */
7691 if (loop->any_upper_bound)
7692 loop->nb_iterations_upper_bound
7693 = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7694 if (loop->any_likely_upper_bound)
7695 loop->nb_iterations_likely_upper_bound
7696 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7697 if (loop->any_estimate)
7698 loop->nb_iterations_estimate
7699 = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7701 if (dump_enabled_p ())
7703 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7705 dump_printf_loc (MSG_NOTE, vect_location,
7706 "LOOP VECTORIZED\n");
7707 if (loop->inner)
7708 dump_printf_loc (MSG_NOTE, vect_location,
7709 "OUTER LOOP VECTORIZED\n");
7710 dump_printf (MSG_NOTE, "\n");
7712 else
7713 dump_printf_loc (MSG_NOTE, vect_location,
7714 "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7715 current_vector_size);
7718 /* Free SLP instances here because otherwise stmt reference counting
7719 won't work. */
7720 slp_instance instance;
7721 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7722 vect_free_slp_instance (instance);
7723 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7724 /* Clear-up safelen field since its value is invalid after vectorization
7725 since vectorized loop can have loop-carried dependencies. */
7726 loop->safelen = 0;
7728 /* Don't vectorize epilogue for epilogue. */
7729 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7730 epilogue = NULL;
7732 if (epilogue)
7734 unsigned int vector_sizes
7735 = targetm.vectorize.autovectorize_vector_sizes ();
7736 vector_sizes &= current_vector_size - 1;
7738 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7739 epilogue = NULL;
7740 else if (!vector_sizes)
7741 epilogue = NULL;
7742 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7743 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7745 int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7746 int ratio = current_vector_size / smallest_vec_size;
7747 int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7748 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7749 eiters = eiters % vf;
7751 epilogue->nb_iterations_upper_bound = eiters - 1;
7753 if (eiters < vf / ratio)
7754 epilogue = NULL;
7758 if (epilogue)
7760 epilogue->force_vectorize = loop->force_vectorize;
7761 epilogue->safelen = loop->safelen;
7762 epilogue->dont_vectorize = false;
7764 /* We may need to if-convert epilogue to vectorize it. */
7765 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7766 tree_if_conversion (epilogue);
7769 return epilogue;
7772 /* The code below is trying to perform simple optimization - revert
7773 if-conversion for masked stores, i.e. if the mask of a store is zero
7774 do not perform it and all stored value producers also if possible.
7775 For example,
7776 for (i=0; i<n; i++)
7777 if (c[i])
7779 p1[i] += 1;
7780 p2[i] = p3[i] +2;
7782 this transformation will produce the following semi-hammock:
7784 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7786 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7787 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7788 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7789 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7790 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7791 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7795 void
7796 optimize_mask_stores (struct loop *loop)
7798 basic_block *bbs = get_loop_body (loop);
7799 unsigned nbbs = loop->num_nodes;
7800 unsigned i;
7801 basic_block bb;
7802 struct loop *bb_loop;
7803 gimple_stmt_iterator gsi;
7804 gimple *stmt;
7805 auto_vec<gimple *> worklist;
7807 vect_location = find_loop_location (loop);
7808 /* Pick up all masked stores in loop if any. */
7809 for (i = 0; i < nbbs; i++)
7811 bb = bbs[i];
7812 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7813 gsi_next (&gsi))
7815 stmt = gsi_stmt (gsi);
7816 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7817 worklist.safe_push (stmt);
7821 free (bbs);
7822 if (worklist.is_empty ())
7823 return;
7825 /* Loop has masked stores. */
7826 while (!worklist.is_empty ())
7828 gimple *last, *last_store;
7829 edge e, efalse;
7830 tree mask;
7831 basic_block store_bb, join_bb;
7832 gimple_stmt_iterator gsi_to;
7833 tree vdef, new_vdef;
7834 gphi *phi;
7835 tree vectype;
7836 tree zero;
7838 last = worklist.pop ();
7839 mask = gimple_call_arg (last, 2);
7840 bb = gimple_bb (last);
7841 /* Create then_bb and if-then structure in CFG, then_bb belongs to
7842 the same loop as if_bb. It could be different to LOOP when two
7843 level loop-nest is vectorized and mask_store belongs to the inner
7844 one. */
7845 e = split_block (bb, last);
7846 bb_loop = bb->loop_father;
7847 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7848 join_bb = e->dest;
7849 store_bb = create_empty_bb (bb);
7850 add_bb_to_loop (store_bb, bb_loop);
7851 e->flags = EDGE_TRUE_VALUE;
7852 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7853 /* Put STORE_BB to likely part. */
7854 efalse->probability = profile_probability::unlikely ();
7855 store_bb->count = efalse->count ();
7856 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7857 if (dom_info_available_p (CDI_DOMINATORS))
7858 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7859 if (dump_enabled_p ())
7860 dump_printf_loc (MSG_NOTE, vect_location,
7861 "Create new block %d to sink mask stores.",
7862 store_bb->index);
7863 /* Create vector comparison with boolean result. */
7864 vectype = TREE_TYPE (mask);
7865 zero = build_zero_cst (vectype);
7866 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7867 gsi = gsi_last_bb (bb);
7868 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7869 /* Create new PHI node for vdef of the last masked store:
7870 .MEM_2 = VDEF <.MEM_1>
7871 will be converted to
7872 .MEM.3 = VDEF <.MEM_1>
7873 and new PHI node will be created in join bb
7874 .MEM_2 = PHI <.MEM_1, .MEM_3>
7876 vdef = gimple_vdef (last);
7877 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7878 gimple_set_vdef (last, new_vdef);
7879 phi = create_phi_node (vdef, join_bb);
7880 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7882 /* Put all masked stores with the same mask to STORE_BB if possible. */
7883 while (true)
7885 gimple_stmt_iterator gsi_from;
7886 gimple *stmt1 = NULL;
7888 /* Move masked store to STORE_BB. */
7889 last_store = last;
7890 gsi = gsi_for_stmt (last);
7891 gsi_from = gsi;
7892 /* Shift GSI to the previous stmt for further traversal. */
7893 gsi_prev (&gsi);
7894 gsi_to = gsi_start_bb (store_bb);
7895 gsi_move_before (&gsi_from, &gsi_to);
7896 /* Setup GSI_TO to the non-empty block start. */
7897 gsi_to = gsi_start_bb (store_bb);
7898 if (dump_enabled_p ())
7900 dump_printf_loc (MSG_NOTE, vect_location,
7901 "Move stmt to created bb\n");
7902 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7904 /* Move all stored value producers if possible. */
7905 while (!gsi_end_p (gsi))
7907 tree lhs;
7908 imm_use_iterator imm_iter;
7909 use_operand_p use_p;
7910 bool res;
7912 /* Skip debug statements. */
7913 if (is_gimple_debug (gsi_stmt (gsi)))
7915 gsi_prev (&gsi);
7916 continue;
7918 stmt1 = gsi_stmt (gsi);
7919 /* Do not consider statements writing to memory or having
7920 volatile operand. */
7921 if (gimple_vdef (stmt1)
7922 || gimple_has_volatile_ops (stmt1))
7923 break;
7924 gsi_from = gsi;
7925 gsi_prev (&gsi);
7926 lhs = gimple_get_lhs (stmt1);
7927 if (!lhs)
7928 break;
7930 /* LHS of vectorized stmt must be SSA_NAME. */
7931 if (TREE_CODE (lhs) != SSA_NAME)
7932 break;
7934 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7936 /* Remove dead scalar statement. */
7937 if (has_zero_uses (lhs))
7939 gsi_remove (&gsi_from, true);
7940 continue;
7944 /* Check that LHS does not have uses outside of STORE_BB. */
7945 res = true;
7946 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7948 gimple *use_stmt;
7949 use_stmt = USE_STMT (use_p);
7950 if (is_gimple_debug (use_stmt))
7951 continue;
7952 if (gimple_bb (use_stmt) != store_bb)
7954 res = false;
7955 break;
7958 if (!res)
7959 break;
7961 if (gimple_vuse (stmt1)
7962 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7963 break;
7965 /* Can move STMT1 to STORE_BB. */
7966 if (dump_enabled_p ())
7968 dump_printf_loc (MSG_NOTE, vect_location,
7969 "Move stmt to created bb\n");
7970 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7972 gsi_move_before (&gsi_from, &gsi_to);
7973 /* Shift GSI_TO for further insertion. */
7974 gsi_prev (&gsi_to);
7976 /* Put other masked stores with the same mask to STORE_BB. */
7977 if (worklist.is_empty ()
7978 || gimple_call_arg (worklist.last (), 2) != mask
7979 || worklist.last () != stmt1)
7980 break;
7981 last = worklist.pop ();
7983 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);