Remove assert in get_def_bb_for_const
[official-gcc.git] / gcc / tree-vect-loop.c
blob1471658da763d5f58df95023d0168f97b00fd5f3
1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
51 /* Loop Vectorization Pass.
53 This pass tries to vectorize loops.
55 For example, the vectorizer transforms the following simple loop:
57 short a[N]; short b[N]; short c[N]; int i;
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
63 as if it was manually vectorized by rewriting the source code into:
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
111 For example, say stmt S1 was vectorized into stmt VS1:
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
151 /* Function vect_determine_vectorization_factor
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
176 static bool
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
201 for (i = 0; i < nbbs; i++)
203 basic_block bb = bbs[i];
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
217 gcc_assert (stmt_info);
219 if (STMT_VINFO_RELEVANT_P (stmt_info))
221 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
222 scalar_type = TREE_TYPE (PHI_RESULT (phi));
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_NOTE, vect_location,
227 "get vectype for scalar type: ");
228 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
229 dump_printf (MSG_NOTE, "\n");
232 vectype = get_vectype_for_scalar_type (scalar_type);
233 if (!vectype)
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
238 "not vectorized: unsupported "
239 "data-type ");
240 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
241 scalar_type);
242 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
244 return false;
246 STMT_VINFO_VECTYPE (stmt_info) = vectype;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
251 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
252 dump_printf (MSG_NOTE, "\n");
255 nunits = TYPE_VECTOR_SUBPARTS (vectype);
256 if (dump_enabled_p ())
257 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
258 nunits);
260 if (!vectorization_factor
261 || (nunits > vectorization_factor))
262 vectorization_factor = nunits;
266 for (gimple_stmt_iterator si = gsi_start_bb (bb);
267 !gsi_end_p (si) || analyze_pattern_stmt;)
269 tree vf_vectype;
271 if (analyze_pattern_stmt)
272 stmt = pattern_stmt;
273 else
274 stmt = gsi_stmt (si);
276 stmt_info = vinfo_for_stmt (stmt);
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_NOTE, vect_location,
281 "==> examining statement: ");
282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
283 dump_printf (MSG_NOTE, "\n");
286 gcc_assert (stmt_info);
288 /* Skip stmts which do not need to be vectorized. */
289 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
290 && !STMT_VINFO_LIVE_P (stmt_info))
291 || gimple_clobber_p (stmt))
293 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298 stmt = pattern_stmt;
299 stmt_info = vinfo_for_stmt (pattern_stmt);
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location,
303 "==> examining pattern statement: ");
304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
305 dump_printf (MSG_NOTE, "\n");
308 else
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
325 if (pattern_def_seq == NULL)
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
337 while (!gsi_end_p (pattern_def_si))
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
348 if (!gsi_end_p (pattern_def_si))
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
356 dump_printf (MSG_NOTE, "\n");
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
362 else
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
368 else
369 analyze_pattern_stmt = false;
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 if (is_gimple_call (stmt))
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 pattern_def_seq = NULL;
387 gsi_next (&si);
389 continue;
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 return false;
402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: vector stmt in loop:");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
411 return false;
414 bool_result = false;
416 if (STMT_VINFO_VECTYPE (stmt_info))
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 else
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (is_gimple_call (stmt)
431 && gimple_call_internal_p (stmt)
432 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
433 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
434 else
435 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 /* Bool ops don't participate in vectorization factor
438 computation. For comparison use compared types to
439 compute a factor. */
440 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
441 && is_gimple_assign (stmt)
442 && gimple_assign_rhs_code (stmt) != COND_EXPR)
444 if (STMT_VINFO_RELEVANT_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (gimple_code (stmt) == GIMPLE_ASSIGN
449 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
450 == tcc_comparison
451 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
452 != BOOLEAN_TYPE)
453 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
454 else
456 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
458 pattern_def_seq = NULL;
459 gsi_next (&si);
461 continue;
465 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "get vectype for scalar type: ");
469 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
470 dump_printf (MSG_NOTE, "\n");
472 vectype = get_vectype_for_scalar_type (scalar_type);
473 if (!vectype)
475 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "not vectorized: unsupported "
479 "data-type ");
480 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
481 scalar_type);
482 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
484 return false;
487 if (!bool_result)
488 STMT_VINFO_VECTYPE (stmt_info) = vectype;
490 if (dump_enabled_p ())
492 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
493 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
494 dump_printf (MSG_NOTE, "\n");
498 /* Don't try to compute VF out scalar types if we stmt
499 produces boolean vector. Use result vectype instead. */
500 if (VECTOR_BOOLEAN_TYPE_P (vectype))
501 vf_vectype = vectype;
502 else
504 /* The vectorization factor is according to the smallest
505 scalar type (or the largest vector size, but we only
506 support one vector size per loop). */
507 if (!bool_result)
508 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
509 &dummy);
510 if (dump_enabled_p ())
512 dump_printf_loc (MSG_NOTE, vect_location,
513 "get vectype for scalar type: ");
514 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
515 dump_printf (MSG_NOTE, "\n");
517 vf_vectype = get_vectype_for_scalar_type (scalar_type);
519 if (!vf_vectype)
521 if (dump_enabled_p ())
523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
524 "not vectorized: unsupported data-type ");
525 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
526 scalar_type);
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 return false;
532 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
533 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
535 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "not vectorized: different sized vector "
539 "types in statement, ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
543 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
544 vf_vectype);
545 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547 return false;
550 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
553 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
554 dump_printf (MSG_NOTE, "\n");
557 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
560 if (!vectorization_factor
561 || (nunits > vectorization_factor))
562 vectorization_factor = nunits;
564 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
566 pattern_def_seq = NULL;
567 gsi_next (&si);
572 /* TODO: Analyze cost. Decide if worth while to vectorize. */
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
575 vectorization_factor);
576 if (vectorization_factor <= 1)
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
580 "not vectorized: unsupported data-type\n");
581 return false;
583 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
585 for (i = 0; i < mask_producers.length (); i++)
587 tree mask_type = NULL;
589 stmt = STMT_VINFO_STMT (mask_producers[i]);
591 if (gimple_code (stmt) == GIMPLE_ASSIGN
592 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
593 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
595 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
596 mask_type = get_mask_type_for_scalar_type (scalar_type);
598 if (!mask_type)
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
602 "not vectorized: unsupported mask\n");
603 return false;
606 else
608 tree rhs;
609 ssa_op_iter iter;
610 gimple *def_stmt;
611 enum vect_def_type dt;
613 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
615 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
616 &def_stmt, &dt, &vectype))
618 if (dump_enabled_p ())
620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
621 "not vectorized: can't compute mask type "
622 "for statement, ");
623 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
625 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
627 return false;
630 /* No vectype probably means external definition.
631 Allow it in case there is another operand which
632 allows to determine mask type. */
633 if (!vectype)
634 continue;
636 if (!mask_type)
637 mask_type = vectype;
638 else if (TYPE_VECTOR_SUBPARTS (mask_type)
639 != TYPE_VECTOR_SUBPARTS (vectype))
641 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "not vectorized: different sized masks "
645 "types in statement, ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
647 mask_type);
648 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
649 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
650 vectype);
651 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
653 return false;
655 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
656 != VECTOR_BOOLEAN_TYPE_P (vectype))
658 if (dump_enabled_p ())
660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
661 "not vectorized: mixed mask and "
662 "nonmask vector types in statement, ");
663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
664 mask_type);
665 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
666 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
667 vectype);
668 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
670 return false;
674 /* We may compare boolean value loaded as vector of integers.
675 Fix mask_type in such case. */
676 if (mask_type
677 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
678 && gimple_code (stmt) == GIMPLE_ASSIGN
679 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
680 mask_type = build_same_sized_truth_vector_type (mask_type);
683 /* No mask_type should mean loop invariant predicate.
684 This is probably a subject for optimization in
685 if-conversion. */
686 if (!mask_type)
688 if (dump_enabled_p ())
690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
691 "not vectorized: can't compute mask type "
692 "for statement, ");
693 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
695 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697 return false;
700 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
703 return true;
707 /* Function vect_is_simple_iv_evolution.
709 FORNOW: A simple evolution of an induction variables in the loop is
710 considered a polynomial evolution. */
712 static bool
713 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
714 tree * step)
716 tree init_expr;
717 tree step_expr;
718 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
719 basic_block bb;
721 /* When there is no evolution in this loop, the evolution function
722 is not "simple". */
723 if (evolution_part == NULL_TREE)
724 return false;
726 /* When the evolution is a polynomial of degree >= 2
727 the evolution function is not "simple". */
728 if (tree_is_chrec (evolution_part))
729 return false;
731 step_expr = evolution_part;
732 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
734 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
737 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
738 dump_printf (MSG_NOTE, ", init: ");
739 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
740 dump_printf (MSG_NOTE, "\n");
743 *init = init_expr;
744 *step = step_expr;
746 if (TREE_CODE (step_expr) != INTEGER_CST
747 && (TREE_CODE (step_expr) != SSA_NAME
748 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
749 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
750 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
751 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
752 || !flag_associative_math)))
753 && (TREE_CODE (step_expr) != REAL_CST
754 || !flag_associative_math))
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "step unknown.\n");
759 return false;
762 return true;
765 /* Function vect_analyze_scalar_cycles_1.
767 Examine the cross iteration def-use cycles of scalar variables
768 in LOOP. LOOP_VINFO represents the loop that is now being
769 considered for vectorization (can be LOOP, or an outer-loop
770 enclosing LOOP). */
772 static void
773 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
775 basic_block bb = loop->header;
776 tree init, step;
777 auto_vec<gimple *, 64> worklist;
778 gphi_iterator gsi;
779 bool double_reduc;
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_NOTE, vect_location,
783 "=== vect_analyze_scalar_cycles ===\n");
785 /* First - identify all inductions. Reduction detection assumes that all the
786 inductions have been identified, therefore, this order must not be
787 changed. */
788 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
790 gphi *phi = gsi.phi ();
791 tree access_fn = NULL;
792 tree def = PHI_RESULT (phi);
793 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
795 if (dump_enabled_p ())
797 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
798 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
799 dump_printf (MSG_NOTE, "\n");
802 /* Skip virtual phi's. The data dependences that are associated with
803 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
804 if (virtual_operand_p (def))
805 continue;
807 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
809 /* Analyze the evolution function. */
810 access_fn = analyze_scalar_evolution (loop, def);
811 if (access_fn)
813 STRIP_NOPS (access_fn);
814 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "Access function of PHI: ");
818 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
819 dump_printf (MSG_NOTE, "\n");
821 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
822 = initial_condition_in_loop_num (access_fn, loop->num);
823 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
824 = evolution_part_in_loop_num (access_fn, loop->num);
827 if (!access_fn
828 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
829 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
830 && TREE_CODE (step) != INTEGER_CST))
832 worklist.safe_push (phi);
833 continue;
836 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
837 != NULL_TREE);
838 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
840 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
842 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
846 /* Second - identify all reductions and nested cycles. */
847 while (worklist.length () > 0)
849 gimple *phi = worklist.pop ();
850 tree def = PHI_RESULT (phi);
851 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
852 gimple *reduc_stmt;
853 bool nested_cycle;
855 if (dump_enabled_p ())
857 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
858 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
859 dump_printf (MSG_NOTE, "\n");
862 gcc_assert (!virtual_operand_p (def)
863 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
865 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (nested_cycle)
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP. */
903 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
907 else
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Unknown def-use cycle pattern.\n");
915 /* Function vect_analyze_scalar_cycles.
917 Examine the cross iteration def-use cycles of scalar variables, by
918 analyzing the loop-header PHIs of scalar variables. Classify each
919 cycle as one of the following: invariant, induction, reduction, unknown.
920 We do that for the loop represented by LOOP_VINFO, and also to its
921 inner-loop, if exists.
922 Examples for scalar cycles:
924 Example1: reduction:
926 loop1:
927 for (i=0; i<N; i++)
928 sum += a[i];
930 Example2: induction:
932 loop2:
933 for (i=0; i<N; i++)
934 a[i] = i; */
936 static void
937 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
939 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
941 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
943 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
944 Reductions in such inner-loop therefore have different properties than
945 the reductions in the nest that gets vectorized:
946 1. When vectorized, they are executed in the same order as in the original
947 scalar loop, so we can't change the order of computation when
948 vectorizing them.
949 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
950 current checks are too strict. */
952 if (loop->inner)
953 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 /* Transfer group and reduction information from STMT to its pattern stmt. */
958 static void
959 vect_fixup_reduc_chain (gimple *stmt)
961 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
962 gimple *stmtp;
963 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
964 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
965 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
969 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
970 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
971 if (stmt)
972 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
973 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
975 while (stmt);
976 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 /* Fixup scalar cycles that now have their stmts detected as patterns. */
981 static void
982 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
984 gimple *first;
985 unsigned i;
987 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
988 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
990 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
991 while (next)
993 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
994 break;
995 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
997 /* If not all stmt in the chain are patterns try to handle
998 the chain without patterns. */
999 if (! next)
1001 vect_fixup_reduc_chain (first);
1002 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1003 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1008 /* Function vect_get_loop_niters.
1010 Determine how many iterations the loop is executed and place it
1011 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1012 in NUMBER_OF_ITERATIONSM1.
1014 Return the loop exit condition. */
1017 static gcond *
1018 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1019 tree *number_of_iterationsm1)
1021 tree niters;
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_NOTE, vect_location,
1025 "=== get_loop_niters ===\n");
1027 niters = number_of_latch_executions (loop);
1028 *number_of_iterationsm1 = niters;
1030 /* We want the number of loop header executions which is the number
1031 of latch executions plus one.
1032 ??? For UINT_MAX latch executions this number overflows to zero
1033 for loops like do { n++; } while (n != 0); */
1034 if (niters && !chrec_contains_undetermined (niters))
1035 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1036 build_int_cst (TREE_TYPE (niters), 1));
1037 *number_of_iterations = niters;
1039 return get_loop_exit_condition (loop);
1043 /* Function bb_in_loop_p
1045 Used as predicate for dfs order traversal of the loop bbs. */
1047 static bool
1048 bb_in_loop_p (const_basic_block bb, const void *data)
1050 const struct loop *const loop = (const struct loop *)data;
1051 if (flow_bb_inside_loop_p (loop, bb))
1052 return true;
1053 return false;
1057 /* Function new_loop_vec_info.
1059 Create and initialize a new loop_vec_info struct for LOOP, as well as
1060 stmt_vec_info structs for all the stmts in LOOP. */
1062 static loop_vec_info
1063 new_loop_vec_info (struct loop *loop)
1065 loop_vec_info res;
1066 basic_block *bbs;
1067 gimple_stmt_iterator si;
1068 unsigned int i, nbbs;
1070 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1071 res->kind = vec_info::loop;
1072 LOOP_VINFO_LOOP (res) = loop;
1074 bbs = get_loop_body (loop);
1076 /* Create/Update stmt_info for all stmts in the loop. */
1077 for (i = 0; i < loop->num_nodes; i++)
1079 basic_block bb = bbs[i];
1081 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1083 gimple *phi = gsi_stmt (si);
1084 gimple_set_uid (phi, 0);
1085 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1088 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1090 gimple *stmt = gsi_stmt (si);
1091 gimple_set_uid (stmt, 0);
1092 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1096 /* CHECKME: We want to visit all BBs before their successors (except for
1097 latch blocks, for which this assertion wouldn't hold). In the simple
1098 case of the loop forms we allow, a dfs order of the BBs would the same
1099 as reversed postorder traversal, so we are safe. */
1101 free (bbs);
1102 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1103 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1104 bbs, loop->num_nodes, loop);
1105 gcc_assert (nbbs == loop->num_nodes);
1107 LOOP_VINFO_BBS (res) = bbs;
1108 LOOP_VINFO_NITERSM1 (res) = NULL;
1109 LOOP_VINFO_NITERS (res) = NULL;
1110 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1111 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1112 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1113 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1114 LOOP_VINFO_VECT_FACTOR (res) = 0;
1115 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1116 LOOP_VINFO_DATAREFS (res) = vNULL;
1117 LOOP_VINFO_DDRS (res) = vNULL;
1118 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1119 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1120 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1121 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1122 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1123 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1124 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1125 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1126 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1127 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1128 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1129 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1131 return res;
1135 /* Function destroy_loop_vec_info.
1137 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1138 stmts in the loop. */
1140 void
1141 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1143 struct loop *loop;
1144 basic_block *bbs;
1145 int nbbs;
1146 gimple_stmt_iterator si;
1147 int j;
1148 vec<slp_instance> slp_instances;
1149 slp_instance instance;
1150 bool swapped;
1152 if (!loop_vinfo)
1153 return;
1155 loop = LOOP_VINFO_LOOP (loop_vinfo);
1157 bbs = LOOP_VINFO_BBS (loop_vinfo);
1158 nbbs = clean_stmts ? loop->num_nodes : 0;
1159 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1161 for (j = 0; j < nbbs; j++)
1163 basic_block bb = bbs[j];
1164 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1165 free_stmt_vec_info (gsi_stmt (si));
1167 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1169 gimple *stmt = gsi_stmt (si);
1171 /* We may have broken canonical form by moving a constant
1172 into RHS1 of a commutative op. Fix such occurrences. */
1173 if (swapped && is_gimple_assign (stmt))
1175 enum tree_code code = gimple_assign_rhs_code (stmt);
1177 if ((code == PLUS_EXPR
1178 || code == POINTER_PLUS_EXPR
1179 || code == MULT_EXPR)
1180 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1181 swap_ssa_operands (stmt,
1182 gimple_assign_rhs1_ptr (stmt),
1183 gimple_assign_rhs2_ptr (stmt));
1186 /* Free stmt_vec_info. */
1187 free_stmt_vec_info (stmt);
1188 gsi_next (&si);
1192 free (LOOP_VINFO_BBS (loop_vinfo));
1193 vect_destroy_datarefs (loop_vinfo);
1194 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1195 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1196 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1197 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1198 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1199 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1200 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1201 vect_free_slp_instance (instance);
1203 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1204 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1205 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1206 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1208 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1209 loop_vinfo->scalar_cost_vec.release ();
1211 free (loop_vinfo);
1212 loop->aux = NULL;
1216 /* Calculate the cost of one scalar iteration of the loop. */
1217 static void
1218 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1220 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1221 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1222 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1223 int innerloop_iters, i;
1225 /* Count statements in scalar loop. Using this as scalar cost for a single
1226 iteration for now.
1228 TODO: Add outer loop support.
1230 TODO: Consider assigning different costs to different scalar
1231 statements. */
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1238 for (i = 0; i < nbbs; i++)
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 if (stmt_info
1258 && !STMT_VINFO_RELEVANT_P (stmt_info)
1259 && (!STMT_VINFO_LIVE_P (stmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1261 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1262 continue;
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1272 else
1273 kind = scalar_stmt;
1275 scalar_single_iter_cost
1276 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1277 factor, kind, NULL, 0, vect_prologue);
1280 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1281 = scalar_single_iter_cost;
1285 /* Function vect_analyze_loop_form_1.
1287 Verify that certain CFG restrictions hold, including:
1288 - the loop has a pre-header
1289 - the loop has a single entry and exit
1290 - the loop exit condition is simple enough, and the number of iterations
1291 can be analyzed (a countable loop). */
1293 bool
1294 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1295 tree *number_of_iterationsm1,
1296 tree *number_of_iterations, gcond **inner_loop_cond)
1298 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_NOTE, vect_location,
1300 "=== vect_analyze_loop_form ===\n");
1302 /* Different restrictions apply when we are considering an inner-most loop,
1303 vs. an outer (nested) loop.
1304 (FORNOW. May want to relax some of these restrictions in the future). */
1306 if (!loop->inner)
1308 /* Inner-most loop. We currently require that the number of BBs is
1309 exactly 2 (the header and latch). Vectorizable inner-most loops
1310 look like this:
1312 (pre-header)
1314 header <--------+
1315 | | |
1316 | +--> latch --+
1318 (exit-bb) */
1320 if (loop->num_nodes != 2)
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 "not vectorized: control flow in loop.\n");
1325 return false;
1328 if (empty_block_p (loop->header))
1330 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332 "not vectorized: empty loop.\n");
1333 return false;
1336 else
1338 struct loop *innerloop = loop->inner;
1339 edge entryedge;
1341 /* Nested loop. We currently require that the loop is doubly-nested,
1342 contains a single inner loop, and the number of BBs is exactly 5.
1343 Vectorizable outer-loops look like this:
1345 (pre-header)
1347 header <---+
1349 inner-loop |
1351 tail ------+
1353 (exit-bb)
1355 The inner-loop has the properties expected of inner-most loops
1356 as described above. */
1358 if ((loop->inner)->inner || (loop->inner)->next)
1360 if (dump_enabled_p ())
1361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1362 "not vectorized: multiple nested loops.\n");
1363 return false;
1366 if (loop->num_nodes != 5)
1368 if (dump_enabled_p ())
1369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1370 "not vectorized: control flow in loop.\n");
1371 return false;
1374 entryedge = loop_preheader_edge (innerloop);
1375 if (entryedge->src != loop->header
1376 || !single_exit (innerloop)
1377 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1379 if (dump_enabled_p ())
1380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1381 "not vectorized: unsupported outerloop form.\n");
1382 return false;
1385 /* Analyze the inner-loop. */
1386 tree inner_niterm1, inner_niter;
1387 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1388 &inner_niterm1, &inner_niter, NULL))
1390 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392 "not vectorized: Bad inner loop.\n");
1393 return false;
1396 if (!expr_invariant_in_loop_p (loop, inner_niter))
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "not vectorized: inner-loop count not"
1401 " invariant.\n");
1402 return false;
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_NOTE, vect_location,
1407 "Considering outer-loop vectorization.\n");
1410 if (!single_exit (loop)
1411 || EDGE_COUNT (loop->header->preds) != 2)
1413 if (dump_enabled_p ())
1415 if (!single_exit (loop))
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 "not vectorized: multiple exits.\n");
1418 else if (EDGE_COUNT (loop->header->preds) != 2)
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420 "not vectorized: too many incoming edges.\n");
1422 return false;
1425 /* We assume that the loop exit condition is at the end of the loop. i.e,
1426 that the loop is represented as a do-while (with a proper if-guard
1427 before the loop if needed), where the loop header contains all the
1428 executable statements, and the latch is empty. */
1429 if (!empty_block_p (loop->latch)
1430 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1432 if (dump_enabled_p ())
1433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434 "not vectorized: latch block not empty.\n");
1435 return false;
1438 /* Make sure there exists a single-predecessor exit bb: */
1439 if (!single_pred_p (single_exit (loop)->dest))
1441 edge e = single_exit (loop);
1442 if (!(e->flags & EDGE_ABNORMAL))
1444 split_loop_exit_edge (e);
1445 if (dump_enabled_p ())
1446 dump_printf (MSG_NOTE, "split exit edge.\n");
1448 else
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452 "not vectorized: abnormal loop exit edge.\n");
1453 return false;
1457 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1458 number_of_iterationsm1);
1459 if (!*loop_cond)
1461 if (dump_enabled_p ())
1462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463 "not vectorized: complicated exit condition.\n");
1464 return false;
1467 if (!*number_of_iterations
1468 || chrec_contains_undetermined (*number_of_iterations))
1470 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472 "not vectorized: number of iterations cannot be "
1473 "computed.\n");
1474 return false;
1477 if (integer_zerop (*number_of_iterations))
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481 "not vectorized: number of iterations = 0.\n");
1482 return false;
1485 return true;
1488 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1490 loop_vec_info
1491 vect_analyze_loop_form (struct loop *loop)
1493 tree number_of_iterations, number_of_iterationsm1;
1494 gcond *loop_cond, *inner_loop_cond = NULL;
1496 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1497 &number_of_iterations, &inner_loop_cond))
1498 return NULL;
1500 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1501 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1502 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1503 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1505 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1507 if (dump_enabled_p ())
1509 dump_printf_loc (MSG_NOTE, vect_location,
1510 "Symbolic number of iterations is ");
1511 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1512 dump_printf (MSG_NOTE, "\n");
1516 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1517 if (inner_loop_cond)
1518 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1519 = loop_exit_ctrl_vec_info_type;
1521 gcc_assert (!loop->aux);
1522 loop->aux = loop_vinfo;
1523 return loop_vinfo;
1528 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1529 statements update the vectorization factor. */
1531 static void
1532 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1534 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1535 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1536 int nbbs = loop->num_nodes;
1537 unsigned int vectorization_factor;
1538 int i;
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location,
1542 "=== vect_update_vf_for_slp ===\n");
1544 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1545 gcc_assert (vectorization_factor != 0);
1547 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1548 vectorization factor of the loop is the unrolling factor required by
1549 the SLP instances. If that unrolling factor is 1, we say, that we
1550 perform pure SLP on loop - cross iteration parallelism is not
1551 exploited. */
1552 bool only_slp_in_loop = true;
1553 for (i = 0; i < nbbs; i++)
1555 basic_block bb = bbs[i];
1556 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1557 gsi_next (&si))
1559 gimple *stmt = gsi_stmt (si);
1560 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1561 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1562 && STMT_VINFO_RELATED_STMT (stmt_info))
1564 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1565 stmt_info = vinfo_for_stmt (stmt);
1567 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1568 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1569 && !PURE_SLP_STMT (stmt_info))
1570 /* STMT needs both SLP and loop-based vectorization. */
1571 only_slp_in_loop = false;
1575 if (only_slp_in_loop)
1576 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577 else
1578 vectorization_factor
1579 = least_common_multiple (vectorization_factor,
1580 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1582 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE, vect_location,
1585 "Updating vectorization factor to %d\n",
1586 vectorization_factor);
1589 /* Function vect_analyze_loop_operations.
1591 Scan the loop stmts and make sure they are all vectorizable. */
1593 static bool
1594 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1596 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1597 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1598 int nbbs = loop->num_nodes;
1599 int i;
1600 stmt_vec_info stmt_info;
1601 bool need_to_vectorize = false;
1602 bool ok;
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_NOTE, vect_location,
1606 "=== vect_analyze_loop_operations ===\n");
1608 for (i = 0; i < nbbs; i++)
1610 basic_block bb = bbs[i];
1612 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1613 gsi_next (&si))
1615 gphi *phi = si.phi ();
1616 ok = true;
1618 stmt_info = vinfo_for_stmt (phi);
1619 if (dump_enabled_p ())
1621 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1622 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1623 dump_printf (MSG_NOTE, "\n");
1625 if (virtual_operand_p (gimple_phi_result (phi)))
1626 continue;
1628 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1629 (i.e., a phi in the tail of the outer-loop). */
1630 if (! is_loop_header_bb_p (bb))
1632 /* FORNOW: we currently don't support the case that these phis
1633 are not used in the outerloop (unless it is double reduction,
1634 i.e., this phi is vect_reduction_def), cause this case
1635 requires to actually do something here. */
1636 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1637 || STMT_VINFO_LIVE_P (stmt_info))
1638 && STMT_VINFO_DEF_TYPE (stmt_info)
1639 != vect_double_reduction_def)
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "Unsupported loop-closed phi in "
1644 "outer-loop.\n");
1645 return false;
1648 /* If PHI is used in the outer loop, we check that its operand
1649 is defined in the inner loop. */
1650 if (STMT_VINFO_RELEVANT_P (stmt_info))
1652 tree phi_op;
1653 gimple *op_def_stmt;
1655 if (gimple_phi_num_args (phi) != 1)
1656 return false;
1658 phi_op = PHI_ARG_DEF (phi, 0);
1659 if (TREE_CODE (phi_op) != SSA_NAME)
1660 return false;
1662 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1663 if (gimple_nop_p (op_def_stmt)
1664 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1665 || !vinfo_for_stmt (op_def_stmt))
1666 return false;
1668 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1669 != vect_used_in_outer
1670 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1671 != vect_used_in_outer_by_reduction)
1672 return false;
1675 continue;
1678 gcc_assert (stmt_info);
1680 if (STMT_VINFO_LIVE_P (stmt_info))
1682 /* FORNOW: not yet supported. */
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "not vectorized: value used after loop.\n");
1686 return false;
1689 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1690 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1692 /* A scalar-dependence cycle that we don't support. */
1693 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695 "not vectorized: scalar dependence cycle.\n");
1696 return false;
1699 if (STMT_VINFO_RELEVANT_P (stmt_info))
1701 need_to_vectorize = true;
1702 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1703 ok = vectorizable_induction (phi, NULL, NULL);
1706 if (!ok)
1708 if (dump_enabled_p ())
1710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711 "not vectorized: relevant phi not "
1712 "supported: ");
1713 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1714 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1716 return false;
1720 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1721 gsi_next (&si))
1723 gimple *stmt = gsi_stmt (si);
1724 if (!gimple_clobber_p (stmt)
1725 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1726 return false;
1728 } /* bbs */
1730 /* All operations in the loop are either irrelevant (deal with loop
1731 control, or dead), or only used outside the loop and can be moved
1732 out of the loop (e.g. invariants, inductions). The loop can be
1733 optimized away by scalar optimizations. We're better off not
1734 touching this loop. */
1735 if (!need_to_vectorize)
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "All the computation can be taken out of the loop.\n");
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742 "not vectorized: redundant loop. no profit to "
1743 "vectorize.\n");
1744 return false;
1747 return true;
1751 /* Function vect_analyze_loop_2.
1753 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1754 for it. The different analyses will record information in the
1755 loop_vec_info struct. */
1756 static bool
1757 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1759 bool ok;
1760 int max_vf = MAX_VECTORIZATION_FACTOR;
1761 int min_vf = 2;
1762 unsigned int n_stmts = 0;
1764 /* The first group of checks is independent of the vector size. */
1765 fatal = true;
1767 /* Find all data references in the loop (which correspond to vdefs/vuses)
1768 and analyze their evolution in the loop. */
1770 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1772 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1773 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1775 if (dump_enabled_p ())
1776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1777 "not vectorized: loop nest containing two "
1778 "or more consecutive inner loops cannot be "
1779 "vectorized\n");
1780 return false;
1783 for (unsigned i = 0; i < loop->num_nodes; i++)
1784 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1785 !gsi_end_p (gsi); gsi_next (&gsi))
1787 gimple *stmt = gsi_stmt (gsi);
1788 if (is_gimple_debug (stmt))
1789 continue;
1790 ++n_stmts;
1791 if (!find_data_references_in_stmt (loop, stmt,
1792 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1794 if (is_gimple_call (stmt) && loop->safelen)
1796 tree fndecl = gimple_call_fndecl (stmt), op;
1797 if (fndecl != NULL_TREE)
1799 cgraph_node *node = cgraph_node::get (fndecl);
1800 if (node != NULL && node->simd_clones != NULL)
1802 unsigned int j, n = gimple_call_num_args (stmt);
1803 for (j = 0; j < n; j++)
1805 op = gimple_call_arg (stmt, j);
1806 if (DECL_P (op)
1807 || (REFERENCE_CLASS_P (op)
1808 && get_base_address (op)))
1809 break;
1811 op = gimple_call_lhs (stmt);
1812 /* Ignore #pragma omp declare simd functions
1813 if they don't have data references in the
1814 call stmt itself. */
1815 if (j == n
1816 && !(op
1817 && (DECL_P (op)
1818 || (REFERENCE_CLASS_P (op)
1819 && get_base_address (op)))))
1820 continue;
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "not vectorized: loop contains function "
1827 "calls or data references that cannot "
1828 "be analyzed\n");
1829 return false;
1833 /* Analyze the data references and also adjust the minimal
1834 vectorization factor according to the loads and stores. */
1836 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1837 if (!ok)
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "bad data references.\n");
1842 return false;
1845 /* Classify all cross-iteration scalar data-flow cycles.
1846 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1847 vect_analyze_scalar_cycles (loop_vinfo);
1849 vect_pattern_recog (loop_vinfo);
1851 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1853 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1854 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1856 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1857 if (!ok)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "bad data access.\n");
1862 return false;
1865 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1867 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1868 if (!ok)
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "unexpected pattern.\n");
1873 return false;
1876 /* While the rest of the analysis below depends on it in some way. */
1877 fatal = false;
1879 /* Analyze data dependences between the data-refs in the loop
1880 and adjust the maximum vectorization factor according to
1881 the dependences.
1882 FORNOW: fail at the first data dependence that we encounter. */
1884 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1885 if (!ok
1886 || max_vf < min_vf)
1888 if (dump_enabled_p ())
1889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890 "bad data dependence.\n");
1891 return false;
1894 ok = vect_determine_vectorization_factor (loop_vinfo);
1895 if (!ok)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "can't determine vectorization factor.\n");
1900 return false;
1902 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1904 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "bad data dependence.\n");
1907 return false;
1910 /* Compute the scalar iteration cost. */
1911 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1913 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1914 HOST_WIDE_INT estimated_niter;
1915 unsigned th;
1916 int min_scalar_loop_bound;
1918 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1919 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1920 if (!ok)
1921 return false;
1923 /* If there are any SLP instances mark them as pure_slp. */
1924 bool slp = vect_make_slp_decision (loop_vinfo);
1925 if (slp)
1927 /* Find stmts that need to be both vectorized and SLPed. */
1928 vect_detect_hybrid_slp (loop_vinfo);
1930 /* Update the vectorization factor based on the SLP decision. */
1931 vect_update_vf_for_slp (loop_vinfo);
1934 /* This is the point where we can re-start analysis with SLP forced off. */
1935 start_over:
1937 /* Now the vectorization factor is final. */
1938 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1939 gcc_assert (vectorization_factor != 0);
1941 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE, vect_location,
1943 "vectorization_factor = %d, niters = "
1944 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1945 LOOP_VINFO_INT_NITERS (loop_vinfo));
1947 HOST_WIDE_INT max_niter
1948 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1949 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1950 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1951 || (max_niter != -1
1952 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956 "not vectorized: iteration count smaller than "
1957 "vectorization factor.\n");
1958 return false;
1961 /* Analyze the alignment of the data-refs in the loop.
1962 Fail if a data reference is found that cannot be vectorized. */
1964 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1965 if (!ok)
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1969 "bad data alignment.\n");
1970 return false;
1973 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1974 It is important to call pruning after vect_analyze_data_ref_accesses,
1975 since we use grouping information gathered by interleaving analysis. */
1976 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1977 if (!ok)
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "number of versioning for alias "
1982 "run-time tests exceeds %d "
1983 "(--param vect-max-version-for-alias-checks)\n",
1984 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1985 return false;
1988 /* This pass will decide on using loop versioning and/or loop peeling in
1989 order to enhance the alignment of data references in the loop. */
1990 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1991 if (!ok)
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995 "bad data alignment.\n");
1996 return false;
1999 if (slp)
2001 /* Analyze operations in the SLP instances. Note this may
2002 remove unsupported SLP instances which makes the above
2003 SLP kind detection invalid. */
2004 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2005 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2006 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2007 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2008 goto again;
2011 /* Scan all the remaining operations in the loop that are not subject
2012 to SLP and make sure they are vectorizable. */
2013 ok = vect_analyze_loop_operations (loop_vinfo);
2014 if (!ok)
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "bad operation or unsupported loop bound.\n");
2019 return false;
2022 /* Analyze cost. Decide if worth while to vectorize. */
2023 int min_profitable_estimate, min_profitable_iters;
2024 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2025 &min_profitable_estimate);
2027 if (min_profitable_iters < 0)
2029 if (dump_enabled_p ())
2030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2031 "not vectorized: vectorization not profitable.\n");
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2034 "not vectorized: vector version will never be "
2035 "profitable.\n");
2036 goto again;
2039 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2040 * vectorization_factor) - 1);
2042 /* Use the cost model only if it is more conservative than user specified
2043 threshold. */
2044 th = (unsigned) min_scalar_loop_bound;
2045 if (min_profitable_iters
2046 && (!min_scalar_loop_bound
2047 || min_profitable_iters > min_scalar_loop_bound))
2048 th = (unsigned) min_profitable_iters;
2050 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2052 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2053 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2057 "not vectorized: vectorization not profitable.\n");
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not vectorized: iteration count smaller than user "
2061 "specified loop bound parameter or minimum profitable "
2062 "iterations (whichever is more conservative).\n");
2063 goto again;
2066 estimated_niter
2067 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2068 if (estimated_niter == -1)
2069 estimated_niter = max_niter;
2070 if (estimated_niter != -1
2071 && ((unsigned HOST_WIDE_INT) estimated_niter
2072 <= MAX (th, (unsigned)min_profitable_estimate)))
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "not vectorized: estimated iteration count too "
2077 "small.\n");
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_NOTE, vect_location,
2080 "not vectorized: estimated iteration count smaller "
2081 "than specified loop bound parameter or minimum "
2082 "profitable iterations (whichever is more "
2083 "conservative).\n");
2084 goto again;
2087 /* Decide whether we need to create an epilogue loop to handle
2088 remaining scalar iterations. */
2089 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2090 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2091 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2093 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2094 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2096 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2097 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2098 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2099 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2101 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2102 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2103 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2104 /* In case of versioning, check if the maximum number of
2105 iterations is greater than th. If they are identical,
2106 the epilogue is unnecessary. */
2107 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2108 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2109 || (unsigned HOST_WIDE_INT) max_niter > th)))
2110 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2112 /* If an epilogue loop is required make sure we can create one. */
2113 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2114 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2118 if (!vect_can_advance_ivs_p (loop_vinfo)
2119 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2120 single_exit (LOOP_VINFO_LOOP
2121 (loop_vinfo))))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 "not vectorized: can't create required "
2126 "epilog loop\n");
2127 goto again;
2131 gcc_assert (vectorization_factor
2132 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2134 /* Ok to vectorize! */
2135 return true;
2137 again:
2138 /* Try again with SLP forced off but if we didn't do any SLP there is
2139 no point in re-trying. */
2140 if (!slp)
2141 return false;
2143 /* If there are reduction chains re-trying will fail anyway. */
2144 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2145 return false;
2147 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2148 via interleaving or lane instructions. */
2149 slp_instance instance;
2150 slp_tree node;
2151 unsigned i, j;
2152 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2154 stmt_vec_info vinfo;
2155 vinfo = vinfo_for_stmt
2156 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2157 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2158 continue;
2159 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2160 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2161 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2162 if (! vect_store_lanes_supported (vectype, size)
2163 && ! vect_grouped_store_supported (vectype, size))
2164 return false;
2165 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2167 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2168 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2169 size = STMT_VINFO_GROUP_SIZE (vinfo);
2170 vectype = STMT_VINFO_VECTYPE (vinfo);
2171 if (! vect_load_lanes_supported (vectype, size)
2172 && ! vect_grouped_load_supported (vectype, size))
2173 return false;
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "re-trying with SLP disabled\n");
2181 /* Roll back state appropriately. No SLP this time. */
2182 slp = false;
2183 /* Restore vectorization factor as it were without SLP. */
2184 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2185 /* Free the SLP instances. */
2186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2187 vect_free_slp_instance (instance);
2188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2189 /* Reset SLP type to loop_vect on all stmts. */
2190 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2192 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2193 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2194 !gsi_end_p (si); gsi_next (&si))
2196 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2197 STMT_SLP_TYPE (stmt_info) = loop_vect;
2198 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2200 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2201 STMT_SLP_TYPE (stmt_info) = loop_vect;
2202 for (gimple_stmt_iterator pi
2203 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2204 !gsi_end_p (pi); gsi_next (&pi))
2206 gimple *pstmt = gsi_stmt (pi);
2207 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2212 /* Free optimized alias test DDRS. */
2213 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2214 /* Reset target cost data. */
2215 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2216 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2217 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2218 /* Reset assorted flags. */
2219 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2220 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2221 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2223 goto start_over;
2226 /* Function vect_analyze_loop.
2228 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2229 for it. The different analyses will record information in the
2230 loop_vec_info struct. */
2231 loop_vec_info
2232 vect_analyze_loop (struct loop *loop)
2234 loop_vec_info loop_vinfo;
2235 unsigned int vector_sizes;
2237 /* Autodetect first vector size we try. */
2238 current_vector_size = 0;
2239 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_NOTE, vect_location,
2243 "===== analyze_loop_nest =====\n");
2245 if (loop_outer (loop)
2246 && loop_vec_info_for_loop (loop_outer (loop))
2247 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "outer-loop already vectorized.\n");
2252 return NULL;
2255 while (1)
2257 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2258 loop_vinfo = vect_analyze_loop_form (loop);
2259 if (!loop_vinfo)
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "bad loop form.\n");
2264 return NULL;
2267 bool fatal = false;
2268 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272 return loop_vinfo;
2275 destroy_loop_vec_info (loop_vinfo, true);
2277 vector_sizes &= ~current_vector_size;
2278 if (fatal
2279 || vector_sizes == 0
2280 || current_vector_size == 0)
2281 return NULL;
2283 /* Try the next biggest vector size. */
2284 current_vector_size = 1 << floor_log2 (vector_sizes);
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "***** Re-trying analysis with "
2288 "vector size %d\n", current_vector_size);
2293 /* Function reduction_code_for_scalar_code
2295 Input:
2296 CODE - tree_code of a reduction operations.
2298 Output:
2299 REDUC_CODE - the corresponding tree-code to be used to reduce the
2300 vector of partial results into a single scalar result, or ERROR_MARK
2301 if the operation is a supported reduction operation, but does not have
2302 such a tree-code.
2304 Return FALSE if CODE currently cannot be vectorized as reduction. */
2306 static bool
2307 reduction_code_for_scalar_code (enum tree_code code,
2308 enum tree_code *reduc_code)
2310 switch (code)
2312 case MAX_EXPR:
2313 *reduc_code = REDUC_MAX_EXPR;
2314 return true;
2316 case MIN_EXPR:
2317 *reduc_code = REDUC_MIN_EXPR;
2318 return true;
2320 case PLUS_EXPR:
2321 *reduc_code = REDUC_PLUS_EXPR;
2322 return true;
2324 case MULT_EXPR:
2325 case MINUS_EXPR:
2326 case BIT_IOR_EXPR:
2327 case BIT_XOR_EXPR:
2328 case BIT_AND_EXPR:
2329 *reduc_code = ERROR_MARK;
2330 return true;
2332 default:
2333 return false;
2338 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2339 STMT is printed with a message MSG. */
2341 static void
2342 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2344 dump_printf_loc (msg_type, vect_location, "%s", msg);
2345 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2346 dump_printf (msg_type, "\n");
2350 /* Detect SLP reduction of the form:
2352 #a1 = phi <a5, a0>
2353 a2 = operation (a1)
2354 a3 = operation (a2)
2355 a4 = operation (a3)
2356 a5 = operation (a4)
2358 #a = phi <a5>
2360 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2361 FIRST_STMT is the first reduction stmt in the chain
2362 (a2 = operation (a1)).
2364 Return TRUE if a reduction chain was detected. */
2366 static bool
2367 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2368 gimple *first_stmt)
2370 struct loop *loop = (gimple_bb (phi))->loop_father;
2371 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2372 enum tree_code code;
2373 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2374 stmt_vec_info use_stmt_info, current_stmt_info;
2375 tree lhs;
2376 imm_use_iterator imm_iter;
2377 use_operand_p use_p;
2378 int nloop_uses, size = 0, n_out_of_loop_uses;
2379 bool found = false;
2381 if (loop != vect_loop)
2382 return false;
2384 lhs = PHI_RESULT (phi);
2385 code = gimple_assign_rhs_code (first_stmt);
2386 while (1)
2388 nloop_uses = 0;
2389 n_out_of_loop_uses = 0;
2390 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2392 gimple *use_stmt = USE_STMT (use_p);
2393 if (is_gimple_debug (use_stmt))
2394 continue;
2396 /* Check if we got back to the reduction phi. */
2397 if (use_stmt == phi)
2399 loop_use_stmt = use_stmt;
2400 found = true;
2401 break;
2404 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2406 loop_use_stmt = use_stmt;
2407 nloop_uses++;
2409 else
2410 n_out_of_loop_uses++;
2412 /* There are can be either a single use in the loop or two uses in
2413 phi nodes. */
2414 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2415 return false;
2418 if (found)
2419 break;
2421 /* We reached a statement with no loop uses. */
2422 if (nloop_uses == 0)
2423 return false;
2425 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2426 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2427 return false;
2429 if (!is_gimple_assign (loop_use_stmt)
2430 || code != gimple_assign_rhs_code (loop_use_stmt)
2431 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2432 return false;
2434 /* Insert USE_STMT into reduction chain. */
2435 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2436 if (current_stmt)
2438 current_stmt_info = vinfo_for_stmt (current_stmt);
2439 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2440 GROUP_FIRST_ELEMENT (use_stmt_info)
2441 = GROUP_FIRST_ELEMENT (current_stmt_info);
2443 else
2444 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2446 lhs = gimple_assign_lhs (loop_use_stmt);
2447 current_stmt = loop_use_stmt;
2448 size++;
2451 if (!found || loop_use_stmt != phi || size < 2)
2452 return false;
2454 /* Swap the operands, if needed, to make the reduction operand be the second
2455 operand. */
2456 lhs = PHI_RESULT (phi);
2457 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2458 while (next_stmt)
2460 if (gimple_assign_rhs2 (next_stmt) == lhs)
2462 tree op = gimple_assign_rhs1 (next_stmt);
2463 gimple *def_stmt = NULL;
2465 if (TREE_CODE (op) == SSA_NAME)
2466 def_stmt = SSA_NAME_DEF_STMT (op);
2468 /* Check that the other def is either defined in the loop
2469 ("vect_internal_def"), or it's an induction (defined by a
2470 loop-header phi-node). */
2471 if (def_stmt
2472 && gimple_bb (def_stmt)
2473 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2474 && (is_gimple_assign (def_stmt)
2475 || is_gimple_call (def_stmt)
2476 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2477 == vect_induction_def
2478 || (gimple_code (def_stmt) == GIMPLE_PHI
2479 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2480 == vect_internal_def
2481 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2483 lhs = gimple_assign_lhs (next_stmt);
2484 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2485 continue;
2488 return false;
2490 else
2492 tree op = gimple_assign_rhs2 (next_stmt);
2493 gimple *def_stmt = NULL;
2495 if (TREE_CODE (op) == SSA_NAME)
2496 def_stmt = SSA_NAME_DEF_STMT (op);
2498 /* Check that the other def is either defined in the loop
2499 ("vect_internal_def"), or it's an induction (defined by a
2500 loop-header phi-node). */
2501 if (def_stmt
2502 && gimple_bb (def_stmt)
2503 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2504 && (is_gimple_assign (def_stmt)
2505 || is_gimple_call (def_stmt)
2506 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2507 == vect_induction_def
2508 || (gimple_code (def_stmt) == GIMPLE_PHI
2509 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2510 == vect_internal_def
2511 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2513 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2516 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2517 dump_printf (MSG_NOTE, "\n");
2520 swap_ssa_operands (next_stmt,
2521 gimple_assign_rhs1_ptr (next_stmt),
2522 gimple_assign_rhs2_ptr (next_stmt));
2523 update_stmt (next_stmt);
2525 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2526 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2528 else
2529 return false;
2532 lhs = gimple_assign_lhs (next_stmt);
2533 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2536 /* Save the chain for further analysis in SLP detection. */
2537 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2538 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2539 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2541 return true;
2545 /* Function vect_is_simple_reduction_1
2547 (1) Detect a cross-iteration def-use cycle that represents a simple
2548 reduction computation. We look for the following pattern:
2550 loop_header:
2551 a1 = phi < a0, a2 >
2552 a3 = ...
2553 a2 = operation (a3, a1)
2557 a3 = ...
2558 loop_header:
2559 a1 = phi < a0, a2 >
2560 a2 = operation (a3, a1)
2562 such that:
2563 1. operation is commutative and associative and it is safe to
2564 change the order of the computation (if CHECK_REDUCTION is true)
2565 2. no uses for a2 in the loop (a2 is used out of the loop)
2566 3. no uses of a1 in the loop besides the reduction operation
2567 4. no uses of a1 outside the loop.
2569 Conditions 1,4 are tested here.
2570 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2572 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2573 nested cycles, if CHECK_REDUCTION is false.
2575 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2576 reductions:
2578 a1 = phi < a0, a2 >
2579 inner loop (def of a3)
2580 a2 = phi < a3 >
2582 (4) Detect condition expressions, ie:
2583 for (int i = 0; i < N; i++)
2584 if (a[i] < val)
2585 ret_val = a[i];
2589 static gimple *
2590 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2591 bool check_reduction, bool *double_reduc,
2592 bool need_wrapping_integral_overflow,
2593 enum vect_reduction_type *v_reduc_type)
2595 struct loop *loop = (gimple_bb (phi))->loop_father;
2596 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2597 edge latch_e = loop_latch_edge (loop);
2598 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2599 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2600 enum tree_code orig_code, code;
2601 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2602 tree type;
2603 int nloop_uses;
2604 tree name;
2605 imm_use_iterator imm_iter;
2606 use_operand_p use_p;
2607 bool phi_def;
2609 *double_reduc = false;
2610 *v_reduc_type = TREE_CODE_REDUCTION;
2612 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2613 otherwise, we assume outer loop vectorization. */
2614 gcc_assert ((check_reduction && loop == vect_loop)
2615 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2617 name = PHI_RESULT (phi);
2618 /* ??? If there are no uses of the PHI result the inner loop reduction
2619 won't be detected as possibly double-reduction by vectorizable_reduction
2620 because that tries to walk the PHI arg from the preheader edge which
2621 can be constant. See PR60382. */
2622 if (has_zero_uses (name))
2623 return NULL;
2624 nloop_uses = 0;
2625 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2627 gimple *use_stmt = USE_STMT (use_p);
2628 if (is_gimple_debug (use_stmt))
2629 continue;
2631 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "intermediate value used outside loop.\n");
2637 return NULL;
2640 nloop_uses++;
2641 if (nloop_uses > 1)
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645 "reduction used in loop.\n");
2646 return NULL;
2649 phi_use_stmt = use_stmt;
2652 if (TREE_CODE (loop_arg) != SSA_NAME)
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657 "reduction: not ssa_name: ");
2658 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2659 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2661 return NULL;
2664 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2665 if (!def_stmt)
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2669 "reduction: no def_stmt.\n");
2670 return NULL;
2673 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2675 if (dump_enabled_p ())
2677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2678 dump_printf (MSG_NOTE, "\n");
2680 return NULL;
2683 if (is_gimple_assign (def_stmt))
2685 name = gimple_assign_lhs (def_stmt);
2686 phi_def = false;
2688 else
2690 name = PHI_RESULT (def_stmt);
2691 phi_def = true;
2694 nloop_uses = 0;
2695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2697 gimple *use_stmt = USE_STMT (use_p);
2698 if (is_gimple_debug (use_stmt))
2699 continue;
2700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2701 nloop_uses++;
2702 if (nloop_uses > 1)
2704 if (dump_enabled_p ())
2705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2706 "reduction used in loop.\n");
2707 return NULL;
2711 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2712 defined in the inner loop. */
2713 if (phi_def)
2715 op1 = PHI_ARG_DEF (def_stmt, 0);
2717 if (gimple_phi_num_args (def_stmt) != 1
2718 || TREE_CODE (op1) != SSA_NAME)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "unsupported phi node definition.\n");
2724 return NULL;
2727 def1 = SSA_NAME_DEF_STMT (op1);
2728 if (gimple_bb (def1)
2729 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2730 && loop->inner
2731 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2732 && is_gimple_assign (def1)
2733 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE, def_stmt,
2737 "detected double reduction: ");
2739 *double_reduc = true;
2740 return def_stmt;
2743 return NULL;
2746 code = orig_code = gimple_assign_rhs_code (def_stmt);
2748 /* We can handle "res -= x[i]", which is non-associative by
2749 simply rewriting this into "res += -x[i]". Avoid changing
2750 gimple instruction for the first simple tests and only do this
2751 if we're allowed to change code at all. */
2752 if (code == MINUS_EXPR
2753 && (op1 = gimple_assign_rhs1 (def_stmt))
2754 && TREE_CODE (op1) == SSA_NAME
2755 && SSA_NAME_DEF_STMT (op1) == phi)
2756 code = PLUS_EXPR;
2758 if (code == COND_EXPR)
2760 if (check_reduction)
2761 *v_reduc_type = COND_REDUCTION;
2763 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2765 if (dump_enabled_p ())
2766 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2767 "reduction: not commutative/associative: ");
2768 return NULL;
2771 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2773 if (code != COND_EXPR)
2775 if (dump_enabled_p ())
2776 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2777 "reduction: not binary operation: ");
2779 return NULL;
2782 op3 = gimple_assign_rhs1 (def_stmt);
2783 if (COMPARISON_CLASS_P (op3))
2785 op4 = TREE_OPERAND (op3, 1);
2786 op3 = TREE_OPERAND (op3, 0);
2789 op1 = gimple_assign_rhs2 (def_stmt);
2790 op2 = gimple_assign_rhs3 (def_stmt);
2792 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2794 if (dump_enabled_p ())
2795 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2796 "reduction: uses not ssa_names: ");
2798 return NULL;
2801 else
2803 op1 = gimple_assign_rhs1 (def_stmt);
2804 op2 = gimple_assign_rhs2 (def_stmt);
2806 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2808 if (dump_enabled_p ())
2809 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2810 "reduction: uses not ssa_names: ");
2812 return NULL;
2816 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2817 if ((TREE_CODE (op1) == SSA_NAME
2818 && !types_compatible_p (type,TREE_TYPE (op1)))
2819 || (TREE_CODE (op2) == SSA_NAME
2820 && !types_compatible_p (type, TREE_TYPE (op2)))
2821 || (op3 && TREE_CODE (op3) == SSA_NAME
2822 && !types_compatible_p (type, TREE_TYPE (op3)))
2823 || (op4 && TREE_CODE (op4) == SSA_NAME
2824 && !types_compatible_p (type, TREE_TYPE (op4))))
2826 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "reduction: multiple types: operation type: ");
2830 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2831 dump_printf (MSG_NOTE, ", operands types: ");
2832 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2833 TREE_TYPE (op1));
2834 dump_printf (MSG_NOTE, ",");
2835 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2836 TREE_TYPE (op2));
2837 if (op3)
2839 dump_printf (MSG_NOTE, ",");
2840 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2841 TREE_TYPE (op3));
2844 if (op4)
2846 dump_printf (MSG_NOTE, ",");
2847 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2848 TREE_TYPE (op4));
2850 dump_printf (MSG_NOTE, "\n");
2853 return NULL;
2856 /* Check that it's ok to change the order of the computation.
2857 Generally, when vectorizing a reduction we change the order of the
2858 computation. This may change the behavior of the program in some
2859 cases, so we need to check that this is ok. One exception is when
2860 vectorizing an outer-loop: the inner-loop is executed sequentially,
2861 and therefore vectorizing reductions in the inner-loop during
2862 outer-loop vectorization is safe. */
2864 if (*v_reduc_type != COND_REDUCTION
2865 && check_reduction)
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2870 /* Changing the order of operations changes the semantics. */
2871 if (dump_enabled_p ())
2872 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2873 "reduction: unsafe fp math optimization: ");
2874 return NULL;
2876 else if (INTEGRAL_TYPE_P (type))
2878 if (!operation_no_trapping_overflow (type, code))
2880 /* Changing the order of operations changes the semantics. */
2881 if (dump_enabled_p ())
2882 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2883 "reduction: unsafe int math optimization"
2884 " (overflow traps): ");
2885 return NULL;
2887 if (need_wrapping_integral_overflow
2888 && !TYPE_OVERFLOW_WRAPS (type)
2889 && operation_can_overflow (code))
2891 /* Changing the order of operations changes the semantics. */
2892 if (dump_enabled_p ())
2893 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2894 "reduction: unsafe int math optimization"
2895 " (overflow doesn't wrap): ");
2896 return NULL;
2899 else if (SAT_FIXED_POINT_TYPE_P (type))
2901 /* Changing the order of operations changes the semantics. */
2902 if (dump_enabled_p ())
2903 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2904 "reduction: unsafe fixed-point math optimization: ");
2905 return NULL;
2909 /* Reduction is safe. We're dealing with one of the following:
2910 1) integer arithmetic and no trapv
2911 2) floating point arithmetic, and special flags permit this optimization
2912 3) nested cycle (i.e., outer loop vectorization). */
2913 if (TREE_CODE (op1) == SSA_NAME)
2914 def1 = SSA_NAME_DEF_STMT (op1);
2916 if (TREE_CODE (op2) == SSA_NAME)
2917 def2 = SSA_NAME_DEF_STMT (op2);
2919 if (code != COND_EXPR
2920 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2922 if (dump_enabled_p ())
2923 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2924 return NULL;
2927 /* Check that one def is the reduction def, defined by PHI,
2928 the other def is either defined in the loop ("vect_internal_def"),
2929 or it's an induction (defined by a loop-header phi-node). */
2931 if (def2 && def2 == phi
2932 && (code == COND_EXPR
2933 || !def1 || gimple_nop_p (def1)
2934 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2935 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2936 && (is_gimple_assign (def1)
2937 || is_gimple_call (def1)
2938 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2939 == vect_induction_def
2940 || (gimple_code (def1) == GIMPLE_PHI
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2942 == vect_internal_def
2943 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2947 return def_stmt;
2950 if (def1 && def1 == phi
2951 && (code == COND_EXPR
2952 || !def2 || gimple_nop_p (def2)
2953 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2954 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2955 && (is_gimple_assign (def2)
2956 || is_gimple_call (def2)
2957 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2958 == vect_induction_def
2959 || (gimple_code (def2) == GIMPLE_PHI
2960 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2961 == vect_internal_def
2962 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2964 if (check_reduction
2965 && orig_code != MINUS_EXPR)
2967 if (code == COND_EXPR)
2969 /* No current known use where this case would be useful. */
2970 if (dump_enabled_p ())
2971 report_vect_op (MSG_NOTE, def_stmt,
2972 "detected reduction: cannot currently swap "
2973 "operands for cond_expr");
2974 return NULL;
2977 /* Swap operands (just for simplicity - so that the rest of the code
2978 can assume that the reduction variable is always the last (second)
2979 argument). */
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE, def_stmt,
2982 "detected reduction: need to swap operands: ");
2984 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2985 gimple_assign_rhs2_ptr (def_stmt));
2987 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2988 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2990 else
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2996 return def_stmt;
2999 /* Try to find SLP reduction chain. */
3000 if (check_reduction && code != COND_EXPR
3001 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3003 if (dump_enabled_p ())
3004 report_vect_op (MSG_NOTE, def_stmt,
3005 "reduction: detected reduction chain: ");
3007 return def_stmt;
3010 if (dump_enabled_p ())
3011 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3012 "reduction: unknown pattern: ");
3014 return NULL;
3017 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3018 in-place if it enables detection of more reductions. Arguments
3019 as there. */
3021 gimple *
3022 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3023 bool check_reduction, bool *double_reduc,
3024 bool need_wrapping_integral_overflow)
3026 enum vect_reduction_type v_reduc_type;
3027 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3028 double_reduc,
3029 need_wrapping_integral_overflow,
3030 &v_reduc_type);
3033 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3035 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3036 int *peel_iters_epilogue,
3037 stmt_vector_for_cost *scalar_cost_vec,
3038 stmt_vector_for_cost *prologue_cost_vec,
3039 stmt_vector_for_cost *epilogue_cost_vec)
3041 int retval = 0;
3042 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3044 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3046 *peel_iters_epilogue = vf/2;
3047 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 "cost model: epilogue peel iters set to vf/2 "
3050 "because loop iterations are unknown .\n");
3052 /* If peeled iterations are known but number of scalar loop
3053 iterations are unknown, count a taken branch per peeled loop. */
3054 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3055 NULL, 0, vect_prologue);
3056 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3057 NULL, 0, vect_epilogue);
3059 else
3061 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3062 peel_iters_prologue = niters < peel_iters_prologue ?
3063 niters : peel_iters_prologue;
3064 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3065 /* If we need to peel for gaps, but no peeling is required, we have to
3066 peel VF iterations. */
3067 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3068 *peel_iters_epilogue = vf;
3071 stmt_info_for_cost *si;
3072 int j;
3073 if (peel_iters_prologue)
3074 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3075 retval += record_stmt_cost (prologue_cost_vec,
3076 si->count * peel_iters_prologue,
3077 si->kind, NULL, si->misalign,
3078 vect_prologue);
3079 if (*peel_iters_epilogue)
3080 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3081 retval += record_stmt_cost (epilogue_cost_vec,
3082 si->count * *peel_iters_epilogue,
3083 si->kind, NULL, si->misalign,
3084 vect_epilogue);
3086 return retval;
3089 /* Function vect_estimate_min_profitable_iters
3091 Return the number of iterations required for the vector version of the
3092 loop to be profitable relative to the cost of the scalar version of the
3093 loop. */
3095 static void
3096 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3097 int *ret_min_profitable_niters,
3098 int *ret_min_profitable_estimate)
3100 int min_profitable_iters;
3101 int min_profitable_estimate;
3102 int peel_iters_prologue;
3103 int peel_iters_epilogue;
3104 unsigned vec_inside_cost = 0;
3105 int vec_outside_cost = 0;
3106 unsigned vec_prologue_cost = 0;
3107 unsigned vec_epilogue_cost = 0;
3108 int scalar_single_iter_cost = 0;
3109 int scalar_outside_cost = 0;
3110 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3111 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3112 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3114 /* Cost model disabled. */
3115 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3117 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3118 *ret_min_profitable_niters = 0;
3119 *ret_min_profitable_estimate = 0;
3120 return;
3123 /* Requires loop versioning tests to handle misalignment. */
3124 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3126 /* FIXME: Make cost depend on complexity of individual check. */
3127 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3128 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3129 vect_prologue);
3130 dump_printf (MSG_NOTE,
3131 "cost model: Adding cost of checks for loop "
3132 "versioning to treat misalignment.\n");
3135 /* Requires loop versioning with alias checks. */
3136 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3138 /* FIXME: Make cost depend on complexity of individual check. */
3139 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3140 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3141 vect_prologue);
3142 dump_printf (MSG_NOTE,
3143 "cost model: Adding cost of checks for loop "
3144 "versioning aliasing.\n");
3147 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3148 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3149 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3150 vect_prologue);
3152 /* Count statements in scalar loop. Using this as scalar cost for a single
3153 iteration for now.
3155 TODO: Add outer loop support.
3157 TODO: Consider assigning different costs to different scalar
3158 statements. */
3160 scalar_single_iter_cost
3161 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3163 /* Add additional cost for the peeled instructions in prologue and epilogue
3164 loop.
3166 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3167 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3169 TODO: Build an expression that represents peel_iters for prologue and
3170 epilogue to be used in a run-time test. */
3172 if (npeel < 0)
3174 peel_iters_prologue = vf/2;
3175 dump_printf (MSG_NOTE, "cost model: "
3176 "prologue peel iters set to vf/2.\n");
3178 /* If peeling for alignment is unknown, loop bound of main loop becomes
3179 unknown. */
3180 peel_iters_epilogue = vf/2;
3181 dump_printf (MSG_NOTE, "cost model: "
3182 "epilogue peel iters set to vf/2 because "
3183 "peeling for alignment is unknown.\n");
3185 /* If peeled iterations are unknown, count a taken branch and a not taken
3186 branch per peeled loop. Even if scalar loop iterations are known,
3187 vector iterations are not known since peeled prologue iterations are
3188 not known. Hence guards remain the same. */
3189 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3190 NULL, 0, vect_prologue);
3191 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3192 NULL, 0, vect_prologue);
3193 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3194 NULL, 0, vect_epilogue);
3195 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3196 NULL, 0, vect_epilogue);
3197 stmt_info_for_cost *si;
3198 int j;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3201 struct _stmt_vec_info *stmt_info
3202 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3203 (void) add_stmt_cost (target_cost_data,
3204 si->count * peel_iters_prologue,
3205 si->kind, stmt_info, si->misalign,
3206 vect_prologue);
3207 (void) add_stmt_cost (target_cost_data,
3208 si->count * peel_iters_epilogue,
3209 si->kind, stmt_info, si->misalign,
3210 vect_epilogue);
3213 else
3215 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3216 stmt_info_for_cost *si;
3217 int j;
3218 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3220 prologue_cost_vec.create (2);
3221 epilogue_cost_vec.create (2);
3222 peel_iters_prologue = npeel;
3224 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3225 &peel_iters_epilogue,
3226 &LOOP_VINFO_SCALAR_ITERATION_COST
3227 (loop_vinfo),
3228 &prologue_cost_vec,
3229 &epilogue_cost_vec);
3231 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3233 struct _stmt_vec_info *stmt_info
3234 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3235 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3236 si->misalign, vect_prologue);
3239 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3241 struct _stmt_vec_info *stmt_info
3242 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3243 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3244 si->misalign, vect_epilogue);
3247 prologue_cost_vec.release ();
3248 epilogue_cost_vec.release ();
3251 /* FORNOW: The scalar outside cost is incremented in one of the
3252 following ways:
3254 1. The vectorizer checks for alignment and aliasing and generates
3255 a condition that allows dynamic vectorization. A cost model
3256 check is ANDED with the versioning condition. Hence scalar code
3257 path now has the added cost of the versioning check.
3259 if (cost > th & versioning_check)
3260 jmp to vector code
3262 Hence run-time scalar is incremented by not-taken branch cost.
3264 2. The vectorizer then checks if a prologue is required. If the
3265 cost model check was not done before during versioning, it has to
3266 be done before the prologue check.
3268 if (cost <= th)
3269 prologue = scalar_iters
3270 if (prologue == 0)
3271 jmp to vector code
3272 else
3273 execute prologue
3274 if (prologue == num_iters)
3275 go to exit
3277 Hence the run-time scalar cost is incremented by a taken branch,
3278 plus a not-taken branch, plus a taken branch cost.
3280 3. The vectorizer then checks if an epilogue is required. If the
3281 cost model check was not done before during prologue check, it
3282 has to be done with the epilogue check.
3284 if (prologue == 0)
3285 jmp to vector code
3286 else
3287 execute prologue
3288 if (prologue == num_iters)
3289 go to exit
3290 vector code:
3291 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3292 jmp to epilogue
3294 Hence the run-time scalar cost should be incremented by 2 taken
3295 branches.
3297 TODO: The back end may reorder the BBS's differently and reverse
3298 conditions/branch directions. Change the estimates below to
3299 something more reasonable. */
3301 /* If the number of iterations is known and we do not do versioning, we can
3302 decide whether to vectorize at compile time. Hence the scalar version
3303 do not carry cost model guard costs. */
3304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3305 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3306 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3308 /* Cost model check occurs at versioning. */
3309 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3310 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3311 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3312 else
3314 /* Cost model check occurs at prologue generation. */
3315 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3316 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3317 + vect_get_stmt_cost (cond_branch_not_taken);
3318 /* Cost model check occurs at epilogue generation. */
3319 else
3320 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3324 /* Complete the target-specific cost calculations. */
3325 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3326 &vec_inside_cost, &vec_epilogue_cost);
3328 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3330 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3333 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3334 vec_inside_cost);
3335 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3336 vec_prologue_cost);
3337 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3338 vec_epilogue_cost);
3339 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3340 scalar_single_iter_cost);
3341 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3342 scalar_outside_cost);
3343 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3344 vec_outside_cost);
3345 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3346 peel_iters_prologue);
3347 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3348 peel_iters_epilogue);
3351 /* Calculate number of iterations required to make the vector version
3352 profitable, relative to the loop bodies only. The following condition
3353 must hold true:
3354 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3355 where
3356 SIC = scalar iteration cost, VIC = vector iteration cost,
3357 VOC = vector outside cost, VF = vectorization factor,
3358 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3359 SOC = scalar outside cost for run time cost model check. */
3361 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3363 if (vec_outside_cost <= 0)
3364 min_profitable_iters = 1;
3365 else
3367 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3368 - vec_inside_cost * peel_iters_prologue
3369 - vec_inside_cost * peel_iters_epilogue)
3370 / ((scalar_single_iter_cost * vf)
3371 - vec_inside_cost);
3373 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3374 <= (((int) vec_inside_cost * min_profitable_iters)
3375 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3376 min_profitable_iters++;
3379 /* vector version will never be profitable. */
3380 else
3382 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3383 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3384 "did not happen for a simd loop");
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3388 "cost model: the vector iteration cost = %d "
3389 "divided by the scalar iteration cost = %d "
3390 "is greater or equal to the vectorization factor = %d"
3391 ".\n",
3392 vec_inside_cost, scalar_single_iter_cost, vf);
3393 *ret_min_profitable_niters = -1;
3394 *ret_min_profitable_estimate = -1;
3395 return;
3398 dump_printf (MSG_NOTE,
3399 " Calculated minimum iters for profitability: %d\n",
3400 min_profitable_iters);
3402 min_profitable_iters =
3403 min_profitable_iters < vf ? vf : min_profitable_iters;
3405 /* Because the condition we create is:
3406 if (niters <= min_profitable_iters)
3407 then skip the vectorized loop. */
3408 min_profitable_iters--;
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 " Runtime profitability threshold = %d\n",
3413 min_profitable_iters);
3415 *ret_min_profitable_niters = min_profitable_iters;
3417 /* Calculate number of iterations required to make the vector version
3418 profitable, relative to the loop bodies only.
3420 Non-vectorized variant is SIC * niters and it must win over vector
3421 variant on the expected loop trip count. The following condition must hold true:
3422 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3424 if (vec_outside_cost <= 0)
3425 min_profitable_estimate = 1;
3426 else
3428 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3429 - vec_inside_cost * peel_iters_prologue
3430 - vec_inside_cost * peel_iters_epilogue)
3431 / ((scalar_single_iter_cost * vf)
3432 - vec_inside_cost);
3434 min_profitable_estimate --;
3435 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3436 if (dump_enabled_p ())
3437 dump_printf_loc (MSG_NOTE, vect_location,
3438 " Static estimate profitability threshold = %d\n",
3439 min_profitable_estimate);
3441 *ret_min_profitable_estimate = min_profitable_estimate;
3444 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3445 vector elements (not bits) for a vector of mode MODE. */
3446 static void
3447 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3448 unsigned char *sel)
3450 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3452 for (i = 0; i < nelt; i++)
3453 sel[i] = (i + offset) & (2*nelt - 1);
3456 /* Checks whether the target supports whole-vector shifts for vectors of mode
3457 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3458 it supports vec_perm_const with masks for all necessary shift amounts. */
3459 static bool
3460 have_whole_vector_shift (enum machine_mode mode)
3462 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3463 return true;
3465 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3466 return false;
3468 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3469 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3471 for (i = nelt/2; i >= 1; i/=2)
3473 calc_vec_perm_mask_for_shift (mode, i, sel);
3474 if (!can_vec_perm_p (mode, false, sel))
3475 return false;
3477 return true;
3480 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3482 static tree
3483 get_reduction_op (gimple *stmt, int reduc_index)
3485 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3487 case GIMPLE_SINGLE_RHS:
3488 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3489 == ternary_op);
3490 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3491 case GIMPLE_UNARY_RHS:
3492 return gimple_assign_rhs1 (stmt);
3493 case GIMPLE_BINARY_RHS:
3494 return (reduc_index
3495 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3496 case GIMPLE_TERNARY_RHS:
3497 return gimple_op (stmt, reduc_index + 1);
3498 default:
3499 gcc_unreachable ();
3503 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3504 functions. Design better to avoid maintenance issues. */
3506 /* Function vect_model_reduction_cost.
3508 Models cost for a reduction operation, including the vector ops
3509 generated within the strip-mine loop, the initial definition before
3510 the loop, and the epilogue code that must be generated. */
3512 static bool
3513 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3514 int ncopies, int reduc_index)
3516 int prologue_cost = 0, epilogue_cost = 0;
3517 enum tree_code code;
3518 optab optab;
3519 tree vectype;
3520 gimple *stmt, *orig_stmt;
3521 tree reduction_op;
3522 machine_mode mode;
3523 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524 struct loop *loop = NULL;
3525 void *target_cost_data;
3527 if (loop_vinfo)
3529 loop = LOOP_VINFO_LOOP (loop_vinfo);
3530 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3532 else
3533 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3535 /* Condition reductions generate two reductions in the loop. */
3536 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3537 ncopies *= 2;
3539 /* Cost of reduction op inside loop. */
3540 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3541 stmt_info, 0, vect_body);
3542 stmt = STMT_VINFO_STMT (stmt_info);
3544 reduction_op = get_reduction_op (stmt, reduc_index);
3546 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3547 if (!vectype)
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 "unsupported data-type ");
3553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3554 TREE_TYPE (reduction_op));
3555 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3557 return false;
3560 mode = TYPE_MODE (vectype);
3561 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3563 if (!orig_stmt)
3564 orig_stmt = STMT_VINFO_STMT (stmt_info);
3566 code = gimple_assign_rhs_code (orig_stmt);
3568 /* Add in cost for initial definition.
3569 For cond reduction we have four vectors: initial index, step, initial
3570 result of the data reduction, initial value of the index reduction. */
3571 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3572 == COND_REDUCTION ? 4 : 1;
3573 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3574 scalar_to_vec, stmt_info, 0,
3575 vect_prologue);
3577 /* Determine cost of epilogue code.
3579 We have a reduction operator that will reduce the vector in one statement.
3580 Also requires scalar extract. */
3582 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3584 if (reduc_code != ERROR_MARK)
3586 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3588 /* An EQ stmt and an COND_EXPR stmt. */
3589 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3590 vector_stmt, stmt_info, 0,
3591 vect_epilogue);
3592 /* Reduction of the max index and a reduction of the found
3593 values. */
3594 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3595 vec_to_scalar, stmt_info, 0,
3596 vect_epilogue);
3597 /* A broadcast of the max value. */
3598 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3599 scalar_to_vec, stmt_info, 0,
3600 vect_epilogue);
3602 else
3604 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3605 stmt_info, 0, vect_epilogue);
3606 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3607 vec_to_scalar, stmt_info, 0,
3608 vect_epilogue);
3611 else
3613 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3614 tree bitsize =
3615 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3616 int element_bitsize = tree_to_uhwi (bitsize);
3617 int nelements = vec_size_in_bits / element_bitsize;
3619 optab = optab_for_tree_code (code, vectype, optab_default);
3621 /* We have a whole vector shift available. */
3622 if (VECTOR_MODE_P (mode)
3623 && optab_handler (optab, mode) != CODE_FOR_nothing
3624 && have_whole_vector_shift (mode))
3626 /* Final reduction via vector shifts and the reduction operator.
3627 Also requires scalar extract. */
3628 epilogue_cost += add_stmt_cost (target_cost_data,
3629 exact_log2 (nelements) * 2,
3630 vector_stmt, stmt_info, 0,
3631 vect_epilogue);
3632 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3633 vec_to_scalar, stmt_info, 0,
3634 vect_epilogue);
3636 else
3637 /* Use extracts and reduction op for final reduction. For N
3638 elements, we have N extracts and N-1 reduction ops. */
3639 epilogue_cost += add_stmt_cost (target_cost_data,
3640 nelements + nelements - 1,
3641 vector_stmt, stmt_info, 0,
3642 vect_epilogue);
3646 if (dump_enabled_p ())
3647 dump_printf (MSG_NOTE,
3648 "vect_model_reduction_cost: inside_cost = %d, "
3649 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3650 prologue_cost, epilogue_cost);
3652 return true;
3656 /* Function vect_model_induction_cost.
3658 Models cost for induction operations. */
3660 static void
3661 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3663 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3664 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3665 unsigned inside_cost, prologue_cost;
3667 /* loop cost for vec_loop. */
3668 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3669 stmt_info, 0, vect_body);
3671 /* prologue cost for vec_init and vec_step. */
3672 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3673 stmt_info, 0, vect_prologue);
3675 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_NOTE, vect_location,
3677 "vect_model_induction_cost: inside_cost = %d, "
3678 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3682 /* Function get_initial_def_for_induction
3684 Input:
3685 STMT - a stmt that performs an induction operation in the loop.
3686 IV_PHI - the initial value of the induction variable
3688 Output:
3689 Return a vector variable, initialized with the first VF values of
3690 the induction variable. E.g., for an iv with IV_PHI='X' and
3691 evolution S, for a vector of 4 units, we want to return:
3692 [X, X + S, X + 2*S, X + 3*S]. */
3694 static tree
3695 get_initial_def_for_induction (gimple *iv_phi)
3697 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3699 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3700 tree vectype;
3701 int nunits;
3702 edge pe = loop_preheader_edge (loop);
3703 struct loop *iv_loop;
3704 basic_block new_bb;
3705 tree new_vec, vec_init, vec_step, t;
3706 tree new_name;
3707 gimple *new_stmt;
3708 gphi *induction_phi;
3709 tree induc_def, vec_def, vec_dest;
3710 tree init_expr, step_expr;
3711 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3712 int i;
3713 int ncopies;
3714 tree expr;
3715 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3716 bool nested_in_vect_loop = false;
3717 gimple_seq stmts;
3718 imm_use_iterator imm_iter;
3719 use_operand_p use_p;
3720 gimple *exit_phi;
3721 edge latch_e;
3722 tree loop_arg;
3723 gimple_stmt_iterator si;
3724 basic_block bb = gimple_bb (iv_phi);
3725 tree stepvectype;
3726 tree resvectype;
3728 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3729 if (nested_in_vect_loop_p (loop, iv_phi))
3731 nested_in_vect_loop = true;
3732 iv_loop = loop->inner;
3734 else
3735 iv_loop = loop;
3736 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3738 latch_e = loop_latch_edge (iv_loop);
3739 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3741 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3742 gcc_assert (step_expr != NULL_TREE);
3744 pe = loop_preheader_edge (iv_loop);
3745 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3746 loop_preheader_edge (iv_loop));
3748 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3749 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3750 gcc_assert (vectype);
3751 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3752 ncopies = vf / nunits;
3754 gcc_assert (phi_info);
3755 gcc_assert (ncopies >= 1);
3757 /* Convert the step to the desired type. */
3758 stmts = NULL;
3759 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3760 if (stmts)
3762 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3763 gcc_assert (!new_bb);
3766 /* Find the first insertion point in the BB. */
3767 si = gsi_after_labels (bb);
3769 /* Create the vector that holds the initial_value of the induction. */
3770 if (nested_in_vect_loop)
3772 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3773 been created during vectorization of previous stmts. We obtain it
3774 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3775 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3776 /* If the initial value is not of proper type, convert it. */
3777 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3779 new_stmt
3780 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3781 vect_simple_var,
3782 "vec_iv_"),
3783 VIEW_CONVERT_EXPR,
3784 build1 (VIEW_CONVERT_EXPR, vectype,
3785 vec_init));
3786 vec_init = gimple_assign_lhs (new_stmt);
3787 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3788 new_stmt);
3789 gcc_assert (!new_bb);
3790 set_vinfo_for_stmt (new_stmt,
3791 new_stmt_vec_info (new_stmt, loop_vinfo));
3794 else
3796 vec<constructor_elt, va_gc> *v;
3798 /* iv_loop is the loop to be vectorized. Create:
3799 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3800 stmts = NULL;
3801 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3803 vec_alloc (v, nunits);
3804 bool constant_p = is_gimple_min_invariant (new_name);
3805 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3806 for (i = 1; i < nunits; i++)
3808 /* Create: new_name_i = new_name + step_expr */
3809 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3810 new_name, step_expr);
3811 if (!is_gimple_min_invariant (new_name))
3812 constant_p = false;
3813 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3815 if (stmts)
3817 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3818 gcc_assert (!new_bb);
3821 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3822 if (constant_p)
3823 new_vec = build_vector_from_ctor (vectype, v);
3824 else
3825 new_vec = build_constructor (vectype, v);
3826 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3830 /* Create the vector that holds the step of the induction. */
3831 if (nested_in_vect_loop)
3832 /* iv_loop is nested in the loop to be vectorized. Generate:
3833 vec_step = [S, S, S, S] */
3834 new_name = step_expr;
3835 else
3837 /* iv_loop is the loop to be vectorized. Generate:
3838 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3839 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3841 expr = build_int_cst (integer_type_node, vf);
3842 expr = fold_convert (TREE_TYPE (step_expr), expr);
3844 else
3845 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3846 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3847 expr, step_expr);
3848 if (TREE_CODE (step_expr) == SSA_NAME)
3849 new_name = vect_init_vector (iv_phi, new_name,
3850 TREE_TYPE (step_expr), NULL);
3853 t = unshare_expr (new_name);
3854 gcc_assert (CONSTANT_CLASS_P (new_name)
3855 || TREE_CODE (new_name) == SSA_NAME);
3856 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3857 gcc_assert (stepvectype);
3858 new_vec = build_vector_from_val (stepvectype, t);
3859 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3862 /* Create the following def-use cycle:
3863 loop prolog:
3864 vec_init = ...
3865 vec_step = ...
3866 loop:
3867 vec_iv = PHI <vec_init, vec_loop>
3869 STMT
3871 vec_loop = vec_iv + vec_step; */
3873 /* Create the induction-phi that defines the induction-operand. */
3874 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3875 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3876 set_vinfo_for_stmt (induction_phi,
3877 new_stmt_vec_info (induction_phi, loop_vinfo));
3878 induc_def = PHI_RESULT (induction_phi);
3880 /* Create the iv update inside the loop */
3881 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3882 vec_def = make_ssa_name (vec_dest, new_stmt);
3883 gimple_assign_set_lhs (new_stmt, vec_def);
3884 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3885 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3887 /* Set the arguments of the phi node: */
3888 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3889 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3890 UNKNOWN_LOCATION);
3893 /* In case that vectorization factor (VF) is bigger than the number
3894 of elements that we can fit in a vectype (nunits), we have to generate
3895 more than one vector stmt - i.e - we need to "unroll" the
3896 vector stmt by a factor VF/nunits. For more details see documentation
3897 in vectorizable_operation. */
3899 if (ncopies > 1)
3901 stmt_vec_info prev_stmt_vinfo;
3902 /* FORNOW. This restriction should be relaxed. */
3903 gcc_assert (!nested_in_vect_loop);
3905 /* Create the vector that holds the step of the induction. */
3906 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3908 expr = build_int_cst (integer_type_node, nunits);
3909 expr = fold_convert (TREE_TYPE (step_expr), expr);
3911 else
3912 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3913 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3914 expr, step_expr);
3915 if (TREE_CODE (step_expr) == SSA_NAME)
3916 new_name = vect_init_vector (iv_phi, new_name,
3917 TREE_TYPE (step_expr), NULL);
3918 t = unshare_expr (new_name);
3919 gcc_assert (CONSTANT_CLASS_P (new_name)
3920 || TREE_CODE (new_name) == SSA_NAME);
3921 new_vec = build_vector_from_val (stepvectype, t);
3922 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3924 vec_def = induc_def;
3925 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3926 for (i = 1; i < ncopies; i++)
3928 /* vec_i = vec_prev + vec_step */
3929 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3930 vec_def, vec_step);
3931 vec_def = make_ssa_name (vec_dest, new_stmt);
3932 gimple_assign_set_lhs (new_stmt, vec_def);
3934 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3935 if (!useless_type_conversion_p (resvectype, vectype))
3937 new_stmt
3938 = gimple_build_assign
3939 (vect_get_new_vect_var (resvectype, vect_simple_var,
3940 "vec_iv_"),
3941 VIEW_CONVERT_EXPR,
3942 build1 (VIEW_CONVERT_EXPR, resvectype,
3943 gimple_assign_lhs (new_stmt)));
3944 gimple_assign_set_lhs (new_stmt,
3945 make_ssa_name
3946 (gimple_assign_lhs (new_stmt), new_stmt));
3947 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3949 set_vinfo_for_stmt (new_stmt,
3950 new_stmt_vec_info (new_stmt, loop_vinfo));
3951 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3952 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3956 if (nested_in_vect_loop)
3958 /* Find the loop-closed exit-phi of the induction, and record
3959 the final vector of induction results: */
3960 exit_phi = NULL;
3961 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3963 gimple *use_stmt = USE_STMT (use_p);
3964 if (is_gimple_debug (use_stmt))
3965 continue;
3967 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3969 exit_phi = use_stmt;
3970 break;
3973 if (exit_phi)
3975 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3976 /* FORNOW. Currently not supporting the case that an inner-loop induction
3977 is not used in the outer-loop (i.e. only outside the outer-loop). */
3978 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3979 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3981 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE, vect_location,
3985 "vector of inductions after inner-loop:");
3986 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3987 dump_printf (MSG_NOTE, "\n");
3993 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_NOTE, vect_location,
3996 "transform induction: created def-use cycle: ");
3997 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3998 dump_printf (MSG_NOTE, "\n");
3999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4000 SSA_NAME_DEF_STMT (vec_def), 0);
4001 dump_printf (MSG_NOTE, "\n");
4004 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4005 if (!useless_type_conversion_p (resvectype, vectype))
4007 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4008 vect_simple_var,
4009 "vec_iv_"),
4010 VIEW_CONVERT_EXPR,
4011 build1 (VIEW_CONVERT_EXPR, resvectype,
4012 induc_def));
4013 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4014 gimple_assign_set_lhs (new_stmt, induc_def);
4015 si = gsi_after_labels (bb);
4016 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4017 set_vinfo_for_stmt (new_stmt,
4018 new_stmt_vec_info (new_stmt, loop_vinfo));
4019 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4020 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4023 return induc_def;
4027 /* Function get_initial_def_for_reduction
4029 Input:
4030 STMT - a stmt that performs a reduction operation in the loop.
4031 INIT_VAL - the initial value of the reduction variable
4033 Output:
4034 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4035 of the reduction (used for adjusting the epilog - see below).
4036 Return a vector variable, initialized according to the operation that STMT
4037 performs. This vector will be used as the initial value of the
4038 vector of partial results.
4040 Option1 (adjust in epilog): Initialize the vector as follows:
4041 add/bit or/xor: [0,0,...,0,0]
4042 mult/bit and: [1,1,...,1,1]
4043 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4044 and when necessary (e.g. add/mult case) let the caller know
4045 that it needs to adjust the result by init_val.
4047 Option2: Initialize the vector as follows:
4048 add/bit or/xor: [init_val,0,0,...,0]
4049 mult/bit and: [init_val,1,1,...,1]
4050 min/max/cond_expr: [init_val,init_val,...,init_val]
4051 and no adjustments are needed.
4053 For example, for the following code:
4055 s = init_val;
4056 for (i=0;i<n;i++)
4057 s = s + a[i];
4059 STMT is 's = s + a[i]', and the reduction variable is 's'.
4060 For a vector of 4 units, we want to return either [0,0,0,init_val],
4061 or [0,0,0,0] and let the caller know that it needs to adjust
4062 the result at the end by 'init_val'.
4064 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4065 initialization vector is simpler (same element in all entries), if
4066 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4068 A cost model should help decide between these two schemes. */
4070 tree
4071 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4072 tree *adjustment_def)
4074 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4075 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4076 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4077 tree scalar_type = TREE_TYPE (init_val);
4078 tree vectype = get_vectype_for_scalar_type (scalar_type);
4079 int nunits;
4080 enum tree_code code = gimple_assign_rhs_code (stmt);
4081 tree def_for_init;
4082 tree init_def;
4083 tree *elts;
4084 int i;
4085 bool nested_in_vect_loop = false;
4086 REAL_VALUE_TYPE real_init_val = dconst0;
4087 int int_init_val = 0;
4088 gimple *def_stmt = NULL;
4089 gimple_seq stmts = NULL;
4091 gcc_assert (vectype);
4092 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4094 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4095 || SCALAR_FLOAT_TYPE_P (scalar_type));
4097 if (nested_in_vect_loop_p (loop, stmt))
4098 nested_in_vect_loop = true;
4099 else
4100 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4102 /* In case of double reduction we only create a vector variable to be put
4103 in the reduction phi node. The actual statement creation is done in
4104 vect_create_epilog_for_reduction. */
4105 if (adjustment_def && nested_in_vect_loop
4106 && TREE_CODE (init_val) == SSA_NAME
4107 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4108 && gimple_code (def_stmt) == GIMPLE_PHI
4109 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4110 && vinfo_for_stmt (def_stmt)
4111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4112 == vect_double_reduction_def)
4114 *adjustment_def = NULL;
4115 return vect_create_destination_var (init_val, vectype);
4118 /* In case of a nested reduction do not use an adjustment def as
4119 that case is not supported by the epilogue generation correctly
4120 if ncopies is not one. */
4121 if (adjustment_def && nested_in_vect_loop)
4123 *adjustment_def = NULL;
4124 return vect_get_vec_def_for_operand (init_val, stmt);
4127 switch (code)
4129 case WIDEN_SUM_EXPR:
4130 case DOT_PROD_EXPR:
4131 case SAD_EXPR:
4132 case PLUS_EXPR:
4133 case MINUS_EXPR:
4134 case BIT_IOR_EXPR:
4135 case BIT_XOR_EXPR:
4136 case MULT_EXPR:
4137 case BIT_AND_EXPR:
4138 /* ADJUSMENT_DEF is NULL when called from
4139 vect_create_epilog_for_reduction to vectorize double reduction. */
4140 if (adjustment_def)
4141 *adjustment_def = init_val;
4143 if (code == MULT_EXPR)
4145 real_init_val = dconst1;
4146 int_init_val = 1;
4149 if (code == BIT_AND_EXPR)
4150 int_init_val = -1;
4152 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4153 def_for_init = build_real (scalar_type, real_init_val);
4154 else
4155 def_for_init = build_int_cst (scalar_type, int_init_val);
4157 /* Create a vector of '0' or '1' except the first element. */
4158 elts = XALLOCAVEC (tree, nunits);
4159 for (i = nunits - 2; i >= 0; --i)
4160 elts[i + 1] = def_for_init;
4162 /* Option1: the first element is '0' or '1' as well. */
4163 if (adjustment_def)
4165 elts[0] = def_for_init;
4166 init_def = build_vector (vectype, elts);
4167 break;
4170 /* Option2: the first element is INIT_VAL. */
4171 elts[0] = init_val;
4172 if (TREE_CONSTANT (init_val))
4173 init_def = build_vector (vectype, elts);
4174 else
4176 vec<constructor_elt, va_gc> *v;
4177 vec_alloc (v, nunits);
4178 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4179 for (i = 1; i < nunits; ++i)
4180 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4181 init_def = build_constructor (vectype, v);
4184 break;
4186 case MIN_EXPR:
4187 case MAX_EXPR:
4188 case COND_EXPR:
4189 if (adjustment_def)
4191 *adjustment_def = NULL_TREE;
4192 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4194 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4195 break;
4198 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4199 if (! gimple_seq_empty_p (stmts))
4200 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4201 init_def = build_vector_from_val (vectype, init_val);
4202 break;
4204 default:
4205 gcc_unreachable ();
4208 return init_def;
4211 /* Function vect_create_epilog_for_reduction
4213 Create code at the loop-epilog to finalize the result of a reduction
4214 computation.
4216 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4217 reduction statements.
4218 STMT is the scalar reduction stmt that is being vectorized.
4219 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4220 number of elements that we can fit in a vectype (nunits). In this case
4221 we have to generate more than one vector stmt - i.e - we need to "unroll"
4222 the vector stmt by a factor VF/nunits. For more details see documentation
4223 in vectorizable_operation.
4224 REDUC_CODE is the tree-code for the epilog reduction.
4225 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4226 computation.
4227 REDUC_INDEX is the index of the operand in the right hand side of the
4228 statement that is defined by REDUCTION_PHI.
4229 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4230 SLP_NODE is an SLP node containing a group of reduction statements. The
4231 first one in this group is STMT.
4232 INDUCTION_INDEX is the index of the loop for condition reductions.
4233 Otherwise it is undefined.
4235 This function:
4236 1. Creates the reduction def-use cycles: sets the arguments for
4237 REDUCTION_PHIS:
4238 The loop-entry argument is the vectorized initial-value of the reduction.
4239 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4240 sums.
4241 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4242 by applying the operation specified by REDUC_CODE if available, or by
4243 other means (whole-vector shifts or a scalar loop).
4244 The function also creates a new phi node at the loop exit to preserve
4245 loop-closed form, as illustrated below.
4247 The flow at the entry to this function:
4249 loop:
4250 vec_def = phi <null, null> # REDUCTION_PHI
4251 VECT_DEF = vector_stmt # vectorized form of STMT
4252 s_loop = scalar_stmt # (scalar) STMT
4253 loop_exit:
4254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4255 use <s_out0>
4256 use <s_out0>
4258 The above is transformed by this function into:
4260 loop:
4261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4262 VECT_DEF = vector_stmt # vectorized form of STMT
4263 s_loop = scalar_stmt # (scalar) STMT
4264 loop_exit:
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4270 use <s_out4>
4271 use <s_out4>
4274 static void
4275 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4276 int ncopies, enum tree_code reduc_code,
4277 vec<gimple *> reduction_phis,
4278 int reduc_index, bool double_reduc,
4279 slp_tree slp_node, tree induction_index)
4281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4282 stmt_vec_info prev_phi_info;
4283 tree vectype;
4284 machine_mode mode;
4285 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4287 basic_block exit_bb;
4288 tree scalar_dest;
4289 tree scalar_type;
4290 gimple *new_phi = NULL, *phi;
4291 gimple_stmt_iterator exit_gsi;
4292 tree vec_dest;
4293 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4294 gimple *epilog_stmt = NULL;
4295 enum tree_code code = gimple_assign_rhs_code (stmt);
4296 gimple *exit_phi;
4297 tree bitsize;
4298 tree adjustment_def = NULL;
4299 tree vec_initial_def = NULL;
4300 tree reduction_op, expr, def, initial_def = NULL;
4301 tree orig_name, scalar_result;
4302 imm_use_iterator imm_iter, phi_imm_iter;
4303 use_operand_p use_p, phi_use_p;
4304 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4305 bool nested_in_vect_loop = false;
4306 auto_vec<gimple *> new_phis;
4307 auto_vec<gimple *> inner_phis;
4308 enum vect_def_type dt = vect_unknown_def_type;
4309 int j, i;
4310 auto_vec<tree> scalar_results;
4311 unsigned int group_size = 1, k, ratio;
4312 auto_vec<tree> vec_initial_defs;
4313 auto_vec<gimple *> phis;
4314 bool slp_reduc = false;
4315 tree new_phi_result;
4316 gimple *inner_phi = NULL;
4318 if (slp_node)
4319 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4321 if (nested_in_vect_loop_p (loop, stmt))
4323 outer_loop = loop;
4324 loop = loop->inner;
4325 nested_in_vect_loop = true;
4326 gcc_assert (!slp_node);
4329 reduction_op = get_reduction_op (stmt, reduc_index);
4331 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4332 gcc_assert (vectype);
4333 mode = TYPE_MODE (vectype);
4335 /* 1. Create the reduction def-use cycle:
4336 Set the arguments of REDUCTION_PHIS, i.e., transform
4338 loop:
4339 vec_def = phi <null, null> # REDUCTION_PHI
4340 VECT_DEF = vector_stmt # vectorized form of STMT
4343 into:
4345 loop:
4346 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4347 VECT_DEF = vector_stmt # vectorized form of STMT
4350 (in case of SLP, do it for all the phis). */
4352 /* Get the loop-entry arguments. */
4353 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4354 if (slp_node)
4355 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4356 NULL, slp_node, reduc_index);
4357 else
4359 /* Get at the scalar def before the loop, that defines the initial value
4360 of the reduction variable. */
4361 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4362 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4363 loop_preheader_edge (loop));
4364 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4365 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4366 &adjustment_def);
4367 vec_initial_defs.create (1);
4368 vec_initial_defs.quick_push (vec_initial_def);
4371 /* Set phi nodes arguments. */
4372 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4374 tree vec_init_def, def;
4375 gimple_seq stmts;
4376 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4377 true, NULL_TREE);
4378 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4379 def = vect_defs[i];
4380 for (j = 0; j < ncopies; j++)
4382 if (j != 0)
4384 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4385 if (nested_in_vect_loop)
4386 vec_init_def
4387 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4388 vec_init_def);
4391 /* Set the loop-entry arg of the reduction-phi. */
4393 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4394 == INTEGER_INDUC_COND_REDUCTION)
4396 /* Initialise the reduction phi to zero. This prevents initial
4397 values of non-zero interferring with the reduction op. */
4398 gcc_assert (ncopies == 1);
4399 gcc_assert (i == 0);
4401 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4402 tree zero_vec = build_zero_cst (vec_init_def_type);
4404 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4405 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4407 else
4408 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4409 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4411 /* Set the loop-latch arg for the reduction-phi. */
4412 if (j > 0)
4413 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4415 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4416 UNKNOWN_LOCATION);
4418 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "transform reduction: created def-use cycle: ");
4422 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4423 dump_printf (MSG_NOTE, "\n");
4424 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4425 dump_printf (MSG_NOTE, "\n");
4430 /* 2. Create epilog code.
4431 The reduction epilog code operates across the elements of the vector
4432 of partial results computed by the vectorized loop.
4433 The reduction epilog code consists of:
4435 step 1: compute the scalar result in a vector (v_out2)
4436 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4437 step 3: adjust the scalar result (s_out3) if needed.
4439 Step 1 can be accomplished using one the following three schemes:
4440 (scheme 1) using reduc_code, if available.
4441 (scheme 2) using whole-vector shifts, if available.
4442 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4443 combined.
4445 The overall epilog code looks like this:
4447 s_out0 = phi <s_loop> # original EXIT_PHI
4448 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4449 v_out2 = reduce <v_out1> # step 1
4450 s_out3 = extract_field <v_out2, 0> # step 2
4451 s_out4 = adjust_result <s_out3> # step 3
4453 (step 3 is optional, and steps 1 and 2 may be combined).
4454 Lastly, the uses of s_out0 are replaced by s_out4. */
4457 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4458 v_out1 = phi <VECT_DEF>
4459 Store them in NEW_PHIS. */
4461 exit_bb = single_exit (loop)->dest;
4462 prev_phi_info = NULL;
4463 new_phis.create (vect_defs.length ());
4464 FOR_EACH_VEC_ELT (vect_defs, i, def)
4466 for (j = 0; j < ncopies; j++)
4468 tree new_def = copy_ssa_name (def);
4469 phi = create_phi_node (new_def, exit_bb);
4470 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4471 if (j == 0)
4472 new_phis.quick_push (phi);
4473 else
4475 def = vect_get_vec_def_for_stmt_copy (dt, def);
4476 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4479 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4480 prev_phi_info = vinfo_for_stmt (phi);
4484 /* The epilogue is created for the outer-loop, i.e., for the loop being
4485 vectorized. Create exit phis for the outer loop. */
4486 if (double_reduc)
4488 loop = outer_loop;
4489 exit_bb = single_exit (loop)->dest;
4490 inner_phis.create (vect_defs.length ());
4491 FOR_EACH_VEC_ELT (new_phis, i, phi)
4493 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4494 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4495 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4496 PHI_RESULT (phi));
4497 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4498 loop_vinfo));
4499 inner_phis.quick_push (phi);
4500 new_phis[i] = outer_phi;
4501 prev_phi_info = vinfo_for_stmt (outer_phi);
4502 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4504 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4505 new_result = copy_ssa_name (PHI_RESULT (phi));
4506 outer_phi = create_phi_node (new_result, exit_bb);
4507 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4508 PHI_RESULT (phi));
4509 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4510 loop_vinfo));
4511 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4512 prev_phi_info = vinfo_for_stmt (outer_phi);
4517 exit_gsi = gsi_after_labels (exit_bb);
4519 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4520 (i.e. when reduc_code is not available) and in the final adjustment
4521 code (if needed). Also get the original scalar reduction variable as
4522 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4523 represents a reduction pattern), the tree-code and scalar-def are
4524 taken from the original stmt that the pattern-stmt (STMT) replaces.
4525 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4526 are taken from STMT. */
4528 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4529 if (!orig_stmt)
4531 /* Regular reduction */
4532 orig_stmt = stmt;
4534 else
4536 /* Reduction pattern */
4537 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4538 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4539 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4542 code = gimple_assign_rhs_code (orig_stmt);
4543 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4544 partial results are added and not subtracted. */
4545 if (code == MINUS_EXPR)
4546 code = PLUS_EXPR;
4548 scalar_dest = gimple_assign_lhs (orig_stmt);
4549 scalar_type = TREE_TYPE (scalar_dest);
4550 scalar_results.create (group_size);
4551 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4552 bitsize = TYPE_SIZE (scalar_type);
4554 /* In case this is a reduction in an inner-loop while vectorizing an outer
4555 loop - we don't need to extract a single scalar result at the end of the
4556 inner-loop (unless it is double reduction, i.e., the use of reduction is
4557 outside the outer-loop). The final vector of partial results will be used
4558 in the vectorized outer-loop, or reduced to a scalar result at the end of
4559 the outer-loop. */
4560 if (nested_in_vect_loop && !double_reduc)
4561 goto vect_finalize_reduction;
4563 /* SLP reduction without reduction chain, e.g.,
4564 # a1 = phi <a2, a0>
4565 # b1 = phi <b2, b0>
4566 a2 = operation (a1)
4567 b2 = operation (b1) */
4568 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4570 /* In case of reduction chain, e.g.,
4571 # a1 = phi <a3, a0>
4572 a2 = operation (a1)
4573 a3 = operation (a2),
4575 we may end up with more than one vector result. Here we reduce them to
4576 one vector. */
4577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4579 tree first_vect = PHI_RESULT (new_phis[0]);
4580 tree tmp;
4581 gassign *new_vec_stmt = NULL;
4583 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4584 for (k = 1; k < new_phis.length (); k++)
4586 gimple *next_phi = new_phis[k];
4587 tree second_vect = PHI_RESULT (next_phi);
4589 tmp = build2 (code, vectype, first_vect, second_vect);
4590 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4591 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4592 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4593 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4596 new_phi_result = first_vect;
4597 if (new_vec_stmt)
4599 new_phis.truncate (0);
4600 new_phis.safe_push (new_vec_stmt);
4603 else
4604 new_phi_result = PHI_RESULT (new_phis[0]);
4606 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4608 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4609 various data values where the condition matched and another vector
4610 (INDUCTION_INDEX) containing all the indexes of those matches. We
4611 need to extract the last matching index (which will be the index with
4612 highest value) and use this to index into the data vector.
4613 For the case where there were no matches, the data vector will contain
4614 all default values and the index vector will be all zeros. */
4616 /* Get various versions of the type of the vector of indexes. */
4617 tree index_vec_type = TREE_TYPE (induction_index);
4618 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4619 tree index_scalar_type = TREE_TYPE (index_vec_type);
4620 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4621 (index_vec_type);
4623 /* Get an unsigned integer version of the type of the data vector. */
4624 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4625 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4626 tree vectype_unsigned = build_vector_type
4627 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4629 /* First we need to create a vector (ZERO_VEC) of zeros and another
4630 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4631 can create using a MAX reduction and then expanding.
4632 In the case where the loop never made any matches, the max index will
4633 be zero. */
4635 /* Vector of {0, 0, 0,...}. */
4636 tree zero_vec = make_ssa_name (vectype);
4637 tree zero_vec_rhs = build_zero_cst (vectype);
4638 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4639 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4641 /* Find maximum value from the vector of found indexes. */
4642 tree max_index = make_ssa_name (index_scalar_type);
4643 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4644 induction_index);
4645 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4647 /* Vector of {max_index, max_index, max_index,...}. */
4648 tree max_index_vec = make_ssa_name (index_vec_type);
4649 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4650 max_index);
4651 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4652 max_index_vec_rhs);
4653 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4655 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4656 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4657 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4658 otherwise. Only one value should match, resulting in a vector
4659 (VEC_COND) with one data value and the rest zeros.
4660 In the case where the loop never made any matches, every index will
4661 match, resulting in a vector with all data values (which will all be
4662 the default value). */
4664 /* Compare the max index vector to the vector of found indexes to find
4665 the position of the max value. */
4666 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4667 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4668 induction_index,
4669 max_index_vec);
4670 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4672 /* Use the compare to choose either values from the data vector or
4673 zero. */
4674 tree vec_cond = make_ssa_name (vectype);
4675 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4676 vec_compare, new_phi_result,
4677 zero_vec);
4678 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4680 /* Finally we need to extract the data value from the vector (VEC_COND)
4681 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4682 reduction, but because this doesn't exist, we can use a MAX reduction
4683 instead. The data value might be signed or a float so we need to cast
4684 it first.
4685 In the case where the loop never made any matches, the data values are
4686 all identical, and so will reduce down correctly. */
4688 /* Make the matched data values unsigned. */
4689 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4690 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4691 vec_cond);
4692 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4693 VIEW_CONVERT_EXPR,
4694 vec_cond_cast_rhs);
4695 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4697 /* Reduce down to a scalar value. */
4698 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4699 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4700 optab_default);
4701 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4702 != CODE_FOR_nothing);
4703 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4704 REDUC_MAX_EXPR,
4705 vec_cond_cast);
4706 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4708 /* Convert the reduced value back to the result type and set as the
4709 result. */
4710 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4711 data_reduc);
4712 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4713 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4714 gimple_assign_set_lhs (epilog_stmt, new_temp);
4715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4716 scalar_results.safe_push (new_temp);
4719 /* 2.3 Create the reduction code, using one of the three schemes described
4720 above. In SLP we simply need to extract all the elements from the
4721 vector (without reducing them), so we use scalar shifts. */
4722 else if (reduc_code != ERROR_MARK && !slp_reduc)
4724 tree tmp;
4725 tree vec_elem_type;
4727 /*** Case 1: Create:
4728 v_out2 = reduc_expr <v_out1> */
4730 if (dump_enabled_p ())
4731 dump_printf_loc (MSG_NOTE, vect_location,
4732 "Reduce using direct vector reduction.\n");
4734 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4735 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4737 tree tmp_dest =
4738 vect_create_destination_var (scalar_dest, vec_elem_type);
4739 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4740 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4741 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4742 gimple_assign_set_lhs (epilog_stmt, new_temp);
4743 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4745 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4747 else
4748 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4750 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4751 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4752 gimple_assign_set_lhs (epilog_stmt, new_temp);
4753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4755 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4756 == INTEGER_INDUC_COND_REDUCTION)
4758 /* Earlier we set the initial value to be zero. Check the result
4759 and if it is zero then replace with the original initial
4760 value. */
4761 tree zero = build_zero_cst (scalar_type);
4762 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4764 tmp = make_ssa_name (new_scalar_dest);
4765 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4766 initial_def, new_temp);
4767 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4768 new_temp = tmp;
4771 scalar_results.safe_push (new_temp);
4773 else
4775 bool reduce_with_shift = have_whole_vector_shift (mode);
4776 int element_bitsize = tree_to_uhwi (bitsize);
4777 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4778 tree vec_temp;
4780 /* Regardless of whether we have a whole vector shift, if we're
4781 emulating the operation via tree-vect-generic, we don't want
4782 to use it. Only the first round of the reduction is likely
4783 to still be profitable via emulation. */
4784 /* ??? It might be better to emit a reduction tree code here, so that
4785 tree-vect-generic can expand the first round via bit tricks. */
4786 if (!VECTOR_MODE_P (mode))
4787 reduce_with_shift = false;
4788 else
4790 optab optab = optab_for_tree_code (code, vectype, optab_default);
4791 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4792 reduce_with_shift = false;
4795 if (reduce_with_shift && !slp_reduc)
4797 int nelements = vec_size_in_bits / element_bitsize;
4798 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4800 int elt_offset;
4802 tree zero_vec = build_zero_cst (vectype);
4803 /*** Case 2: Create:
4804 for (offset = nelements/2; offset >= 1; offset/=2)
4806 Create: va' = vec_shift <va, offset>
4807 Create: va = vop <va, va'>
4808 } */
4810 tree rhs;
4812 if (dump_enabled_p ())
4813 dump_printf_loc (MSG_NOTE, vect_location,
4814 "Reduce using vector shifts\n");
4816 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4817 new_temp = new_phi_result;
4818 for (elt_offset = nelements / 2;
4819 elt_offset >= 1;
4820 elt_offset /= 2)
4822 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4823 tree mask = vect_gen_perm_mask_any (vectype, sel);
4824 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4825 new_temp, zero_vec, mask);
4826 new_name = make_ssa_name (vec_dest, epilog_stmt);
4827 gimple_assign_set_lhs (epilog_stmt, new_name);
4828 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4830 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4831 new_temp);
4832 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4833 gimple_assign_set_lhs (epilog_stmt, new_temp);
4834 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4837 /* 2.4 Extract the final scalar result. Create:
4838 s_out3 = extract_field <v_out2, bitpos> */
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_NOTE, vect_location,
4842 "extract scalar result\n");
4844 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4845 bitsize, bitsize_zero_node);
4846 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4847 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4848 gimple_assign_set_lhs (epilog_stmt, new_temp);
4849 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4850 scalar_results.safe_push (new_temp);
4852 else
4854 /*** Case 3: Create:
4855 s = extract_field <v_out2, 0>
4856 for (offset = element_size;
4857 offset < vector_size;
4858 offset += element_size;)
4860 Create: s' = extract_field <v_out2, offset>
4861 Create: s = op <s, s'> // For non SLP cases
4862 } */
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE, vect_location,
4866 "Reduce using scalar code.\n");
4868 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4869 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4871 int bit_offset;
4872 if (gimple_code (new_phi) == GIMPLE_PHI)
4873 vec_temp = PHI_RESULT (new_phi);
4874 else
4875 vec_temp = gimple_assign_lhs (new_phi);
4876 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4877 bitsize_zero_node);
4878 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4879 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4880 gimple_assign_set_lhs (epilog_stmt, new_temp);
4881 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4883 /* In SLP we don't need to apply reduction operation, so we just
4884 collect s' values in SCALAR_RESULTS. */
4885 if (slp_reduc)
4886 scalar_results.safe_push (new_temp);
4888 for (bit_offset = element_bitsize;
4889 bit_offset < vec_size_in_bits;
4890 bit_offset += element_bitsize)
4892 tree bitpos = bitsize_int (bit_offset);
4893 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4894 bitsize, bitpos);
4896 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4897 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4898 gimple_assign_set_lhs (epilog_stmt, new_name);
4899 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4901 if (slp_reduc)
4903 /* In SLP we don't need to apply reduction operation, so
4904 we just collect s' values in SCALAR_RESULTS. */
4905 new_temp = new_name;
4906 scalar_results.safe_push (new_name);
4908 else
4910 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4911 new_name, new_temp);
4912 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4913 gimple_assign_set_lhs (epilog_stmt, new_temp);
4914 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4919 /* The only case where we need to reduce scalar results in SLP, is
4920 unrolling. If the size of SCALAR_RESULTS is greater than
4921 GROUP_SIZE, we reduce them combining elements modulo
4922 GROUP_SIZE. */
4923 if (slp_reduc)
4925 tree res, first_res, new_res;
4926 gimple *new_stmt;
4928 /* Reduce multiple scalar results in case of SLP unrolling. */
4929 for (j = group_size; scalar_results.iterate (j, &res);
4930 j++)
4932 first_res = scalar_results[j % group_size];
4933 new_stmt = gimple_build_assign (new_scalar_dest, code,
4934 first_res, res);
4935 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4936 gimple_assign_set_lhs (new_stmt, new_res);
4937 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4938 scalar_results[j % group_size] = new_res;
4941 else
4942 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4943 scalar_results.safe_push (new_temp);
4947 vect_finalize_reduction:
4949 if (double_reduc)
4950 loop = loop->inner;
4952 /* 2.5 Adjust the final result by the initial value of the reduction
4953 variable. (When such adjustment is not needed, then
4954 'adjustment_def' is zero). For example, if code is PLUS we create:
4955 new_temp = loop_exit_def + adjustment_def */
4957 if (adjustment_def)
4959 gcc_assert (!slp_reduc);
4960 if (nested_in_vect_loop)
4962 new_phi = new_phis[0];
4963 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4964 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4965 new_dest = vect_create_destination_var (scalar_dest, vectype);
4967 else
4969 new_temp = scalar_results[0];
4970 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4971 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4972 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4975 epilog_stmt = gimple_build_assign (new_dest, expr);
4976 new_temp = make_ssa_name (new_dest, epilog_stmt);
4977 gimple_assign_set_lhs (epilog_stmt, new_temp);
4978 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979 if (nested_in_vect_loop)
4981 set_vinfo_for_stmt (epilog_stmt,
4982 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4983 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4986 if (!double_reduc)
4987 scalar_results.quick_push (new_temp);
4988 else
4989 scalar_results[0] = new_temp;
4991 else
4992 scalar_results[0] = new_temp;
4994 new_phis[0] = epilog_stmt;
4997 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4998 phis with new adjusted scalar results, i.e., replace use <s_out0>
4999 with use <s_out4>.
5001 Transform:
5002 loop_exit:
5003 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5004 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5005 v_out2 = reduce <v_out1>
5006 s_out3 = extract_field <v_out2, 0>
5007 s_out4 = adjust_result <s_out3>
5008 use <s_out0>
5009 use <s_out0>
5011 into:
5013 loop_exit:
5014 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5015 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5016 v_out2 = reduce <v_out1>
5017 s_out3 = extract_field <v_out2, 0>
5018 s_out4 = adjust_result <s_out3>
5019 use <s_out4>
5020 use <s_out4> */
5023 /* In SLP reduction chain we reduce vector results into one vector if
5024 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5025 the last stmt in the reduction chain, since we are looking for the loop
5026 exit phi node. */
5027 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5029 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5030 /* Handle reduction patterns. */
5031 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5032 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5034 scalar_dest = gimple_assign_lhs (dest_stmt);
5035 group_size = 1;
5038 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5039 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5040 need to match SCALAR_RESULTS with corresponding statements. The first
5041 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5042 the first vector stmt, etc.
5043 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5044 if (group_size > new_phis.length ())
5046 ratio = group_size / new_phis.length ();
5047 gcc_assert (!(group_size % new_phis.length ()));
5049 else
5050 ratio = 1;
5052 for (k = 0; k < group_size; k++)
5054 if (k % ratio == 0)
5056 epilog_stmt = new_phis[k / ratio];
5057 reduction_phi = reduction_phis[k / ratio];
5058 if (double_reduc)
5059 inner_phi = inner_phis[k / ratio];
5062 if (slp_reduc)
5064 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5066 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5067 /* SLP statements can't participate in patterns. */
5068 gcc_assert (!orig_stmt);
5069 scalar_dest = gimple_assign_lhs (current_stmt);
5072 phis.create (3);
5073 /* Find the loop-closed-use at the loop exit of the original scalar
5074 result. (The reduction result is expected to have two immediate uses -
5075 one at the latch block, and one at the loop exit). */
5076 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5077 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5078 && !is_gimple_debug (USE_STMT (use_p)))
5079 phis.safe_push (USE_STMT (use_p));
5081 /* While we expect to have found an exit_phi because of loop-closed-ssa
5082 form we can end up without one if the scalar cycle is dead. */
5084 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5086 if (outer_loop)
5088 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5089 gphi *vect_phi;
5091 /* FORNOW. Currently not supporting the case that an inner-loop
5092 reduction is not used in the outer-loop (but only outside the
5093 outer-loop), unless it is double reduction. */
5094 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5095 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5096 || double_reduc);
5098 if (double_reduc)
5099 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5100 else
5101 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5102 if (!double_reduc
5103 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5104 != vect_double_reduction_def)
5105 continue;
5107 /* Handle double reduction:
5109 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5110 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5111 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5112 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5114 At that point the regular reduction (stmt2 and stmt3) is
5115 already vectorized, as well as the exit phi node, stmt4.
5116 Here we vectorize the phi node of double reduction, stmt1, and
5117 update all relevant statements. */
5119 /* Go through all the uses of s2 to find double reduction phi
5120 node, i.e., stmt1 above. */
5121 orig_name = PHI_RESULT (exit_phi);
5122 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5124 stmt_vec_info use_stmt_vinfo;
5125 stmt_vec_info new_phi_vinfo;
5126 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5127 basic_block bb = gimple_bb (use_stmt);
5128 gimple *use;
5130 /* Check that USE_STMT is really double reduction phi
5131 node. */
5132 if (gimple_code (use_stmt) != GIMPLE_PHI
5133 || gimple_phi_num_args (use_stmt) != 2
5134 || bb->loop_father != outer_loop)
5135 continue;
5136 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5137 if (!use_stmt_vinfo
5138 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5139 != vect_double_reduction_def)
5140 continue;
5142 /* Create vector phi node for double reduction:
5143 vs1 = phi <vs0, vs2>
5144 vs1 was created previously in this function by a call to
5145 vect_get_vec_def_for_operand and is stored in
5146 vec_initial_def;
5147 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5148 vs0 is created here. */
5150 /* Create vector phi node. */
5151 vect_phi = create_phi_node (vec_initial_def, bb);
5152 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5153 loop_vec_info_for_loop (outer_loop));
5154 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5156 /* Create vs0 - initial def of the double reduction phi. */
5157 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5158 loop_preheader_edge (outer_loop));
5159 init_def = get_initial_def_for_reduction (stmt,
5160 preheader_arg, NULL);
5161 vect_phi_init = vect_init_vector (use_stmt, init_def,
5162 vectype, NULL);
5164 /* Update phi node arguments with vs0 and vs2. */
5165 add_phi_arg (vect_phi, vect_phi_init,
5166 loop_preheader_edge (outer_loop),
5167 UNKNOWN_LOCATION);
5168 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5169 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5170 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_NOTE, vect_location,
5173 "created double reduction phi node: ");
5174 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5175 dump_printf (MSG_NOTE, "\n");
5178 vect_phi_res = PHI_RESULT (vect_phi);
5180 /* Replace the use, i.e., set the correct vs1 in the regular
5181 reduction phi node. FORNOW, NCOPIES is always 1, so the
5182 loop is redundant. */
5183 use = reduction_phi;
5184 for (j = 0; j < ncopies; j++)
5186 edge pr_edge = loop_preheader_edge (loop);
5187 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5188 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5194 phis.release ();
5195 if (nested_in_vect_loop)
5197 if (double_reduc)
5198 loop = outer_loop;
5199 else
5200 continue;
5203 phis.create (3);
5204 /* Find the loop-closed-use at the loop exit of the original scalar
5205 result. (The reduction result is expected to have two immediate uses,
5206 one at the latch block, and one at the loop exit). For double
5207 reductions we are looking for exit phis of the outer loop. */
5208 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5210 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5212 if (!is_gimple_debug (USE_STMT (use_p)))
5213 phis.safe_push (USE_STMT (use_p));
5215 else
5217 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5219 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5221 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5223 if (!flow_bb_inside_loop_p (loop,
5224 gimple_bb (USE_STMT (phi_use_p)))
5225 && !is_gimple_debug (USE_STMT (phi_use_p)))
5226 phis.safe_push (USE_STMT (phi_use_p));
5232 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5234 /* Replace the uses: */
5235 orig_name = PHI_RESULT (exit_phi);
5236 scalar_result = scalar_results[k];
5237 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5238 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5239 SET_USE (use_p, scalar_result);
5242 phis.release ();
5247 /* Function is_nonwrapping_integer_induction.
5249 Check if STMT (which is part of loop LOOP) both increments and
5250 does not cause overflow. */
5252 static bool
5253 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5255 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5256 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5257 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5258 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5259 widest_int ni, max_loop_value, lhs_max;
5260 bool overflow = false;
5262 /* Make sure the loop is integer based. */
5263 if (TREE_CODE (base) != INTEGER_CST
5264 || TREE_CODE (step) != INTEGER_CST)
5265 return false;
5267 /* Check that the induction increments. */
5268 if (tree_int_cst_sgn (step) == -1)
5269 return false;
5271 /* Check that the max size of the loop will not wrap. */
5273 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5274 return true;
5276 if (! max_stmt_executions (loop, &ni))
5277 return false;
5279 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5280 &overflow);
5281 if (overflow)
5282 return false;
5284 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5285 TYPE_SIGN (lhs_type), &overflow);
5286 if (overflow)
5287 return false;
5289 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5290 <= TYPE_PRECISION (lhs_type));
5293 /* Function vectorizable_reduction.
5295 Check if STMT performs a reduction operation that can be vectorized.
5296 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5297 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5298 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5300 This function also handles reduction idioms (patterns) that have been
5301 recognized in advance during vect_pattern_recog. In this case, STMT may be
5302 of this form:
5303 X = pattern_expr (arg0, arg1, ..., X)
5304 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5305 sequence that had been detected and replaced by the pattern-stmt (STMT).
5307 This function also handles reduction of condition expressions, for example:
5308 for (int i = 0; i < N; i++)
5309 if (a[i] < value)
5310 last = a[i];
5311 This is handled by vectorising the loop and creating an additional vector
5312 containing the loop indexes for which "a[i] < value" was true. In the
5313 function epilogue this is reduced to a single max value and then used to
5314 index into the vector of results.
5316 In some cases of reduction patterns, the type of the reduction variable X is
5317 different than the type of the other arguments of STMT.
5318 In such cases, the vectype that is used when transforming STMT into a vector
5319 stmt is different than the vectype that is used to determine the
5320 vectorization factor, because it consists of a different number of elements
5321 than the actual number of elements that are being operated upon in parallel.
5323 For example, consider an accumulation of shorts into an int accumulator.
5324 On some targets it's possible to vectorize this pattern operating on 8
5325 shorts at a time (hence, the vectype for purposes of determining the
5326 vectorization factor should be V8HI); on the other hand, the vectype that
5327 is used to create the vector form is actually V4SI (the type of the result).
5329 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5330 indicates what is the actual level of parallelism (V8HI in the example), so
5331 that the right vectorization factor would be derived. This vectype
5332 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5333 be used to create the vectorized stmt. The right vectype for the vectorized
5334 stmt is obtained from the type of the result X:
5335 get_vectype_for_scalar_type (TREE_TYPE (X))
5337 This means that, contrary to "regular" reductions (or "regular" stmts in
5338 general), the following equation:
5339 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5340 does *NOT* necessarily hold for reduction patterns. */
5342 bool
5343 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5344 gimple **vec_stmt, slp_tree slp_node)
5346 tree vec_dest;
5347 tree scalar_dest;
5348 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5349 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5350 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5351 tree vectype_in = NULL_TREE;
5352 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354 enum tree_code code, orig_code, epilog_reduc_code;
5355 machine_mode vec_mode;
5356 int op_type;
5357 optab optab, reduc_optab;
5358 tree new_temp = NULL_TREE;
5359 gimple *def_stmt;
5360 enum vect_def_type dt;
5361 gphi *new_phi = NULL;
5362 tree scalar_type;
5363 bool is_simple_use;
5364 gimple *orig_stmt;
5365 stmt_vec_info orig_stmt_info;
5366 tree expr = NULL_TREE;
5367 int i;
5368 int ncopies;
5369 int epilog_copies;
5370 stmt_vec_info prev_stmt_info, prev_phi_info;
5371 bool single_defuse_cycle = false;
5372 tree reduc_def = NULL_TREE;
5373 gimple *new_stmt = NULL;
5374 int j;
5375 tree ops[3];
5376 bool nested_cycle = false, found_nested_cycle_def = false;
5377 gimple *reduc_def_stmt = NULL;
5378 bool double_reduc = false, dummy;
5379 basic_block def_bb;
5380 struct loop * def_stmt_loop, *outer_loop = NULL;
5381 tree def_arg;
5382 gimple *def_arg_stmt;
5383 auto_vec<tree> vec_oprnds0;
5384 auto_vec<tree> vec_oprnds1;
5385 auto_vec<tree> vect_defs;
5386 auto_vec<gimple *> phis;
5387 int vec_num;
5388 tree def0, def1, tem, op0, op1 = NULL_TREE;
5389 bool first_p = true;
5390 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5391 gimple *cond_expr_induction_def_stmt = NULL;
5393 /* In case of reduction chain we switch to the first stmt in the chain, but
5394 we don't update STMT_INFO, since only the last stmt is marked as reduction
5395 and has reduction properties. */
5396 if (GROUP_FIRST_ELEMENT (stmt_info)
5397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5399 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5400 first_p = false;
5403 if (nested_in_vect_loop_p (loop, stmt))
5405 outer_loop = loop;
5406 loop = loop->inner;
5407 nested_cycle = true;
5410 /* 1. Is vectorizable reduction? */
5411 /* Not supportable if the reduction variable is used in the loop, unless
5412 it's a reduction chain. */
5413 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5414 && !GROUP_FIRST_ELEMENT (stmt_info))
5415 return false;
5417 /* Reductions that are not used even in an enclosing outer-loop,
5418 are expected to be "live" (used out of the loop). */
5419 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5420 && !STMT_VINFO_LIVE_P (stmt_info))
5421 return false;
5423 /* Make sure it was already recognized as a reduction computation. */
5424 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5425 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5426 return false;
5428 /* 2. Has this been recognized as a reduction pattern?
5430 Check if STMT represents a pattern that has been recognized
5431 in earlier analysis stages. For stmts that represent a pattern,
5432 the STMT_VINFO_RELATED_STMT field records the last stmt in
5433 the original sequence that constitutes the pattern. */
5435 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5436 if (orig_stmt)
5438 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5439 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5440 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5443 /* 3. Check the operands of the operation. The first operands are defined
5444 inside the loop body. The last operand is the reduction variable,
5445 which is defined by the loop-header-phi. */
5447 gcc_assert (is_gimple_assign (stmt));
5449 /* Flatten RHS. */
5450 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5452 case GIMPLE_SINGLE_RHS:
5453 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5454 if (op_type == ternary_op)
5456 tree rhs = gimple_assign_rhs1 (stmt);
5457 ops[0] = TREE_OPERAND (rhs, 0);
5458 ops[1] = TREE_OPERAND (rhs, 1);
5459 ops[2] = TREE_OPERAND (rhs, 2);
5460 code = TREE_CODE (rhs);
5462 else
5463 return false;
5464 break;
5466 case GIMPLE_BINARY_RHS:
5467 code = gimple_assign_rhs_code (stmt);
5468 op_type = TREE_CODE_LENGTH (code);
5469 gcc_assert (op_type == binary_op);
5470 ops[0] = gimple_assign_rhs1 (stmt);
5471 ops[1] = gimple_assign_rhs2 (stmt);
5472 break;
5474 case GIMPLE_TERNARY_RHS:
5475 code = gimple_assign_rhs_code (stmt);
5476 op_type = TREE_CODE_LENGTH (code);
5477 gcc_assert (op_type == ternary_op);
5478 ops[0] = gimple_assign_rhs1 (stmt);
5479 ops[1] = gimple_assign_rhs2 (stmt);
5480 ops[2] = gimple_assign_rhs3 (stmt);
5481 break;
5483 case GIMPLE_UNARY_RHS:
5484 return false;
5486 default:
5487 gcc_unreachable ();
5489 /* The default is that the reduction variable is the last in statement. */
5490 int reduc_index = op_type - 1;
5491 if (code == MINUS_EXPR)
5492 reduc_index = 0;
5494 if (code == COND_EXPR && slp_node)
5495 return false;
5497 scalar_dest = gimple_assign_lhs (stmt);
5498 scalar_type = TREE_TYPE (scalar_dest);
5499 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5500 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5501 return false;
5503 /* Do not try to vectorize bit-precision reductions. */
5504 if ((TYPE_PRECISION (scalar_type)
5505 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5506 return false;
5508 /* All uses but the last are expected to be defined in the loop.
5509 The last use is the reduction variable. In case of nested cycle this
5510 assumption is not true: we use reduc_index to record the index of the
5511 reduction variable. */
5512 for (i = 0; i < op_type; i++)
5514 if (i == reduc_index)
5515 continue;
5517 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5518 if (i == 0 && code == COND_EXPR)
5519 continue;
5521 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5522 &def_stmt, &dt, &tem);
5523 if (!vectype_in)
5524 vectype_in = tem;
5525 gcc_assert (is_simple_use);
5527 if (dt != vect_internal_def
5528 && dt != vect_external_def
5529 && dt != vect_constant_def
5530 && dt != vect_induction_def
5531 && !(dt == vect_nested_cycle && nested_cycle))
5532 return false;
5534 if (dt == vect_nested_cycle)
5536 found_nested_cycle_def = true;
5537 reduc_def_stmt = def_stmt;
5538 reduc_index = i;
5541 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5542 cond_expr_induction_def_stmt = def_stmt;
5545 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5546 &def_stmt, &dt, &tem);
5547 if (!vectype_in)
5548 vectype_in = tem;
5549 gcc_assert (is_simple_use);
5550 if (!found_nested_cycle_def)
5551 reduc_def_stmt = def_stmt;
5553 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5554 return false;
5556 if (!(dt == vect_reduction_def
5557 || dt == vect_nested_cycle
5558 || ((dt == vect_internal_def || dt == vect_external_def
5559 || dt == vect_constant_def || dt == vect_induction_def)
5560 && nested_cycle && found_nested_cycle_def)))
5562 /* For pattern recognized stmts, orig_stmt might be a reduction,
5563 but some helper statements for the pattern might not, or
5564 might be COND_EXPRs with reduction uses in the condition. */
5565 gcc_assert (orig_stmt);
5566 return false;
5569 enum vect_reduction_type v_reduc_type;
5570 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5571 !nested_cycle, &dummy, false,
5572 &v_reduc_type);
5574 /* If we have a condition reduction, see if we can simplify it further. */
5575 if (v_reduc_type == COND_REDUCTION
5576 && cond_expr_induction_def_stmt != NULL
5577 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE, vect_location,
5581 "condition expression based on integer induction.\n");
5582 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5584 else
5585 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5587 if (orig_stmt)
5588 gcc_assert (tmp == orig_stmt
5589 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5590 else
5591 /* We changed STMT to be the first stmt in reduction chain, hence we
5592 check that in this case the first element in the chain is STMT. */
5593 gcc_assert (stmt == tmp
5594 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5596 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5597 return false;
5599 if (slp_node)
5600 ncopies = 1;
5601 else
5602 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5603 / TYPE_VECTOR_SUBPARTS (vectype_in));
5605 gcc_assert (ncopies >= 1);
5607 vec_mode = TYPE_MODE (vectype_in);
5609 if (code == COND_EXPR)
5611 /* Only call during the analysis stage, otherwise we'll lose
5612 STMT_VINFO_TYPE. */
5613 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5614 ops[reduc_index], 0, NULL))
5616 if (dump_enabled_p ())
5617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5618 "unsupported condition in reduction\n");
5619 return false;
5622 else
5624 /* 4. Supportable by target? */
5626 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5627 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5629 /* Shifts and rotates are only supported by vectorizable_shifts,
5630 not vectorizable_reduction. */
5631 if (dump_enabled_p ())
5632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5633 "unsupported shift or rotation.\n");
5634 return false;
5637 /* 4.1. check support for the operation in the loop */
5638 optab = optab_for_tree_code (code, vectype_in, optab_default);
5639 if (!optab)
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "no optab.\n");
5645 return false;
5648 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5650 if (dump_enabled_p ())
5651 dump_printf (MSG_NOTE, "op not supported by target.\n");
5653 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5654 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5655 < vect_min_worthwhile_factor (code))
5656 return false;
5658 if (dump_enabled_p ())
5659 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5662 /* Worthwhile without SIMD support? */
5663 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5664 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5665 < vect_min_worthwhile_factor (code))
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "not worthwhile without SIMD support.\n");
5671 return false;
5675 /* 4.2. Check support for the epilog operation.
5677 If STMT represents a reduction pattern, then the type of the
5678 reduction variable may be different than the type of the rest
5679 of the arguments. For example, consider the case of accumulation
5680 of shorts into an int accumulator; The original code:
5681 S1: int_a = (int) short_a;
5682 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5684 was replaced with:
5685 STMT: int_acc = widen_sum <short_a, int_acc>
5687 This means that:
5688 1. The tree-code that is used to create the vector operation in the
5689 epilog code (that reduces the partial results) is not the
5690 tree-code of STMT, but is rather the tree-code of the original
5691 stmt from the pattern that STMT is replacing. I.e, in the example
5692 above we want to use 'widen_sum' in the loop, but 'plus' in the
5693 epilog.
5694 2. The type (mode) we use to check available target support
5695 for the vector operation to be created in the *epilog*, is
5696 determined by the type of the reduction variable (in the example
5697 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5698 However the type (mode) we use to check available target support
5699 for the vector operation to be created *inside the loop*, is
5700 determined by the type of the other arguments to STMT (in the
5701 example we'd check this: optab_handler (widen_sum_optab,
5702 vect_short_mode)).
5704 This is contrary to "regular" reductions, in which the types of all
5705 the arguments are the same as the type of the reduction variable.
5706 For "regular" reductions we can therefore use the same vector type
5707 (and also the same tree-code) when generating the epilog code and
5708 when generating the code inside the loop. */
5710 if (orig_stmt)
5712 /* This is a reduction pattern: get the vectype from the type of the
5713 reduction variable, and get the tree-code from orig_stmt. */
5714 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5715 == TREE_CODE_REDUCTION);
5716 orig_code = gimple_assign_rhs_code (orig_stmt);
5717 gcc_assert (vectype_out);
5718 vec_mode = TYPE_MODE (vectype_out);
5720 else
5722 /* Regular reduction: use the same vectype and tree-code as used for
5723 the vector code inside the loop can be used for the epilog code. */
5724 orig_code = code;
5726 if (code == MINUS_EXPR)
5727 orig_code = PLUS_EXPR;
5729 /* For simple condition reductions, replace with the actual expression
5730 we want to base our reduction around. */
5731 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5732 == INTEGER_INDUC_COND_REDUCTION)
5733 orig_code = MAX_EXPR;
5736 if (nested_cycle)
5738 def_bb = gimple_bb (reduc_def_stmt);
5739 def_stmt_loop = def_bb->loop_father;
5740 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5741 loop_preheader_edge (def_stmt_loop));
5742 if (TREE_CODE (def_arg) == SSA_NAME
5743 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5744 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5745 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5746 && vinfo_for_stmt (def_arg_stmt)
5747 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5748 == vect_double_reduction_def)
5749 double_reduc = true;
5752 epilog_reduc_code = ERROR_MARK;
5754 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5755 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5756 == INTEGER_INDUC_COND_REDUCTION)
5758 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5760 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5761 optab_default);
5762 if (!reduc_optab)
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "no optab for reduction.\n");
5768 epilog_reduc_code = ERROR_MARK;
5770 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "reduc op not supported by target.\n");
5776 epilog_reduc_code = ERROR_MARK;
5779 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5780 generated in the epilog using multiple expressions. This does not
5781 work for condition reductions. */
5782 if (epilog_reduc_code == ERROR_MARK
5783 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5784 == INTEGER_INDUC_COND_REDUCTION)
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "no reduc code for scalar code.\n");
5789 return false;
5792 else
5794 if (!nested_cycle || double_reduc)
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "no reduc code for scalar code.\n");
5800 return false;
5804 else
5806 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5807 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5808 cr_index_vector_type = build_vector_type
5809 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5811 epilog_reduc_code = REDUC_MAX_EXPR;
5812 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5813 optab_default);
5814 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5815 == CODE_FOR_nothing)
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 "reduc max op not supported by target.\n");
5820 return false;
5824 if ((double_reduc
5825 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5826 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5827 == INTEGER_INDUC_COND_REDUCTION)
5828 && ncopies > 1)
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 "multiple types in double reduction or condition "
5833 "reduction.\n");
5834 return false;
5837 /* In case of widenning multiplication by a constant, we update the type
5838 of the constant to be the type of the other operand. We check that the
5839 constant fits the type in the pattern recognition pass. */
5840 if (code == DOT_PROD_EXPR
5841 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5843 if (TREE_CODE (ops[0]) == INTEGER_CST)
5844 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5845 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5846 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5847 else
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5851 "invalid types in dot-prod\n");
5853 return false;
5857 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5859 widest_int ni;
5861 if (! max_loop_iterations (loop, &ni))
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location,
5865 "loop count not known, cannot create cond "
5866 "reduction.\n");
5867 return false;
5869 /* Convert backedges to iterations. */
5870 ni += 1;
5872 /* The additional index will be the same type as the condition. Check
5873 that the loop can fit into this less one (because we'll use up the
5874 zero slot for when there are no matches). */
5875 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5876 if (wi::geu_p (ni, wi::to_widest (max_index)))
5878 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_NOTE, vect_location,
5880 "loop size is greater than data size.\n");
5881 return false;
5885 if (!vec_stmt) /* transformation not required. */
5887 if (first_p
5888 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5889 reduc_index))
5890 return false;
5891 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5892 return true;
5895 /** Transform. **/
5897 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5900 /* FORNOW: Multiple types are not supported for condition. */
5901 if (code == COND_EXPR)
5902 gcc_assert (ncopies == 1);
5904 /* Create the destination vector */
5905 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5907 /* In case the vectorization factor (VF) is bigger than the number
5908 of elements that we can fit in a vectype (nunits), we have to generate
5909 more than one vector stmt - i.e - we need to "unroll" the
5910 vector stmt by a factor VF/nunits. For more details see documentation
5911 in vectorizable_operation. */
5913 /* If the reduction is used in an outer loop we need to generate
5914 VF intermediate results, like so (e.g. for ncopies=2):
5915 r0 = phi (init, r0)
5916 r1 = phi (init, r1)
5917 r0 = x0 + r0;
5918 r1 = x1 + r1;
5919 (i.e. we generate VF results in 2 registers).
5920 In this case we have a separate def-use cycle for each copy, and therefore
5921 for each copy we get the vector def for the reduction variable from the
5922 respective phi node created for this copy.
5924 Otherwise (the reduction is unused in the loop nest), we can combine
5925 together intermediate results, like so (e.g. for ncopies=2):
5926 r = phi (init, r)
5927 r = x0 + r;
5928 r = x1 + r;
5929 (i.e. we generate VF/2 results in a single register).
5930 In this case for each copy we get the vector def for the reduction variable
5931 from the vectorized reduction operation generated in the previous iteration.
5934 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5936 single_defuse_cycle = true;
5937 epilog_copies = 1;
5939 else
5940 epilog_copies = ncopies;
5942 prev_stmt_info = NULL;
5943 prev_phi_info = NULL;
5944 if (slp_node)
5945 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5946 else
5948 vec_num = 1;
5949 vec_oprnds0.create (1);
5950 if (op_type == ternary_op)
5951 vec_oprnds1.create (1);
5954 phis.create (vec_num);
5955 vect_defs.create (vec_num);
5956 if (!slp_node)
5957 vect_defs.quick_push (NULL_TREE);
5959 for (j = 0; j < ncopies; j++)
5961 if (j == 0 || !single_defuse_cycle)
5963 for (i = 0; i < vec_num; i++)
5965 /* Create the reduction-phi that defines the reduction
5966 operand. */
5967 new_phi = create_phi_node (vec_dest, loop->header);
5968 set_vinfo_for_stmt (new_phi,
5969 new_stmt_vec_info (new_phi, loop_vinfo));
5970 if (j == 0 || slp_node)
5971 phis.quick_push (new_phi);
5975 if (code == COND_EXPR)
5977 gcc_assert (!slp_node);
5978 vectorizable_condition (stmt, gsi, vec_stmt,
5979 PHI_RESULT (phis[0]),
5980 reduc_index, NULL);
5981 /* Multiple types are not supported for condition. */
5982 break;
5985 /* Handle uses. */
5986 if (j == 0)
5988 op0 = ops[!reduc_index];
5989 if (op_type == ternary_op)
5991 if (reduc_index == 0)
5992 op1 = ops[2];
5993 else
5994 op1 = ops[1];
5997 if (slp_node)
5998 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5999 slp_node, -1);
6000 else
6002 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6003 stmt);
6004 vec_oprnds0.quick_push (loop_vec_def0);
6005 if (op_type == ternary_op)
6007 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6008 vec_oprnds1.quick_push (loop_vec_def1);
6012 else
6014 if (!slp_node)
6016 enum vect_def_type dt;
6017 gimple *dummy_stmt;
6019 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6020 &dummy_stmt, &dt);
6021 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6022 loop_vec_def0);
6023 vec_oprnds0[0] = loop_vec_def0;
6024 if (op_type == ternary_op)
6026 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6027 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6028 loop_vec_def1);
6029 vec_oprnds1[0] = loop_vec_def1;
6033 if (single_defuse_cycle)
6034 reduc_def = gimple_assign_lhs (new_stmt);
6036 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6039 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6041 if (slp_node)
6042 reduc_def = PHI_RESULT (phis[i]);
6043 else
6045 if (!single_defuse_cycle || j == 0)
6046 reduc_def = PHI_RESULT (new_phi);
6049 def1 = ((op_type == ternary_op)
6050 ? vec_oprnds1[i] : NULL);
6051 if (op_type == binary_op)
6053 if (reduc_index == 0)
6054 expr = build2 (code, vectype_out, reduc_def, def0);
6055 else
6056 expr = build2 (code, vectype_out, def0, reduc_def);
6058 else
6060 if (reduc_index == 0)
6061 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6062 else
6064 if (reduc_index == 1)
6065 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6066 else
6067 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6071 new_stmt = gimple_build_assign (vec_dest, expr);
6072 new_temp = make_ssa_name (vec_dest, new_stmt);
6073 gimple_assign_set_lhs (new_stmt, new_temp);
6074 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6076 if (slp_node)
6078 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6079 vect_defs.quick_push (new_temp);
6081 else
6082 vect_defs[0] = new_temp;
6085 if (slp_node)
6086 continue;
6088 if (j == 0)
6089 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6090 else
6091 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6093 prev_stmt_info = vinfo_for_stmt (new_stmt);
6094 prev_phi_info = vinfo_for_stmt (new_phi);
6097 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6099 /* Finalize the reduction-phi (set its arguments) and create the
6100 epilog reduction code. */
6101 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6103 new_temp = gimple_assign_lhs (*vec_stmt);
6104 vect_defs[0] = new_temp;
6106 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6107 which is updated with the current index of the loop for every match of
6108 the original loop's cond_expr (VEC_STMT). This results in a vector
6109 containing the last time the condition passed for that vector lane.
6110 The first match will be a 1 to allow 0 to be used for non-matching
6111 indexes. If there are no matches at all then the vector will be all
6112 zeroes. */
6113 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6115 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6116 int k;
6118 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6120 /* First we create a simple vector induction variable which starts
6121 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6122 vector size (STEP). */
6124 /* Create a {1,2,3,...} vector. */
6125 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6126 for (k = 0; k < nunits_out; ++k)
6127 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6128 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6130 /* Create a vector of the step value. */
6131 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6132 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6134 /* Create an induction variable. */
6135 gimple_stmt_iterator incr_gsi;
6136 bool insert_after;
6137 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6138 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6139 insert_after, &indx_before_incr, &indx_after_incr);
6141 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6142 filled with zeros (VEC_ZERO). */
6144 /* Create a vector of 0s. */
6145 tree zero = build_zero_cst (cr_index_scalar_type);
6146 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6148 /* Create a vector phi node. */
6149 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6150 new_phi = create_phi_node (new_phi_tree, loop->header);
6151 set_vinfo_for_stmt (new_phi,
6152 new_stmt_vec_info (new_phi, loop_vinfo));
6153 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6154 UNKNOWN_LOCATION);
6156 /* Now take the condition from the loops original cond_expr
6157 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6158 every match uses values from the induction variable
6159 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6160 (NEW_PHI_TREE).
6161 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6162 the new cond_expr (INDEX_COND_EXPR). */
6164 /* Duplicate the condition from vec_stmt. */
6165 tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6167 /* Create a conditional, where the condition is taken from vec_stmt
6168 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6169 else is the phi (NEW_PHI_TREE). */
6170 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6171 ccompare, indx_before_incr,
6172 new_phi_tree);
6173 cond_name = make_ssa_name (cr_index_vector_type);
6174 gimple *index_condition = gimple_build_assign (cond_name,
6175 index_cond_expr);
6176 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6177 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6178 loop_vinfo);
6179 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6180 set_vinfo_for_stmt (index_condition, index_vec_info);
6182 /* Update the phi with the vec cond. */
6183 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6184 UNKNOWN_LOCATION);
6188 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6189 epilog_reduc_code, phis, reduc_index,
6190 double_reduc, slp_node, cond_name);
6192 return true;
6195 /* Function vect_min_worthwhile_factor.
6197 For a loop where we could vectorize the operation indicated by CODE,
6198 return the minimum vectorization factor that makes it worthwhile
6199 to use generic vectors. */
6201 vect_min_worthwhile_factor (enum tree_code code)
6203 switch (code)
6205 case PLUS_EXPR:
6206 case MINUS_EXPR:
6207 case NEGATE_EXPR:
6208 return 4;
6210 case BIT_AND_EXPR:
6211 case BIT_IOR_EXPR:
6212 case BIT_XOR_EXPR:
6213 case BIT_NOT_EXPR:
6214 return 2;
6216 default:
6217 return INT_MAX;
6222 /* Function vectorizable_induction
6224 Check if PHI performs an induction computation that can be vectorized.
6225 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6226 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6227 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6229 bool
6230 vectorizable_induction (gimple *phi,
6231 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6232 gimple **vec_stmt)
6234 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6235 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6236 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6237 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6238 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6239 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6240 tree vec_def;
6242 gcc_assert (ncopies >= 1);
6243 /* FORNOW. These restrictions should be relaxed. */
6244 if (nested_in_vect_loop_p (loop, phi))
6246 imm_use_iterator imm_iter;
6247 use_operand_p use_p;
6248 gimple *exit_phi;
6249 edge latch_e;
6250 tree loop_arg;
6252 if (ncopies > 1)
6254 if (dump_enabled_p ())
6255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256 "multiple types in nested loop.\n");
6257 return false;
6260 exit_phi = NULL;
6261 latch_e = loop_latch_edge (loop->inner);
6262 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6263 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6265 gimple *use_stmt = USE_STMT (use_p);
6266 if (is_gimple_debug (use_stmt))
6267 continue;
6269 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6271 exit_phi = use_stmt;
6272 break;
6275 if (exit_phi)
6277 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6278 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6279 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6281 if (dump_enabled_p ())
6282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283 "inner-loop induction only used outside "
6284 "of the outer vectorized loop.\n");
6285 return false;
6290 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6291 return false;
6293 /* FORNOW: SLP not supported. */
6294 if (STMT_SLP_TYPE (stmt_info))
6295 return false;
6297 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6299 if (gimple_code (phi) != GIMPLE_PHI)
6300 return false;
6302 if (!vec_stmt) /* transformation not required. */
6304 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_NOTE, vect_location,
6307 "=== vectorizable_induction ===\n");
6308 vect_model_induction_cost (stmt_info, ncopies);
6309 return true;
6312 /** Transform. **/
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6317 vec_def = get_initial_def_for_induction (phi);
6318 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6319 return true;
6322 /* Function vectorizable_live_operation.
6324 STMT computes a value that is used outside the loop. Check if
6325 it can be supported. */
6327 bool
6328 vectorizable_live_operation (gimple *stmt,
6329 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6330 gimple **vec_stmt)
6332 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6333 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6334 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6335 tree op;
6336 gimple *def_stmt;
6337 ssa_op_iter iter;
6339 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6341 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6342 return false;
6344 if (!is_gimple_assign (stmt))
6346 if (gimple_call_internal_p (stmt)
6347 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6348 && gimple_call_lhs (stmt)
6349 && loop->simduid
6350 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6351 && loop->simduid
6352 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6354 edge e = single_exit (loop);
6355 basic_block merge_bb = e->dest;
6356 imm_use_iterator imm_iter;
6357 use_operand_p use_p;
6358 tree lhs = gimple_call_lhs (stmt);
6360 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6362 gimple *use_stmt = USE_STMT (use_p);
6363 if (gimple_code (use_stmt) == GIMPLE_PHI
6364 && gimple_bb (use_stmt) == merge_bb)
6366 if (vec_stmt)
6368 tree vfm1
6369 = build_int_cst (unsigned_type_node,
6370 loop_vinfo->vectorization_factor - 1);
6371 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6373 return true;
6378 return false;
6381 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6382 return false;
6384 /* FORNOW. CHECKME. */
6385 if (nested_in_vect_loop_p (loop, stmt))
6386 return false;
6388 /* FORNOW: support only if all uses are invariant. This means
6389 that the scalar operations can remain in place, unvectorized.
6390 The original last scalar value that they compute will be used. */
6391 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6393 enum vect_def_type dt = vect_uninitialized_def;
6395 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6397 if (dump_enabled_p ())
6398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6399 "use not simple.\n");
6400 return false;
6403 if (dt != vect_external_def && dt != vect_constant_def)
6404 return false;
6407 /* No transformation is required for the cases we currently support. */
6408 return true;
6411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6413 static void
6414 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6416 ssa_op_iter op_iter;
6417 imm_use_iterator imm_iter;
6418 def_operand_p def_p;
6419 gimple *ustmt;
6421 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6423 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6425 basic_block bb;
6427 if (!is_gimple_debug (ustmt))
6428 continue;
6430 bb = gimple_bb (ustmt);
6432 if (!flow_bb_inside_loop_p (loop, bb))
6434 if (gimple_debug_bind_p (ustmt))
6436 if (dump_enabled_p ())
6437 dump_printf_loc (MSG_NOTE, vect_location,
6438 "killing debug use\n");
6440 gimple_debug_bind_reset_value (ustmt);
6441 update_stmt (ustmt);
6443 else
6444 gcc_unreachable ();
6451 /* This function builds ni_name = number of iterations. Statements
6452 are emitted on the loop preheader edge. */
6454 static tree
6455 vect_build_loop_niters (loop_vec_info loop_vinfo)
6457 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6458 if (TREE_CODE (ni) == INTEGER_CST)
6459 return ni;
6460 else
6462 tree ni_name, var;
6463 gimple_seq stmts = NULL;
6464 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6466 var = create_tmp_var (TREE_TYPE (ni), "niters");
6467 ni_name = force_gimple_operand (ni, &stmts, false, var);
6468 if (stmts)
6469 gsi_insert_seq_on_edge_immediate (pe, stmts);
6471 return ni_name;
6476 /* This function generates the following statements:
6478 ni_name = number of iterations loop executes
6479 ratio = ni_name / vf
6480 ratio_mult_vf_name = ratio * vf
6482 and places them on the loop preheader edge. */
6484 static void
6485 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6486 tree ni_name,
6487 tree *ratio_mult_vf_name_ptr,
6488 tree *ratio_name_ptr)
6490 tree ni_minus_gap_name;
6491 tree var;
6492 tree ratio_name;
6493 tree ratio_mult_vf_name;
6494 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6495 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6496 tree log_vf;
6498 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6500 /* If epilogue loop is required because of data accesses with gaps, we
6501 subtract one iteration from the total number of iterations here for
6502 correct calculation of RATIO. */
6503 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6505 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6506 ni_name,
6507 build_one_cst (TREE_TYPE (ni_name)));
6508 if (!is_gimple_val (ni_minus_gap_name))
6510 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6511 gimple *stmts = NULL;
6512 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6513 true, var);
6514 gsi_insert_seq_on_edge_immediate (pe, stmts);
6517 else
6518 ni_minus_gap_name = ni_name;
6520 /* Create: ratio = ni >> log2(vf) */
6521 /* ??? As we have ni == number of latch executions + 1, ni could
6522 have overflown to zero. So avoid computing ratio based on ni
6523 but compute it using the fact that we know ratio will be at least
6524 one, thus via (ni - vf) >> log2(vf) + 1. */
6525 ratio_name
6526 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6527 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6528 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6529 ni_minus_gap_name,
6530 build_int_cst
6531 (TREE_TYPE (ni_name), vf)),
6532 log_vf),
6533 build_int_cst (TREE_TYPE (ni_name), 1));
6534 if (!is_gimple_val (ratio_name))
6536 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6537 gimple *stmts = NULL;
6538 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6539 gsi_insert_seq_on_edge_immediate (pe, stmts);
6541 *ratio_name_ptr = ratio_name;
6543 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6545 if (ratio_mult_vf_name_ptr)
6547 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6548 ratio_name, log_vf);
6549 if (!is_gimple_val (ratio_mult_vf_name))
6551 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6552 gimple *stmts = NULL;
6553 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6554 true, var);
6555 gsi_insert_seq_on_edge_immediate (pe, stmts);
6557 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6560 return;
6564 /* Function vect_transform_loop.
6566 The analysis phase has determined that the loop is vectorizable.
6567 Vectorize the loop - created vectorized stmts to replace the scalar
6568 stmts in the loop, and update the loop exit condition. */
6570 void
6571 vect_transform_loop (loop_vec_info loop_vinfo)
6573 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6574 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6575 int nbbs = loop->num_nodes;
6576 int i;
6577 tree ratio = NULL;
6578 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6579 bool grouped_store;
6580 bool slp_scheduled = false;
6581 gimple *stmt, *pattern_stmt;
6582 gimple_seq pattern_def_seq = NULL;
6583 gimple_stmt_iterator pattern_def_si = gsi_none ();
6584 bool transform_pattern_stmt = false;
6585 bool check_profitability = false;
6586 int th;
6587 /* Record number of iterations before we started tampering with the profile. */
6588 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6590 if (dump_enabled_p ())
6591 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6593 /* If profile is inprecise, we have chance to fix it up. */
6594 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6595 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6597 /* Use the more conservative vectorization threshold. If the number
6598 of iterations is constant assume the cost check has been performed
6599 by our caller. If the threshold makes all loops profitable that
6600 run at least the vectorization factor number of times checking
6601 is pointless, too. */
6602 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6603 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6604 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_NOTE, vect_location,
6608 "Profitability threshold is %d loop iterations.\n",
6609 th);
6610 check_profitability = true;
6613 /* Version the loop first, if required, so the profitability check
6614 comes first. */
6616 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6617 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6619 vect_loop_versioning (loop_vinfo, th, check_profitability);
6620 check_profitability = false;
6623 tree ni_name = vect_build_loop_niters (loop_vinfo);
6624 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6626 /* Peel the loop if there are data refs with unknown alignment.
6627 Only one data ref with unknown store is allowed. */
6629 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6631 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6632 th, check_profitability);
6633 check_profitability = false;
6634 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6635 be re-computed. */
6636 ni_name = NULL_TREE;
6639 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6640 compile time constant), or it is a constant that doesn't divide by the
6641 vectorization factor, then an epilog loop needs to be created.
6642 We therefore duplicate the loop: the original loop will be vectorized,
6643 and will compute the first (n/VF) iterations. The second copy of the loop
6644 will remain scalar and will compute the remaining (n%VF) iterations.
6645 (VF is the vectorization factor). */
6647 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6648 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6650 tree ratio_mult_vf;
6651 if (!ni_name)
6652 ni_name = vect_build_loop_niters (loop_vinfo);
6653 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6654 &ratio);
6655 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6656 th, check_profitability);
6658 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6659 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6660 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6661 else
6663 if (!ni_name)
6664 ni_name = vect_build_loop_niters (loop_vinfo);
6665 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6668 /* 1) Make sure the loop header has exactly two entries
6669 2) Make sure we have a preheader basic block. */
6671 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6673 split_edge (loop_preheader_edge (loop));
6675 /* FORNOW: the vectorizer supports only loops which body consist
6676 of one basic block (header + empty latch). When the vectorizer will
6677 support more involved loop forms, the order by which the BBs are
6678 traversed need to be reconsidered. */
6680 for (i = 0; i < nbbs; i++)
6682 basic_block bb = bbs[i];
6683 stmt_vec_info stmt_info;
6685 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6686 gsi_next (&si))
6688 gphi *phi = si.phi ();
6689 if (dump_enabled_p ())
6691 dump_printf_loc (MSG_NOTE, vect_location,
6692 "------>vectorizing phi: ");
6693 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6694 dump_printf (MSG_NOTE, "\n");
6696 stmt_info = vinfo_for_stmt (phi);
6697 if (!stmt_info)
6698 continue;
6700 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6701 vect_loop_kill_debug_uses (loop, phi);
6703 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6704 && !STMT_VINFO_LIVE_P (stmt_info))
6705 continue;
6707 if (STMT_VINFO_VECTYPE (stmt_info)
6708 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6709 != (unsigned HOST_WIDE_INT) vectorization_factor)
6710 && dump_enabled_p ())
6711 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6713 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6715 if (dump_enabled_p ())
6716 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6717 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6721 pattern_stmt = NULL;
6722 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6723 !gsi_end_p (si) || transform_pattern_stmt;)
6725 bool is_store;
6727 if (transform_pattern_stmt)
6728 stmt = pattern_stmt;
6729 else
6731 stmt = gsi_stmt (si);
6732 /* During vectorization remove existing clobber stmts. */
6733 if (gimple_clobber_p (stmt))
6735 unlink_stmt_vdef (stmt);
6736 gsi_remove (&si, true);
6737 release_defs (stmt);
6738 continue;
6742 if (dump_enabled_p ())
6744 dump_printf_loc (MSG_NOTE, vect_location,
6745 "------>vectorizing statement: ");
6746 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6747 dump_printf (MSG_NOTE, "\n");
6750 stmt_info = vinfo_for_stmt (stmt);
6752 /* vector stmts created in the outer-loop during vectorization of
6753 stmts in an inner-loop may not have a stmt_info, and do not
6754 need to be vectorized. */
6755 if (!stmt_info)
6757 gsi_next (&si);
6758 continue;
6761 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6762 vect_loop_kill_debug_uses (loop, stmt);
6764 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6765 && !STMT_VINFO_LIVE_P (stmt_info))
6767 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6768 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6769 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6770 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6772 stmt = pattern_stmt;
6773 stmt_info = vinfo_for_stmt (stmt);
6775 else
6777 gsi_next (&si);
6778 continue;
6781 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6782 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6783 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6784 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6785 transform_pattern_stmt = true;
6787 /* If pattern statement has def stmts, vectorize them too. */
6788 if (is_pattern_stmt_p (stmt_info))
6790 if (pattern_def_seq == NULL)
6792 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6793 pattern_def_si = gsi_start (pattern_def_seq);
6795 else if (!gsi_end_p (pattern_def_si))
6796 gsi_next (&pattern_def_si);
6797 if (pattern_def_seq != NULL)
6799 gimple *pattern_def_stmt = NULL;
6800 stmt_vec_info pattern_def_stmt_info = NULL;
6802 while (!gsi_end_p (pattern_def_si))
6804 pattern_def_stmt = gsi_stmt (pattern_def_si);
6805 pattern_def_stmt_info
6806 = vinfo_for_stmt (pattern_def_stmt);
6807 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6808 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6809 break;
6810 gsi_next (&pattern_def_si);
6813 if (!gsi_end_p (pattern_def_si))
6815 if (dump_enabled_p ())
6817 dump_printf_loc (MSG_NOTE, vect_location,
6818 "==> vectorizing pattern def "
6819 "stmt: ");
6820 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6821 pattern_def_stmt, 0);
6822 dump_printf (MSG_NOTE, "\n");
6825 stmt = pattern_def_stmt;
6826 stmt_info = pattern_def_stmt_info;
6828 else
6830 pattern_def_si = gsi_none ();
6831 transform_pattern_stmt = false;
6834 else
6835 transform_pattern_stmt = false;
6838 if (STMT_VINFO_VECTYPE (stmt_info))
6840 unsigned int nunits
6841 = (unsigned int)
6842 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6843 if (!STMT_SLP_TYPE (stmt_info)
6844 && nunits != (unsigned int) vectorization_factor
6845 && dump_enabled_p ())
6846 /* For SLP VF is set according to unrolling factor, and not
6847 to vector size, hence for SLP this print is not valid. */
6848 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6851 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6852 reached. */
6853 if (STMT_SLP_TYPE (stmt_info))
6855 if (!slp_scheduled)
6857 slp_scheduled = true;
6859 if (dump_enabled_p ())
6860 dump_printf_loc (MSG_NOTE, vect_location,
6861 "=== scheduling SLP instances ===\n");
6863 vect_schedule_slp (loop_vinfo);
6866 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6867 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6869 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6871 pattern_def_seq = NULL;
6872 gsi_next (&si);
6874 continue;
6878 /* -------- vectorize statement ------------ */
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6882 grouped_store = false;
6883 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6884 if (is_store)
6886 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6888 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6889 interleaving chain was completed - free all the stores in
6890 the chain. */
6891 gsi_next (&si);
6892 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6894 else
6896 /* Free the attached stmt_vec_info and remove the stmt. */
6897 gimple *store = gsi_stmt (si);
6898 free_stmt_vec_info (store);
6899 unlink_stmt_vdef (store);
6900 gsi_remove (&si, true);
6901 release_defs (store);
6904 /* Stores can only appear at the end of pattern statements. */
6905 gcc_assert (!transform_pattern_stmt);
6906 pattern_def_seq = NULL;
6908 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6910 pattern_def_seq = NULL;
6911 gsi_next (&si);
6913 } /* stmts in BB */
6914 } /* BBs in loop */
6916 slpeel_make_loop_iterate_ntimes (loop, ratio);
6918 /* Reduce loop iterations by the vectorization factor. */
6919 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6920 expected_iterations / vectorization_factor);
6921 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6922 && loop->nb_iterations_upper_bound != 0)
6923 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6924 loop->nb_iterations_upper_bound
6925 = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
6926 vectorization_factor) - 1;
6928 if (loop->any_estimate)
6930 loop->nb_iterations_estimate
6931 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6932 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6933 && loop->nb_iterations_estimate != 0)
6934 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6937 if (dump_enabled_p ())
6939 dump_printf_loc (MSG_NOTE, vect_location,
6940 "LOOP VECTORIZED\n");
6941 if (loop->inner)
6942 dump_printf_loc (MSG_NOTE, vect_location,
6943 "OUTER LOOP VECTORIZED\n");
6944 dump_printf (MSG_NOTE, "\n");
6947 /* Free SLP instances here because otherwise stmt reference counting
6948 won't work. */
6949 slp_instance instance;
6950 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6951 vect_free_slp_instance (instance);
6952 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6955 /* The code below is trying to perform simple optimization - revert
6956 if-conversion for masked stores, i.e. if the mask of a store is zero
6957 do not perform it and all stored value producers also if possible.
6958 For example,
6959 for (i=0; i<n; i++)
6960 if (c[i])
6962 p1[i] += 1;
6963 p2[i] = p3[i] +2;
6965 this transformation will produce the following semi-hammock:
6967 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6969 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6970 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6971 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6972 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6973 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6974 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6978 void
6979 optimize_mask_stores (struct loop *loop)
6981 basic_block *bbs = get_loop_body (loop);
6982 unsigned nbbs = loop->num_nodes;
6983 unsigned i;
6984 basic_block bb;
6985 gimple_stmt_iterator gsi;
6986 gimple *stmt;
6987 auto_vec<gimple *> worklist;
6989 vect_location = find_loop_location (loop);
6990 /* Pick up all masked stores in loop if any. */
6991 for (i = 0; i < nbbs; i++)
6993 bb = bbs[i];
6994 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
6995 gsi_next (&gsi))
6997 stmt = gsi_stmt (gsi);
6998 if (is_gimple_call (stmt)
6999 && gimple_call_internal_p (stmt)
7000 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7001 worklist.safe_push (stmt);
7005 free (bbs);
7006 if (worklist.is_empty ())
7007 return;
7009 /* Loop has masked stores. */
7010 while (!worklist.is_empty ())
7012 gimple *last, *last_store;
7013 edge e, efalse;
7014 tree mask;
7015 basic_block store_bb, join_bb;
7016 gimple_stmt_iterator gsi_to;
7017 tree vdef, new_vdef;
7018 gphi *phi;
7019 tree vectype;
7020 tree zero;
7022 last = worklist.pop ();
7023 mask = gimple_call_arg (last, 2);
7024 bb = gimple_bb (last);
7025 /* Create new bb. */
7026 e = split_block (bb, last);
7027 join_bb = e->dest;
7028 store_bb = create_empty_bb (bb);
7029 add_bb_to_loop (store_bb, loop);
7030 e->flags = EDGE_TRUE_VALUE;
7031 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7032 /* Put STORE_BB to likely part. */
7033 efalse->probability = PROB_UNLIKELY;
7034 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7035 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7036 if (dom_info_available_p (CDI_DOMINATORS))
7037 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7038 if (dump_enabled_p ())
7039 dump_printf_loc (MSG_NOTE, vect_location,
7040 "Create new block %d to sink mask stores.",
7041 store_bb->index);
7042 /* Create vector comparison with boolean result. */
7043 vectype = TREE_TYPE (mask);
7044 zero = build_zero_cst (vectype);
7045 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7046 gsi = gsi_last_bb (bb);
7047 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7048 /* Create new PHI node for vdef of the last masked store:
7049 .MEM_2 = VDEF <.MEM_1>
7050 will be converted to
7051 .MEM.3 = VDEF <.MEM_1>
7052 and new PHI node will be created in join bb
7053 .MEM_2 = PHI <.MEM_1, .MEM_3>
7055 vdef = gimple_vdef (last);
7056 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7057 gimple_set_vdef (last, new_vdef);
7058 phi = create_phi_node (vdef, join_bb);
7059 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7061 /* Put all masked stores with the same mask to STORE_BB if possible. */
7062 while (true)
7064 gimple_stmt_iterator gsi_from;
7065 gimple *stmt1 = NULL;
7067 /* Move masked store to STORE_BB. */
7068 last_store = last;
7069 gsi = gsi_for_stmt (last);
7070 gsi_from = gsi;
7071 /* Shift GSI to the previous stmt for further traversal. */
7072 gsi_prev (&gsi);
7073 gsi_to = gsi_start_bb (store_bb);
7074 gsi_move_before (&gsi_from, &gsi_to);
7075 /* Setup GSI_TO to the non-empty block start. */
7076 gsi_to = gsi_start_bb (store_bb);
7077 if (dump_enabled_p ())
7079 dump_printf_loc (MSG_NOTE, vect_location,
7080 "Move stmt to created bb\n");
7081 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7083 /* Move all stored value producers if possible. */
7084 while (!gsi_end_p (gsi))
7086 tree lhs;
7087 imm_use_iterator imm_iter;
7088 use_operand_p use_p;
7089 bool res;
7091 /* Skip debug statements. */
7092 if (is_gimple_debug (gsi_stmt (gsi)))
7094 gsi_prev (&gsi);
7095 continue;
7097 stmt1 = gsi_stmt (gsi);
7098 /* Do not consider statements writing to memory or having
7099 volatile operand. */
7100 if (gimple_vdef (stmt1)
7101 || gimple_has_volatile_ops (stmt1))
7102 break;
7103 gsi_from = gsi;
7104 gsi_prev (&gsi);
7105 lhs = gimple_get_lhs (stmt1);
7106 if (!lhs)
7107 break;
7109 /* LHS of vectorized stmt must be SSA_NAME. */
7110 if (TREE_CODE (lhs) != SSA_NAME)
7111 break;
7113 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7115 /* Remove dead scalar statement. */
7116 if (has_zero_uses (lhs))
7118 gsi_remove (&gsi_from, true);
7119 continue;
7123 /* Check that LHS does not have uses outside of STORE_BB. */
7124 res = true;
7125 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7127 gimple *use_stmt;
7128 use_stmt = USE_STMT (use_p);
7129 if (is_gimple_debug (use_stmt))
7130 continue;
7131 if (gimple_bb (use_stmt) != store_bb)
7133 res = false;
7134 break;
7137 if (!res)
7138 break;
7140 if (gimple_vuse (stmt1)
7141 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7142 break;
7144 /* Can move STMT1 to STORE_BB. */
7145 if (dump_enabled_p ())
7147 dump_printf_loc (MSG_NOTE, vect_location,
7148 "Move stmt to created bb\n");
7149 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7151 gsi_move_before (&gsi_from, &gsi_to);
7152 /* Shift GSI_TO for further insertion. */
7153 gsi_prev (&gsi_to);
7155 /* Put other masked stores with the same mask to STORE_BB. */
7156 if (worklist.is_empty ()
7157 || gimple_call_arg (worklist.last (), 2) != mask
7158 || worklist.last () != stmt1)
7159 break;
7160 last = worklist.pop ();
7162 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);