2015-11-26 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blob7fb16f592605301d6d074b3ae17f83b3b9c15923
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
51 /* Loop Vectorization Pass.
53 This pass tries to vectorize loops.
55 For example, the vectorizer transforms the following simple loop:
57 short a[N]; short b[N]; short c[N]; int i;
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
63 as if it was manually vectorized by rewriting the source code into:
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
111 For example, say stmt S1 was vectorized into stmt VS1:
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
151 /* Function vect_determine_vectorization_factor
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
176 static bool
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
201 for (i = 0; i < nbbs; i++)
203 basic_block bb = bbs[i];
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
217 gcc_assert (stmt_info);
219 if (STMT_VINFO_RELEVANT_P (stmt_info))
221 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
222 scalar_type = TREE_TYPE (PHI_RESULT (phi));
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_NOTE, vect_location,
227 "get vectype for scalar type: ");
228 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
229 dump_printf (MSG_NOTE, "\n");
232 vectype = get_vectype_for_scalar_type (scalar_type);
233 if (!vectype)
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
238 "not vectorized: unsupported "
239 "data-type ");
240 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
241 scalar_type);
242 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
244 return false;
246 STMT_VINFO_VECTYPE (stmt_info) = vectype;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
251 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
252 dump_printf (MSG_NOTE, "\n");
255 nunits = TYPE_VECTOR_SUBPARTS (vectype);
256 if (dump_enabled_p ())
257 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
258 nunits);
260 if (!vectorization_factor
261 || (nunits > vectorization_factor))
262 vectorization_factor = nunits;
266 for (gimple_stmt_iterator si = gsi_start_bb (bb);
267 !gsi_end_p (si) || analyze_pattern_stmt;)
269 tree vf_vectype;
271 if (analyze_pattern_stmt)
272 stmt = pattern_stmt;
273 else
274 stmt = gsi_stmt (si);
276 stmt_info = vinfo_for_stmt (stmt);
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_NOTE, vect_location,
281 "==> examining statement: ");
282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
283 dump_printf (MSG_NOTE, "\n");
286 gcc_assert (stmt_info);
288 /* Skip stmts which do not need to be vectorized. */
289 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
290 && !STMT_VINFO_LIVE_P (stmt_info))
291 || gimple_clobber_p (stmt))
293 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298 stmt = pattern_stmt;
299 stmt_info = vinfo_for_stmt (pattern_stmt);
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location,
303 "==> examining pattern statement: ");
304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
305 dump_printf (MSG_NOTE, "\n");
308 else
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
325 if (pattern_def_seq == NULL)
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
337 while (!gsi_end_p (pattern_def_si))
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
348 if (!gsi_end_p (pattern_def_si))
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
356 dump_printf (MSG_NOTE, "\n");
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
362 else
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
368 else
369 analyze_pattern_stmt = false;
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 if (is_gimple_call (stmt))
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 pattern_def_seq = NULL;
387 gsi_next (&si);
389 continue;
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 return false;
402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: vector stmt in loop:");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
411 return false;
414 bool_result = false;
416 if (STMT_VINFO_VECTYPE (stmt_info))
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 else
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (is_gimple_call (stmt)
431 && gimple_call_internal_p (stmt)
432 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
433 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
434 else
435 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 /* Bool ops don't participate in vectorization factor
438 computation. For comparison use compared types to
439 compute a factor. */
440 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
442 if (STMT_VINFO_RELEVANT_P (stmt_info))
443 mask_producers.safe_push (stmt_info);
444 bool_result = true;
446 if (gimple_code (stmt) == GIMPLE_ASSIGN
447 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
448 == tcc_comparison
449 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
450 != BOOLEAN_TYPE)
451 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
452 else
454 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 pattern_def_seq = NULL;
457 gsi_next (&si);
459 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "get vectype for scalar type: ");
467 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
468 dump_printf (MSG_NOTE, "\n");
470 vectype = get_vectype_for_scalar_type (scalar_type);
471 if (!vectype)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "not vectorized: unsupported "
477 "data-type ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 scalar_type);
480 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 return false;
485 if (!bool_result)
486 STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
492 dump_printf (MSG_NOTE, "\n");
496 /* Don't try to compute VF out scalar types if we stmt
497 produces boolean vector. Use result vectype instead. */
498 if (VECTOR_BOOLEAN_TYPE_P (vectype))
499 vf_vectype = vectype;
500 else
502 /* The vectorization factor is according to the smallest
503 scalar type (or the largest vector size, but we only
504 support one vector size per loop). */
505 if (!bool_result)
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 if (!vf_vectype)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "not vectorized: unsupported data-type ");
523 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
524 scalar_type);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 return false;
530 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
531 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: different sized vector "
537 "types in statement, ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
539 vectype);
540 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vf_vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 return false;
548 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
551 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
552 dump_printf (MSG_NOTE, "\n");
555 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
558 if (!vectorization_factor
559 || (nunits > vectorization_factor))
560 vectorization_factor = nunits;
562 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
564 pattern_def_seq = NULL;
565 gsi_next (&si);
570 /* TODO: Analyze cost. Decide if worth while to vectorize. */
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
573 vectorization_factor);
574 if (vectorization_factor <= 1)
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "not vectorized: unsupported data-type\n");
579 return false;
581 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
583 for (i = 0; i < mask_producers.length (); i++)
585 tree mask_type = NULL;
587 stmt = STMT_VINFO_STMT (mask_producers[i]);
589 if (gimple_code (stmt) == GIMPLE_ASSIGN
590 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
591 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
593 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
594 mask_type = get_mask_type_for_scalar_type (scalar_type);
596 if (!mask_type)
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "not vectorized: unsupported mask\n");
601 return false;
604 else
606 tree rhs;
607 ssa_op_iter iter;
608 gimple *def_stmt;
609 enum vect_def_type dt;
611 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
613 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
614 &def_stmt, &dt, &vectype))
616 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619 "not vectorized: can't compute mask type "
620 "for statement, ");
621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
623 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
625 return false;
628 /* No vectype probably means external definition.
629 Allow it in case there is another operand which
630 allows to determine mask type. */
631 if (!vectype)
632 continue;
634 if (!mask_type)
635 mask_type = vectype;
636 else if (TYPE_VECTOR_SUBPARTS (mask_type)
637 != TYPE_VECTOR_SUBPARTS (vectype))
639 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "not vectorized: different sized masks "
643 "types in statement, ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
645 mask_type);
646 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
647 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
648 vectype);
649 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
651 return false;
653 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
654 != VECTOR_BOOLEAN_TYPE_P (vectype))
656 if (dump_enabled_p ())
658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
659 "not vectorized: mixed mask and "
660 "nonmask vector types in statement, ");
661 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
662 mask_type);
663 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
664 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
665 vectype);
666 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
668 return false;
672 /* We may compare boolean value loaded as vector of integers.
673 Fix mask_type in such case. */
674 if (mask_type
675 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
676 && gimple_code (stmt) == GIMPLE_ASSIGN
677 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
678 mask_type = build_same_sized_truth_vector_type (mask_type);
681 /* No mask_type should mean loop invariant predicate.
682 This is probably a subject for optimization in
683 if-conversion. */
684 if (!mask_type)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689 "not vectorized: can't compute mask type "
690 "for statement, ");
691 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
693 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
695 return false;
698 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
701 return true;
705 /* Function vect_is_simple_iv_evolution.
707 FORNOW: A simple evolution of an induction variables in the loop is
708 considered a polynomial evolution. */
710 static bool
711 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
712 tree * step)
714 tree init_expr;
715 tree step_expr;
716 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
717 basic_block bb;
719 /* When there is no evolution in this loop, the evolution function
720 is not "simple". */
721 if (evolution_part == NULL_TREE)
722 return false;
724 /* When the evolution is a polynomial of degree >= 2
725 the evolution function is not "simple". */
726 if (tree_is_chrec (evolution_part))
727 return false;
729 step_expr = evolution_part;
730 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
736 dump_printf (MSG_NOTE, ", init: ");
737 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
738 dump_printf (MSG_NOTE, "\n");
741 *init = init_expr;
742 *step = step_expr;
744 if (TREE_CODE (step_expr) != INTEGER_CST
745 && (TREE_CODE (step_expr) != SSA_NAME
746 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
747 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
748 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
749 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
750 || !flag_associative_math)))
751 && (TREE_CODE (step_expr) != REAL_CST
752 || !flag_associative_math))
754 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 "step unknown.\n");
757 return false;
760 return true;
763 /* Function vect_analyze_scalar_cycles_1.
765 Examine the cross iteration def-use cycles of scalar variables
766 in LOOP. LOOP_VINFO represents the loop that is now being
767 considered for vectorization (can be LOOP, or an outer-loop
768 enclosing LOOP). */
770 static void
771 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
773 basic_block bb = loop->header;
774 tree init, step;
775 auto_vec<gimple *, 64> worklist;
776 gphi_iterator gsi;
777 bool double_reduc;
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_NOTE, vect_location,
781 "=== vect_analyze_scalar_cycles ===\n");
783 /* First - identify all inductions. Reduction detection assumes that all the
784 inductions have been identified, therefore, this order must not be
785 changed. */
786 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
788 gphi *phi = gsi.phi ();
789 tree access_fn = NULL;
790 tree def = PHI_RESULT (phi);
791 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
793 if (dump_enabled_p ())
795 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
796 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
797 dump_printf (MSG_NOTE, "\n");
800 /* Skip virtual phi's. The data dependences that are associated with
801 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
802 if (virtual_operand_p (def))
803 continue;
805 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
807 /* Analyze the evolution function. */
808 access_fn = analyze_scalar_evolution (loop, def);
809 if (access_fn)
811 STRIP_NOPS (access_fn);
812 if (dump_enabled_p ())
814 dump_printf_loc (MSG_NOTE, vect_location,
815 "Access function of PHI: ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
817 dump_printf (MSG_NOTE, "\n");
819 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
820 = initial_condition_in_loop_num (access_fn, loop->num);
821 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
822 = evolution_part_in_loop_num (access_fn, loop->num);
825 if (!access_fn
826 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
827 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
828 && TREE_CODE (step) != INTEGER_CST))
830 worklist.safe_push (phi);
831 continue;
834 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
835 != NULL_TREE);
836 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
840 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
844 /* Second - identify all reductions and nested cycles. */
845 while (worklist.length () > 0)
847 gimple *phi = worklist.pop ();
848 tree def = PHI_RESULT (phi);
849 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
850 gimple *reduc_stmt;
851 bool nested_cycle;
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
856 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
857 dump_printf (MSG_NOTE, "\n");
860 gcc_assert (!virtual_operand_p (def)
861 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
863 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
864 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
865 &double_reduc, false);
866 if (reduc_stmt)
868 if (double_reduc)
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_NOTE, vect_location,
872 "Detected double reduction.\n");
874 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
875 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
876 vect_double_reduction_def;
878 else
880 if (nested_cycle)
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_NOTE, vect_location,
884 "Detected vectorizable nested cycle.\n");
886 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
887 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
888 vect_nested_cycle;
890 else
892 if (dump_enabled_p ())
893 dump_printf_loc (MSG_NOTE, vect_location,
894 "Detected reduction.\n");
896 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
897 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
898 vect_reduction_def;
899 /* Store the reduction cycles for possible vectorization in
900 loop-aware SLP. */
901 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
905 else
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "Unknown def-use cycle pattern.\n");
913 /* Function vect_analyze_scalar_cycles.
915 Examine the cross iteration def-use cycles of scalar variables, by
916 analyzing the loop-header PHIs of scalar variables. Classify each
917 cycle as one of the following: invariant, induction, reduction, unknown.
918 We do that for the loop represented by LOOP_VINFO, and also to its
919 inner-loop, if exists.
920 Examples for scalar cycles:
922 Example1: reduction:
924 loop1:
925 for (i=0; i<N; i++)
926 sum += a[i];
928 Example2: induction:
930 loop2:
931 for (i=0; i<N; i++)
932 a[i] = i; */
934 static void
935 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
937 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
939 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
941 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
942 Reductions in such inner-loop therefore have different properties than
943 the reductions in the nest that gets vectorized:
944 1. When vectorized, they are executed in the same order as in the original
945 scalar loop, so we can't change the order of computation when
946 vectorizing them.
947 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
948 current checks are too strict. */
950 if (loop->inner)
951 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
954 /* Transfer group and reduction information from STMT to its pattern stmt. */
956 static void
957 vect_fixup_reduc_chain (gimple *stmt)
959 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
960 gimple *stmtp;
961 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
962 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
963 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
966 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
967 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
968 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
969 if (stmt)
970 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
971 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
973 while (stmt);
974 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
977 /* Fixup scalar cycles that now have their stmts detected as patterns. */
979 static void
980 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
982 gimple *first;
983 unsigned i;
985 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
986 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
988 vect_fixup_reduc_chain (first);
989 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
990 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
994 /* Function vect_get_loop_niters.
996 Determine how many iterations the loop is executed and place it
997 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
998 in NUMBER_OF_ITERATIONSM1.
1000 Return the loop exit condition. */
1003 static gcond *
1004 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1005 tree *number_of_iterationsm1)
1007 tree niters;
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_NOTE, vect_location,
1011 "=== get_loop_niters ===\n");
1013 niters = number_of_latch_executions (loop);
1014 *number_of_iterationsm1 = niters;
1016 /* We want the number of loop header executions which is the number
1017 of latch executions plus one.
1018 ??? For UINT_MAX latch executions this number overflows to zero
1019 for loops like do { n++; } while (n != 0); */
1020 if (niters && !chrec_contains_undetermined (niters))
1021 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1022 build_int_cst (TREE_TYPE (niters), 1));
1023 *number_of_iterations = niters;
1025 return get_loop_exit_condition (loop);
1029 /* Function bb_in_loop_p
1031 Used as predicate for dfs order traversal of the loop bbs. */
1033 static bool
1034 bb_in_loop_p (const_basic_block bb, const void *data)
1036 const struct loop *const loop = (const struct loop *)data;
1037 if (flow_bb_inside_loop_p (loop, bb))
1038 return true;
1039 return false;
1043 /* Function new_loop_vec_info.
1045 Create and initialize a new loop_vec_info struct for LOOP, as well as
1046 stmt_vec_info structs for all the stmts in LOOP. */
1048 static loop_vec_info
1049 new_loop_vec_info (struct loop *loop)
1051 loop_vec_info res;
1052 basic_block *bbs;
1053 gimple_stmt_iterator si;
1054 unsigned int i, nbbs;
1056 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1057 res->kind = vec_info::loop;
1058 LOOP_VINFO_LOOP (res) = loop;
1060 bbs = get_loop_body (loop);
1062 /* Create/Update stmt_info for all stmts in the loop. */
1063 for (i = 0; i < loop->num_nodes; i++)
1065 basic_block bb = bbs[i];
1067 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1069 gimple *phi = gsi_stmt (si);
1070 gimple_set_uid (phi, 0);
1071 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1076 gimple *stmt = gsi_stmt (si);
1077 gimple_set_uid (stmt, 0);
1078 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1082 /* CHECKME: We want to visit all BBs before their successors (except for
1083 latch blocks, for which this assertion wouldn't hold). In the simple
1084 case of the loop forms we allow, a dfs order of the BBs would the same
1085 as reversed postorder traversal, so we are safe. */
1087 free (bbs);
1088 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1089 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1090 bbs, loop->num_nodes, loop);
1091 gcc_assert (nbbs == loop->num_nodes);
1093 LOOP_VINFO_BBS (res) = bbs;
1094 LOOP_VINFO_NITERSM1 (res) = NULL;
1095 LOOP_VINFO_NITERS (res) = NULL;
1096 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1097 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1098 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1099 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1100 LOOP_VINFO_VECT_FACTOR (res) = 0;
1101 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1102 LOOP_VINFO_DATAREFS (res) = vNULL;
1103 LOOP_VINFO_DDRS (res) = vNULL;
1104 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1105 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1106 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1107 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1108 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1109 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1110 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1111 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1112 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1113 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1114 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1115 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1117 return res;
1121 /* Function destroy_loop_vec_info.
1123 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1124 stmts in the loop. */
1126 void
1127 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1129 struct loop *loop;
1130 basic_block *bbs;
1131 int nbbs;
1132 gimple_stmt_iterator si;
1133 int j;
1134 vec<slp_instance> slp_instances;
1135 slp_instance instance;
1136 bool swapped;
1138 if (!loop_vinfo)
1139 return;
1141 loop = LOOP_VINFO_LOOP (loop_vinfo);
1143 bbs = LOOP_VINFO_BBS (loop_vinfo);
1144 nbbs = clean_stmts ? loop->num_nodes : 0;
1145 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1147 for (j = 0; j < nbbs; j++)
1149 basic_block bb = bbs[j];
1150 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 free_stmt_vec_info (gsi_stmt (si));
1153 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1155 gimple *stmt = gsi_stmt (si);
1157 /* We may have broken canonical form by moving a constant
1158 into RHS1 of a commutative op. Fix such occurrences. */
1159 if (swapped && is_gimple_assign (stmt))
1161 enum tree_code code = gimple_assign_rhs_code (stmt);
1163 if ((code == PLUS_EXPR
1164 || code == POINTER_PLUS_EXPR
1165 || code == MULT_EXPR)
1166 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1167 swap_ssa_operands (stmt,
1168 gimple_assign_rhs1_ptr (stmt),
1169 gimple_assign_rhs2_ptr (stmt));
1172 /* Free stmt_vec_info. */
1173 free_stmt_vec_info (stmt);
1174 gsi_next (&si);
1178 free (LOOP_VINFO_BBS (loop_vinfo));
1179 vect_destroy_datarefs (loop_vinfo);
1180 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1181 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1182 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1183 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1184 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1185 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1186 vect_free_slp_instance (instance);
1188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1189 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1190 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1191 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1193 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1194 loop_vinfo->scalar_cost_vec.release ();
1196 free (loop_vinfo);
1197 loop->aux = NULL;
1201 /* Calculate the cost of one scalar iteration of the loop. */
1202 static void
1203 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1205 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1206 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1207 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1208 int innerloop_iters, i;
1210 /* Count statements in scalar loop. Using this as scalar cost for a single
1211 iteration for now.
1213 TODO: Add outer loop support.
1215 TODO: Consider assigning different costs to different scalar
1216 statements. */
1218 /* FORNOW. */
1219 innerloop_iters = 1;
1220 if (loop->inner)
1221 innerloop_iters = 50; /* FIXME */
1223 for (i = 0; i < nbbs; i++)
1225 gimple_stmt_iterator si;
1226 basic_block bb = bbs[i];
1228 if (bb->loop_father == loop->inner)
1229 factor = innerloop_iters;
1230 else
1231 factor = 1;
1233 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1235 gimple *stmt = gsi_stmt (si);
1236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1238 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1239 continue;
1241 /* Skip stmts that are not vectorized inside the loop. */
1242 if (stmt_info
1243 && !STMT_VINFO_RELEVANT_P (stmt_info)
1244 && (!STMT_VINFO_LIVE_P (stmt_info)
1245 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1246 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1247 continue;
1249 vect_cost_for_stmt kind;
1250 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1252 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1253 kind = scalar_load;
1254 else
1255 kind = scalar_store;
1257 else
1258 kind = scalar_stmt;
1260 scalar_single_iter_cost
1261 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1262 factor, kind, NULL, 0, vect_prologue);
1265 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1266 = scalar_single_iter_cost;
1270 /* Function vect_analyze_loop_form_1.
1272 Verify that certain CFG restrictions hold, including:
1273 - the loop has a pre-header
1274 - the loop has a single entry and exit
1275 - the loop exit condition is simple enough, and the number of iterations
1276 can be analyzed (a countable loop). */
1278 bool
1279 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1280 tree *number_of_iterationsm1,
1281 tree *number_of_iterations, gcond **inner_loop_cond)
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_NOTE, vect_location,
1285 "=== vect_analyze_loop_form ===\n");
1287 /* Different restrictions apply when we are considering an inner-most loop,
1288 vs. an outer (nested) loop.
1289 (FORNOW. May want to relax some of these restrictions in the future). */
1291 if (!loop->inner)
1293 /* Inner-most loop. We currently require that the number of BBs is
1294 exactly 2 (the header and latch). Vectorizable inner-most loops
1295 look like this:
1297 (pre-header)
1299 header <--------+
1300 | | |
1301 | +--> latch --+
1303 (exit-bb) */
1305 if (loop->num_nodes != 2)
1307 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309 "not vectorized: control flow in loop.\n");
1310 return false;
1313 if (empty_block_p (loop->header))
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "not vectorized: empty loop.\n");
1318 return false;
1321 else
1323 struct loop *innerloop = loop->inner;
1324 edge entryedge;
1326 /* Nested loop. We currently require that the loop is doubly-nested,
1327 contains a single inner loop, and the number of BBs is exactly 5.
1328 Vectorizable outer-loops look like this:
1330 (pre-header)
1332 header <---+
1334 inner-loop |
1336 tail ------+
1338 (exit-bb)
1340 The inner-loop has the properties expected of inner-most loops
1341 as described above. */
1343 if ((loop->inner)->inner || (loop->inner)->next)
1345 if (dump_enabled_p ())
1346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1347 "not vectorized: multiple nested loops.\n");
1348 return false;
1351 if (loop->num_nodes != 5)
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "not vectorized: control flow in loop.\n");
1356 return false;
1359 entryedge = loop_preheader_edge (innerloop);
1360 if (entryedge->src != loop->header
1361 || !single_exit (innerloop)
1362 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1364 if (dump_enabled_p ())
1365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 "not vectorized: unsupported outerloop form.\n");
1367 return false;
1370 /* Analyze the inner-loop. */
1371 tree inner_niterm1, inner_niter;
1372 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1373 &inner_niterm1, &inner_niter, NULL))
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "not vectorized: Bad inner loop.\n");
1378 return false;
1381 if (!expr_invariant_in_loop_p (loop, inner_niter))
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385 "not vectorized: inner-loop count not"
1386 " invariant.\n");
1387 return false;
1390 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE, vect_location,
1392 "Considering outer-loop vectorization.\n");
1395 if (!single_exit (loop)
1396 || EDGE_COUNT (loop->header->preds) != 2)
1398 if (dump_enabled_p ())
1400 if (!single_exit (loop))
1401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 "not vectorized: multiple exits.\n");
1403 else if (EDGE_COUNT (loop->header->preds) != 2)
1404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1405 "not vectorized: too many incoming edges.\n");
1407 return false;
1410 /* We assume that the loop exit condition is at the end of the loop. i.e,
1411 that the loop is represented as a do-while (with a proper if-guard
1412 before the loop if needed), where the loop header contains all the
1413 executable statements, and the latch is empty. */
1414 if (!empty_block_p (loop->latch)
1415 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "not vectorized: latch block not empty.\n");
1420 return false;
1423 /* Make sure there exists a single-predecessor exit bb: */
1424 if (!single_pred_p (single_exit (loop)->dest))
1426 edge e = single_exit (loop);
1427 if (!(e->flags & EDGE_ABNORMAL))
1429 split_loop_exit_edge (e);
1430 if (dump_enabled_p ())
1431 dump_printf (MSG_NOTE, "split exit edge.\n");
1433 else
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1437 "not vectorized: abnormal loop exit edge.\n");
1438 return false;
1442 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1443 number_of_iterationsm1);
1444 if (!*loop_cond)
1446 if (dump_enabled_p ())
1447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1448 "not vectorized: complicated exit condition.\n");
1449 return false;
1452 if (!*number_of_iterations
1453 || chrec_contains_undetermined (*number_of_iterations))
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457 "not vectorized: number of iterations cannot be "
1458 "computed.\n");
1459 return false;
1462 if (integer_zerop (*number_of_iterations))
1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466 "not vectorized: number of iterations = 0.\n");
1467 return false;
1470 return true;
1473 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1475 loop_vec_info
1476 vect_analyze_loop_form (struct loop *loop)
1478 tree number_of_iterations, number_of_iterationsm1;
1479 gcond *loop_cond, *inner_loop_cond = NULL;
1481 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1482 &number_of_iterations, &inner_loop_cond))
1483 return NULL;
1485 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1486 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1487 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1488 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1492 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1501 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1502 if (inner_loop_cond)
1503 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1504 = loop_exit_ctrl_vec_info_type;
1506 gcc_assert (!loop->aux);
1507 loop->aux = loop_vinfo;
1508 return loop_vinfo;
1513 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1514 statements update the vectorization factor. */
1516 static void
1517 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521 int nbbs = loop->num_nodes;
1522 unsigned int vectorization_factor;
1523 int i;
1525 if (dump_enabled_p ())
1526 dump_printf_loc (MSG_NOTE, vect_location,
1527 "=== vect_update_vf_for_slp ===\n");
1529 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1530 gcc_assert (vectorization_factor != 0);
1532 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1533 vectorization factor of the loop is the unrolling factor required by
1534 the SLP instances. If that unrolling factor is 1, we say, that we
1535 perform pure SLP on loop - cross iteration parallelism is not
1536 exploited. */
1537 bool only_slp_in_loop = true;
1538 for (i = 0; i < nbbs; i++)
1540 basic_block bb = bbs[i];
1541 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1542 gsi_next (&si))
1544 gimple *stmt = gsi_stmt (si);
1545 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1546 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1547 && STMT_VINFO_RELATED_STMT (stmt_info))
1549 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1550 stmt_info = vinfo_for_stmt (stmt);
1552 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1553 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1554 && !PURE_SLP_STMT (stmt_info))
1555 /* STMT needs both SLP and loop-based vectorization. */
1556 only_slp_in_loop = false;
1560 if (only_slp_in_loop)
1561 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1562 else
1563 vectorization_factor
1564 = least_common_multiple (vectorization_factor,
1565 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1567 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1568 if (dump_enabled_p ())
1569 dump_printf_loc (MSG_NOTE, vect_location,
1570 "Updating vectorization factor to %d\n",
1571 vectorization_factor);
1574 /* Function vect_analyze_loop_operations.
1576 Scan the loop stmts and make sure they are all vectorizable. */
1578 static bool
1579 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1581 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1582 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1583 int nbbs = loop->num_nodes;
1584 int i;
1585 stmt_vec_info stmt_info;
1586 bool need_to_vectorize = false;
1587 bool ok;
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_NOTE, vect_location,
1591 "=== vect_analyze_loop_operations ===\n");
1593 for (i = 0; i < nbbs; i++)
1595 basic_block bb = bbs[i];
1597 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1598 gsi_next (&si))
1600 gphi *phi = si.phi ();
1601 ok = true;
1603 stmt_info = vinfo_for_stmt (phi);
1604 if (dump_enabled_p ())
1606 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1607 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1608 dump_printf (MSG_NOTE, "\n");
1610 if (virtual_operand_p (gimple_phi_result (phi)))
1611 continue;
1613 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1614 (i.e., a phi in the tail of the outer-loop). */
1615 if (! is_loop_header_bb_p (bb))
1617 /* FORNOW: we currently don't support the case that these phis
1618 are not used in the outerloop (unless it is double reduction,
1619 i.e., this phi is vect_reduction_def), cause this case
1620 requires to actually do something here. */
1621 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1622 || STMT_VINFO_LIVE_P (stmt_info))
1623 && STMT_VINFO_DEF_TYPE (stmt_info)
1624 != vect_double_reduction_def)
1626 if (dump_enabled_p ())
1627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628 "Unsupported loop-closed phi in "
1629 "outer-loop.\n");
1630 return false;
1633 /* If PHI is used in the outer loop, we check that its operand
1634 is defined in the inner loop. */
1635 if (STMT_VINFO_RELEVANT_P (stmt_info))
1637 tree phi_op;
1638 gimple *op_def_stmt;
1640 if (gimple_phi_num_args (phi) != 1)
1641 return false;
1643 phi_op = PHI_ARG_DEF (phi, 0);
1644 if (TREE_CODE (phi_op) != SSA_NAME)
1645 return false;
1647 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1648 if (gimple_nop_p (op_def_stmt)
1649 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1650 || !vinfo_for_stmt (op_def_stmt))
1651 return false;
1653 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1654 != vect_used_in_outer
1655 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1656 != vect_used_in_outer_by_reduction)
1657 return false;
1660 continue;
1663 gcc_assert (stmt_info);
1665 if (STMT_VINFO_LIVE_P (stmt_info))
1667 /* FORNOW: not yet supported. */
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670 "not vectorized: value used after loop.\n");
1671 return false;
1674 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1675 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1677 /* A scalar-dependence cycle that we don't support. */
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 "not vectorized: scalar dependence cycle.\n");
1681 return false;
1684 if (STMT_VINFO_RELEVANT_P (stmt_info))
1686 need_to_vectorize = true;
1687 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1688 ok = vectorizable_induction (phi, NULL, NULL);
1691 if (!ok)
1693 if (dump_enabled_p ())
1695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696 "not vectorized: relevant phi not "
1697 "supported: ");
1698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1699 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1701 return false;
1705 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1706 gsi_next (&si))
1708 gimple *stmt = gsi_stmt (si);
1709 if (!gimple_clobber_p (stmt)
1710 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1711 return false;
1713 } /* bbs */
1715 /* All operations in the loop are either irrelevant (deal with loop
1716 control, or dead), or only used outside the loop and can be moved
1717 out of the loop (e.g. invariants, inductions). The loop can be
1718 optimized away by scalar optimizations. We're better off not
1719 touching this loop. */
1720 if (!need_to_vectorize)
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_NOTE, vect_location,
1724 "All the computation can be taken out of the loop.\n");
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727 "not vectorized: redundant loop. no profit to "
1728 "vectorize.\n");
1729 return false;
1732 return true;
1736 /* Function vect_analyze_loop_2.
1738 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1739 for it. The different analyses will record information in the
1740 loop_vec_info struct. */
1741 static bool
1742 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1744 bool ok;
1745 int max_vf = MAX_VECTORIZATION_FACTOR;
1746 int min_vf = 2;
1747 unsigned int n_stmts = 0;
1749 /* The first group of checks is independent of the vector size. */
1750 fatal = true;
1752 /* Find all data references in the loop (which correspond to vdefs/vuses)
1753 and analyze their evolution in the loop. */
1755 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1757 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1758 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 "not vectorized: loop contains function calls"
1763 " or data references that cannot be analyzed\n");
1764 return false;
1767 for (unsigned i = 0; i < loop->num_nodes; i++)
1768 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1769 !gsi_end_p (gsi); gsi_next (&gsi))
1771 gimple *stmt = gsi_stmt (gsi);
1772 if (is_gimple_debug (stmt))
1773 continue;
1774 ++n_stmts;
1775 if (!find_data_references_in_stmt (loop, stmt,
1776 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1778 if (is_gimple_call (stmt) && loop->safelen)
1780 tree fndecl = gimple_call_fndecl (stmt), op;
1781 if (fndecl != NULL_TREE)
1783 cgraph_node *node = cgraph_node::get (fndecl);
1784 if (node != NULL && node->simd_clones != NULL)
1786 unsigned int j, n = gimple_call_num_args (stmt);
1787 for (j = 0; j < n; j++)
1789 op = gimple_call_arg (stmt, j);
1790 if (DECL_P (op)
1791 || (REFERENCE_CLASS_P (op)
1792 && get_base_address (op)))
1793 break;
1795 op = gimple_call_lhs (stmt);
1796 /* Ignore #pragma omp declare simd functions
1797 if they don't have data references in the
1798 call stmt itself. */
1799 if (j == n
1800 && !(op
1801 && (DECL_P (op)
1802 || (REFERENCE_CLASS_P (op)
1803 && get_base_address (op)))))
1804 continue;
1808 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810 "not vectorized: loop contains function "
1811 "calls or data references that cannot "
1812 "be analyzed\n");
1813 return false;
1817 /* Analyze the data references and also adjust the minimal
1818 vectorization factor according to the loads and stores. */
1820 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1821 if (!ok)
1823 if (dump_enabled_p ())
1824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825 "bad data references.\n");
1826 return false;
1829 /* Classify all cross-iteration scalar data-flow cycles.
1830 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1831 vect_analyze_scalar_cycles (loop_vinfo);
1833 vect_pattern_recog (loop_vinfo);
1835 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1837 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1838 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1840 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1841 if (!ok)
1843 if (dump_enabled_p ())
1844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1845 "bad data access.\n");
1846 return false;
1849 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1851 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1852 if (!ok)
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "unexpected pattern.\n");
1857 return false;
1860 /* While the rest of the analysis below depends on it in some way. */
1861 fatal = false;
1863 /* Analyze data dependences between the data-refs in the loop
1864 and adjust the maximum vectorization factor according to
1865 the dependences.
1866 FORNOW: fail at the first data dependence that we encounter. */
1868 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1869 if (!ok
1870 || max_vf < min_vf)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "bad data dependence.\n");
1875 return false;
1878 ok = vect_determine_vectorization_factor (loop_vinfo);
1879 if (!ok)
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883 "can't determine vectorization factor.\n");
1884 return false;
1886 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1888 if (dump_enabled_p ())
1889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890 "bad data dependence.\n");
1891 return false;
1894 /* Compute the scalar iteration cost. */
1895 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1897 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1898 HOST_WIDE_INT estimated_niter;
1899 unsigned th;
1900 int min_scalar_loop_bound;
1902 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1903 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1904 if (!ok)
1905 return false;
1907 /* If there are any SLP instances mark them as pure_slp. */
1908 bool slp = vect_make_slp_decision (loop_vinfo);
1909 if (slp)
1911 /* Find stmts that need to be both vectorized and SLPed. */
1912 vect_detect_hybrid_slp (loop_vinfo);
1914 /* Update the vectorization factor based on the SLP decision. */
1915 vect_update_vf_for_slp (loop_vinfo);
1918 /* This is the point where we can re-start analysis with SLP forced off. */
1919 start_over:
1921 /* Now the vectorization factor is final. */
1922 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1923 gcc_assert (vectorization_factor != 0);
1925 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1926 dump_printf_loc (MSG_NOTE, vect_location,
1927 "vectorization_factor = %d, niters = "
1928 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1929 LOOP_VINFO_INT_NITERS (loop_vinfo));
1931 HOST_WIDE_INT max_niter
1932 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1933 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1934 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1935 || (max_niter != -1
1936 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1938 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940 "not vectorized: iteration count smaller than "
1941 "vectorization factor.\n");
1942 return false;
1945 /* Analyze the alignment of the data-refs in the loop.
1946 Fail if a data reference is found that cannot be vectorized. */
1948 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1949 if (!ok)
1951 if (dump_enabled_p ())
1952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1953 "bad data alignment.\n");
1954 return false;
1957 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1958 It is important to call pruning after vect_analyze_data_ref_accesses,
1959 since we use grouping information gathered by interleaving analysis. */
1960 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1961 if (!ok)
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965 "number of versioning for alias "
1966 "run-time tests exceeds %d "
1967 "(--param vect-max-version-for-alias-checks)\n",
1968 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1969 return false;
1972 /* This pass will decide on using loop versioning and/or loop peeling in
1973 order to enhance the alignment of data references in the loop. */
1974 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1975 if (!ok)
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979 "bad data alignment.\n");
1980 return false;
1983 if (slp)
1985 /* Analyze operations in the SLP instances. Note this may
1986 remove unsupported SLP instances which makes the above
1987 SLP kind detection invalid. */
1988 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1989 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1990 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1991 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1992 goto again;
1995 /* Scan all the remaining operations in the loop that are not subject
1996 to SLP and make sure they are vectorizable. */
1997 ok = vect_analyze_loop_operations (loop_vinfo);
1998 if (!ok)
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 "bad operation or unsupported loop bound.\n");
2003 return false;
2006 /* Analyze cost. Decide if worth while to vectorize. */
2007 int min_profitable_estimate, min_profitable_iters;
2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 &min_profitable_estimate);
2011 if (min_profitable_iters < 0)
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "not vectorized: vectorization not profitable.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: vector version will never be "
2019 "profitable.\n");
2020 goto again;
2023 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 * vectorization_factor) - 1);
2026 /* Use the cost model only if it is more conservative than user specified
2027 threshold. */
2028 th = (unsigned) min_scalar_loop_bound;
2029 if (min_profitable_iters
2030 && (!min_scalar_loop_bound
2031 || min_profitable_iters > min_scalar_loop_bound))
2032 th = (unsigned) min_profitable_iters;
2034 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2036 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2037 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2041 "not vectorized: vectorization not profitable.\n");
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "not vectorized: iteration count smaller than user "
2045 "specified loop bound parameter or minimum profitable "
2046 "iterations (whichever is more conservative).\n");
2047 goto again;
2050 estimated_niter
2051 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2052 if (estimated_niter != -1
2053 && ((unsigned HOST_WIDE_INT) estimated_niter
2054 <= MAX (th, (unsigned)min_profitable_estimate)))
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not vectorized: estimated iteration count too "
2059 "small.\n");
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "not vectorized: estimated iteration count smaller "
2063 "than specified loop bound parameter or minimum "
2064 "profitable iterations (whichever is more "
2065 "conservative).\n");
2066 goto again;
2069 /* Decide whether we need to create an epilogue loop to handle
2070 remaining scalar iterations. */
2071 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2072 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2073 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2075 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2076 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2078 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2079 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2080 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2081 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2083 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2084 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2085 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2086 /* In case of versioning, check if the maximum number of
2087 iterations is greater than th. If they are identical,
2088 the epilogue is unnecessary. */
2089 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2090 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2091 || (unsigned HOST_WIDE_INT) max_niter > th)))
2092 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2094 /* If an epilogue loop is required make sure we can create one. */
2095 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2096 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2100 if (!vect_can_advance_ivs_p (loop_vinfo)
2101 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2102 single_exit (LOOP_VINFO_LOOP
2103 (loop_vinfo))))
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107 "not vectorized: can't create required "
2108 "epilog loop\n");
2109 goto again;
2113 gcc_assert (vectorization_factor
2114 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2116 /* Ok to vectorize! */
2117 return true;
2119 again:
2120 /* Try again with SLP forced off but if we didn't do any SLP there is
2121 no point in re-trying. */
2122 if (!slp)
2123 return false;
2125 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2126 via interleaving or lane instructions or if there were any SLP
2127 reductions. */
2128 slp_instance instance;
2129 slp_tree node;
2130 unsigned i, j;
2131 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2133 stmt_vec_info vinfo;
2134 vinfo = vinfo_for_stmt
2135 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2136 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2137 return false;
2138 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2139 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2140 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2141 if (! vect_store_lanes_supported (vectype, size)
2142 && ! vect_grouped_store_supported (vectype, size))
2143 return false;
2144 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2146 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2147 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2148 size = STMT_VINFO_GROUP_SIZE (vinfo);
2149 vectype = STMT_VINFO_VECTYPE (vinfo);
2150 if (! vect_load_lanes_supported (vectype, size)
2151 && ! vect_grouped_load_supported (vectype, size))
2152 return false;
2156 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "re-trying with SLP disabled\n");
2160 /* Roll back state appropriately. No SLP this time. */
2161 slp = false;
2162 /* Restore vectorization factor as it were without SLP. */
2163 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2164 /* Free the SLP instances. */
2165 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2166 vect_free_slp_instance (instance);
2167 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2168 /* Reset SLP type to loop_vect on all stmts. */
2169 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2171 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2172 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2173 !gsi_end_p (si); gsi_next (&si))
2175 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2176 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2178 gcc_assert (STMT_SLP_TYPE (stmt_info) == loop_vect);
2179 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2181 STMT_SLP_TYPE (stmt_info) = loop_vect;
2184 /* Free optimized alias test DDRS. */
2185 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2186 /* Reset target cost data. */
2187 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2188 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2189 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2190 /* Reset assorted flags. */
2191 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2192 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2194 goto start_over;
2197 /* Function vect_analyze_loop.
2199 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2200 for it. The different analyses will record information in the
2201 loop_vec_info struct. */
2202 loop_vec_info
2203 vect_analyze_loop (struct loop *loop)
2205 loop_vec_info loop_vinfo;
2206 unsigned int vector_sizes;
2208 /* Autodetect first vector size we try. */
2209 current_vector_size = 0;
2210 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "===== analyze_loop_nest =====\n");
2216 if (loop_outer (loop)
2217 && loop_vec_info_for_loop (loop_outer (loop))
2218 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "outer-loop already vectorized.\n");
2223 return NULL;
2226 while (1)
2228 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2229 loop_vinfo = vect_analyze_loop_form (loop);
2230 if (!loop_vinfo)
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 "bad loop form.\n");
2235 return NULL;
2238 bool fatal = false;
2239 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2241 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2243 return loop_vinfo;
2246 destroy_loop_vec_info (loop_vinfo, true);
2248 vector_sizes &= ~current_vector_size;
2249 if (fatal
2250 || vector_sizes == 0
2251 || current_vector_size == 0)
2252 return NULL;
2254 /* Try the next biggest vector size. */
2255 current_vector_size = 1 << floor_log2 (vector_sizes);
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "***** Re-trying analysis with "
2259 "vector size %d\n", current_vector_size);
2264 /* Function reduction_code_for_scalar_code
2266 Input:
2267 CODE - tree_code of a reduction operations.
2269 Output:
2270 REDUC_CODE - the corresponding tree-code to be used to reduce the
2271 vector of partial results into a single scalar result, or ERROR_MARK
2272 if the operation is a supported reduction operation, but does not have
2273 such a tree-code.
2275 Return FALSE if CODE currently cannot be vectorized as reduction. */
2277 static bool
2278 reduction_code_for_scalar_code (enum tree_code code,
2279 enum tree_code *reduc_code)
2281 switch (code)
2283 case MAX_EXPR:
2284 *reduc_code = REDUC_MAX_EXPR;
2285 return true;
2287 case MIN_EXPR:
2288 *reduc_code = REDUC_MIN_EXPR;
2289 return true;
2291 case PLUS_EXPR:
2292 *reduc_code = REDUC_PLUS_EXPR;
2293 return true;
2295 case MULT_EXPR:
2296 case MINUS_EXPR:
2297 case BIT_IOR_EXPR:
2298 case BIT_XOR_EXPR:
2299 case BIT_AND_EXPR:
2300 *reduc_code = ERROR_MARK;
2301 return true;
2303 default:
2304 return false;
2309 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2310 STMT is printed with a message MSG. */
2312 static void
2313 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2315 dump_printf_loc (msg_type, vect_location, "%s", msg);
2316 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2317 dump_printf (msg_type, "\n");
2321 /* Detect SLP reduction of the form:
2323 #a1 = phi <a5, a0>
2324 a2 = operation (a1)
2325 a3 = operation (a2)
2326 a4 = operation (a3)
2327 a5 = operation (a4)
2329 #a = phi <a5>
2331 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2332 FIRST_STMT is the first reduction stmt in the chain
2333 (a2 = operation (a1)).
2335 Return TRUE if a reduction chain was detected. */
2337 static bool
2338 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2339 gimple *first_stmt)
2341 struct loop *loop = (gimple_bb (phi))->loop_father;
2342 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2343 enum tree_code code;
2344 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2345 stmt_vec_info use_stmt_info, current_stmt_info;
2346 tree lhs;
2347 imm_use_iterator imm_iter;
2348 use_operand_p use_p;
2349 int nloop_uses, size = 0, n_out_of_loop_uses;
2350 bool found = false;
2352 if (loop != vect_loop)
2353 return false;
2355 lhs = PHI_RESULT (phi);
2356 code = gimple_assign_rhs_code (first_stmt);
2357 while (1)
2359 nloop_uses = 0;
2360 n_out_of_loop_uses = 0;
2361 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2363 gimple *use_stmt = USE_STMT (use_p);
2364 if (is_gimple_debug (use_stmt))
2365 continue;
2367 /* Check if we got back to the reduction phi. */
2368 if (use_stmt == phi)
2370 loop_use_stmt = use_stmt;
2371 found = true;
2372 break;
2375 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2377 loop_use_stmt = use_stmt;
2378 nloop_uses++;
2380 else
2381 n_out_of_loop_uses++;
2383 /* There are can be either a single use in the loop or two uses in
2384 phi nodes. */
2385 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2386 return false;
2389 if (found)
2390 break;
2392 /* We reached a statement with no loop uses. */
2393 if (nloop_uses == 0)
2394 return false;
2396 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2397 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2398 return false;
2400 if (!is_gimple_assign (loop_use_stmt)
2401 || code != gimple_assign_rhs_code (loop_use_stmt)
2402 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2403 return false;
2405 /* Insert USE_STMT into reduction chain. */
2406 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2407 if (current_stmt)
2409 current_stmt_info = vinfo_for_stmt (current_stmt);
2410 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2411 GROUP_FIRST_ELEMENT (use_stmt_info)
2412 = GROUP_FIRST_ELEMENT (current_stmt_info);
2414 else
2415 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2417 lhs = gimple_assign_lhs (loop_use_stmt);
2418 current_stmt = loop_use_stmt;
2419 size++;
2422 if (!found || loop_use_stmt != phi || size < 2)
2423 return false;
2425 /* Swap the operands, if needed, to make the reduction operand be the second
2426 operand. */
2427 lhs = PHI_RESULT (phi);
2428 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2429 while (next_stmt)
2431 if (gimple_assign_rhs2 (next_stmt) == lhs)
2433 tree op = gimple_assign_rhs1 (next_stmt);
2434 gimple *def_stmt = NULL;
2436 if (TREE_CODE (op) == SSA_NAME)
2437 def_stmt = SSA_NAME_DEF_STMT (op);
2439 /* Check that the other def is either defined in the loop
2440 ("vect_internal_def"), or it's an induction (defined by a
2441 loop-header phi-node). */
2442 if (def_stmt
2443 && gimple_bb (def_stmt)
2444 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2445 && (is_gimple_assign (def_stmt)
2446 || is_gimple_call (def_stmt)
2447 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2448 == vect_induction_def
2449 || (gimple_code (def_stmt) == GIMPLE_PHI
2450 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2451 == vect_internal_def
2452 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2454 lhs = gimple_assign_lhs (next_stmt);
2455 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2456 continue;
2459 return false;
2461 else
2463 tree op = gimple_assign_rhs2 (next_stmt);
2464 gimple *def_stmt = NULL;
2466 if (TREE_CODE (op) == SSA_NAME)
2467 def_stmt = SSA_NAME_DEF_STMT (op);
2469 /* Check that the other def is either defined in the loop
2470 ("vect_internal_def"), or it's an induction (defined by a
2471 loop-header phi-node). */
2472 if (def_stmt
2473 && gimple_bb (def_stmt)
2474 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2475 && (is_gimple_assign (def_stmt)
2476 || is_gimple_call (def_stmt)
2477 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2478 == vect_induction_def
2479 || (gimple_code (def_stmt) == GIMPLE_PHI
2480 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2481 == vect_internal_def
2482 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2484 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2487 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2488 dump_printf (MSG_NOTE, "\n");
2491 swap_ssa_operands (next_stmt,
2492 gimple_assign_rhs1_ptr (next_stmt),
2493 gimple_assign_rhs2_ptr (next_stmt));
2494 update_stmt (next_stmt);
2496 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2497 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2499 else
2500 return false;
2503 lhs = gimple_assign_lhs (next_stmt);
2504 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2507 /* Save the chain for further analysis in SLP detection. */
2508 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2509 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2510 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2512 return true;
2516 /* Function vect_is_simple_reduction_1
2518 (1) Detect a cross-iteration def-use cycle that represents a simple
2519 reduction computation. We look for the following pattern:
2521 loop_header:
2522 a1 = phi < a0, a2 >
2523 a3 = ...
2524 a2 = operation (a3, a1)
2528 a3 = ...
2529 loop_header:
2530 a1 = phi < a0, a2 >
2531 a2 = operation (a3, a1)
2533 such that:
2534 1. operation is commutative and associative and it is safe to
2535 change the order of the computation (if CHECK_REDUCTION is true)
2536 2. no uses for a2 in the loop (a2 is used out of the loop)
2537 3. no uses of a1 in the loop besides the reduction operation
2538 4. no uses of a1 outside the loop.
2540 Conditions 1,4 are tested here.
2541 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2543 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2544 nested cycles, if CHECK_REDUCTION is false.
2546 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2547 reductions:
2549 a1 = phi < a0, a2 >
2550 inner loop (def of a3)
2551 a2 = phi < a3 >
2553 (4) Detect condition expressions, ie:
2554 for (int i = 0; i < N; i++)
2555 if (a[i] < val)
2556 ret_val = a[i];
2560 static gimple *
2561 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2562 bool check_reduction, bool *double_reduc,
2563 bool need_wrapping_integral_overflow,
2564 enum vect_reduction_type *v_reduc_type)
2566 struct loop *loop = (gimple_bb (phi))->loop_father;
2567 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2568 edge latch_e = loop_latch_edge (loop);
2569 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2570 gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2571 enum tree_code orig_code, code;
2572 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2573 tree type;
2574 int nloop_uses;
2575 tree name;
2576 imm_use_iterator imm_iter;
2577 use_operand_p use_p;
2578 bool phi_def;
2580 *double_reduc = false;
2581 *v_reduc_type = TREE_CODE_REDUCTION;
2583 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2584 otherwise, we assume outer loop vectorization. */
2585 gcc_assert ((check_reduction && loop == vect_loop)
2586 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2588 name = PHI_RESULT (phi);
2589 /* ??? If there are no uses of the PHI result the inner loop reduction
2590 won't be detected as possibly double-reduction by vectorizable_reduction
2591 because that tries to walk the PHI arg from the preheader edge which
2592 can be constant. See PR60382. */
2593 if (has_zero_uses (name))
2594 return NULL;
2595 nloop_uses = 0;
2596 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2598 gimple *use_stmt = USE_STMT (use_p);
2599 if (is_gimple_debug (use_stmt))
2600 continue;
2602 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2604 if (dump_enabled_p ())
2605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2606 "intermediate value used outside loop.\n");
2608 return NULL;
2611 nloop_uses++;
2612 if (nloop_uses > 1)
2614 if (dump_enabled_p ())
2615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616 "reduction used in loop.\n");
2617 return NULL;
2621 if (TREE_CODE (loop_arg) != SSA_NAME)
2623 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626 "reduction: not ssa_name: ");
2627 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2628 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2630 return NULL;
2633 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2634 if (!def_stmt)
2636 if (dump_enabled_p ())
2637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2638 "reduction: no def_stmt.\n");
2639 return NULL;
2642 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2644 if (dump_enabled_p ())
2646 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2647 dump_printf (MSG_NOTE, "\n");
2649 return NULL;
2652 if (is_gimple_assign (def_stmt))
2654 name = gimple_assign_lhs (def_stmt);
2655 phi_def = false;
2657 else
2659 name = PHI_RESULT (def_stmt);
2660 phi_def = true;
2663 nloop_uses = 0;
2664 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2666 gimple *use_stmt = USE_STMT (use_p);
2667 if (is_gimple_debug (use_stmt))
2668 continue;
2669 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2670 nloop_uses++;
2671 if (nloop_uses > 1)
2673 if (dump_enabled_p ())
2674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2675 "reduction used in loop.\n");
2676 return NULL;
2680 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2681 defined in the inner loop. */
2682 if (phi_def)
2684 op1 = PHI_ARG_DEF (def_stmt, 0);
2686 if (gimple_phi_num_args (def_stmt) != 1
2687 || TREE_CODE (op1) != SSA_NAME)
2689 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2691 "unsupported phi node definition.\n");
2693 return NULL;
2696 def1 = SSA_NAME_DEF_STMT (op1);
2697 if (gimple_bb (def1)
2698 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2699 && loop->inner
2700 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2701 && is_gimple_assign (def1))
2703 if (dump_enabled_p ())
2704 report_vect_op (MSG_NOTE, def_stmt,
2705 "detected double reduction: ");
2707 *double_reduc = true;
2708 return def_stmt;
2711 return NULL;
2714 code = orig_code = gimple_assign_rhs_code (def_stmt);
2716 /* We can handle "res -= x[i]", which is non-associative by
2717 simply rewriting this into "res += -x[i]". Avoid changing
2718 gimple instruction for the first simple tests and only do this
2719 if we're allowed to change code at all. */
2720 if (code == MINUS_EXPR
2721 && (op1 = gimple_assign_rhs1 (def_stmt))
2722 && TREE_CODE (op1) == SSA_NAME
2723 && SSA_NAME_DEF_STMT (op1) == phi)
2724 code = PLUS_EXPR;
2726 if (check_reduction)
2728 if (code == COND_EXPR)
2729 *v_reduc_type = COND_REDUCTION;
2730 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2732 if (dump_enabled_p ())
2733 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2734 "reduction: not commutative/associative: ");
2735 return NULL;
2739 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2741 if (code != COND_EXPR)
2743 if (dump_enabled_p ())
2744 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2745 "reduction: not binary operation: ");
2747 return NULL;
2750 op3 = gimple_assign_rhs1 (def_stmt);
2751 if (COMPARISON_CLASS_P (op3))
2753 op4 = TREE_OPERAND (op3, 1);
2754 op3 = TREE_OPERAND (op3, 0);
2757 op1 = gimple_assign_rhs2 (def_stmt);
2758 op2 = gimple_assign_rhs3 (def_stmt);
2760 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2762 if (dump_enabled_p ())
2763 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2764 "reduction: uses not ssa_names: ");
2766 return NULL;
2769 else
2771 op1 = gimple_assign_rhs1 (def_stmt);
2772 op2 = gimple_assign_rhs2 (def_stmt);
2774 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2776 if (dump_enabled_p ())
2777 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2778 "reduction: uses not ssa_names: ");
2780 return NULL;
2784 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2785 if ((TREE_CODE (op1) == SSA_NAME
2786 && !types_compatible_p (type,TREE_TYPE (op1)))
2787 || (TREE_CODE (op2) == SSA_NAME
2788 && !types_compatible_p (type, TREE_TYPE (op2)))
2789 || (op3 && TREE_CODE (op3) == SSA_NAME
2790 && !types_compatible_p (type, TREE_TYPE (op3)))
2791 || (op4 && TREE_CODE (op4) == SSA_NAME
2792 && !types_compatible_p (type, TREE_TYPE (op4))))
2794 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_NOTE, vect_location,
2797 "reduction: multiple types: operation type: ");
2798 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2799 dump_printf (MSG_NOTE, ", operands types: ");
2800 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2801 TREE_TYPE (op1));
2802 dump_printf (MSG_NOTE, ",");
2803 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2804 TREE_TYPE (op2));
2805 if (op3)
2807 dump_printf (MSG_NOTE, ",");
2808 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2809 TREE_TYPE (op3));
2812 if (op4)
2814 dump_printf (MSG_NOTE, ",");
2815 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2816 TREE_TYPE (op4));
2818 dump_printf (MSG_NOTE, "\n");
2821 return NULL;
2824 /* Check that it's ok to change the order of the computation.
2825 Generally, when vectorizing a reduction we change the order of the
2826 computation. This may change the behavior of the program in some
2827 cases, so we need to check that this is ok. One exception is when
2828 vectorizing an outer-loop: the inner-loop is executed sequentially,
2829 and therefore vectorizing reductions in the inner-loop during
2830 outer-loop vectorization is safe. */
2832 if (*v_reduc_type != COND_REDUCTION)
2834 /* CHECKME: check for !flag_finite_math_only too? */
2835 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2836 && check_reduction)
2838 /* Changing the order of operations changes the semantics. */
2839 if (dump_enabled_p ())
2840 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2841 "reduction: unsafe fp math optimization: ");
2842 return NULL;
2844 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2846 if (!operation_no_trapping_overflow (type, code))
2848 /* Changing the order of operations changes the semantics. */
2849 if (dump_enabled_p ())
2850 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2851 "reduction: unsafe int math optimization"
2852 " (overflow traps): ");
2853 return NULL;
2855 if (need_wrapping_integral_overflow
2856 && !TYPE_OVERFLOW_WRAPS (type)
2857 && operation_can_overflow (code))
2859 /* Changing the order of operations changes the semantics. */
2860 if (dump_enabled_p ())
2861 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2862 "reduction: unsafe int math optimization"
2863 " (overflow doesn't wrap): ");
2864 return NULL;
2867 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2869 /* Changing the order of operations changes the semantics. */
2870 if (dump_enabled_p ())
2871 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2872 "reduction: unsafe fixed-point math optimization: ");
2873 return NULL;
2877 /* Reduction is safe. We're dealing with one of the following:
2878 1) integer arithmetic and no trapv
2879 2) floating point arithmetic, and special flags permit this optimization
2880 3) nested cycle (i.e., outer loop vectorization). */
2881 if (TREE_CODE (op1) == SSA_NAME)
2882 def1 = SSA_NAME_DEF_STMT (op1);
2884 if (TREE_CODE (op2) == SSA_NAME)
2885 def2 = SSA_NAME_DEF_STMT (op2);
2887 if (code != COND_EXPR
2888 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2890 if (dump_enabled_p ())
2891 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2892 return NULL;
2895 /* Check that one def is the reduction def, defined by PHI,
2896 the other def is either defined in the loop ("vect_internal_def"),
2897 or it's an induction (defined by a loop-header phi-node). */
2899 if (def2 && def2 == phi
2900 && (code == COND_EXPR
2901 || !def1 || gimple_nop_p (def1)
2902 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2903 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2904 && (is_gimple_assign (def1)
2905 || is_gimple_call (def1)
2906 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2907 == vect_induction_def
2908 || (gimple_code (def1) == GIMPLE_PHI
2909 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2910 == vect_internal_def
2911 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2913 if (dump_enabled_p ())
2914 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2915 return def_stmt;
2918 if (def1 && def1 == phi
2919 && (code == COND_EXPR
2920 || !def2 || gimple_nop_p (def2)
2921 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2922 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2923 && (is_gimple_assign (def2)
2924 || is_gimple_call (def2)
2925 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2926 == vect_induction_def
2927 || (gimple_code (def2) == GIMPLE_PHI
2928 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2929 == vect_internal_def
2930 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2932 if (check_reduction
2933 && orig_code != MINUS_EXPR)
2935 if (code == COND_EXPR)
2937 /* No current known use where this case would be useful. */
2938 if (dump_enabled_p ())
2939 report_vect_op (MSG_NOTE, def_stmt,
2940 "detected reduction: cannot currently swap "
2941 "operands for cond_expr");
2942 return NULL;
2945 /* Swap operands (just for simplicity - so that the rest of the code
2946 can assume that the reduction variable is always the last (second)
2947 argument). */
2948 if (dump_enabled_p ())
2949 report_vect_op (MSG_NOTE, def_stmt,
2950 "detected reduction: need to swap operands: ");
2952 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2953 gimple_assign_rhs2_ptr (def_stmt));
2955 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2956 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2958 else
2960 if (dump_enabled_p ())
2961 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2964 return def_stmt;
2967 /* Try to find SLP reduction chain. */
2968 if (check_reduction && code != COND_EXPR
2969 && vect_is_slp_reduction (loop_info, phi, def_stmt))
2971 if (dump_enabled_p ())
2972 report_vect_op (MSG_NOTE, def_stmt,
2973 "reduction: detected reduction chain: ");
2975 return def_stmt;
2978 if (dump_enabled_p ())
2979 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2980 "reduction: unknown pattern: ");
2982 return NULL;
2985 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2986 in-place if it enables detection of more reductions. Arguments
2987 as there. */
2989 gimple *
2990 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2991 bool check_reduction, bool *double_reduc,
2992 bool need_wrapping_integral_overflow)
2994 enum vect_reduction_type v_reduc_type;
2995 return vect_is_simple_reduction (loop_info, phi, check_reduction,
2996 double_reduc,
2997 need_wrapping_integral_overflow,
2998 &v_reduc_type);
3001 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3003 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3004 int *peel_iters_epilogue,
3005 stmt_vector_for_cost *scalar_cost_vec,
3006 stmt_vector_for_cost *prologue_cost_vec,
3007 stmt_vector_for_cost *epilogue_cost_vec)
3009 int retval = 0;
3010 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3012 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3014 *peel_iters_epilogue = vf/2;
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_NOTE, vect_location,
3017 "cost model: epilogue peel iters set to vf/2 "
3018 "because loop iterations are unknown .\n");
3020 /* If peeled iterations are known but number of scalar loop
3021 iterations are unknown, count a taken branch per peeled loop. */
3022 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3023 NULL, 0, vect_prologue);
3024 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3025 NULL, 0, vect_epilogue);
3027 else
3029 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3030 peel_iters_prologue = niters < peel_iters_prologue ?
3031 niters : peel_iters_prologue;
3032 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3033 /* If we need to peel for gaps, but no peeling is required, we have to
3034 peel VF iterations. */
3035 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3036 *peel_iters_epilogue = vf;
3039 stmt_info_for_cost *si;
3040 int j;
3041 if (peel_iters_prologue)
3042 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3043 retval += record_stmt_cost (prologue_cost_vec,
3044 si->count * peel_iters_prologue,
3045 si->kind, NULL, si->misalign,
3046 vect_prologue);
3047 if (*peel_iters_epilogue)
3048 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3049 retval += record_stmt_cost (epilogue_cost_vec,
3050 si->count * *peel_iters_epilogue,
3051 si->kind, NULL, si->misalign,
3052 vect_epilogue);
3054 return retval;
3057 /* Function vect_estimate_min_profitable_iters
3059 Return the number of iterations required for the vector version of the
3060 loop to be profitable relative to the cost of the scalar version of the
3061 loop. */
3063 static void
3064 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3065 int *ret_min_profitable_niters,
3066 int *ret_min_profitable_estimate)
3068 int min_profitable_iters;
3069 int min_profitable_estimate;
3070 int peel_iters_prologue;
3071 int peel_iters_epilogue;
3072 unsigned vec_inside_cost = 0;
3073 int vec_outside_cost = 0;
3074 unsigned vec_prologue_cost = 0;
3075 unsigned vec_epilogue_cost = 0;
3076 int scalar_single_iter_cost = 0;
3077 int scalar_outside_cost = 0;
3078 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3079 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3080 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3082 /* Cost model disabled. */
3083 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3085 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3086 *ret_min_profitable_niters = 0;
3087 *ret_min_profitable_estimate = 0;
3088 return;
3091 /* Requires loop versioning tests to handle misalignment. */
3092 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3094 /* FIXME: Make cost depend on complexity of individual check. */
3095 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3096 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3097 vect_prologue);
3098 dump_printf (MSG_NOTE,
3099 "cost model: Adding cost of checks for loop "
3100 "versioning to treat misalignment.\n");
3103 /* Requires loop versioning with alias checks. */
3104 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3106 /* FIXME: Make cost depend on complexity of individual check. */
3107 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3108 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3109 vect_prologue);
3110 dump_printf (MSG_NOTE,
3111 "cost model: Adding cost of checks for loop "
3112 "versioning aliasing.\n");
3115 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3116 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3117 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3118 vect_prologue);
3120 /* Count statements in scalar loop. Using this as scalar cost for a single
3121 iteration for now.
3123 TODO: Add outer loop support.
3125 TODO: Consider assigning different costs to different scalar
3126 statements. */
3128 scalar_single_iter_cost
3129 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3131 /* Add additional cost for the peeled instructions in prologue and epilogue
3132 loop.
3134 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3135 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3137 TODO: Build an expression that represents peel_iters for prologue and
3138 epilogue to be used in a run-time test. */
3140 if (npeel < 0)
3142 peel_iters_prologue = vf/2;
3143 dump_printf (MSG_NOTE, "cost model: "
3144 "prologue peel iters set to vf/2.\n");
3146 /* If peeling for alignment is unknown, loop bound of main loop becomes
3147 unknown. */
3148 peel_iters_epilogue = vf/2;
3149 dump_printf (MSG_NOTE, "cost model: "
3150 "epilogue peel iters set to vf/2 because "
3151 "peeling for alignment is unknown.\n");
3153 /* If peeled iterations are unknown, count a taken branch and a not taken
3154 branch per peeled loop. Even if scalar loop iterations are known,
3155 vector iterations are not known since peeled prologue iterations are
3156 not known. Hence guards remain the same. */
3157 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3158 NULL, 0, vect_prologue);
3159 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3160 NULL, 0, vect_prologue);
3161 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3162 NULL, 0, vect_epilogue);
3163 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3164 NULL, 0, vect_epilogue);
3165 stmt_info_for_cost *si;
3166 int j;
3167 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3169 struct _stmt_vec_info *stmt_info
3170 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3171 (void) add_stmt_cost (target_cost_data,
3172 si->count * peel_iters_prologue,
3173 si->kind, stmt_info, si->misalign,
3174 vect_prologue);
3175 (void) add_stmt_cost (target_cost_data,
3176 si->count * peel_iters_epilogue,
3177 si->kind, stmt_info, si->misalign,
3178 vect_epilogue);
3181 else
3183 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3184 stmt_info_for_cost *si;
3185 int j;
3186 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3188 prologue_cost_vec.create (2);
3189 epilogue_cost_vec.create (2);
3190 peel_iters_prologue = npeel;
3192 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3193 &peel_iters_epilogue,
3194 &LOOP_VINFO_SCALAR_ITERATION_COST
3195 (loop_vinfo),
3196 &prologue_cost_vec,
3197 &epilogue_cost_vec);
3199 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3201 struct _stmt_vec_info *stmt_info
3202 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3203 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3204 si->misalign, vect_prologue);
3207 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3209 struct _stmt_vec_info *stmt_info
3210 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3211 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3212 si->misalign, vect_epilogue);
3215 prologue_cost_vec.release ();
3216 epilogue_cost_vec.release ();
3219 /* FORNOW: The scalar outside cost is incremented in one of the
3220 following ways:
3222 1. The vectorizer checks for alignment and aliasing and generates
3223 a condition that allows dynamic vectorization. A cost model
3224 check is ANDED with the versioning condition. Hence scalar code
3225 path now has the added cost of the versioning check.
3227 if (cost > th & versioning_check)
3228 jmp to vector code
3230 Hence run-time scalar is incremented by not-taken branch cost.
3232 2. The vectorizer then checks if a prologue is required. If the
3233 cost model check was not done before during versioning, it has to
3234 be done before the prologue check.
3236 if (cost <= th)
3237 prologue = scalar_iters
3238 if (prologue == 0)
3239 jmp to vector code
3240 else
3241 execute prologue
3242 if (prologue == num_iters)
3243 go to exit
3245 Hence the run-time scalar cost is incremented by a taken branch,
3246 plus a not-taken branch, plus a taken branch cost.
3248 3. The vectorizer then checks if an epilogue is required. If the
3249 cost model check was not done before during prologue check, it
3250 has to be done with the epilogue check.
3252 if (prologue == 0)
3253 jmp to vector code
3254 else
3255 execute prologue
3256 if (prologue == num_iters)
3257 go to exit
3258 vector code:
3259 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3260 jmp to epilogue
3262 Hence the run-time scalar cost should be incremented by 2 taken
3263 branches.
3265 TODO: The back end may reorder the BBS's differently and reverse
3266 conditions/branch directions. Change the estimates below to
3267 something more reasonable. */
3269 /* If the number of iterations is known and we do not do versioning, we can
3270 decide whether to vectorize at compile time. Hence the scalar version
3271 do not carry cost model guard costs. */
3272 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3273 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3274 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3276 /* Cost model check occurs at versioning. */
3277 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3278 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3279 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3280 else
3282 /* Cost model check occurs at prologue generation. */
3283 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3284 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3285 + vect_get_stmt_cost (cond_branch_not_taken);
3286 /* Cost model check occurs at epilogue generation. */
3287 else
3288 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3292 /* Complete the target-specific cost calculations. */
3293 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3294 &vec_inside_cost, &vec_epilogue_cost);
3296 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3298 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3301 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3302 vec_inside_cost);
3303 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3304 vec_prologue_cost);
3305 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3306 vec_epilogue_cost);
3307 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3308 scalar_single_iter_cost);
3309 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3310 scalar_outside_cost);
3311 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3312 vec_outside_cost);
3313 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3314 peel_iters_prologue);
3315 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3316 peel_iters_epilogue);
3319 /* Calculate number of iterations required to make the vector version
3320 profitable, relative to the loop bodies only. The following condition
3321 must hold true:
3322 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3323 where
3324 SIC = scalar iteration cost, VIC = vector iteration cost,
3325 VOC = vector outside cost, VF = vectorization factor,
3326 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3327 SOC = scalar outside cost for run time cost model check. */
3329 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3331 if (vec_outside_cost <= 0)
3332 min_profitable_iters = 1;
3333 else
3335 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3336 - vec_inside_cost * peel_iters_prologue
3337 - vec_inside_cost * peel_iters_epilogue)
3338 / ((scalar_single_iter_cost * vf)
3339 - vec_inside_cost);
3341 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3342 <= (((int) vec_inside_cost * min_profitable_iters)
3343 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3344 min_profitable_iters++;
3347 /* vector version will never be profitable. */
3348 else
3350 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3351 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3352 "did not happen for a simd loop");
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3356 "cost model: the vector iteration cost = %d "
3357 "divided by the scalar iteration cost = %d "
3358 "is greater or equal to the vectorization factor = %d"
3359 ".\n",
3360 vec_inside_cost, scalar_single_iter_cost, vf);
3361 *ret_min_profitable_niters = -1;
3362 *ret_min_profitable_estimate = -1;
3363 return;
3366 dump_printf (MSG_NOTE,
3367 " Calculated minimum iters for profitability: %d\n",
3368 min_profitable_iters);
3370 min_profitable_iters =
3371 min_profitable_iters < vf ? vf : min_profitable_iters;
3373 /* Because the condition we create is:
3374 if (niters <= min_profitable_iters)
3375 then skip the vectorized loop. */
3376 min_profitable_iters--;
3378 if (dump_enabled_p ())
3379 dump_printf_loc (MSG_NOTE, vect_location,
3380 " Runtime profitability threshold = %d\n",
3381 min_profitable_iters);
3383 *ret_min_profitable_niters = min_profitable_iters;
3385 /* Calculate number of iterations required to make the vector version
3386 profitable, relative to the loop bodies only.
3388 Non-vectorized variant is SIC * niters and it must win over vector
3389 variant on the expected loop trip count. The following condition must hold true:
3390 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3392 if (vec_outside_cost <= 0)
3393 min_profitable_estimate = 1;
3394 else
3396 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3397 - vec_inside_cost * peel_iters_prologue
3398 - vec_inside_cost * peel_iters_epilogue)
3399 / ((scalar_single_iter_cost * vf)
3400 - vec_inside_cost);
3402 min_profitable_estimate --;
3403 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3404 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_NOTE, vect_location,
3406 " Static estimate profitability threshold = %d\n",
3407 min_profitable_iters);
3409 *ret_min_profitable_estimate = min_profitable_estimate;
3412 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3413 vector elements (not bits) for a vector of mode MODE. */
3414 static void
3415 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3416 unsigned char *sel)
3418 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3420 for (i = 0; i < nelt; i++)
3421 sel[i] = (i + offset) & (2*nelt - 1);
3424 /* Checks whether the target supports whole-vector shifts for vectors of mode
3425 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3426 it supports vec_perm_const with masks for all necessary shift amounts. */
3427 static bool
3428 have_whole_vector_shift (enum machine_mode mode)
3430 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3431 return true;
3433 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3434 return false;
3436 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3437 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3439 for (i = nelt/2; i >= 1; i/=2)
3441 calc_vec_perm_mask_for_shift (mode, i, sel);
3442 if (!can_vec_perm_p (mode, false, sel))
3443 return false;
3445 return true;
3448 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3450 static tree
3451 get_reduction_op (gimple *stmt, int reduc_index)
3453 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3455 case GIMPLE_SINGLE_RHS:
3456 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3457 == ternary_op);
3458 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3459 case GIMPLE_UNARY_RHS:
3460 return gimple_assign_rhs1 (stmt);
3461 case GIMPLE_BINARY_RHS:
3462 return (reduc_index
3463 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3464 case GIMPLE_TERNARY_RHS:
3465 return gimple_op (stmt, reduc_index + 1);
3466 default:
3467 gcc_unreachable ();
3471 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3472 functions. Design better to avoid maintenance issues. */
3474 /* Function vect_model_reduction_cost.
3476 Models cost for a reduction operation, including the vector ops
3477 generated within the strip-mine loop, the initial definition before
3478 the loop, and the epilogue code that must be generated. */
3480 static bool
3481 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3482 int ncopies, int reduc_index)
3484 int prologue_cost = 0, epilogue_cost = 0;
3485 enum tree_code code;
3486 optab optab;
3487 tree vectype;
3488 gimple *stmt, *orig_stmt;
3489 tree reduction_op;
3490 machine_mode mode;
3491 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3492 struct loop *loop = NULL;
3493 void *target_cost_data;
3495 if (loop_vinfo)
3497 loop = LOOP_VINFO_LOOP (loop_vinfo);
3498 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3500 else
3501 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3503 /* Condition reductions generate two reductions in the loop. */
3504 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3505 ncopies *= 2;
3507 /* Cost of reduction op inside loop. */
3508 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3509 stmt_info, 0, vect_body);
3510 stmt = STMT_VINFO_STMT (stmt_info);
3512 reduction_op = get_reduction_op (stmt, reduc_index);
3514 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3515 if (!vectype)
3517 if (dump_enabled_p ())
3519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3520 "unsupported data-type ");
3521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3522 TREE_TYPE (reduction_op));
3523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3525 return false;
3528 mode = TYPE_MODE (vectype);
3529 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3531 if (!orig_stmt)
3532 orig_stmt = STMT_VINFO_STMT (stmt_info);
3534 code = gimple_assign_rhs_code (orig_stmt);
3536 /* Add in cost for initial definition.
3537 For cond reduction we have four vectors: initial index, step, initial
3538 result of the data reduction, initial value of the index reduction. */
3539 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3540 == COND_REDUCTION ? 4 : 1;
3541 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3542 scalar_to_vec, stmt_info, 0,
3543 vect_prologue);
3545 /* Determine cost of epilogue code.
3547 We have a reduction operator that will reduce the vector in one statement.
3548 Also requires scalar extract. */
3550 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3552 if (reduc_code != ERROR_MARK)
3554 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3556 /* An EQ stmt and an COND_EXPR stmt. */
3557 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3558 vector_stmt, stmt_info, 0,
3559 vect_epilogue);
3560 /* Reduction of the max index and a reduction of the found
3561 values. */
3562 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3563 vec_to_scalar, stmt_info, 0,
3564 vect_epilogue);
3565 /* A broadcast of the max value. */
3566 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3567 scalar_to_vec, stmt_info, 0,
3568 vect_epilogue);
3570 else
3572 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3573 stmt_info, 0, vect_epilogue);
3574 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3575 vec_to_scalar, stmt_info, 0,
3576 vect_epilogue);
3579 else
3581 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3582 tree bitsize =
3583 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3584 int element_bitsize = tree_to_uhwi (bitsize);
3585 int nelements = vec_size_in_bits / element_bitsize;
3587 optab = optab_for_tree_code (code, vectype, optab_default);
3589 /* We have a whole vector shift available. */
3590 if (VECTOR_MODE_P (mode)
3591 && optab_handler (optab, mode) != CODE_FOR_nothing
3592 && have_whole_vector_shift (mode))
3594 /* Final reduction via vector shifts and the reduction operator.
3595 Also requires scalar extract. */
3596 epilogue_cost += add_stmt_cost (target_cost_data,
3597 exact_log2 (nelements) * 2,
3598 vector_stmt, stmt_info, 0,
3599 vect_epilogue);
3600 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3601 vec_to_scalar, stmt_info, 0,
3602 vect_epilogue);
3604 else
3605 /* Use extracts and reduction op for final reduction. For N
3606 elements, we have N extracts and N-1 reduction ops. */
3607 epilogue_cost += add_stmt_cost (target_cost_data,
3608 nelements + nelements - 1,
3609 vector_stmt, stmt_info, 0,
3610 vect_epilogue);
3614 if (dump_enabled_p ())
3615 dump_printf (MSG_NOTE,
3616 "vect_model_reduction_cost: inside_cost = %d, "
3617 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3618 prologue_cost, epilogue_cost);
3620 return true;
3624 /* Function vect_model_induction_cost.
3626 Models cost for induction operations. */
3628 static void
3629 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3631 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3632 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3633 unsigned inside_cost, prologue_cost;
3635 /* loop cost for vec_loop. */
3636 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3637 stmt_info, 0, vect_body);
3639 /* prologue cost for vec_init and vec_step. */
3640 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3641 stmt_info, 0, vect_prologue);
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_NOTE, vect_location,
3645 "vect_model_induction_cost: inside_cost = %d, "
3646 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3650 /* Function get_initial_def_for_induction
3652 Input:
3653 STMT - a stmt that performs an induction operation in the loop.
3654 IV_PHI - the initial value of the induction variable
3656 Output:
3657 Return a vector variable, initialized with the first VF values of
3658 the induction variable. E.g., for an iv with IV_PHI='X' and
3659 evolution S, for a vector of 4 units, we want to return:
3660 [X, X + S, X + 2*S, X + 3*S]. */
3662 static tree
3663 get_initial_def_for_induction (gimple *iv_phi)
3665 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3666 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3667 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3668 tree vectype;
3669 int nunits;
3670 edge pe = loop_preheader_edge (loop);
3671 struct loop *iv_loop;
3672 basic_block new_bb;
3673 tree new_vec, vec_init, vec_step, t;
3674 tree new_name;
3675 gimple *new_stmt;
3676 gphi *induction_phi;
3677 tree induc_def, vec_def, vec_dest;
3678 tree init_expr, step_expr;
3679 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3680 int i;
3681 int ncopies;
3682 tree expr;
3683 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3684 bool nested_in_vect_loop = false;
3685 gimple_seq stmts;
3686 imm_use_iterator imm_iter;
3687 use_operand_p use_p;
3688 gimple *exit_phi;
3689 edge latch_e;
3690 tree loop_arg;
3691 gimple_stmt_iterator si;
3692 basic_block bb = gimple_bb (iv_phi);
3693 tree stepvectype;
3694 tree resvectype;
3696 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3697 if (nested_in_vect_loop_p (loop, iv_phi))
3699 nested_in_vect_loop = true;
3700 iv_loop = loop->inner;
3702 else
3703 iv_loop = loop;
3704 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3706 latch_e = loop_latch_edge (iv_loop);
3707 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3709 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3710 gcc_assert (step_expr != NULL_TREE);
3712 pe = loop_preheader_edge (iv_loop);
3713 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3714 loop_preheader_edge (iv_loop));
3716 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3717 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3718 gcc_assert (vectype);
3719 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3720 ncopies = vf / nunits;
3722 gcc_assert (phi_info);
3723 gcc_assert (ncopies >= 1);
3725 /* Convert the step to the desired type. */
3726 stmts = NULL;
3727 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3728 if (stmts)
3730 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3731 gcc_assert (!new_bb);
3734 /* Find the first insertion point in the BB. */
3735 si = gsi_after_labels (bb);
3737 /* Create the vector that holds the initial_value of the induction. */
3738 if (nested_in_vect_loop)
3740 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3741 been created during vectorization of previous stmts. We obtain it
3742 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3743 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3744 /* If the initial value is not of proper type, convert it. */
3745 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3747 new_stmt
3748 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3749 vect_simple_var,
3750 "vec_iv_"),
3751 VIEW_CONVERT_EXPR,
3752 build1 (VIEW_CONVERT_EXPR, vectype,
3753 vec_init));
3754 vec_init = gimple_assign_lhs (new_stmt);
3755 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3756 new_stmt);
3757 gcc_assert (!new_bb);
3758 set_vinfo_for_stmt (new_stmt,
3759 new_stmt_vec_info (new_stmt, loop_vinfo));
3762 else
3764 vec<constructor_elt, va_gc> *v;
3766 /* iv_loop is the loop to be vectorized. Create:
3767 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3768 stmts = NULL;
3769 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3771 vec_alloc (v, nunits);
3772 bool constant_p = is_gimple_min_invariant (new_name);
3773 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3774 for (i = 1; i < nunits; i++)
3776 /* Create: new_name_i = new_name + step_expr */
3777 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3778 new_name, step_expr);
3779 if (!is_gimple_min_invariant (new_name))
3780 constant_p = false;
3781 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3783 if (stmts)
3785 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3786 gcc_assert (!new_bb);
3789 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3790 if (constant_p)
3791 new_vec = build_vector_from_ctor (vectype, v);
3792 else
3793 new_vec = build_constructor (vectype, v);
3794 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3798 /* Create the vector that holds the step of the induction. */
3799 if (nested_in_vect_loop)
3800 /* iv_loop is nested in the loop to be vectorized. Generate:
3801 vec_step = [S, S, S, S] */
3802 new_name = step_expr;
3803 else
3805 /* iv_loop is the loop to be vectorized. Generate:
3806 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3807 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3809 expr = build_int_cst (integer_type_node, vf);
3810 expr = fold_convert (TREE_TYPE (step_expr), expr);
3812 else
3813 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3814 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3815 expr, step_expr);
3816 if (TREE_CODE (step_expr) == SSA_NAME)
3817 new_name = vect_init_vector (iv_phi, new_name,
3818 TREE_TYPE (step_expr), NULL);
3821 t = unshare_expr (new_name);
3822 gcc_assert (CONSTANT_CLASS_P (new_name)
3823 || TREE_CODE (new_name) == SSA_NAME);
3824 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3825 gcc_assert (stepvectype);
3826 new_vec = build_vector_from_val (stepvectype, t);
3827 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3830 /* Create the following def-use cycle:
3831 loop prolog:
3832 vec_init = ...
3833 vec_step = ...
3834 loop:
3835 vec_iv = PHI <vec_init, vec_loop>
3837 STMT
3839 vec_loop = vec_iv + vec_step; */
3841 /* Create the induction-phi that defines the induction-operand. */
3842 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3843 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3844 set_vinfo_for_stmt (induction_phi,
3845 new_stmt_vec_info (induction_phi, loop_vinfo));
3846 induc_def = PHI_RESULT (induction_phi);
3848 /* Create the iv update inside the loop */
3849 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3850 vec_def = make_ssa_name (vec_dest, new_stmt);
3851 gimple_assign_set_lhs (new_stmt, vec_def);
3852 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3853 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3855 /* Set the arguments of the phi node: */
3856 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3857 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3858 UNKNOWN_LOCATION);
3861 /* In case that vectorization factor (VF) is bigger than the number
3862 of elements that we can fit in a vectype (nunits), we have to generate
3863 more than one vector stmt - i.e - we need to "unroll" the
3864 vector stmt by a factor VF/nunits. For more details see documentation
3865 in vectorizable_operation. */
3867 if (ncopies > 1)
3869 stmt_vec_info prev_stmt_vinfo;
3870 /* FORNOW. This restriction should be relaxed. */
3871 gcc_assert (!nested_in_vect_loop);
3873 /* Create the vector that holds the step of the induction. */
3874 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3876 expr = build_int_cst (integer_type_node, nunits);
3877 expr = fold_convert (TREE_TYPE (step_expr), expr);
3879 else
3880 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3881 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3882 expr, step_expr);
3883 if (TREE_CODE (step_expr) == SSA_NAME)
3884 new_name = vect_init_vector (iv_phi, new_name,
3885 TREE_TYPE (step_expr), NULL);
3886 t = unshare_expr (new_name);
3887 gcc_assert (CONSTANT_CLASS_P (new_name)
3888 || TREE_CODE (new_name) == SSA_NAME);
3889 new_vec = build_vector_from_val (stepvectype, t);
3890 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3892 vec_def = induc_def;
3893 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3894 for (i = 1; i < ncopies; i++)
3896 /* vec_i = vec_prev + vec_step */
3897 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3898 vec_def, vec_step);
3899 vec_def = make_ssa_name (vec_dest, new_stmt);
3900 gimple_assign_set_lhs (new_stmt, vec_def);
3902 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3903 if (!useless_type_conversion_p (resvectype, vectype))
3905 new_stmt
3906 = gimple_build_assign
3907 (vect_get_new_vect_var (resvectype, vect_simple_var,
3908 "vec_iv_"),
3909 VIEW_CONVERT_EXPR,
3910 build1 (VIEW_CONVERT_EXPR, resvectype,
3911 gimple_assign_lhs (new_stmt)));
3912 gimple_assign_set_lhs (new_stmt,
3913 make_ssa_name
3914 (gimple_assign_lhs (new_stmt), new_stmt));
3915 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3917 set_vinfo_for_stmt (new_stmt,
3918 new_stmt_vec_info (new_stmt, loop_vinfo));
3919 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3920 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3924 if (nested_in_vect_loop)
3926 /* Find the loop-closed exit-phi of the induction, and record
3927 the final vector of induction results: */
3928 exit_phi = NULL;
3929 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3931 gimple *use_stmt = USE_STMT (use_p);
3932 if (is_gimple_debug (use_stmt))
3933 continue;
3935 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3937 exit_phi = use_stmt;
3938 break;
3941 if (exit_phi)
3943 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3944 /* FORNOW. Currently not supporting the case that an inner-loop induction
3945 is not used in the outer-loop (i.e. only outside the outer-loop). */
3946 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3947 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3949 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3950 if (dump_enabled_p ())
3952 dump_printf_loc (MSG_NOTE, vect_location,
3953 "vector of inductions after inner-loop:");
3954 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3955 dump_printf (MSG_NOTE, "\n");
3961 if (dump_enabled_p ())
3963 dump_printf_loc (MSG_NOTE, vect_location,
3964 "transform induction: created def-use cycle: ");
3965 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3966 dump_printf (MSG_NOTE, "\n");
3967 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3968 SSA_NAME_DEF_STMT (vec_def), 0);
3969 dump_printf (MSG_NOTE, "\n");
3972 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3973 if (!useless_type_conversion_p (resvectype, vectype))
3975 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3976 vect_simple_var,
3977 "vec_iv_"),
3978 VIEW_CONVERT_EXPR,
3979 build1 (VIEW_CONVERT_EXPR, resvectype,
3980 induc_def));
3981 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3982 gimple_assign_set_lhs (new_stmt, induc_def);
3983 si = gsi_after_labels (bb);
3984 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3985 set_vinfo_for_stmt (new_stmt,
3986 new_stmt_vec_info (new_stmt, loop_vinfo));
3987 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3988 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3991 return induc_def;
3995 /* Function get_initial_def_for_reduction
3997 Input:
3998 STMT - a stmt that performs a reduction operation in the loop.
3999 INIT_VAL - the initial value of the reduction variable
4001 Output:
4002 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4003 of the reduction (used for adjusting the epilog - see below).
4004 Return a vector variable, initialized according to the operation that STMT
4005 performs. This vector will be used as the initial value of the
4006 vector of partial results.
4008 Option1 (adjust in epilog): Initialize the vector as follows:
4009 add/bit or/xor: [0,0,...,0,0]
4010 mult/bit and: [1,1,...,1,1]
4011 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4012 and when necessary (e.g. add/mult case) let the caller know
4013 that it needs to adjust the result by init_val.
4015 Option2: Initialize the vector as follows:
4016 add/bit or/xor: [init_val,0,0,...,0]
4017 mult/bit and: [init_val,1,1,...,1]
4018 min/max/cond_expr: [init_val,init_val,...,init_val]
4019 and no adjustments are needed.
4021 For example, for the following code:
4023 s = init_val;
4024 for (i=0;i<n;i++)
4025 s = s + a[i];
4027 STMT is 's = s + a[i]', and the reduction variable is 's'.
4028 For a vector of 4 units, we want to return either [0,0,0,init_val],
4029 or [0,0,0,0] and let the caller know that it needs to adjust
4030 the result at the end by 'init_val'.
4032 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4033 initialization vector is simpler (same element in all entries), if
4034 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4036 A cost model should help decide between these two schemes. */
4038 tree
4039 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4040 tree *adjustment_def)
4042 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4043 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4044 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4045 tree scalar_type = TREE_TYPE (init_val);
4046 tree vectype = get_vectype_for_scalar_type (scalar_type);
4047 int nunits;
4048 enum tree_code code = gimple_assign_rhs_code (stmt);
4049 tree def_for_init;
4050 tree init_def;
4051 tree *elts;
4052 int i;
4053 bool nested_in_vect_loop = false;
4054 tree init_value;
4055 REAL_VALUE_TYPE real_init_val = dconst0;
4056 int int_init_val = 0;
4057 gimple *def_stmt = NULL;
4059 gcc_assert (vectype);
4060 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4062 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4063 || SCALAR_FLOAT_TYPE_P (scalar_type));
4065 if (nested_in_vect_loop_p (loop, stmt))
4066 nested_in_vect_loop = true;
4067 else
4068 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4070 /* In case of double reduction we only create a vector variable to be put
4071 in the reduction phi node. The actual statement creation is done in
4072 vect_create_epilog_for_reduction. */
4073 if (adjustment_def && nested_in_vect_loop
4074 && TREE_CODE (init_val) == SSA_NAME
4075 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4076 && gimple_code (def_stmt) == GIMPLE_PHI
4077 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4078 && vinfo_for_stmt (def_stmt)
4079 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4080 == vect_double_reduction_def)
4082 *adjustment_def = NULL;
4083 return vect_create_destination_var (init_val, vectype);
4086 if (TREE_CONSTANT (init_val))
4088 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4089 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4090 else
4091 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4093 else
4094 init_value = init_val;
4096 switch (code)
4098 case WIDEN_SUM_EXPR:
4099 case DOT_PROD_EXPR:
4100 case SAD_EXPR:
4101 case PLUS_EXPR:
4102 case MINUS_EXPR:
4103 case BIT_IOR_EXPR:
4104 case BIT_XOR_EXPR:
4105 case MULT_EXPR:
4106 case BIT_AND_EXPR:
4107 /* ADJUSMENT_DEF is NULL when called from
4108 vect_create_epilog_for_reduction to vectorize double reduction. */
4109 if (adjustment_def)
4111 if (nested_in_vect_loop)
4112 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4113 else
4114 *adjustment_def = init_val;
4117 if (code == MULT_EXPR)
4119 real_init_val = dconst1;
4120 int_init_val = 1;
4123 if (code == BIT_AND_EXPR)
4124 int_init_val = -1;
4126 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4127 def_for_init = build_real (scalar_type, real_init_val);
4128 else
4129 def_for_init = build_int_cst (scalar_type, int_init_val);
4131 /* Create a vector of '0' or '1' except the first element. */
4132 elts = XALLOCAVEC (tree, nunits);
4133 for (i = nunits - 2; i >= 0; --i)
4134 elts[i + 1] = def_for_init;
4136 /* Option1: the first element is '0' or '1' as well. */
4137 if (adjustment_def)
4139 elts[0] = def_for_init;
4140 init_def = build_vector (vectype, elts);
4141 break;
4144 /* Option2: the first element is INIT_VAL. */
4145 elts[0] = init_val;
4146 if (TREE_CONSTANT (init_val))
4147 init_def = build_vector (vectype, elts);
4148 else
4150 vec<constructor_elt, va_gc> *v;
4151 vec_alloc (v, nunits);
4152 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4153 for (i = 1; i < nunits; ++i)
4154 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4155 init_def = build_constructor (vectype, v);
4158 break;
4160 case MIN_EXPR:
4161 case MAX_EXPR:
4162 case COND_EXPR:
4163 if (adjustment_def)
4165 *adjustment_def = NULL_TREE;
4166 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4168 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4169 break;
4172 init_def = build_vector_from_val (vectype, init_value);
4173 break;
4175 default:
4176 gcc_unreachable ();
4179 return init_def;
4182 /* Function vect_create_epilog_for_reduction
4184 Create code at the loop-epilog to finalize the result of a reduction
4185 computation.
4187 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4188 reduction statements.
4189 STMT is the scalar reduction stmt that is being vectorized.
4190 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4191 number of elements that we can fit in a vectype (nunits). In this case
4192 we have to generate more than one vector stmt - i.e - we need to "unroll"
4193 the vector stmt by a factor VF/nunits. For more details see documentation
4194 in vectorizable_operation.
4195 REDUC_CODE is the tree-code for the epilog reduction.
4196 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4197 computation.
4198 REDUC_INDEX is the index of the operand in the right hand side of the
4199 statement that is defined by REDUCTION_PHI.
4200 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4201 SLP_NODE is an SLP node containing a group of reduction statements. The
4202 first one in this group is STMT.
4203 INDUCTION_INDEX is the index of the loop for condition reductions.
4204 Otherwise it is undefined.
4206 This function:
4207 1. Creates the reduction def-use cycles: sets the arguments for
4208 REDUCTION_PHIS:
4209 The loop-entry argument is the vectorized initial-value of the reduction.
4210 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4211 sums.
4212 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4213 by applying the operation specified by REDUC_CODE if available, or by
4214 other means (whole-vector shifts or a scalar loop).
4215 The function also creates a new phi node at the loop exit to preserve
4216 loop-closed form, as illustrated below.
4218 The flow at the entry to this function:
4220 loop:
4221 vec_def = phi <null, null> # REDUCTION_PHI
4222 VECT_DEF = vector_stmt # vectorized form of STMT
4223 s_loop = scalar_stmt # (scalar) STMT
4224 loop_exit:
4225 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4226 use <s_out0>
4227 use <s_out0>
4229 The above is transformed by this function into:
4231 loop:
4232 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4233 VECT_DEF = vector_stmt # vectorized form of STMT
4234 s_loop = scalar_stmt # (scalar) STMT
4235 loop_exit:
4236 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4237 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4238 v_out2 = reduce <v_out1>
4239 s_out3 = extract_field <v_out2, 0>
4240 s_out4 = adjust_result <s_out3>
4241 use <s_out4>
4242 use <s_out4>
4245 static void
4246 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4247 int ncopies, enum tree_code reduc_code,
4248 vec<gimple *> reduction_phis,
4249 int reduc_index, bool double_reduc,
4250 slp_tree slp_node, tree induction_index)
4252 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4253 stmt_vec_info prev_phi_info;
4254 tree vectype;
4255 machine_mode mode;
4256 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4257 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4258 basic_block exit_bb;
4259 tree scalar_dest;
4260 tree scalar_type;
4261 gimple *new_phi = NULL, *phi;
4262 gimple_stmt_iterator exit_gsi;
4263 tree vec_dest;
4264 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4265 gimple *epilog_stmt = NULL;
4266 enum tree_code code = gimple_assign_rhs_code (stmt);
4267 gimple *exit_phi;
4268 tree bitsize;
4269 tree adjustment_def = NULL;
4270 tree vec_initial_def = NULL;
4271 tree reduction_op, expr, def, initial_def = NULL;
4272 tree orig_name, scalar_result;
4273 imm_use_iterator imm_iter, phi_imm_iter;
4274 use_operand_p use_p, phi_use_p;
4275 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4276 bool nested_in_vect_loop = false;
4277 auto_vec<gimple *> new_phis;
4278 auto_vec<gimple *> inner_phis;
4279 enum vect_def_type dt = vect_unknown_def_type;
4280 int j, i;
4281 auto_vec<tree> scalar_results;
4282 unsigned int group_size = 1, k, ratio;
4283 auto_vec<tree> vec_initial_defs;
4284 auto_vec<gimple *> phis;
4285 bool slp_reduc = false;
4286 tree new_phi_result;
4287 gimple *inner_phi = NULL;
4289 if (slp_node)
4290 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4292 if (nested_in_vect_loop_p (loop, stmt))
4294 outer_loop = loop;
4295 loop = loop->inner;
4296 nested_in_vect_loop = true;
4297 gcc_assert (!slp_node);
4300 reduction_op = get_reduction_op (stmt, reduc_index);
4302 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4303 gcc_assert (vectype);
4304 mode = TYPE_MODE (vectype);
4306 /* 1. Create the reduction def-use cycle:
4307 Set the arguments of REDUCTION_PHIS, i.e., transform
4309 loop:
4310 vec_def = phi <null, null> # REDUCTION_PHI
4311 VECT_DEF = vector_stmt # vectorized form of STMT
4314 into:
4316 loop:
4317 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4318 VECT_DEF = vector_stmt # vectorized form of STMT
4321 (in case of SLP, do it for all the phis). */
4323 /* Get the loop-entry arguments. */
4324 if (slp_node)
4325 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4326 NULL, slp_node, reduc_index);
4327 else
4329 /* Get at the scalar def before the loop, that defines the initial value
4330 of the reduction variable. */
4331 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4332 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4333 loop_preheader_edge (loop));
4334 vec_initial_defs.create (1);
4335 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4336 &adjustment_def);
4337 vec_initial_defs.quick_push (vec_initial_def);
4340 /* Set phi nodes arguments. */
4341 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4343 tree vec_init_def, def;
4344 gimple_seq stmts;
4345 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4346 true, NULL_TREE);
4347 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4348 def = vect_defs[i];
4349 for (j = 0; j < ncopies; j++)
4351 /* Set the loop-entry arg of the reduction-phi. */
4353 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4354 == INTEGER_INDUC_COND_REDUCTION)
4356 /* Initialise the reduction phi to zero. This prevents initial
4357 values of non-zero interferring with the reduction op. */
4358 gcc_assert (ncopies == 1);
4359 gcc_assert (i == 0);
4361 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4362 tree zero_vec = build_zero_cst (vec_init_def_type);
4364 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4365 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4367 else
4368 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4369 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4371 /* Set the loop-latch arg for the reduction-phi. */
4372 if (j > 0)
4373 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4375 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4376 UNKNOWN_LOCATION);
4378 if (dump_enabled_p ())
4380 dump_printf_loc (MSG_NOTE, vect_location,
4381 "transform reduction: created def-use cycle: ");
4382 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4383 dump_printf (MSG_NOTE, "\n");
4384 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4385 dump_printf (MSG_NOTE, "\n");
4388 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4392 /* 2. Create epilog code.
4393 The reduction epilog code operates across the elements of the vector
4394 of partial results computed by the vectorized loop.
4395 The reduction epilog code consists of:
4397 step 1: compute the scalar result in a vector (v_out2)
4398 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4399 step 3: adjust the scalar result (s_out3) if needed.
4401 Step 1 can be accomplished using one the following three schemes:
4402 (scheme 1) using reduc_code, if available.
4403 (scheme 2) using whole-vector shifts, if available.
4404 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4405 combined.
4407 The overall epilog code looks like this:
4409 s_out0 = phi <s_loop> # original EXIT_PHI
4410 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4411 v_out2 = reduce <v_out1> # step 1
4412 s_out3 = extract_field <v_out2, 0> # step 2
4413 s_out4 = adjust_result <s_out3> # step 3
4415 (step 3 is optional, and steps 1 and 2 may be combined).
4416 Lastly, the uses of s_out0 are replaced by s_out4. */
4419 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4420 v_out1 = phi <VECT_DEF>
4421 Store them in NEW_PHIS. */
4423 exit_bb = single_exit (loop)->dest;
4424 prev_phi_info = NULL;
4425 new_phis.create (vect_defs.length ());
4426 FOR_EACH_VEC_ELT (vect_defs, i, def)
4428 for (j = 0; j < ncopies; j++)
4430 tree new_def = copy_ssa_name (def);
4431 phi = create_phi_node (new_def, exit_bb);
4432 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4433 if (j == 0)
4434 new_phis.quick_push (phi);
4435 else
4437 def = vect_get_vec_def_for_stmt_copy (dt, def);
4438 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4441 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4442 prev_phi_info = vinfo_for_stmt (phi);
4446 /* The epilogue is created for the outer-loop, i.e., for the loop being
4447 vectorized. Create exit phis for the outer loop. */
4448 if (double_reduc)
4450 loop = outer_loop;
4451 exit_bb = single_exit (loop)->dest;
4452 inner_phis.create (vect_defs.length ());
4453 FOR_EACH_VEC_ELT (new_phis, i, phi)
4455 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4456 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4457 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4458 PHI_RESULT (phi));
4459 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4460 loop_vinfo));
4461 inner_phis.quick_push (phi);
4462 new_phis[i] = outer_phi;
4463 prev_phi_info = vinfo_for_stmt (outer_phi);
4464 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4466 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4467 new_result = copy_ssa_name (PHI_RESULT (phi));
4468 outer_phi = create_phi_node (new_result, exit_bb);
4469 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4470 PHI_RESULT (phi));
4471 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4472 loop_vinfo));
4473 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4474 prev_phi_info = vinfo_for_stmt (outer_phi);
4479 exit_gsi = gsi_after_labels (exit_bb);
4481 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4482 (i.e. when reduc_code is not available) and in the final adjustment
4483 code (if needed). Also get the original scalar reduction variable as
4484 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4485 represents a reduction pattern), the tree-code and scalar-def are
4486 taken from the original stmt that the pattern-stmt (STMT) replaces.
4487 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4488 are taken from STMT. */
4490 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4491 if (!orig_stmt)
4493 /* Regular reduction */
4494 orig_stmt = stmt;
4496 else
4498 /* Reduction pattern */
4499 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4500 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4501 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4504 code = gimple_assign_rhs_code (orig_stmt);
4505 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4506 partial results are added and not subtracted. */
4507 if (code == MINUS_EXPR)
4508 code = PLUS_EXPR;
4510 scalar_dest = gimple_assign_lhs (orig_stmt);
4511 scalar_type = TREE_TYPE (scalar_dest);
4512 scalar_results.create (group_size);
4513 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4514 bitsize = TYPE_SIZE (scalar_type);
4516 /* In case this is a reduction in an inner-loop while vectorizing an outer
4517 loop - we don't need to extract a single scalar result at the end of the
4518 inner-loop (unless it is double reduction, i.e., the use of reduction is
4519 outside the outer-loop). The final vector of partial results will be used
4520 in the vectorized outer-loop, or reduced to a scalar result at the end of
4521 the outer-loop. */
4522 if (nested_in_vect_loop && !double_reduc)
4523 goto vect_finalize_reduction;
4525 /* SLP reduction without reduction chain, e.g.,
4526 # a1 = phi <a2, a0>
4527 # b1 = phi <b2, b0>
4528 a2 = operation (a1)
4529 b2 = operation (b1) */
4530 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4532 /* In case of reduction chain, e.g.,
4533 # a1 = phi <a3, a0>
4534 a2 = operation (a1)
4535 a3 = operation (a2),
4537 we may end up with more than one vector result. Here we reduce them to
4538 one vector. */
4539 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4541 tree first_vect = PHI_RESULT (new_phis[0]);
4542 tree tmp;
4543 gassign *new_vec_stmt = NULL;
4545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4546 for (k = 1; k < new_phis.length (); k++)
4548 gimple *next_phi = new_phis[k];
4549 tree second_vect = PHI_RESULT (next_phi);
4551 tmp = build2 (code, vectype, first_vect, second_vect);
4552 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4553 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4554 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4555 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4558 new_phi_result = first_vect;
4559 if (new_vec_stmt)
4561 new_phis.truncate (0);
4562 new_phis.safe_push (new_vec_stmt);
4565 else
4566 new_phi_result = PHI_RESULT (new_phis[0]);
4568 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4570 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4571 various data values where the condition matched and another vector
4572 (INDUCTION_INDEX) containing all the indexes of those matches. We
4573 need to extract the last matching index (which will be the index with
4574 highest value) and use this to index into the data vector.
4575 For the case where there were no matches, the data vector will contain
4576 all default values and the index vector will be all zeros. */
4578 /* Get various versions of the type of the vector of indexes. */
4579 tree index_vec_type = TREE_TYPE (induction_index);
4580 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4581 tree index_scalar_type = TREE_TYPE (index_vec_type);
4582 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4583 (index_vec_type);
4585 /* Get an unsigned integer version of the type of the data vector. */
4586 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4587 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4588 tree vectype_unsigned = build_vector_type
4589 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4591 /* First we need to create a vector (ZERO_VEC) of zeros and another
4592 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4593 can create using a MAX reduction and then expanding.
4594 In the case where the loop never made any matches, the max index will
4595 be zero. */
4597 /* Vector of {0, 0, 0,...}. */
4598 tree zero_vec = make_ssa_name (vectype);
4599 tree zero_vec_rhs = build_zero_cst (vectype);
4600 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4601 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4603 /* Find maximum value from the vector of found indexes. */
4604 tree max_index = make_ssa_name (index_scalar_type);
4605 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4606 induction_index);
4607 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4609 /* Vector of {max_index, max_index, max_index,...}. */
4610 tree max_index_vec = make_ssa_name (index_vec_type);
4611 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4612 max_index);
4613 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4614 max_index_vec_rhs);
4615 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4617 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4618 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4619 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4620 otherwise. Only one value should match, resulting in a vector
4621 (VEC_COND) with one data value and the rest zeros.
4622 In the case where the loop never made any matches, every index will
4623 match, resulting in a vector with all data values (which will all be
4624 the default value). */
4626 /* Compare the max index vector to the vector of found indexes to find
4627 the position of the max value. */
4628 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4629 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4630 induction_index,
4631 max_index_vec);
4632 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4634 /* Use the compare to choose either values from the data vector or
4635 zero. */
4636 tree vec_cond = make_ssa_name (vectype);
4637 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4638 vec_compare, new_phi_result,
4639 zero_vec);
4640 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4642 /* Finally we need to extract the data value from the vector (VEC_COND)
4643 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4644 reduction, but because this doesn't exist, we can use a MAX reduction
4645 instead. The data value might be signed or a float so we need to cast
4646 it first.
4647 In the case where the loop never made any matches, the data values are
4648 all identical, and so will reduce down correctly. */
4650 /* Make the matched data values unsigned. */
4651 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4652 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4653 vec_cond);
4654 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4655 VIEW_CONVERT_EXPR,
4656 vec_cond_cast_rhs);
4657 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4659 /* Reduce down to a scalar value. */
4660 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4661 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4662 optab_default);
4663 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4664 != CODE_FOR_nothing);
4665 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4666 REDUC_MAX_EXPR,
4667 vec_cond_cast);
4668 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4670 /* Convert the reduced value back to the result type and set as the
4671 result. */
4672 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4673 data_reduc);
4674 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4675 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4676 gimple_assign_set_lhs (epilog_stmt, new_temp);
4677 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4678 scalar_results.safe_push (new_temp);
4681 /* 2.3 Create the reduction code, using one of the three schemes described
4682 above. In SLP we simply need to extract all the elements from the
4683 vector (without reducing them), so we use scalar shifts. */
4684 else if (reduc_code != ERROR_MARK && !slp_reduc)
4686 tree tmp;
4687 tree vec_elem_type;
4689 /*** Case 1: Create:
4690 v_out2 = reduc_expr <v_out1> */
4692 if (dump_enabled_p ())
4693 dump_printf_loc (MSG_NOTE, vect_location,
4694 "Reduce using direct vector reduction.\n");
4696 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4697 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4699 tree tmp_dest =
4700 vect_create_destination_var (scalar_dest, vec_elem_type);
4701 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4702 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4703 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4704 gimple_assign_set_lhs (epilog_stmt, new_temp);
4705 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4707 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4709 else
4710 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4712 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4713 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4714 gimple_assign_set_lhs (epilog_stmt, new_temp);
4715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4717 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4718 == INTEGER_INDUC_COND_REDUCTION)
4720 /* Earlier we set the initial value to be zero. Check the result
4721 and if it is zero then replace with the original initial
4722 value. */
4723 tree zero = build_zero_cst (scalar_type);
4724 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4726 tmp = make_ssa_name (new_scalar_dest);
4727 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4728 initial_def, new_temp);
4729 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4730 new_temp = tmp;
4733 scalar_results.safe_push (new_temp);
4735 else
4737 bool reduce_with_shift = have_whole_vector_shift (mode);
4738 int element_bitsize = tree_to_uhwi (bitsize);
4739 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4740 tree vec_temp;
4742 /* Regardless of whether we have a whole vector shift, if we're
4743 emulating the operation via tree-vect-generic, we don't want
4744 to use it. Only the first round of the reduction is likely
4745 to still be profitable via emulation. */
4746 /* ??? It might be better to emit a reduction tree code here, so that
4747 tree-vect-generic can expand the first round via bit tricks. */
4748 if (!VECTOR_MODE_P (mode))
4749 reduce_with_shift = false;
4750 else
4752 optab optab = optab_for_tree_code (code, vectype, optab_default);
4753 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4754 reduce_with_shift = false;
4757 if (reduce_with_shift && !slp_reduc)
4759 int nelements = vec_size_in_bits / element_bitsize;
4760 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4762 int elt_offset;
4764 tree zero_vec = build_zero_cst (vectype);
4765 /*** Case 2: Create:
4766 for (offset = nelements/2; offset >= 1; offset/=2)
4768 Create: va' = vec_shift <va, offset>
4769 Create: va = vop <va, va'>
4770 } */
4772 tree rhs;
4774 if (dump_enabled_p ())
4775 dump_printf_loc (MSG_NOTE, vect_location,
4776 "Reduce using vector shifts\n");
4778 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4779 new_temp = new_phi_result;
4780 for (elt_offset = nelements / 2;
4781 elt_offset >= 1;
4782 elt_offset /= 2)
4784 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4785 tree mask = vect_gen_perm_mask_any (vectype, sel);
4786 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4787 new_temp, zero_vec, mask);
4788 new_name = make_ssa_name (vec_dest, epilog_stmt);
4789 gimple_assign_set_lhs (epilog_stmt, new_name);
4790 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4792 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4793 new_temp);
4794 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4795 gimple_assign_set_lhs (epilog_stmt, new_temp);
4796 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4799 /* 2.4 Extract the final scalar result. Create:
4800 s_out3 = extract_field <v_out2, bitpos> */
4802 if (dump_enabled_p ())
4803 dump_printf_loc (MSG_NOTE, vect_location,
4804 "extract scalar result\n");
4806 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4807 bitsize, bitsize_zero_node);
4808 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4809 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4810 gimple_assign_set_lhs (epilog_stmt, new_temp);
4811 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4812 scalar_results.safe_push (new_temp);
4814 else
4816 /*** Case 3: Create:
4817 s = extract_field <v_out2, 0>
4818 for (offset = element_size;
4819 offset < vector_size;
4820 offset += element_size;)
4822 Create: s' = extract_field <v_out2, offset>
4823 Create: s = op <s, s'> // For non SLP cases
4824 } */
4826 if (dump_enabled_p ())
4827 dump_printf_loc (MSG_NOTE, vect_location,
4828 "Reduce using scalar code.\n");
4830 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4831 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4833 int bit_offset;
4834 if (gimple_code (new_phi) == GIMPLE_PHI)
4835 vec_temp = PHI_RESULT (new_phi);
4836 else
4837 vec_temp = gimple_assign_lhs (new_phi);
4838 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4839 bitsize_zero_node);
4840 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4841 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4842 gimple_assign_set_lhs (epilog_stmt, new_temp);
4843 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4845 /* In SLP we don't need to apply reduction operation, so we just
4846 collect s' values in SCALAR_RESULTS. */
4847 if (slp_reduc)
4848 scalar_results.safe_push (new_temp);
4850 for (bit_offset = element_bitsize;
4851 bit_offset < vec_size_in_bits;
4852 bit_offset += element_bitsize)
4854 tree bitpos = bitsize_int (bit_offset);
4855 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4856 bitsize, bitpos);
4858 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4859 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4860 gimple_assign_set_lhs (epilog_stmt, new_name);
4861 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4863 if (slp_reduc)
4865 /* In SLP we don't need to apply reduction operation, so
4866 we just collect s' values in SCALAR_RESULTS. */
4867 new_temp = new_name;
4868 scalar_results.safe_push (new_name);
4870 else
4872 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4873 new_name, new_temp);
4874 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4875 gimple_assign_set_lhs (epilog_stmt, new_temp);
4876 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4881 /* The only case where we need to reduce scalar results in SLP, is
4882 unrolling. If the size of SCALAR_RESULTS is greater than
4883 GROUP_SIZE, we reduce them combining elements modulo
4884 GROUP_SIZE. */
4885 if (slp_reduc)
4887 tree res, first_res, new_res;
4888 gimple *new_stmt;
4890 /* Reduce multiple scalar results in case of SLP unrolling. */
4891 for (j = group_size; scalar_results.iterate (j, &res);
4892 j++)
4894 first_res = scalar_results[j % group_size];
4895 new_stmt = gimple_build_assign (new_scalar_dest, code,
4896 first_res, res);
4897 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4898 gimple_assign_set_lhs (new_stmt, new_res);
4899 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4900 scalar_results[j % group_size] = new_res;
4903 else
4904 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4905 scalar_results.safe_push (new_temp);
4909 vect_finalize_reduction:
4911 if (double_reduc)
4912 loop = loop->inner;
4914 /* 2.5 Adjust the final result by the initial value of the reduction
4915 variable. (When such adjustment is not needed, then
4916 'adjustment_def' is zero). For example, if code is PLUS we create:
4917 new_temp = loop_exit_def + adjustment_def */
4919 if (adjustment_def)
4921 gcc_assert (!slp_reduc);
4922 if (nested_in_vect_loop)
4924 new_phi = new_phis[0];
4925 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4926 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4927 new_dest = vect_create_destination_var (scalar_dest, vectype);
4929 else
4931 new_temp = scalar_results[0];
4932 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4933 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4934 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4937 epilog_stmt = gimple_build_assign (new_dest, expr);
4938 new_temp = make_ssa_name (new_dest, epilog_stmt);
4939 gimple_assign_set_lhs (epilog_stmt, new_temp);
4940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941 if (nested_in_vect_loop)
4943 set_vinfo_for_stmt (epilog_stmt,
4944 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4945 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4946 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4948 if (!double_reduc)
4949 scalar_results.quick_push (new_temp);
4950 else
4951 scalar_results[0] = new_temp;
4953 else
4954 scalar_results[0] = new_temp;
4956 new_phis[0] = epilog_stmt;
4959 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4960 phis with new adjusted scalar results, i.e., replace use <s_out0>
4961 with use <s_out4>.
4963 Transform:
4964 loop_exit:
4965 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4966 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4967 v_out2 = reduce <v_out1>
4968 s_out3 = extract_field <v_out2, 0>
4969 s_out4 = adjust_result <s_out3>
4970 use <s_out0>
4971 use <s_out0>
4973 into:
4975 loop_exit:
4976 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4977 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4978 v_out2 = reduce <v_out1>
4979 s_out3 = extract_field <v_out2, 0>
4980 s_out4 = adjust_result <s_out3>
4981 use <s_out4>
4982 use <s_out4> */
4985 /* In SLP reduction chain we reduce vector results into one vector if
4986 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4987 the last stmt in the reduction chain, since we are looking for the loop
4988 exit phi node. */
4989 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4991 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4992 /* Handle reduction patterns. */
4993 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4994 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4996 scalar_dest = gimple_assign_lhs (dest_stmt);
4997 group_size = 1;
5000 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5001 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5002 need to match SCALAR_RESULTS with corresponding statements. The first
5003 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5004 the first vector stmt, etc.
5005 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5006 if (group_size > new_phis.length ())
5008 ratio = group_size / new_phis.length ();
5009 gcc_assert (!(group_size % new_phis.length ()));
5011 else
5012 ratio = 1;
5014 for (k = 0; k < group_size; k++)
5016 if (k % ratio == 0)
5018 epilog_stmt = new_phis[k / ratio];
5019 reduction_phi = reduction_phis[k / ratio];
5020 if (double_reduc)
5021 inner_phi = inner_phis[k / ratio];
5024 if (slp_reduc)
5026 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5028 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5029 /* SLP statements can't participate in patterns. */
5030 gcc_assert (!orig_stmt);
5031 scalar_dest = gimple_assign_lhs (current_stmt);
5034 phis.create (3);
5035 /* Find the loop-closed-use at the loop exit of the original scalar
5036 result. (The reduction result is expected to have two immediate uses -
5037 one at the latch block, and one at the loop exit). */
5038 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5039 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5040 && !is_gimple_debug (USE_STMT (use_p)))
5041 phis.safe_push (USE_STMT (use_p));
5043 /* While we expect to have found an exit_phi because of loop-closed-ssa
5044 form we can end up without one if the scalar cycle is dead. */
5046 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5048 if (outer_loop)
5050 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5051 gphi *vect_phi;
5053 /* FORNOW. Currently not supporting the case that an inner-loop
5054 reduction is not used in the outer-loop (but only outside the
5055 outer-loop), unless it is double reduction. */
5056 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5057 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5058 || double_reduc);
5060 if (double_reduc)
5061 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5062 else
5063 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5064 if (!double_reduc
5065 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5066 != vect_double_reduction_def)
5067 continue;
5069 /* Handle double reduction:
5071 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5072 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5073 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5074 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5076 At that point the regular reduction (stmt2 and stmt3) is
5077 already vectorized, as well as the exit phi node, stmt4.
5078 Here we vectorize the phi node of double reduction, stmt1, and
5079 update all relevant statements. */
5081 /* Go through all the uses of s2 to find double reduction phi
5082 node, i.e., stmt1 above. */
5083 orig_name = PHI_RESULT (exit_phi);
5084 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5086 stmt_vec_info use_stmt_vinfo;
5087 stmt_vec_info new_phi_vinfo;
5088 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5089 basic_block bb = gimple_bb (use_stmt);
5090 gimple *use;
5092 /* Check that USE_STMT is really double reduction phi
5093 node. */
5094 if (gimple_code (use_stmt) != GIMPLE_PHI
5095 || gimple_phi_num_args (use_stmt) != 2
5096 || bb->loop_father != outer_loop)
5097 continue;
5098 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5099 if (!use_stmt_vinfo
5100 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5101 != vect_double_reduction_def)
5102 continue;
5104 /* Create vector phi node for double reduction:
5105 vs1 = phi <vs0, vs2>
5106 vs1 was created previously in this function by a call to
5107 vect_get_vec_def_for_operand and is stored in
5108 vec_initial_def;
5109 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5110 vs0 is created here. */
5112 /* Create vector phi node. */
5113 vect_phi = create_phi_node (vec_initial_def, bb);
5114 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5115 loop_vec_info_for_loop (outer_loop));
5116 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5118 /* Create vs0 - initial def of the double reduction phi. */
5119 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5120 loop_preheader_edge (outer_loop));
5121 init_def = get_initial_def_for_reduction (stmt,
5122 preheader_arg, NULL);
5123 vect_phi_init = vect_init_vector (use_stmt, init_def,
5124 vectype, NULL);
5126 /* Update phi node arguments with vs0 and vs2. */
5127 add_phi_arg (vect_phi, vect_phi_init,
5128 loop_preheader_edge (outer_loop),
5129 UNKNOWN_LOCATION);
5130 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5131 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5132 if (dump_enabled_p ())
5134 dump_printf_loc (MSG_NOTE, vect_location,
5135 "created double reduction phi node: ");
5136 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5137 dump_printf (MSG_NOTE, "\n");
5140 vect_phi_res = PHI_RESULT (vect_phi);
5142 /* Replace the use, i.e., set the correct vs1 in the regular
5143 reduction phi node. FORNOW, NCOPIES is always 1, so the
5144 loop is redundant. */
5145 use = reduction_phi;
5146 for (j = 0; j < ncopies; j++)
5148 edge pr_edge = loop_preheader_edge (loop);
5149 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5150 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5156 phis.release ();
5157 if (nested_in_vect_loop)
5159 if (double_reduc)
5160 loop = outer_loop;
5161 else
5162 continue;
5165 phis.create (3);
5166 /* Find the loop-closed-use at the loop exit of the original scalar
5167 result. (The reduction result is expected to have two immediate uses,
5168 one at the latch block, and one at the loop exit). For double
5169 reductions we are looking for exit phis of the outer loop. */
5170 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5172 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5174 if (!is_gimple_debug (USE_STMT (use_p)))
5175 phis.safe_push (USE_STMT (use_p));
5177 else
5179 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5181 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5183 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5185 if (!flow_bb_inside_loop_p (loop,
5186 gimple_bb (USE_STMT (phi_use_p)))
5187 && !is_gimple_debug (USE_STMT (phi_use_p)))
5188 phis.safe_push (USE_STMT (phi_use_p));
5194 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5196 /* Replace the uses: */
5197 orig_name = PHI_RESULT (exit_phi);
5198 scalar_result = scalar_results[k];
5199 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5200 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5201 SET_USE (use_p, scalar_result);
5204 phis.release ();
5209 /* Function is_nonwrapping_integer_induction.
5211 Check if STMT (which is part of loop LOOP) both increments and
5212 does not cause overflow. */
5214 static bool
5215 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5217 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5218 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5219 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5220 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5221 widest_int ni, max_loop_value, lhs_max;
5222 bool overflow = false;
5224 /* Make sure the loop is integer based. */
5225 if (TREE_CODE (base) != INTEGER_CST
5226 || TREE_CODE (step) != INTEGER_CST)
5227 return false;
5229 /* Check that the induction increments. */
5230 if (tree_int_cst_sgn (step) == -1)
5231 return false;
5233 /* Check that the max size of the loop will not wrap. */
5235 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5236 return true;
5238 if (! max_stmt_executions (loop, &ni))
5239 return false;
5241 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5242 &overflow);
5243 if (overflow)
5244 return false;
5246 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5247 TYPE_SIGN (lhs_type), &overflow);
5248 if (overflow)
5249 return false;
5251 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5252 <= TYPE_PRECISION (lhs_type));
5255 /* Function vectorizable_reduction.
5257 Check if STMT performs a reduction operation that can be vectorized.
5258 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5259 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5260 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5262 This function also handles reduction idioms (patterns) that have been
5263 recognized in advance during vect_pattern_recog. In this case, STMT may be
5264 of this form:
5265 X = pattern_expr (arg0, arg1, ..., X)
5266 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5267 sequence that had been detected and replaced by the pattern-stmt (STMT).
5269 This function also handles reduction of condition expressions, for example:
5270 for (int i = 0; i < N; i++)
5271 if (a[i] < value)
5272 last = a[i];
5273 This is handled by vectorising the loop and creating an additional vector
5274 containing the loop indexes for which "a[i] < value" was true. In the
5275 function epilogue this is reduced to a single max value and then used to
5276 index into the vector of results.
5278 In some cases of reduction patterns, the type of the reduction variable X is
5279 different than the type of the other arguments of STMT.
5280 In such cases, the vectype that is used when transforming STMT into a vector
5281 stmt is different than the vectype that is used to determine the
5282 vectorization factor, because it consists of a different number of elements
5283 than the actual number of elements that are being operated upon in parallel.
5285 For example, consider an accumulation of shorts into an int accumulator.
5286 On some targets it's possible to vectorize this pattern operating on 8
5287 shorts at a time (hence, the vectype for purposes of determining the
5288 vectorization factor should be V8HI); on the other hand, the vectype that
5289 is used to create the vector form is actually V4SI (the type of the result).
5291 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5292 indicates what is the actual level of parallelism (V8HI in the example), so
5293 that the right vectorization factor would be derived. This vectype
5294 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5295 be used to create the vectorized stmt. The right vectype for the vectorized
5296 stmt is obtained from the type of the result X:
5297 get_vectype_for_scalar_type (TREE_TYPE (X))
5299 This means that, contrary to "regular" reductions (or "regular" stmts in
5300 general), the following equation:
5301 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5302 does *NOT* necessarily hold for reduction patterns. */
5304 bool
5305 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5306 gimple **vec_stmt, slp_tree slp_node)
5308 tree vec_dest;
5309 tree scalar_dest;
5310 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5311 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5312 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5313 tree vectype_in = NULL_TREE;
5314 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5315 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5316 enum tree_code code, orig_code, epilog_reduc_code;
5317 machine_mode vec_mode;
5318 int op_type;
5319 optab optab, reduc_optab;
5320 tree new_temp = NULL_TREE;
5321 gimple *def_stmt;
5322 enum vect_def_type dt;
5323 gphi *new_phi = NULL;
5324 tree scalar_type;
5325 bool is_simple_use;
5326 gimple *orig_stmt;
5327 stmt_vec_info orig_stmt_info;
5328 tree expr = NULL_TREE;
5329 int i;
5330 int ncopies;
5331 int epilog_copies;
5332 stmt_vec_info prev_stmt_info, prev_phi_info;
5333 bool single_defuse_cycle = false;
5334 tree reduc_def = NULL_TREE;
5335 gimple *new_stmt = NULL;
5336 int j;
5337 tree ops[3];
5338 bool nested_cycle = false, found_nested_cycle_def = false;
5339 gimple *reduc_def_stmt = NULL;
5340 bool double_reduc = false, dummy;
5341 basic_block def_bb;
5342 struct loop * def_stmt_loop, *outer_loop = NULL;
5343 tree def_arg;
5344 gimple *def_arg_stmt;
5345 auto_vec<tree> vec_oprnds0;
5346 auto_vec<tree> vec_oprnds1;
5347 auto_vec<tree> vect_defs;
5348 auto_vec<gimple *> phis;
5349 int vec_num;
5350 tree def0, def1, tem, op0, op1 = NULL_TREE;
5351 bool first_p = true;
5352 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5353 gimple *cond_expr_induction_def_stmt = NULL;
5355 /* In case of reduction chain we switch to the first stmt in the chain, but
5356 we don't update STMT_INFO, since only the last stmt is marked as reduction
5357 and has reduction properties. */
5358 if (GROUP_FIRST_ELEMENT (stmt_info)
5359 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5361 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5362 first_p = false;
5365 if (nested_in_vect_loop_p (loop, stmt))
5367 outer_loop = loop;
5368 loop = loop->inner;
5369 nested_cycle = true;
5372 /* 1. Is vectorizable reduction? */
5373 /* Not supportable if the reduction variable is used in the loop, unless
5374 it's a reduction chain. */
5375 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5376 && !GROUP_FIRST_ELEMENT (stmt_info))
5377 return false;
5379 /* Reductions that are not used even in an enclosing outer-loop,
5380 are expected to be "live" (used out of the loop). */
5381 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5382 && !STMT_VINFO_LIVE_P (stmt_info))
5383 return false;
5385 /* Make sure it was already recognized as a reduction computation. */
5386 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5387 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5388 return false;
5390 /* 2. Has this been recognized as a reduction pattern?
5392 Check if STMT represents a pattern that has been recognized
5393 in earlier analysis stages. For stmts that represent a pattern,
5394 the STMT_VINFO_RELATED_STMT field records the last stmt in
5395 the original sequence that constitutes the pattern. */
5397 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5398 if (orig_stmt)
5400 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5401 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5402 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5405 /* 3. Check the operands of the operation. The first operands are defined
5406 inside the loop body. The last operand is the reduction variable,
5407 which is defined by the loop-header-phi. */
5409 gcc_assert (is_gimple_assign (stmt));
5411 /* Flatten RHS. */
5412 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5414 case GIMPLE_SINGLE_RHS:
5415 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5416 if (op_type == ternary_op)
5418 tree rhs = gimple_assign_rhs1 (stmt);
5419 ops[0] = TREE_OPERAND (rhs, 0);
5420 ops[1] = TREE_OPERAND (rhs, 1);
5421 ops[2] = TREE_OPERAND (rhs, 2);
5422 code = TREE_CODE (rhs);
5424 else
5425 return false;
5426 break;
5428 case GIMPLE_BINARY_RHS:
5429 code = gimple_assign_rhs_code (stmt);
5430 op_type = TREE_CODE_LENGTH (code);
5431 gcc_assert (op_type == binary_op);
5432 ops[0] = gimple_assign_rhs1 (stmt);
5433 ops[1] = gimple_assign_rhs2 (stmt);
5434 break;
5436 case GIMPLE_TERNARY_RHS:
5437 code = gimple_assign_rhs_code (stmt);
5438 op_type = TREE_CODE_LENGTH (code);
5439 gcc_assert (op_type == ternary_op);
5440 ops[0] = gimple_assign_rhs1 (stmt);
5441 ops[1] = gimple_assign_rhs2 (stmt);
5442 ops[2] = gimple_assign_rhs3 (stmt);
5443 break;
5445 case GIMPLE_UNARY_RHS:
5446 return false;
5448 default:
5449 gcc_unreachable ();
5451 /* The default is that the reduction variable is the last in statement. */
5452 int reduc_index = op_type - 1;
5453 if (code == MINUS_EXPR)
5454 reduc_index = 0;
5456 if (code == COND_EXPR && slp_node)
5457 return false;
5459 scalar_dest = gimple_assign_lhs (stmt);
5460 scalar_type = TREE_TYPE (scalar_dest);
5461 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5462 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5463 return false;
5465 /* Do not try to vectorize bit-precision reductions. */
5466 if ((TYPE_PRECISION (scalar_type)
5467 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5468 return false;
5470 /* All uses but the last are expected to be defined in the loop.
5471 The last use is the reduction variable. In case of nested cycle this
5472 assumption is not true: we use reduc_index to record the index of the
5473 reduction variable. */
5474 for (i = 0; i < op_type; i++)
5476 if (i == reduc_index)
5477 continue;
5479 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5480 if (i == 0 && code == COND_EXPR)
5481 continue;
5483 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5484 &def_stmt, &dt, &tem);
5485 if (!vectype_in)
5486 vectype_in = tem;
5487 gcc_assert (is_simple_use);
5489 if (dt != vect_internal_def
5490 && dt != vect_external_def
5491 && dt != vect_constant_def
5492 && dt != vect_induction_def
5493 && !(dt == vect_nested_cycle && nested_cycle))
5494 return false;
5496 if (dt == vect_nested_cycle)
5498 found_nested_cycle_def = true;
5499 reduc_def_stmt = def_stmt;
5500 reduc_index = i;
5503 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5504 cond_expr_induction_def_stmt = def_stmt;
5507 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5508 &def_stmt, &dt, &tem);
5509 if (!vectype_in)
5510 vectype_in = tem;
5511 gcc_assert (is_simple_use);
5512 if (!found_nested_cycle_def)
5513 reduc_def_stmt = def_stmt;
5515 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5516 return false;
5518 if (!(dt == vect_reduction_def
5519 || dt == vect_nested_cycle
5520 || ((dt == vect_internal_def || dt == vect_external_def
5521 || dt == vect_constant_def || dt == vect_induction_def)
5522 && nested_cycle && found_nested_cycle_def)))
5524 /* For pattern recognized stmts, orig_stmt might be a reduction,
5525 but some helper statements for the pattern might not, or
5526 might be COND_EXPRs with reduction uses in the condition. */
5527 gcc_assert (orig_stmt);
5528 return false;
5531 enum vect_reduction_type v_reduc_type;
5532 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5533 !nested_cycle, &dummy, false,
5534 &v_reduc_type);
5536 /* If we have a condition reduction, see if we can simplify it further. */
5537 if (v_reduc_type == COND_REDUCTION
5538 && cond_expr_induction_def_stmt != NULL
5539 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5541 if (dump_enabled_p ())
5542 dump_printf_loc (MSG_NOTE, vect_location,
5543 "condition expression based on integer induction.\n");
5544 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5546 else
5547 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5549 if (orig_stmt)
5550 gcc_assert (tmp == orig_stmt
5551 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5552 else
5553 /* We changed STMT to be the first stmt in reduction chain, hence we
5554 check that in this case the first element in the chain is STMT. */
5555 gcc_assert (stmt == tmp
5556 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5558 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5559 return false;
5561 if (slp_node || PURE_SLP_STMT (stmt_info))
5562 ncopies = 1;
5563 else
5564 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5565 / TYPE_VECTOR_SUBPARTS (vectype_in));
5567 gcc_assert (ncopies >= 1);
5569 vec_mode = TYPE_MODE (vectype_in);
5571 if (code == COND_EXPR)
5573 /* Only call during the analysis stage, otherwise we'll lose
5574 STMT_VINFO_TYPE. */
5575 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5576 ops[reduc_index], 0, NULL))
5578 if (dump_enabled_p ())
5579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5580 "unsupported condition in reduction\n");
5581 return false;
5584 else
5586 /* 4. Supportable by target? */
5588 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5589 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5591 /* Shifts and rotates are only supported by vectorizable_shifts,
5592 not vectorizable_reduction. */
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595 "unsupported shift or rotation.\n");
5596 return false;
5599 /* 4.1. check support for the operation in the loop */
5600 optab = optab_for_tree_code (code, vectype_in, optab_default);
5601 if (!optab)
5603 if (dump_enabled_p ())
5604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5605 "no optab.\n");
5607 return false;
5610 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5612 if (dump_enabled_p ())
5613 dump_printf (MSG_NOTE, "op not supported by target.\n");
5615 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5616 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5617 < vect_min_worthwhile_factor (code))
5618 return false;
5620 if (dump_enabled_p ())
5621 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5624 /* Worthwhile without SIMD support? */
5625 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5626 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5627 < vect_min_worthwhile_factor (code))
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631 "not worthwhile without SIMD support.\n");
5633 return false;
5637 /* 4.2. Check support for the epilog operation.
5639 If STMT represents a reduction pattern, then the type of the
5640 reduction variable may be different than the type of the rest
5641 of the arguments. For example, consider the case of accumulation
5642 of shorts into an int accumulator; The original code:
5643 S1: int_a = (int) short_a;
5644 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5646 was replaced with:
5647 STMT: int_acc = widen_sum <short_a, int_acc>
5649 This means that:
5650 1. The tree-code that is used to create the vector operation in the
5651 epilog code (that reduces the partial results) is not the
5652 tree-code of STMT, but is rather the tree-code of the original
5653 stmt from the pattern that STMT is replacing. I.e, in the example
5654 above we want to use 'widen_sum' in the loop, but 'plus' in the
5655 epilog.
5656 2. The type (mode) we use to check available target support
5657 for the vector operation to be created in the *epilog*, is
5658 determined by the type of the reduction variable (in the example
5659 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5660 However the type (mode) we use to check available target support
5661 for the vector operation to be created *inside the loop*, is
5662 determined by the type of the other arguments to STMT (in the
5663 example we'd check this: optab_handler (widen_sum_optab,
5664 vect_short_mode)).
5666 This is contrary to "regular" reductions, in which the types of all
5667 the arguments are the same as the type of the reduction variable.
5668 For "regular" reductions we can therefore use the same vector type
5669 (and also the same tree-code) when generating the epilog code and
5670 when generating the code inside the loop. */
5672 if (orig_stmt)
5674 /* This is a reduction pattern: get the vectype from the type of the
5675 reduction variable, and get the tree-code from orig_stmt. */
5676 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5677 == TREE_CODE_REDUCTION);
5678 orig_code = gimple_assign_rhs_code (orig_stmt);
5679 gcc_assert (vectype_out);
5680 vec_mode = TYPE_MODE (vectype_out);
5682 else
5684 /* Regular reduction: use the same vectype and tree-code as used for
5685 the vector code inside the loop can be used for the epilog code. */
5686 orig_code = code;
5688 if (code == MINUS_EXPR)
5689 orig_code = PLUS_EXPR;
5691 /* For simple condition reductions, replace with the actual expression
5692 we want to base our reduction around. */
5693 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5694 == INTEGER_INDUC_COND_REDUCTION)
5695 orig_code = MAX_EXPR;
5698 if (nested_cycle)
5700 def_bb = gimple_bb (reduc_def_stmt);
5701 def_stmt_loop = def_bb->loop_father;
5702 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5703 loop_preheader_edge (def_stmt_loop));
5704 if (TREE_CODE (def_arg) == SSA_NAME
5705 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5706 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5707 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5708 && vinfo_for_stmt (def_arg_stmt)
5709 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5710 == vect_double_reduction_def)
5711 double_reduc = true;
5714 epilog_reduc_code = ERROR_MARK;
5716 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5717 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5718 == INTEGER_INDUC_COND_REDUCTION)
5720 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5722 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5723 optab_default);
5724 if (!reduc_optab)
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "no optab for reduction.\n");
5730 epilog_reduc_code = ERROR_MARK;
5732 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5734 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5735 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5737 if (dump_enabled_p ())
5738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5739 "reduc op not supported by target.\n");
5741 epilog_reduc_code = ERROR_MARK;
5745 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5746 generated in the epilog using multiple expressions. This does not
5747 work for condition reductions. */
5748 if (epilog_reduc_code == ERROR_MARK
5749 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5750 == INTEGER_INDUC_COND_REDUCTION)
5752 if (dump_enabled_p ())
5753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5754 "no reduc code for scalar code.\n");
5755 return false;
5758 else
5760 if (!nested_cycle || double_reduc)
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764 "no reduc code for scalar code.\n");
5766 return false;
5770 else
5772 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5773 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5774 cr_index_vector_type = build_vector_type
5775 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5777 epilog_reduc_code = REDUC_MAX_EXPR;
5778 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5779 optab_default);
5780 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5781 == CODE_FOR_nothing)
5783 if (dump_enabled_p ())
5784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5785 "reduc max op not supported by target.\n");
5786 return false;
5790 if ((double_reduc
5791 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5792 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5793 == INTEGER_INDUC_COND_REDUCTION)
5794 && ncopies > 1)
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "multiple types in double reduction or condition "
5799 "reduction.\n");
5800 return false;
5803 /* In case of widenning multiplication by a constant, we update the type
5804 of the constant to be the type of the other operand. We check that the
5805 constant fits the type in the pattern recognition pass. */
5806 if (code == DOT_PROD_EXPR
5807 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5809 if (TREE_CODE (ops[0]) == INTEGER_CST)
5810 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5811 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5812 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5813 else
5815 if (dump_enabled_p ())
5816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5817 "invalid types in dot-prod\n");
5819 return false;
5823 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5825 widest_int ni;
5827 if (! max_loop_iterations (loop, &ni))
5829 if (dump_enabled_p ())
5830 dump_printf_loc (MSG_NOTE, vect_location,
5831 "loop count not known, cannot create cond "
5832 "reduction.\n");
5833 return false;
5835 /* Convert backedges to iterations. */
5836 ni += 1;
5838 /* The additional index will be the same type as the condition. Check
5839 that the loop can fit into this less one (because we'll use up the
5840 zero slot for when there are no matches). */
5841 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5842 if (wi::geu_p (ni, wi::to_widest (max_index)))
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE, vect_location,
5846 "loop size is greater than data size.\n");
5847 return false;
5851 if (!vec_stmt) /* transformation not required. */
5853 if (first_p
5854 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5855 reduc_index))
5856 return false;
5857 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5858 return true;
5861 /** Transform. **/
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5866 /* FORNOW: Multiple types are not supported for condition. */
5867 if (code == COND_EXPR)
5868 gcc_assert (ncopies == 1);
5870 /* Create the destination vector */
5871 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5873 /* In case the vectorization factor (VF) is bigger than the number
5874 of elements that we can fit in a vectype (nunits), we have to generate
5875 more than one vector stmt - i.e - we need to "unroll" the
5876 vector stmt by a factor VF/nunits. For more details see documentation
5877 in vectorizable_operation. */
5879 /* If the reduction is used in an outer loop we need to generate
5880 VF intermediate results, like so (e.g. for ncopies=2):
5881 r0 = phi (init, r0)
5882 r1 = phi (init, r1)
5883 r0 = x0 + r0;
5884 r1 = x1 + r1;
5885 (i.e. we generate VF results in 2 registers).
5886 In this case we have a separate def-use cycle for each copy, and therefore
5887 for each copy we get the vector def for the reduction variable from the
5888 respective phi node created for this copy.
5890 Otherwise (the reduction is unused in the loop nest), we can combine
5891 together intermediate results, like so (e.g. for ncopies=2):
5892 r = phi (init, r)
5893 r = x0 + r;
5894 r = x1 + r;
5895 (i.e. we generate VF/2 results in a single register).
5896 In this case for each copy we get the vector def for the reduction variable
5897 from the vectorized reduction operation generated in the previous iteration.
5900 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5902 single_defuse_cycle = true;
5903 epilog_copies = 1;
5905 else
5906 epilog_copies = ncopies;
5908 prev_stmt_info = NULL;
5909 prev_phi_info = NULL;
5910 if (slp_node)
5911 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5912 else
5914 vec_num = 1;
5915 vec_oprnds0.create (1);
5916 if (op_type == ternary_op)
5917 vec_oprnds1.create (1);
5920 phis.create (vec_num);
5921 vect_defs.create (vec_num);
5922 if (!slp_node)
5923 vect_defs.quick_push (NULL_TREE);
5925 for (j = 0; j < ncopies; j++)
5927 if (j == 0 || !single_defuse_cycle)
5929 for (i = 0; i < vec_num; i++)
5931 /* Create the reduction-phi that defines the reduction
5932 operand. */
5933 new_phi = create_phi_node (vec_dest, loop->header);
5934 set_vinfo_for_stmt (new_phi,
5935 new_stmt_vec_info (new_phi, loop_vinfo));
5936 if (j == 0 || slp_node)
5937 phis.quick_push (new_phi);
5941 if (code == COND_EXPR)
5943 gcc_assert (!slp_node);
5944 vectorizable_condition (stmt, gsi, vec_stmt,
5945 PHI_RESULT (phis[0]),
5946 reduc_index, NULL);
5947 /* Multiple types are not supported for condition. */
5948 break;
5951 /* Handle uses. */
5952 if (j == 0)
5954 op0 = ops[!reduc_index];
5955 if (op_type == ternary_op)
5957 if (reduc_index == 0)
5958 op1 = ops[2];
5959 else
5960 op1 = ops[1];
5963 if (slp_node)
5964 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5965 slp_node, -1);
5966 else
5968 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5969 stmt);
5970 vec_oprnds0.quick_push (loop_vec_def0);
5971 if (op_type == ternary_op)
5973 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5974 vec_oprnds1.quick_push (loop_vec_def1);
5978 else
5980 if (!slp_node)
5982 enum vect_def_type dt;
5983 gimple *dummy_stmt;
5985 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5986 &dummy_stmt, &dt);
5987 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5988 loop_vec_def0);
5989 vec_oprnds0[0] = loop_vec_def0;
5990 if (op_type == ternary_op)
5992 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5993 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5994 loop_vec_def1);
5995 vec_oprnds1[0] = loop_vec_def1;
5999 if (single_defuse_cycle)
6000 reduc_def = gimple_assign_lhs (new_stmt);
6002 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6005 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6007 if (slp_node)
6008 reduc_def = PHI_RESULT (phis[i]);
6009 else
6011 if (!single_defuse_cycle || j == 0)
6012 reduc_def = PHI_RESULT (new_phi);
6015 def1 = ((op_type == ternary_op)
6016 ? vec_oprnds1[i] : NULL);
6017 if (op_type == binary_op)
6019 if (reduc_index == 0)
6020 expr = build2 (code, vectype_out, reduc_def, def0);
6021 else
6022 expr = build2 (code, vectype_out, def0, reduc_def);
6024 else
6026 if (reduc_index == 0)
6027 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6028 else
6030 if (reduc_index == 1)
6031 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6032 else
6033 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6037 new_stmt = gimple_build_assign (vec_dest, expr);
6038 new_temp = make_ssa_name (vec_dest, new_stmt);
6039 gimple_assign_set_lhs (new_stmt, new_temp);
6040 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6042 if (slp_node)
6044 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6045 vect_defs.quick_push (new_temp);
6047 else
6048 vect_defs[0] = new_temp;
6051 if (slp_node)
6052 continue;
6054 if (j == 0)
6055 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6056 else
6057 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6059 prev_stmt_info = vinfo_for_stmt (new_stmt);
6060 prev_phi_info = vinfo_for_stmt (new_phi);
6063 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6065 /* Finalize the reduction-phi (set its arguments) and create the
6066 epilog reduction code. */
6067 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6069 new_temp = gimple_assign_lhs (*vec_stmt);
6070 vect_defs[0] = new_temp;
6072 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6073 which is updated with the current index of the loop for every match of
6074 the original loop's cond_expr (VEC_STMT). This results in a vector
6075 containing the last time the condition passed for that vector lane.
6076 The first match will be a 1 to allow 0 to be used for non-matching
6077 indexes. If there are no matches at all then the vector will be all
6078 zeroes. */
6079 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6081 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6082 int k;
6084 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6086 /* First we create a simple vector induction variable which starts
6087 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6088 vector size (STEP). */
6090 /* Create a {1,2,3,...} vector. */
6091 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6092 for (k = 0; k < nunits_out; ++k)
6093 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6094 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6096 /* Create a vector of the step value. */
6097 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6098 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6100 /* Create an induction variable. */
6101 gimple_stmt_iterator incr_gsi;
6102 bool insert_after;
6103 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6104 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6105 insert_after, &indx_before_incr, &indx_after_incr);
6107 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6108 filled with zeros (VEC_ZERO). */
6110 /* Create a vector of 0s. */
6111 tree zero = build_zero_cst (cr_index_scalar_type);
6112 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6114 /* Create a vector phi node. */
6115 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6116 new_phi = create_phi_node (new_phi_tree, loop->header);
6117 set_vinfo_for_stmt (new_phi,
6118 new_stmt_vec_info (new_phi, loop_vinfo));
6119 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6120 UNKNOWN_LOCATION);
6122 /* Now take the condition from the loops original cond_expr
6123 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6124 every match uses values from the induction variable
6125 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6126 (NEW_PHI_TREE).
6127 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6128 the new cond_expr (INDEX_COND_EXPR). */
6130 /* Turn the condition from vec_stmt into an ssa name. */
6131 gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6132 tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6133 tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6134 gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6135 ccompare);
6136 gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6137 gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6138 update_stmt (*vec_stmt);
6140 /* Create a conditional, where the condition is taken from vec_stmt
6141 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6142 and else is the phi (NEW_PHI_TREE). */
6143 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6144 ccompare_name, indx_before_incr,
6145 new_phi_tree);
6146 cond_name = make_ssa_name (cr_index_vector_type);
6147 gimple *index_condition = gimple_build_assign (cond_name,
6148 index_cond_expr);
6149 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6150 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6151 loop_vinfo);
6152 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6153 set_vinfo_for_stmt (index_condition, index_vec_info);
6155 /* Update the phi with the vec cond. */
6156 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6157 UNKNOWN_LOCATION);
6161 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6162 epilog_reduc_code, phis, reduc_index,
6163 double_reduc, slp_node, cond_name);
6165 return true;
6168 /* Function vect_min_worthwhile_factor.
6170 For a loop where we could vectorize the operation indicated by CODE,
6171 return the minimum vectorization factor that makes it worthwhile
6172 to use generic vectors. */
6174 vect_min_worthwhile_factor (enum tree_code code)
6176 switch (code)
6178 case PLUS_EXPR:
6179 case MINUS_EXPR:
6180 case NEGATE_EXPR:
6181 return 4;
6183 case BIT_AND_EXPR:
6184 case BIT_IOR_EXPR:
6185 case BIT_XOR_EXPR:
6186 case BIT_NOT_EXPR:
6187 return 2;
6189 default:
6190 return INT_MAX;
6195 /* Function vectorizable_induction
6197 Check if PHI performs an induction computation that can be vectorized.
6198 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6199 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6200 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6202 bool
6203 vectorizable_induction (gimple *phi,
6204 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6205 gimple **vec_stmt)
6207 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6208 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6211 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6212 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6213 tree vec_def;
6215 gcc_assert (ncopies >= 1);
6216 /* FORNOW. These restrictions should be relaxed. */
6217 if (nested_in_vect_loop_p (loop, phi))
6219 imm_use_iterator imm_iter;
6220 use_operand_p use_p;
6221 gimple *exit_phi;
6222 edge latch_e;
6223 tree loop_arg;
6225 if (ncopies > 1)
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 "multiple types in nested loop.\n");
6230 return false;
6233 exit_phi = NULL;
6234 latch_e = loop_latch_edge (loop->inner);
6235 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6236 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6238 gimple *use_stmt = USE_STMT (use_p);
6239 if (is_gimple_debug (use_stmt))
6240 continue;
6242 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6244 exit_phi = use_stmt;
6245 break;
6248 if (exit_phi)
6250 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6251 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6252 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6254 if (dump_enabled_p ())
6255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256 "inner-loop induction only used outside "
6257 "of the outer vectorized loop.\n");
6258 return false;
6263 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6264 return false;
6266 /* FORNOW: SLP not supported. */
6267 if (STMT_SLP_TYPE (stmt_info))
6268 return false;
6270 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6272 if (gimple_code (phi) != GIMPLE_PHI)
6273 return false;
6275 if (!vec_stmt) /* transformation not required. */
6277 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6278 if (dump_enabled_p ())
6279 dump_printf_loc (MSG_NOTE, vect_location,
6280 "=== vectorizable_induction ===\n");
6281 vect_model_induction_cost (stmt_info, ncopies);
6282 return true;
6285 /** Transform. **/
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6290 vec_def = get_initial_def_for_induction (phi);
6291 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6292 return true;
6295 /* Function vectorizable_live_operation.
6297 STMT computes a value that is used outside the loop. Check if
6298 it can be supported. */
6300 bool
6301 vectorizable_live_operation (gimple *stmt,
6302 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6303 gimple **vec_stmt)
6305 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6306 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6307 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6308 tree op;
6309 gimple *def_stmt;
6310 ssa_op_iter iter;
6312 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6314 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6315 return false;
6317 if (!is_gimple_assign (stmt))
6319 if (gimple_call_internal_p (stmt)
6320 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6321 && gimple_call_lhs (stmt)
6322 && loop->simduid
6323 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6324 && loop->simduid
6325 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6327 edge e = single_exit (loop);
6328 basic_block merge_bb = e->dest;
6329 imm_use_iterator imm_iter;
6330 use_operand_p use_p;
6331 tree lhs = gimple_call_lhs (stmt);
6333 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6335 gimple *use_stmt = USE_STMT (use_p);
6336 if (gimple_code (use_stmt) == GIMPLE_PHI
6337 && gimple_bb (use_stmt) == merge_bb)
6339 if (vec_stmt)
6341 tree vfm1
6342 = build_int_cst (unsigned_type_node,
6343 loop_vinfo->vectorization_factor - 1);
6344 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6346 return true;
6351 return false;
6354 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6355 return false;
6357 /* FORNOW. CHECKME. */
6358 if (nested_in_vect_loop_p (loop, stmt))
6359 return false;
6361 /* FORNOW: support only if all uses are invariant. This means
6362 that the scalar operations can remain in place, unvectorized.
6363 The original last scalar value that they compute will be used. */
6364 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6366 enum vect_def_type dt = vect_uninitialized_def;
6368 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6370 if (dump_enabled_p ())
6371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6372 "use not simple.\n");
6373 return false;
6376 if (dt != vect_external_def && dt != vect_constant_def)
6377 return false;
6380 /* No transformation is required for the cases we currently support. */
6381 return true;
6384 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6386 static void
6387 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6389 ssa_op_iter op_iter;
6390 imm_use_iterator imm_iter;
6391 def_operand_p def_p;
6392 gimple *ustmt;
6394 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6396 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6398 basic_block bb;
6400 if (!is_gimple_debug (ustmt))
6401 continue;
6403 bb = gimple_bb (ustmt);
6405 if (!flow_bb_inside_loop_p (loop, bb))
6407 if (gimple_debug_bind_p (ustmt))
6409 if (dump_enabled_p ())
6410 dump_printf_loc (MSG_NOTE, vect_location,
6411 "killing debug use\n");
6413 gimple_debug_bind_reset_value (ustmt);
6414 update_stmt (ustmt);
6416 else
6417 gcc_unreachable ();
6424 /* This function builds ni_name = number of iterations. Statements
6425 are emitted on the loop preheader edge. */
6427 static tree
6428 vect_build_loop_niters (loop_vec_info loop_vinfo)
6430 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6431 if (TREE_CODE (ni) == INTEGER_CST)
6432 return ni;
6433 else
6435 tree ni_name, var;
6436 gimple_seq stmts = NULL;
6437 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6439 var = create_tmp_var (TREE_TYPE (ni), "niters");
6440 ni_name = force_gimple_operand (ni, &stmts, false, var);
6441 if (stmts)
6442 gsi_insert_seq_on_edge_immediate (pe, stmts);
6444 return ni_name;
6449 /* This function generates the following statements:
6451 ni_name = number of iterations loop executes
6452 ratio = ni_name / vf
6453 ratio_mult_vf_name = ratio * vf
6455 and places them on the loop preheader edge. */
6457 static void
6458 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6459 tree ni_name,
6460 tree *ratio_mult_vf_name_ptr,
6461 tree *ratio_name_ptr)
6463 tree ni_minus_gap_name;
6464 tree var;
6465 tree ratio_name;
6466 tree ratio_mult_vf_name;
6467 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6468 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6469 tree log_vf;
6471 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6473 /* If epilogue loop is required because of data accesses with gaps, we
6474 subtract one iteration from the total number of iterations here for
6475 correct calculation of RATIO. */
6476 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6478 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6479 ni_name,
6480 build_one_cst (TREE_TYPE (ni_name)));
6481 if (!is_gimple_val (ni_minus_gap_name))
6483 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6484 gimple *stmts = NULL;
6485 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6486 true, var);
6487 gsi_insert_seq_on_edge_immediate (pe, stmts);
6490 else
6491 ni_minus_gap_name = ni_name;
6493 /* Create: ratio = ni >> log2(vf) */
6494 /* ??? As we have ni == number of latch executions + 1, ni could
6495 have overflown to zero. So avoid computing ratio based on ni
6496 but compute it using the fact that we know ratio will be at least
6497 one, thus via (ni - vf) >> log2(vf) + 1. */
6498 ratio_name
6499 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6500 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6501 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6502 ni_minus_gap_name,
6503 build_int_cst
6504 (TREE_TYPE (ni_name), vf)),
6505 log_vf),
6506 build_int_cst (TREE_TYPE (ni_name), 1));
6507 if (!is_gimple_val (ratio_name))
6509 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6510 gimple *stmts = NULL;
6511 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6512 gsi_insert_seq_on_edge_immediate (pe, stmts);
6514 *ratio_name_ptr = ratio_name;
6516 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6518 if (ratio_mult_vf_name_ptr)
6520 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6521 ratio_name, log_vf);
6522 if (!is_gimple_val (ratio_mult_vf_name))
6524 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6525 gimple *stmts = NULL;
6526 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6527 true, var);
6528 gsi_insert_seq_on_edge_immediate (pe, stmts);
6530 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6533 return;
6537 /* Function vect_transform_loop.
6539 The analysis phase has determined that the loop is vectorizable.
6540 Vectorize the loop - created vectorized stmts to replace the scalar
6541 stmts in the loop, and update the loop exit condition. */
6543 void
6544 vect_transform_loop (loop_vec_info loop_vinfo)
6546 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6547 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6548 int nbbs = loop->num_nodes;
6549 int i;
6550 tree ratio = NULL;
6551 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6552 bool grouped_store;
6553 bool slp_scheduled = false;
6554 gimple *stmt, *pattern_stmt;
6555 gimple_seq pattern_def_seq = NULL;
6556 gimple_stmt_iterator pattern_def_si = gsi_none ();
6557 bool transform_pattern_stmt = false;
6558 bool check_profitability = false;
6559 int th;
6560 /* Record number of iterations before we started tampering with the profile. */
6561 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6563 if (dump_enabled_p ())
6564 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6566 /* If profile is inprecise, we have chance to fix it up. */
6567 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6568 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6570 /* Use the more conservative vectorization threshold. If the number
6571 of iterations is constant assume the cost check has been performed
6572 by our caller. If the threshold makes all loops profitable that
6573 run at least the vectorization factor number of times checking
6574 is pointless, too. */
6575 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6576 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6577 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_NOTE, vect_location,
6581 "Profitability threshold is %d loop iterations.\n",
6582 th);
6583 check_profitability = true;
6586 /* Version the loop first, if required, so the profitability check
6587 comes first. */
6589 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6590 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6592 vect_loop_versioning (loop_vinfo, th, check_profitability);
6593 check_profitability = false;
6596 tree ni_name = vect_build_loop_niters (loop_vinfo);
6597 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6599 /* Peel the loop if there are data refs with unknown alignment.
6600 Only one data ref with unknown store is allowed. */
6602 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6604 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6605 th, check_profitability);
6606 check_profitability = false;
6607 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6608 be re-computed. */
6609 ni_name = NULL_TREE;
6612 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6613 compile time constant), or it is a constant that doesn't divide by the
6614 vectorization factor, then an epilog loop needs to be created.
6615 We therefore duplicate the loop: the original loop will be vectorized,
6616 and will compute the first (n/VF) iterations. The second copy of the loop
6617 will remain scalar and will compute the remaining (n%VF) iterations.
6618 (VF is the vectorization factor). */
6620 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6621 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6623 tree ratio_mult_vf;
6624 if (!ni_name)
6625 ni_name = vect_build_loop_niters (loop_vinfo);
6626 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6627 &ratio);
6628 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6629 th, check_profitability);
6631 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6632 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6633 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6634 else
6636 if (!ni_name)
6637 ni_name = vect_build_loop_niters (loop_vinfo);
6638 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6641 /* 1) Make sure the loop header has exactly two entries
6642 2) Make sure we have a preheader basic block. */
6644 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6646 split_edge (loop_preheader_edge (loop));
6648 /* FORNOW: the vectorizer supports only loops which body consist
6649 of one basic block (header + empty latch). When the vectorizer will
6650 support more involved loop forms, the order by which the BBs are
6651 traversed need to be reconsidered. */
6653 for (i = 0; i < nbbs; i++)
6655 basic_block bb = bbs[i];
6656 stmt_vec_info stmt_info;
6658 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6659 gsi_next (&si))
6661 gphi *phi = si.phi ();
6662 if (dump_enabled_p ())
6664 dump_printf_loc (MSG_NOTE, vect_location,
6665 "------>vectorizing phi: ");
6666 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6667 dump_printf (MSG_NOTE, "\n");
6669 stmt_info = vinfo_for_stmt (phi);
6670 if (!stmt_info)
6671 continue;
6673 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6674 vect_loop_kill_debug_uses (loop, phi);
6676 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6677 && !STMT_VINFO_LIVE_P (stmt_info))
6678 continue;
6680 if (STMT_VINFO_VECTYPE (stmt_info)
6681 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6682 != (unsigned HOST_WIDE_INT) vectorization_factor)
6683 && dump_enabled_p ())
6684 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6686 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6688 if (dump_enabled_p ())
6689 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6690 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6694 pattern_stmt = NULL;
6695 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6696 !gsi_end_p (si) || transform_pattern_stmt;)
6698 bool is_store;
6700 if (transform_pattern_stmt)
6701 stmt = pattern_stmt;
6702 else
6704 stmt = gsi_stmt (si);
6705 /* During vectorization remove existing clobber stmts. */
6706 if (gimple_clobber_p (stmt))
6708 unlink_stmt_vdef (stmt);
6709 gsi_remove (&si, true);
6710 release_defs (stmt);
6711 continue;
6715 if (dump_enabled_p ())
6717 dump_printf_loc (MSG_NOTE, vect_location,
6718 "------>vectorizing statement: ");
6719 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6720 dump_printf (MSG_NOTE, "\n");
6723 stmt_info = vinfo_for_stmt (stmt);
6725 /* vector stmts created in the outer-loop during vectorization of
6726 stmts in an inner-loop may not have a stmt_info, and do not
6727 need to be vectorized. */
6728 if (!stmt_info)
6730 gsi_next (&si);
6731 continue;
6734 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6735 vect_loop_kill_debug_uses (loop, stmt);
6737 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6738 && !STMT_VINFO_LIVE_P (stmt_info))
6740 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6741 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6742 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6743 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6745 stmt = pattern_stmt;
6746 stmt_info = vinfo_for_stmt (stmt);
6748 else
6750 gsi_next (&si);
6751 continue;
6754 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6755 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6756 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6757 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6758 transform_pattern_stmt = true;
6760 /* If pattern statement has def stmts, vectorize them too. */
6761 if (is_pattern_stmt_p (stmt_info))
6763 if (pattern_def_seq == NULL)
6765 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6766 pattern_def_si = gsi_start (pattern_def_seq);
6768 else if (!gsi_end_p (pattern_def_si))
6769 gsi_next (&pattern_def_si);
6770 if (pattern_def_seq != NULL)
6772 gimple *pattern_def_stmt = NULL;
6773 stmt_vec_info pattern_def_stmt_info = NULL;
6775 while (!gsi_end_p (pattern_def_si))
6777 pattern_def_stmt = gsi_stmt (pattern_def_si);
6778 pattern_def_stmt_info
6779 = vinfo_for_stmt (pattern_def_stmt);
6780 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6781 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6782 break;
6783 gsi_next (&pattern_def_si);
6786 if (!gsi_end_p (pattern_def_si))
6788 if (dump_enabled_p ())
6790 dump_printf_loc (MSG_NOTE, vect_location,
6791 "==> vectorizing pattern def "
6792 "stmt: ");
6793 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6794 pattern_def_stmt, 0);
6795 dump_printf (MSG_NOTE, "\n");
6798 stmt = pattern_def_stmt;
6799 stmt_info = pattern_def_stmt_info;
6801 else
6803 pattern_def_si = gsi_none ();
6804 transform_pattern_stmt = false;
6807 else
6808 transform_pattern_stmt = false;
6811 if (STMT_VINFO_VECTYPE (stmt_info))
6813 unsigned int nunits
6814 = (unsigned int)
6815 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6816 if (!STMT_SLP_TYPE (stmt_info)
6817 && nunits != (unsigned int) vectorization_factor
6818 && dump_enabled_p ())
6819 /* For SLP VF is set according to unrolling factor, and not
6820 to vector size, hence for SLP this print is not valid. */
6821 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6824 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6825 reached. */
6826 if (STMT_SLP_TYPE (stmt_info))
6828 if (!slp_scheduled)
6830 slp_scheduled = true;
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_NOTE, vect_location,
6834 "=== scheduling SLP instances ===\n");
6836 vect_schedule_slp (loop_vinfo);
6839 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6840 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6842 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6844 pattern_def_seq = NULL;
6845 gsi_next (&si);
6847 continue;
6851 /* -------- vectorize statement ------------ */
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6855 grouped_store = false;
6856 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6857 if (is_store)
6859 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6861 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6862 interleaving chain was completed - free all the stores in
6863 the chain. */
6864 gsi_next (&si);
6865 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6867 else
6869 /* Free the attached stmt_vec_info and remove the stmt. */
6870 gimple *store = gsi_stmt (si);
6871 free_stmt_vec_info (store);
6872 unlink_stmt_vdef (store);
6873 gsi_remove (&si, true);
6874 release_defs (store);
6877 /* Stores can only appear at the end of pattern statements. */
6878 gcc_assert (!transform_pattern_stmt);
6879 pattern_def_seq = NULL;
6881 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6883 pattern_def_seq = NULL;
6884 gsi_next (&si);
6886 } /* stmts in BB */
6887 } /* BBs in loop */
6889 slpeel_make_loop_iterate_ntimes (loop, ratio);
6891 /* Reduce loop iterations by the vectorization factor. */
6892 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6893 expected_iterations / vectorization_factor);
6894 loop->nb_iterations_upper_bound
6895 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6896 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6897 && loop->nb_iterations_upper_bound != 0)
6898 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6899 if (loop->any_estimate)
6901 loop->nb_iterations_estimate
6902 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6903 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6904 && loop->nb_iterations_estimate != 0)
6905 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6908 if (dump_enabled_p ())
6910 dump_printf_loc (MSG_NOTE, vect_location,
6911 "LOOP VECTORIZED\n");
6912 if (loop->inner)
6913 dump_printf_loc (MSG_NOTE, vect_location,
6914 "OUTER LOOP VECTORIZED\n");
6915 dump_printf (MSG_NOTE, "\n");