2015-11-30 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blobee321667a86427d027b455bdb4b3fc40f8cd0de9
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
51 /* Loop Vectorization Pass.
53 This pass tries to vectorize loops.
55 For example, the vectorizer transforms the following simple loop:
57 short a[N]; short b[N]; short c[N]; int i;
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
63 as if it was manually vectorized by rewriting the source code into:
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
111 For example, say stmt S1 was vectorized into stmt VS1:
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
151 /* Function vect_determine_vectorization_factor
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
176 static bool
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
201 for (i = 0; i < nbbs; i++)
203 basic_block bb = bbs[i];
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
217 gcc_assert (stmt_info);
219 if (STMT_VINFO_RELEVANT_P (stmt_info))
221 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
222 scalar_type = TREE_TYPE (PHI_RESULT (phi));
224 if (dump_enabled_p ())
226 dump_printf_loc (MSG_NOTE, vect_location,
227 "get vectype for scalar type: ");
228 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
229 dump_printf (MSG_NOTE, "\n");
232 vectype = get_vectype_for_scalar_type (scalar_type);
233 if (!vectype)
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
238 "not vectorized: unsupported "
239 "data-type ");
240 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
241 scalar_type);
242 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
244 return false;
246 STMT_VINFO_VECTYPE (stmt_info) = vectype;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
251 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
252 dump_printf (MSG_NOTE, "\n");
255 nunits = TYPE_VECTOR_SUBPARTS (vectype);
256 if (dump_enabled_p ())
257 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
258 nunits);
260 if (!vectorization_factor
261 || (nunits > vectorization_factor))
262 vectorization_factor = nunits;
266 for (gimple_stmt_iterator si = gsi_start_bb (bb);
267 !gsi_end_p (si) || analyze_pattern_stmt;)
269 tree vf_vectype;
271 if (analyze_pattern_stmt)
272 stmt = pattern_stmt;
273 else
274 stmt = gsi_stmt (si);
276 stmt_info = vinfo_for_stmt (stmt);
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_NOTE, vect_location,
281 "==> examining statement: ");
282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
283 dump_printf (MSG_NOTE, "\n");
286 gcc_assert (stmt_info);
288 /* Skip stmts which do not need to be vectorized. */
289 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
290 && !STMT_VINFO_LIVE_P (stmt_info))
291 || gimple_clobber_p (stmt))
293 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298 stmt = pattern_stmt;
299 stmt_info = vinfo_for_stmt (pattern_stmt);
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location,
303 "==> examining pattern statement: ");
304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
305 dump_printf (MSG_NOTE, "\n");
308 else
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
325 if (pattern_def_seq == NULL)
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
337 while (!gsi_end_p (pattern_def_si))
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
348 if (!gsi_end_p (pattern_def_si))
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
356 dump_printf (MSG_NOTE, "\n");
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
362 else
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
368 else
369 analyze_pattern_stmt = false;
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 if (is_gimple_call (stmt))
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 pattern_def_seq = NULL;
387 gsi_next (&si);
389 continue;
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 return false;
402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: vector stmt in loop:");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
411 return false;
414 bool_result = false;
416 if (STMT_VINFO_VECTYPE (stmt_info))
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 else
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (is_gimple_call (stmt)
431 && gimple_call_internal_p (stmt)
432 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
433 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
434 else
435 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 /* Bool ops don't participate in vectorization factor
438 computation. For comparison use compared types to
439 compute a factor. */
440 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
442 if (STMT_VINFO_RELEVANT_P (stmt_info))
443 mask_producers.safe_push (stmt_info);
444 bool_result = true;
446 if (gimple_code (stmt) == GIMPLE_ASSIGN
447 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
448 == tcc_comparison
449 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
450 != BOOLEAN_TYPE)
451 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
452 else
454 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 pattern_def_seq = NULL;
457 gsi_next (&si);
459 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "get vectype for scalar type: ");
467 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
468 dump_printf (MSG_NOTE, "\n");
470 vectype = get_vectype_for_scalar_type (scalar_type);
471 if (!vectype)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "not vectorized: unsupported "
477 "data-type ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 scalar_type);
480 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 return false;
485 if (!bool_result)
486 STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
492 dump_printf (MSG_NOTE, "\n");
496 /* Don't try to compute VF out scalar types if we stmt
497 produces boolean vector. Use result vectype instead. */
498 if (VECTOR_BOOLEAN_TYPE_P (vectype))
499 vf_vectype = vectype;
500 else
502 /* The vectorization factor is according to the smallest
503 scalar type (or the largest vector size, but we only
504 support one vector size per loop). */
505 if (!bool_result)
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 if (!vf_vectype)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "not vectorized: unsupported data-type ");
523 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
524 scalar_type);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 return false;
530 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
531 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: different sized vector "
537 "types in statement, ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
539 vectype);
540 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vf_vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 return false;
548 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
551 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
552 dump_printf (MSG_NOTE, "\n");
555 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
558 if (!vectorization_factor
559 || (nunits > vectorization_factor))
560 vectorization_factor = nunits;
562 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
564 pattern_def_seq = NULL;
565 gsi_next (&si);
570 /* TODO: Analyze cost. Decide if worth while to vectorize. */
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
573 vectorization_factor);
574 if (vectorization_factor <= 1)
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "not vectorized: unsupported data-type\n");
579 return false;
581 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
583 for (i = 0; i < mask_producers.length (); i++)
585 tree mask_type = NULL;
587 stmt = STMT_VINFO_STMT (mask_producers[i]);
589 if (gimple_code (stmt) == GIMPLE_ASSIGN
590 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
591 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
593 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
594 mask_type = get_mask_type_for_scalar_type (scalar_type);
596 if (!mask_type)
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "not vectorized: unsupported mask\n");
601 return false;
604 else
606 tree rhs;
607 ssa_op_iter iter;
608 gimple *def_stmt;
609 enum vect_def_type dt;
611 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
613 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
614 &def_stmt, &dt, &vectype))
616 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619 "not vectorized: can't compute mask type "
620 "for statement, ");
621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
623 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
625 return false;
628 /* No vectype probably means external definition.
629 Allow it in case there is another operand which
630 allows to determine mask type. */
631 if (!vectype)
632 continue;
634 if (!mask_type)
635 mask_type = vectype;
636 else if (TYPE_VECTOR_SUBPARTS (mask_type)
637 != TYPE_VECTOR_SUBPARTS (vectype))
639 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "not vectorized: different sized masks "
643 "types in statement, ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
645 mask_type);
646 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
647 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
648 vectype);
649 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
651 return false;
653 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
654 != VECTOR_BOOLEAN_TYPE_P (vectype))
656 if (dump_enabled_p ())
658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
659 "not vectorized: mixed mask and "
660 "nonmask vector types in statement, ");
661 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
662 mask_type);
663 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
664 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
665 vectype);
666 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
668 return false;
672 /* We may compare boolean value loaded as vector of integers.
673 Fix mask_type in such case. */
674 if (mask_type
675 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
676 && gimple_code (stmt) == GIMPLE_ASSIGN
677 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
678 mask_type = build_same_sized_truth_vector_type (mask_type);
681 /* No mask_type should mean loop invariant predicate.
682 This is probably a subject for optimization in
683 if-conversion. */
684 if (!mask_type)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689 "not vectorized: can't compute mask type "
690 "for statement, ");
691 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
693 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
695 return false;
698 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
701 return true;
705 /* Function vect_is_simple_iv_evolution.
707 FORNOW: A simple evolution of an induction variables in the loop is
708 considered a polynomial evolution. */
710 static bool
711 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
712 tree * step)
714 tree init_expr;
715 tree step_expr;
716 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
717 basic_block bb;
719 /* When there is no evolution in this loop, the evolution function
720 is not "simple". */
721 if (evolution_part == NULL_TREE)
722 return false;
724 /* When the evolution is a polynomial of degree >= 2
725 the evolution function is not "simple". */
726 if (tree_is_chrec (evolution_part))
727 return false;
729 step_expr = evolution_part;
730 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
736 dump_printf (MSG_NOTE, ", init: ");
737 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
738 dump_printf (MSG_NOTE, "\n");
741 *init = init_expr;
742 *step = step_expr;
744 if (TREE_CODE (step_expr) != INTEGER_CST
745 && (TREE_CODE (step_expr) != SSA_NAME
746 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
747 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
748 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
749 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
750 || !flag_associative_math)))
751 && (TREE_CODE (step_expr) != REAL_CST
752 || !flag_associative_math))
754 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 "step unknown.\n");
757 return false;
760 return true;
763 /* Function vect_analyze_scalar_cycles_1.
765 Examine the cross iteration def-use cycles of scalar variables
766 in LOOP. LOOP_VINFO represents the loop that is now being
767 considered for vectorization (can be LOOP, or an outer-loop
768 enclosing LOOP). */
770 static void
771 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
773 basic_block bb = loop->header;
774 tree init, step;
775 auto_vec<gimple *, 64> worklist;
776 gphi_iterator gsi;
777 bool double_reduc;
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_NOTE, vect_location,
781 "=== vect_analyze_scalar_cycles ===\n");
783 /* First - identify all inductions. Reduction detection assumes that all the
784 inductions have been identified, therefore, this order must not be
785 changed. */
786 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
788 gphi *phi = gsi.phi ();
789 tree access_fn = NULL;
790 tree def = PHI_RESULT (phi);
791 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
793 if (dump_enabled_p ())
795 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
796 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
797 dump_printf (MSG_NOTE, "\n");
800 /* Skip virtual phi's. The data dependences that are associated with
801 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
802 if (virtual_operand_p (def))
803 continue;
805 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
807 /* Analyze the evolution function. */
808 access_fn = analyze_scalar_evolution (loop, def);
809 if (access_fn)
811 STRIP_NOPS (access_fn);
812 if (dump_enabled_p ())
814 dump_printf_loc (MSG_NOTE, vect_location,
815 "Access function of PHI: ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
817 dump_printf (MSG_NOTE, "\n");
819 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
820 = initial_condition_in_loop_num (access_fn, loop->num);
821 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
822 = evolution_part_in_loop_num (access_fn, loop->num);
825 if (!access_fn
826 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
827 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
828 && TREE_CODE (step) != INTEGER_CST))
830 worklist.safe_push (phi);
831 continue;
834 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
835 != NULL_TREE);
836 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
840 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
844 /* Second - identify all reductions and nested cycles. */
845 while (worklist.length () > 0)
847 gimple *phi = worklist.pop ();
848 tree def = PHI_RESULT (phi);
849 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
850 gimple *reduc_stmt;
851 bool nested_cycle;
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
856 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
857 dump_printf (MSG_NOTE, "\n");
860 gcc_assert (!virtual_operand_p (def)
861 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
863 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
864 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
865 &double_reduc, false);
866 if (reduc_stmt)
868 if (double_reduc)
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_NOTE, vect_location,
872 "Detected double reduction.\n");
874 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
875 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
876 vect_double_reduction_def;
878 else
880 if (nested_cycle)
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_NOTE, vect_location,
884 "Detected vectorizable nested cycle.\n");
886 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
887 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
888 vect_nested_cycle;
890 else
892 if (dump_enabled_p ())
893 dump_printf_loc (MSG_NOTE, vect_location,
894 "Detected reduction.\n");
896 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
897 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
898 vect_reduction_def;
899 /* Store the reduction cycles for possible vectorization in
900 loop-aware SLP. */
901 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
905 else
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "Unknown def-use cycle pattern.\n");
913 /* Function vect_analyze_scalar_cycles.
915 Examine the cross iteration def-use cycles of scalar variables, by
916 analyzing the loop-header PHIs of scalar variables. Classify each
917 cycle as one of the following: invariant, induction, reduction, unknown.
918 We do that for the loop represented by LOOP_VINFO, and also to its
919 inner-loop, if exists.
920 Examples for scalar cycles:
922 Example1: reduction:
924 loop1:
925 for (i=0; i<N; i++)
926 sum += a[i];
928 Example2: induction:
930 loop2:
931 for (i=0; i<N; i++)
932 a[i] = i; */
934 static void
935 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
937 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
939 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
941 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
942 Reductions in such inner-loop therefore have different properties than
943 the reductions in the nest that gets vectorized:
944 1. When vectorized, they are executed in the same order as in the original
945 scalar loop, so we can't change the order of computation when
946 vectorizing them.
947 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
948 current checks are too strict. */
950 if (loop->inner)
951 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
954 /* Transfer group and reduction information from STMT to its pattern stmt. */
956 static void
957 vect_fixup_reduc_chain (gimple *stmt)
959 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
960 gimple *stmtp;
961 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
962 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
963 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
966 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
967 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
968 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
969 if (stmt)
970 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
971 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
973 while (stmt);
974 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
977 /* Fixup scalar cycles that now have their stmts detected as patterns. */
979 static void
980 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
982 gimple *first;
983 unsigned i;
985 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
986 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
988 vect_fixup_reduc_chain (first);
989 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
990 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
994 /* Function vect_get_loop_niters.
996 Determine how many iterations the loop is executed and place it
997 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
998 in NUMBER_OF_ITERATIONSM1.
1000 Return the loop exit condition. */
1003 static gcond *
1004 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1005 tree *number_of_iterationsm1)
1007 tree niters;
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_NOTE, vect_location,
1011 "=== get_loop_niters ===\n");
1013 niters = number_of_latch_executions (loop);
1014 *number_of_iterationsm1 = niters;
1016 /* We want the number of loop header executions which is the number
1017 of latch executions plus one.
1018 ??? For UINT_MAX latch executions this number overflows to zero
1019 for loops like do { n++; } while (n != 0); */
1020 if (niters && !chrec_contains_undetermined (niters))
1021 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1022 build_int_cst (TREE_TYPE (niters), 1));
1023 *number_of_iterations = niters;
1025 return get_loop_exit_condition (loop);
1029 /* Function bb_in_loop_p
1031 Used as predicate for dfs order traversal of the loop bbs. */
1033 static bool
1034 bb_in_loop_p (const_basic_block bb, const void *data)
1036 const struct loop *const loop = (const struct loop *)data;
1037 if (flow_bb_inside_loop_p (loop, bb))
1038 return true;
1039 return false;
1043 /* Function new_loop_vec_info.
1045 Create and initialize a new loop_vec_info struct for LOOP, as well as
1046 stmt_vec_info structs for all the stmts in LOOP. */
1048 static loop_vec_info
1049 new_loop_vec_info (struct loop *loop)
1051 loop_vec_info res;
1052 basic_block *bbs;
1053 gimple_stmt_iterator si;
1054 unsigned int i, nbbs;
1056 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1057 res->kind = vec_info::loop;
1058 LOOP_VINFO_LOOP (res) = loop;
1060 bbs = get_loop_body (loop);
1062 /* Create/Update stmt_info for all stmts in the loop. */
1063 for (i = 0; i < loop->num_nodes; i++)
1065 basic_block bb = bbs[i];
1067 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1069 gimple *phi = gsi_stmt (si);
1070 gimple_set_uid (phi, 0);
1071 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1076 gimple *stmt = gsi_stmt (si);
1077 gimple_set_uid (stmt, 0);
1078 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1082 /* CHECKME: We want to visit all BBs before their successors (except for
1083 latch blocks, for which this assertion wouldn't hold). In the simple
1084 case of the loop forms we allow, a dfs order of the BBs would the same
1085 as reversed postorder traversal, so we are safe. */
1087 free (bbs);
1088 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1089 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1090 bbs, loop->num_nodes, loop);
1091 gcc_assert (nbbs == loop->num_nodes);
1093 LOOP_VINFO_BBS (res) = bbs;
1094 LOOP_VINFO_NITERSM1 (res) = NULL;
1095 LOOP_VINFO_NITERS (res) = NULL;
1096 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1097 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1098 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1099 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1100 LOOP_VINFO_VECT_FACTOR (res) = 0;
1101 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1102 LOOP_VINFO_DATAREFS (res) = vNULL;
1103 LOOP_VINFO_DDRS (res) = vNULL;
1104 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1105 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1106 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1107 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1108 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1109 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1110 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1111 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1112 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1113 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1114 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1115 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1117 return res;
1121 /* Function destroy_loop_vec_info.
1123 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1124 stmts in the loop. */
1126 void
1127 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1129 struct loop *loop;
1130 basic_block *bbs;
1131 int nbbs;
1132 gimple_stmt_iterator si;
1133 int j;
1134 vec<slp_instance> slp_instances;
1135 slp_instance instance;
1136 bool swapped;
1138 if (!loop_vinfo)
1139 return;
1141 loop = LOOP_VINFO_LOOP (loop_vinfo);
1143 bbs = LOOP_VINFO_BBS (loop_vinfo);
1144 nbbs = clean_stmts ? loop->num_nodes : 0;
1145 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1147 for (j = 0; j < nbbs; j++)
1149 basic_block bb = bbs[j];
1150 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 free_stmt_vec_info (gsi_stmt (si));
1153 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1155 gimple *stmt = gsi_stmt (si);
1157 /* We may have broken canonical form by moving a constant
1158 into RHS1 of a commutative op. Fix such occurrences. */
1159 if (swapped && is_gimple_assign (stmt))
1161 enum tree_code code = gimple_assign_rhs_code (stmt);
1163 if ((code == PLUS_EXPR
1164 || code == POINTER_PLUS_EXPR
1165 || code == MULT_EXPR)
1166 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1167 swap_ssa_operands (stmt,
1168 gimple_assign_rhs1_ptr (stmt),
1169 gimple_assign_rhs2_ptr (stmt));
1172 /* Free stmt_vec_info. */
1173 free_stmt_vec_info (stmt);
1174 gsi_next (&si);
1178 free (LOOP_VINFO_BBS (loop_vinfo));
1179 vect_destroy_datarefs (loop_vinfo);
1180 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1181 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1182 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1183 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1184 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1185 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1186 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1187 vect_free_slp_instance (instance);
1189 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1190 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1191 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1192 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1194 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1195 loop_vinfo->scalar_cost_vec.release ();
1197 free (loop_vinfo);
1198 loop->aux = NULL;
1202 /* Calculate the cost of one scalar iteration of the loop. */
1203 static void
1204 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1206 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1207 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1208 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1209 int innerloop_iters, i;
1211 /* Count statements in scalar loop. Using this as scalar cost for a single
1212 iteration for now.
1214 TODO: Add outer loop support.
1216 TODO: Consider assigning different costs to different scalar
1217 statements. */
1219 /* FORNOW. */
1220 innerloop_iters = 1;
1221 if (loop->inner)
1222 innerloop_iters = 50; /* FIXME */
1224 for (i = 0; i < nbbs; i++)
1226 gimple_stmt_iterator si;
1227 basic_block bb = bbs[i];
1229 if (bb->loop_father == loop->inner)
1230 factor = innerloop_iters;
1231 else
1232 factor = 1;
1234 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1236 gimple *stmt = gsi_stmt (si);
1237 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1239 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1240 continue;
1242 /* Skip stmts that are not vectorized inside the loop. */
1243 if (stmt_info
1244 && !STMT_VINFO_RELEVANT_P (stmt_info)
1245 && (!STMT_VINFO_LIVE_P (stmt_info)
1246 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1247 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1248 continue;
1250 vect_cost_for_stmt kind;
1251 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1253 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1254 kind = scalar_load;
1255 else
1256 kind = scalar_store;
1258 else
1259 kind = scalar_stmt;
1261 scalar_single_iter_cost
1262 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1263 factor, kind, NULL, 0, vect_prologue);
1266 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1267 = scalar_single_iter_cost;
1271 /* Function vect_analyze_loop_form_1.
1273 Verify that certain CFG restrictions hold, including:
1274 - the loop has a pre-header
1275 - the loop has a single entry and exit
1276 - the loop exit condition is simple enough, and the number of iterations
1277 can be analyzed (a countable loop). */
1279 bool
1280 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1281 tree *number_of_iterationsm1,
1282 tree *number_of_iterations, gcond **inner_loop_cond)
1284 if (dump_enabled_p ())
1285 dump_printf_loc (MSG_NOTE, vect_location,
1286 "=== vect_analyze_loop_form ===\n");
1288 /* Different restrictions apply when we are considering an inner-most loop,
1289 vs. an outer (nested) loop.
1290 (FORNOW. May want to relax some of these restrictions in the future). */
1292 if (!loop->inner)
1294 /* Inner-most loop. We currently require that the number of BBs is
1295 exactly 2 (the header and latch). Vectorizable inner-most loops
1296 look like this:
1298 (pre-header)
1300 header <--------+
1301 | | |
1302 | +--> latch --+
1304 (exit-bb) */
1306 if (loop->num_nodes != 2)
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "not vectorized: control flow in loop.\n");
1311 return false;
1314 if (empty_block_p (loop->header))
1316 if (dump_enabled_p ())
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318 "not vectorized: empty loop.\n");
1319 return false;
1322 else
1324 struct loop *innerloop = loop->inner;
1325 edge entryedge;
1327 /* Nested loop. We currently require that the loop is doubly-nested,
1328 contains a single inner loop, and the number of BBs is exactly 5.
1329 Vectorizable outer-loops look like this:
1331 (pre-header)
1333 header <---+
1335 inner-loop |
1337 tail ------+
1339 (exit-bb)
1341 The inner-loop has the properties expected of inner-most loops
1342 as described above. */
1344 if ((loop->inner)->inner || (loop->inner)->next)
1346 if (dump_enabled_p ())
1347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348 "not vectorized: multiple nested loops.\n");
1349 return false;
1352 if (loop->num_nodes != 5)
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356 "not vectorized: control flow in loop.\n");
1357 return false;
1360 entryedge = loop_preheader_edge (innerloop);
1361 if (entryedge->src != loop->header
1362 || !single_exit (innerloop)
1363 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367 "not vectorized: unsupported outerloop form.\n");
1368 return false;
1371 /* Analyze the inner-loop. */
1372 tree inner_niterm1, inner_niter;
1373 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1374 &inner_niterm1, &inner_niter, NULL))
1376 if (dump_enabled_p ())
1377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378 "not vectorized: Bad inner loop.\n");
1379 return false;
1382 if (!expr_invariant_in_loop_p (loop, inner_niter))
1384 if (dump_enabled_p ())
1385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386 "not vectorized: inner-loop count not"
1387 " invariant.\n");
1388 return false;
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "Considering outer-loop vectorization.\n");
1396 if (!single_exit (loop)
1397 || EDGE_COUNT (loop->header->preds) != 2)
1399 if (dump_enabled_p ())
1401 if (!single_exit (loop))
1402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1403 "not vectorized: multiple exits.\n");
1404 else if (EDGE_COUNT (loop->header->preds) != 2)
1405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406 "not vectorized: too many incoming edges.\n");
1408 return false;
1411 /* We assume that the loop exit condition is at the end of the loop. i.e,
1412 that the loop is represented as a do-while (with a proper if-guard
1413 before the loop if needed), where the loop header contains all the
1414 executable statements, and the latch is empty. */
1415 if (!empty_block_p (loop->latch)
1416 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420 "not vectorized: latch block not empty.\n");
1421 return false;
1424 /* Make sure there exists a single-predecessor exit bb: */
1425 if (!single_pred_p (single_exit (loop)->dest))
1427 edge e = single_exit (loop);
1428 if (!(e->flags & EDGE_ABNORMAL))
1430 split_loop_exit_edge (e);
1431 if (dump_enabled_p ())
1432 dump_printf (MSG_NOTE, "split exit edge.\n");
1434 else
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438 "not vectorized: abnormal loop exit edge.\n");
1439 return false;
1443 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1444 number_of_iterationsm1);
1445 if (!*loop_cond)
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449 "not vectorized: complicated exit condition.\n");
1450 return false;
1453 if (!*number_of_iterations
1454 || chrec_contains_undetermined (*number_of_iterations))
1456 if (dump_enabled_p ())
1457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1458 "not vectorized: number of iterations cannot be "
1459 "computed.\n");
1460 return false;
1463 if (integer_zerop (*number_of_iterations))
1465 if (dump_enabled_p ())
1466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467 "not vectorized: number of iterations = 0.\n");
1468 return false;
1471 return true;
1474 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1476 loop_vec_info
1477 vect_analyze_loop_form (struct loop *loop)
1479 tree number_of_iterations, number_of_iterationsm1;
1480 gcond *loop_cond, *inner_loop_cond = NULL;
1482 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1483 &number_of_iterations, &inner_loop_cond))
1484 return NULL;
1486 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1487 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1488 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1489 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1491 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1493 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_NOTE, vect_location,
1496 "Symbolic number of iterations is ");
1497 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1498 dump_printf (MSG_NOTE, "\n");
1502 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1504 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1505 = loop_exit_ctrl_vec_info_type;
1507 gcc_assert (!loop->aux);
1508 loop->aux = loop_vinfo;
1509 return loop_vinfo;
1514 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1515 statements update the vectorization factor. */
1517 static void
1518 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1520 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1521 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1522 int nbbs = loop->num_nodes;
1523 unsigned int vectorization_factor;
1524 int i;
1526 if (dump_enabled_p ())
1527 dump_printf_loc (MSG_NOTE, vect_location,
1528 "=== vect_update_vf_for_slp ===\n");
1530 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1531 gcc_assert (vectorization_factor != 0);
1533 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1534 vectorization factor of the loop is the unrolling factor required by
1535 the SLP instances. If that unrolling factor is 1, we say, that we
1536 perform pure SLP on loop - cross iteration parallelism is not
1537 exploited. */
1538 bool only_slp_in_loop = true;
1539 for (i = 0; i < nbbs; i++)
1541 basic_block bb = bbs[i];
1542 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1543 gsi_next (&si))
1545 gimple *stmt = gsi_stmt (si);
1546 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1547 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1548 && STMT_VINFO_RELATED_STMT (stmt_info))
1550 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1551 stmt_info = vinfo_for_stmt (stmt);
1553 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555 && !PURE_SLP_STMT (stmt_info))
1556 /* STMT needs both SLP and loop-based vectorization. */
1557 only_slp_in_loop = false;
1561 if (only_slp_in_loop)
1562 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1563 else
1564 vectorization_factor
1565 = least_common_multiple (vectorization_factor,
1566 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1568 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "Updating vectorization factor to %d\n",
1572 vectorization_factor);
1575 /* Function vect_analyze_loop_operations.
1577 Scan the loop stmts and make sure they are all vectorizable. */
1579 static bool
1580 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1582 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1583 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1584 int nbbs = loop->num_nodes;
1585 int i;
1586 stmt_vec_info stmt_info;
1587 bool need_to_vectorize = false;
1588 bool ok;
1590 if (dump_enabled_p ())
1591 dump_printf_loc (MSG_NOTE, vect_location,
1592 "=== vect_analyze_loop_operations ===\n");
1594 for (i = 0; i < nbbs; i++)
1596 basic_block bb = bbs[i];
1598 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1599 gsi_next (&si))
1601 gphi *phi = si.phi ();
1602 ok = true;
1604 stmt_info = vinfo_for_stmt (phi);
1605 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1608 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1609 dump_printf (MSG_NOTE, "\n");
1611 if (virtual_operand_p (gimple_phi_result (phi)))
1612 continue;
1614 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1615 (i.e., a phi in the tail of the outer-loop). */
1616 if (! is_loop_header_bb_p (bb))
1618 /* FORNOW: we currently don't support the case that these phis
1619 are not used in the outerloop (unless it is double reduction,
1620 i.e., this phi is vect_reduction_def), cause this case
1621 requires to actually do something here. */
1622 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1623 || STMT_VINFO_LIVE_P (stmt_info))
1624 && STMT_VINFO_DEF_TYPE (stmt_info)
1625 != vect_double_reduction_def)
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1629 "Unsupported loop-closed phi in "
1630 "outer-loop.\n");
1631 return false;
1634 /* If PHI is used in the outer loop, we check that its operand
1635 is defined in the inner loop. */
1636 if (STMT_VINFO_RELEVANT_P (stmt_info))
1638 tree phi_op;
1639 gimple *op_def_stmt;
1641 if (gimple_phi_num_args (phi) != 1)
1642 return false;
1644 phi_op = PHI_ARG_DEF (phi, 0);
1645 if (TREE_CODE (phi_op) != SSA_NAME)
1646 return false;
1648 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1649 if (gimple_nop_p (op_def_stmt)
1650 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1651 || !vinfo_for_stmt (op_def_stmt))
1652 return false;
1654 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1655 != vect_used_in_outer
1656 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1657 != vect_used_in_outer_by_reduction)
1658 return false;
1661 continue;
1664 gcc_assert (stmt_info);
1666 if (STMT_VINFO_LIVE_P (stmt_info))
1668 /* FORNOW: not yet supported. */
1669 if (dump_enabled_p ())
1670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1671 "not vectorized: value used after loop.\n");
1672 return false;
1675 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1676 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1678 /* A scalar-dependence cycle that we don't support. */
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681 "not vectorized: scalar dependence cycle.\n");
1682 return false;
1685 if (STMT_VINFO_RELEVANT_P (stmt_info))
1687 need_to_vectorize = true;
1688 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1689 ok = vectorizable_induction (phi, NULL, NULL);
1692 if (!ok)
1694 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: relevant phi not "
1698 "supported: ");
1699 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1700 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1702 return false;
1706 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1707 gsi_next (&si))
1709 gimple *stmt = gsi_stmt (si);
1710 if (!gimple_clobber_p (stmt)
1711 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1712 return false;
1714 } /* bbs */
1716 /* All operations in the loop are either irrelevant (deal with loop
1717 control, or dead), or only used outside the loop and can be moved
1718 out of the loop (e.g. invariants, inductions). The loop can be
1719 optimized away by scalar optimizations. We're better off not
1720 touching this loop. */
1721 if (!need_to_vectorize)
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_NOTE, vect_location,
1725 "All the computation can be taken out of the loop.\n");
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1728 "not vectorized: redundant loop. no profit to "
1729 "vectorize.\n");
1730 return false;
1733 return true;
1737 /* Function vect_analyze_loop_2.
1739 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1740 for it. The different analyses will record information in the
1741 loop_vec_info struct. */
1742 static bool
1743 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1745 bool ok;
1746 int max_vf = MAX_VECTORIZATION_FACTOR;
1747 int min_vf = 2;
1748 unsigned int n_stmts = 0;
1750 /* The first group of checks is independent of the vector size. */
1751 fatal = true;
1753 /* Find all data references in the loop (which correspond to vdefs/vuses)
1754 and analyze their evolution in the loop. */
1756 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1758 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1759 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1761 if (dump_enabled_p ())
1762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763 "not vectorized: loop contains function calls"
1764 " or data references that cannot be analyzed\n");
1765 return false;
1768 for (unsigned i = 0; i < loop->num_nodes; i++)
1769 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1770 !gsi_end_p (gsi); gsi_next (&gsi))
1772 gimple *stmt = gsi_stmt (gsi);
1773 if (is_gimple_debug (stmt))
1774 continue;
1775 ++n_stmts;
1776 if (!find_data_references_in_stmt (loop, stmt,
1777 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1779 if (is_gimple_call (stmt) && loop->safelen)
1781 tree fndecl = gimple_call_fndecl (stmt), op;
1782 if (fndecl != NULL_TREE)
1784 cgraph_node *node = cgraph_node::get (fndecl);
1785 if (node != NULL && node->simd_clones != NULL)
1787 unsigned int j, n = gimple_call_num_args (stmt);
1788 for (j = 0; j < n; j++)
1790 op = gimple_call_arg (stmt, j);
1791 if (DECL_P (op)
1792 || (REFERENCE_CLASS_P (op)
1793 && get_base_address (op)))
1794 break;
1796 op = gimple_call_lhs (stmt);
1797 /* Ignore #pragma omp declare simd functions
1798 if they don't have data references in the
1799 call stmt itself. */
1800 if (j == n
1801 && !(op
1802 && (DECL_P (op)
1803 || (REFERENCE_CLASS_P (op)
1804 && get_base_address (op)))))
1805 continue;
1809 if (dump_enabled_p ())
1810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811 "not vectorized: loop contains function "
1812 "calls or data references that cannot "
1813 "be analyzed\n");
1814 return false;
1818 /* Analyze the data references and also adjust the minimal
1819 vectorization factor according to the loads and stores. */
1821 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1822 if (!ok)
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "bad data references.\n");
1827 return false;
1830 /* Classify all cross-iteration scalar data-flow cycles.
1831 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1832 vect_analyze_scalar_cycles (loop_vinfo);
1834 vect_pattern_recog (loop_vinfo);
1836 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1838 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1839 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1841 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1842 if (!ok)
1844 if (dump_enabled_p ())
1845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846 "bad data access.\n");
1847 return false;
1850 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1852 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1853 if (!ok)
1855 if (dump_enabled_p ())
1856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857 "unexpected pattern.\n");
1858 return false;
1861 /* While the rest of the analysis below depends on it in some way. */
1862 fatal = false;
1864 /* Analyze data dependences between the data-refs in the loop
1865 and adjust the maximum vectorization factor according to
1866 the dependences.
1867 FORNOW: fail at the first data dependence that we encounter. */
1869 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1870 if (!ok
1871 || max_vf < min_vf)
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "bad data dependence.\n");
1876 return false;
1879 ok = vect_determine_vectorization_factor (loop_vinfo);
1880 if (!ok)
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884 "can't determine vectorization factor.\n");
1885 return false;
1887 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891 "bad data dependence.\n");
1892 return false;
1895 /* Compute the scalar iteration cost. */
1896 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1898 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1899 HOST_WIDE_INT estimated_niter;
1900 unsigned th;
1901 int min_scalar_loop_bound;
1903 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1904 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1905 if (!ok)
1906 return false;
1908 /* If there are any SLP instances mark them as pure_slp. */
1909 bool slp = vect_make_slp_decision (loop_vinfo);
1910 if (slp)
1912 /* Find stmts that need to be both vectorized and SLPed. */
1913 vect_detect_hybrid_slp (loop_vinfo);
1915 /* Update the vectorization factor based on the SLP decision. */
1916 vect_update_vf_for_slp (loop_vinfo);
1919 /* This is the point where we can re-start analysis with SLP forced off. */
1920 start_over:
1922 /* Now the vectorization factor is final. */
1923 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1924 gcc_assert (vectorization_factor != 0);
1926 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1927 dump_printf_loc (MSG_NOTE, vect_location,
1928 "vectorization_factor = %d, niters = "
1929 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1930 LOOP_VINFO_INT_NITERS (loop_vinfo));
1932 HOST_WIDE_INT max_niter
1933 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1934 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1935 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1936 || (max_niter != -1
1937 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "not vectorized: iteration count smaller than "
1942 "vectorization factor.\n");
1943 return false;
1946 /* Analyze the alignment of the data-refs in the loop.
1947 Fail if a data reference is found that cannot be vectorized. */
1949 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1950 if (!ok)
1952 if (dump_enabled_p ())
1953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1954 "bad data alignment.\n");
1955 return false;
1958 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1959 It is important to call pruning after vect_analyze_data_ref_accesses,
1960 since we use grouping information gathered by interleaving analysis. */
1961 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1962 if (!ok)
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966 "number of versioning for alias "
1967 "run-time tests exceeds %d "
1968 "(--param vect-max-version-for-alias-checks)\n",
1969 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1970 return false;
1973 /* This pass will decide on using loop versioning and/or loop peeling in
1974 order to enhance the alignment of data references in the loop. */
1975 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1976 if (!ok)
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980 "bad data alignment.\n");
1981 return false;
1984 if (slp)
1986 /* Analyze operations in the SLP instances. Note this may
1987 remove unsupported SLP instances which makes the above
1988 SLP kind detection invalid. */
1989 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1990 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1991 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1992 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1993 goto again;
1996 /* Scan all the remaining operations in the loop that are not subject
1997 to SLP and make sure they are vectorizable. */
1998 ok = vect_analyze_loop_operations (loop_vinfo);
1999 if (!ok)
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "bad operation or unsupported loop bound.\n");
2004 return false;
2007 /* Analyze cost. Decide if worth while to vectorize. */
2008 int min_profitable_estimate, min_profitable_iters;
2009 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2010 &min_profitable_estimate);
2012 if (min_profitable_iters < 0)
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016 "not vectorized: vectorization not profitable.\n");
2017 if (dump_enabled_p ())
2018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019 "not vectorized: vector version will never be "
2020 "profitable.\n");
2021 goto again;
2024 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2025 * vectorization_factor) - 1);
2027 /* Use the cost model only if it is more conservative than user specified
2028 threshold. */
2029 th = (unsigned) min_scalar_loop_bound;
2030 if (min_profitable_iters
2031 && (!min_scalar_loop_bound
2032 || min_profitable_iters > min_scalar_loop_bound))
2033 th = (unsigned) min_profitable_iters;
2035 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2037 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2038 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2040 if (dump_enabled_p ())
2041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2042 "not vectorized: vectorization not profitable.\n");
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_NOTE, vect_location,
2045 "not vectorized: iteration count smaller than user "
2046 "specified loop bound parameter or minimum profitable "
2047 "iterations (whichever is more conservative).\n");
2048 goto again;
2051 estimated_niter
2052 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2053 if (estimated_niter != -1
2054 && ((unsigned HOST_WIDE_INT) estimated_niter
2055 <= MAX (th, (unsigned)min_profitable_estimate)))
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "not vectorized: estimated iteration count too "
2060 "small.\n");
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "not vectorized: estimated iteration count smaller "
2064 "than specified loop bound parameter or minimum "
2065 "profitable iterations (whichever is more "
2066 "conservative).\n");
2067 goto again;
2070 /* Decide whether we need to create an epilogue loop to handle
2071 remaining scalar iterations. */
2072 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2073 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2074 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2076 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2077 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2079 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2080 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2081 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2082 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2084 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2085 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2086 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2087 /* In case of versioning, check if the maximum number of
2088 iterations is greater than th. If they are identical,
2089 the epilogue is unnecessary. */
2090 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2091 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2092 || (unsigned HOST_WIDE_INT) max_niter > th)))
2093 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2095 /* If an epilogue loop is required make sure we can create one. */
2096 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2097 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2101 if (!vect_can_advance_ivs_p (loop_vinfo)
2102 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2103 single_exit (LOOP_VINFO_LOOP
2104 (loop_vinfo))))
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "not vectorized: can't create required "
2109 "epilog loop\n");
2110 goto again;
2114 gcc_assert (vectorization_factor
2115 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2117 /* Ok to vectorize! */
2118 return true;
2120 again:
2121 /* Try again with SLP forced off but if we didn't do any SLP there is
2122 no point in re-trying. */
2123 if (!slp)
2124 return false;
2126 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127 via interleaving or lane instructions or if there were any SLP
2128 reductions. */
2129 slp_instance instance;
2130 slp_tree node;
2131 unsigned i, j;
2132 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2134 stmt_vec_info vinfo;
2135 vinfo = vinfo_for_stmt
2136 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2137 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2138 return false;
2139 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2140 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2141 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2142 if (! vect_store_lanes_supported (vectype, size)
2143 && ! vect_grouped_store_supported (vectype, size))
2144 return false;
2145 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2147 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2148 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2149 size = STMT_VINFO_GROUP_SIZE (vinfo);
2150 vectype = STMT_VINFO_VECTYPE (vinfo);
2151 if (! vect_load_lanes_supported (vectype, size)
2152 && ! vect_grouped_load_supported (vectype, size))
2153 return false;
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location,
2159 "re-trying with SLP disabled\n");
2161 /* Roll back state appropriately. No SLP this time. */
2162 slp = false;
2163 /* Restore vectorization factor as it were without SLP. */
2164 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2165 /* Free the SLP instances. */
2166 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2167 vect_free_slp_instance (instance);
2168 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2169 /* Reset SLP type to loop_vect on all stmts. */
2170 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2173 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2174 !gsi_end_p (si); gsi_next (&si))
2176 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2177 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2179 gcc_assert (STMT_SLP_TYPE (stmt_info) == loop_vect);
2180 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2181 for (gimple_stmt_iterator pi
2182 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2183 !gsi_end_p (pi); gsi_next (&pi))
2185 gimple *pstmt = gsi_stmt (pi);
2186 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2189 STMT_SLP_TYPE (stmt_info) = loop_vect;
2192 /* Free optimized alias test DDRS. */
2193 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2194 /* Reset target cost data. */
2195 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2196 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2197 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2198 /* Reset assorted flags. */
2199 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2200 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2201 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2203 goto start_over;
2206 /* Function vect_analyze_loop.
2208 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2209 for it. The different analyses will record information in the
2210 loop_vec_info struct. */
2211 loop_vec_info
2212 vect_analyze_loop (struct loop *loop)
2214 loop_vec_info loop_vinfo;
2215 unsigned int vector_sizes;
2217 /* Autodetect first vector size we try. */
2218 current_vector_size = 0;
2219 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_NOTE, vect_location,
2223 "===== analyze_loop_nest =====\n");
2225 if (loop_outer (loop)
2226 && loop_vec_info_for_loop (loop_outer (loop))
2227 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_NOTE, vect_location,
2231 "outer-loop already vectorized.\n");
2232 return NULL;
2235 while (1)
2237 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2238 loop_vinfo = vect_analyze_loop_form (loop);
2239 if (!loop_vinfo)
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "bad loop form.\n");
2244 return NULL;
2247 bool fatal = false;
2248 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2250 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2252 return loop_vinfo;
2255 destroy_loop_vec_info (loop_vinfo, true);
2257 vector_sizes &= ~current_vector_size;
2258 if (fatal
2259 || vector_sizes == 0
2260 || current_vector_size == 0)
2261 return NULL;
2263 /* Try the next biggest vector size. */
2264 current_vector_size = 1 << floor_log2 (vector_sizes);
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "***** Re-trying analysis with "
2268 "vector size %d\n", current_vector_size);
2273 /* Function reduction_code_for_scalar_code
2275 Input:
2276 CODE - tree_code of a reduction operations.
2278 Output:
2279 REDUC_CODE - the corresponding tree-code to be used to reduce the
2280 vector of partial results into a single scalar result, or ERROR_MARK
2281 if the operation is a supported reduction operation, but does not have
2282 such a tree-code.
2284 Return FALSE if CODE currently cannot be vectorized as reduction. */
2286 static bool
2287 reduction_code_for_scalar_code (enum tree_code code,
2288 enum tree_code *reduc_code)
2290 switch (code)
2292 case MAX_EXPR:
2293 *reduc_code = REDUC_MAX_EXPR;
2294 return true;
2296 case MIN_EXPR:
2297 *reduc_code = REDUC_MIN_EXPR;
2298 return true;
2300 case PLUS_EXPR:
2301 *reduc_code = REDUC_PLUS_EXPR;
2302 return true;
2304 case MULT_EXPR:
2305 case MINUS_EXPR:
2306 case BIT_IOR_EXPR:
2307 case BIT_XOR_EXPR:
2308 case BIT_AND_EXPR:
2309 *reduc_code = ERROR_MARK;
2310 return true;
2312 default:
2313 return false;
2318 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2319 STMT is printed with a message MSG. */
2321 static void
2322 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2324 dump_printf_loc (msg_type, vect_location, "%s", msg);
2325 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2326 dump_printf (msg_type, "\n");
2330 /* Detect SLP reduction of the form:
2332 #a1 = phi <a5, a0>
2333 a2 = operation (a1)
2334 a3 = operation (a2)
2335 a4 = operation (a3)
2336 a5 = operation (a4)
2338 #a = phi <a5>
2340 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2341 FIRST_STMT is the first reduction stmt in the chain
2342 (a2 = operation (a1)).
2344 Return TRUE if a reduction chain was detected. */
2346 static bool
2347 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2348 gimple *first_stmt)
2350 struct loop *loop = (gimple_bb (phi))->loop_father;
2351 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2352 enum tree_code code;
2353 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2354 stmt_vec_info use_stmt_info, current_stmt_info;
2355 tree lhs;
2356 imm_use_iterator imm_iter;
2357 use_operand_p use_p;
2358 int nloop_uses, size = 0, n_out_of_loop_uses;
2359 bool found = false;
2361 if (loop != vect_loop)
2362 return false;
2364 lhs = PHI_RESULT (phi);
2365 code = gimple_assign_rhs_code (first_stmt);
2366 while (1)
2368 nloop_uses = 0;
2369 n_out_of_loop_uses = 0;
2370 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2372 gimple *use_stmt = USE_STMT (use_p);
2373 if (is_gimple_debug (use_stmt))
2374 continue;
2376 /* Check if we got back to the reduction phi. */
2377 if (use_stmt == phi)
2379 loop_use_stmt = use_stmt;
2380 found = true;
2381 break;
2384 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2386 loop_use_stmt = use_stmt;
2387 nloop_uses++;
2389 else
2390 n_out_of_loop_uses++;
2392 /* There are can be either a single use in the loop or two uses in
2393 phi nodes. */
2394 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2395 return false;
2398 if (found)
2399 break;
2401 /* We reached a statement with no loop uses. */
2402 if (nloop_uses == 0)
2403 return false;
2405 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2406 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2407 return false;
2409 if (!is_gimple_assign (loop_use_stmt)
2410 || code != gimple_assign_rhs_code (loop_use_stmt)
2411 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2412 return false;
2414 /* Insert USE_STMT into reduction chain. */
2415 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2416 if (current_stmt)
2418 current_stmt_info = vinfo_for_stmt (current_stmt);
2419 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2420 GROUP_FIRST_ELEMENT (use_stmt_info)
2421 = GROUP_FIRST_ELEMENT (current_stmt_info);
2423 else
2424 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2426 lhs = gimple_assign_lhs (loop_use_stmt);
2427 current_stmt = loop_use_stmt;
2428 size++;
2431 if (!found || loop_use_stmt != phi || size < 2)
2432 return false;
2434 /* Swap the operands, if needed, to make the reduction operand be the second
2435 operand. */
2436 lhs = PHI_RESULT (phi);
2437 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2438 while (next_stmt)
2440 if (gimple_assign_rhs2 (next_stmt) == lhs)
2442 tree op = gimple_assign_rhs1 (next_stmt);
2443 gimple *def_stmt = NULL;
2445 if (TREE_CODE (op) == SSA_NAME)
2446 def_stmt = SSA_NAME_DEF_STMT (op);
2448 /* Check that the other def is either defined in the loop
2449 ("vect_internal_def"), or it's an induction (defined by a
2450 loop-header phi-node). */
2451 if (def_stmt
2452 && gimple_bb (def_stmt)
2453 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2454 && (is_gimple_assign (def_stmt)
2455 || is_gimple_call (def_stmt)
2456 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2457 == vect_induction_def
2458 || (gimple_code (def_stmt) == GIMPLE_PHI
2459 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2460 == vect_internal_def
2461 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2463 lhs = gimple_assign_lhs (next_stmt);
2464 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2465 continue;
2468 return false;
2470 else
2472 tree op = gimple_assign_rhs2 (next_stmt);
2473 gimple *def_stmt = NULL;
2475 if (TREE_CODE (op) == SSA_NAME)
2476 def_stmt = SSA_NAME_DEF_STMT (op);
2478 /* Check that the other def is either defined in the loop
2479 ("vect_internal_def"), or it's an induction (defined by a
2480 loop-header phi-node). */
2481 if (def_stmt
2482 && gimple_bb (def_stmt)
2483 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2484 && (is_gimple_assign (def_stmt)
2485 || is_gimple_call (def_stmt)
2486 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2487 == vect_induction_def
2488 || (gimple_code (def_stmt) == GIMPLE_PHI
2489 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2490 == vect_internal_def
2491 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2493 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2496 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2497 dump_printf (MSG_NOTE, "\n");
2500 swap_ssa_operands (next_stmt,
2501 gimple_assign_rhs1_ptr (next_stmt),
2502 gimple_assign_rhs2_ptr (next_stmt));
2503 update_stmt (next_stmt);
2505 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2506 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2508 else
2509 return false;
2512 lhs = gimple_assign_lhs (next_stmt);
2513 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2516 /* Save the chain for further analysis in SLP detection. */
2517 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2518 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2519 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2521 return true;
2525 /* Function vect_is_simple_reduction_1
2527 (1) Detect a cross-iteration def-use cycle that represents a simple
2528 reduction computation. We look for the following pattern:
2530 loop_header:
2531 a1 = phi < a0, a2 >
2532 a3 = ...
2533 a2 = operation (a3, a1)
2537 a3 = ...
2538 loop_header:
2539 a1 = phi < a0, a2 >
2540 a2 = operation (a3, a1)
2542 such that:
2543 1. operation is commutative and associative and it is safe to
2544 change the order of the computation (if CHECK_REDUCTION is true)
2545 2. no uses for a2 in the loop (a2 is used out of the loop)
2546 3. no uses of a1 in the loop besides the reduction operation
2547 4. no uses of a1 outside the loop.
2549 Conditions 1,4 are tested here.
2550 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2552 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2553 nested cycles, if CHECK_REDUCTION is false.
2555 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2556 reductions:
2558 a1 = phi < a0, a2 >
2559 inner loop (def of a3)
2560 a2 = phi < a3 >
2562 (4) Detect condition expressions, ie:
2563 for (int i = 0; i < N; i++)
2564 if (a[i] < val)
2565 ret_val = a[i];
2569 static gimple *
2570 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2571 bool check_reduction, bool *double_reduc,
2572 bool need_wrapping_integral_overflow,
2573 enum vect_reduction_type *v_reduc_type)
2575 struct loop *loop = (gimple_bb (phi))->loop_father;
2576 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2577 edge latch_e = loop_latch_edge (loop);
2578 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2579 gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2580 enum tree_code orig_code, code;
2581 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2582 tree type;
2583 int nloop_uses;
2584 tree name;
2585 imm_use_iterator imm_iter;
2586 use_operand_p use_p;
2587 bool phi_def;
2589 *double_reduc = false;
2590 *v_reduc_type = TREE_CODE_REDUCTION;
2592 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2593 otherwise, we assume outer loop vectorization. */
2594 gcc_assert ((check_reduction && loop == vect_loop)
2595 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2597 name = PHI_RESULT (phi);
2598 /* ??? If there are no uses of the PHI result the inner loop reduction
2599 won't be detected as possibly double-reduction by vectorizable_reduction
2600 because that tries to walk the PHI arg from the preheader edge which
2601 can be constant. See PR60382. */
2602 if (has_zero_uses (name))
2603 return NULL;
2604 nloop_uses = 0;
2605 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2607 gimple *use_stmt = USE_STMT (use_p);
2608 if (is_gimple_debug (use_stmt))
2609 continue;
2611 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2613 if (dump_enabled_p ())
2614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2615 "intermediate value used outside loop.\n");
2617 return NULL;
2620 nloop_uses++;
2621 if (nloop_uses > 1)
2623 if (dump_enabled_p ())
2624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625 "reduction used in loop.\n");
2626 return NULL;
2630 if (TREE_CODE (loop_arg) != SSA_NAME)
2632 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "reduction: not ssa_name: ");
2636 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2637 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2639 return NULL;
2642 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2643 if (!def_stmt)
2645 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2647 "reduction: no def_stmt.\n");
2648 return NULL;
2651 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2653 if (dump_enabled_p ())
2655 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2656 dump_printf (MSG_NOTE, "\n");
2658 return NULL;
2661 if (is_gimple_assign (def_stmt))
2663 name = gimple_assign_lhs (def_stmt);
2664 phi_def = false;
2666 else
2668 name = PHI_RESULT (def_stmt);
2669 phi_def = true;
2672 nloop_uses = 0;
2673 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2675 gimple *use_stmt = USE_STMT (use_p);
2676 if (is_gimple_debug (use_stmt))
2677 continue;
2678 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2679 nloop_uses++;
2680 if (nloop_uses > 1)
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2684 "reduction used in loop.\n");
2685 return NULL;
2689 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2690 defined in the inner loop. */
2691 if (phi_def)
2693 op1 = PHI_ARG_DEF (def_stmt, 0);
2695 if (gimple_phi_num_args (def_stmt) != 1
2696 || TREE_CODE (op1) != SSA_NAME)
2698 if (dump_enabled_p ())
2699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2700 "unsupported phi node definition.\n");
2702 return NULL;
2705 def1 = SSA_NAME_DEF_STMT (op1);
2706 if (gimple_bb (def1)
2707 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2708 && loop->inner
2709 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2710 && is_gimple_assign (def1))
2712 if (dump_enabled_p ())
2713 report_vect_op (MSG_NOTE, def_stmt,
2714 "detected double reduction: ");
2716 *double_reduc = true;
2717 return def_stmt;
2720 return NULL;
2723 code = orig_code = gimple_assign_rhs_code (def_stmt);
2725 /* We can handle "res -= x[i]", which is non-associative by
2726 simply rewriting this into "res += -x[i]". Avoid changing
2727 gimple instruction for the first simple tests and only do this
2728 if we're allowed to change code at all. */
2729 if (code == MINUS_EXPR
2730 && (op1 = gimple_assign_rhs1 (def_stmt))
2731 && TREE_CODE (op1) == SSA_NAME
2732 && SSA_NAME_DEF_STMT (op1) == phi)
2733 code = PLUS_EXPR;
2735 if (check_reduction)
2737 if (code == COND_EXPR)
2738 *v_reduc_type = COND_REDUCTION;
2739 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2741 if (dump_enabled_p ())
2742 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2743 "reduction: not commutative/associative: ");
2744 return NULL;
2748 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2750 if (code != COND_EXPR)
2752 if (dump_enabled_p ())
2753 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2754 "reduction: not binary operation: ");
2756 return NULL;
2759 op3 = gimple_assign_rhs1 (def_stmt);
2760 if (COMPARISON_CLASS_P (op3))
2762 op4 = TREE_OPERAND (op3, 1);
2763 op3 = TREE_OPERAND (op3, 0);
2766 op1 = gimple_assign_rhs2 (def_stmt);
2767 op2 = gimple_assign_rhs3 (def_stmt);
2769 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2771 if (dump_enabled_p ())
2772 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2773 "reduction: uses not ssa_names: ");
2775 return NULL;
2778 else
2780 op1 = gimple_assign_rhs1 (def_stmt);
2781 op2 = gimple_assign_rhs2 (def_stmt);
2783 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2785 if (dump_enabled_p ())
2786 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2787 "reduction: uses not ssa_names: ");
2789 return NULL;
2793 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2794 if ((TREE_CODE (op1) == SSA_NAME
2795 && !types_compatible_p (type,TREE_TYPE (op1)))
2796 || (TREE_CODE (op2) == SSA_NAME
2797 && !types_compatible_p (type, TREE_TYPE (op2)))
2798 || (op3 && TREE_CODE (op3) == SSA_NAME
2799 && !types_compatible_p (type, TREE_TYPE (op3)))
2800 || (op4 && TREE_CODE (op4) == SSA_NAME
2801 && !types_compatible_p (type, TREE_TYPE (op4))))
2803 if (dump_enabled_p ())
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "reduction: multiple types: operation type: ");
2807 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2808 dump_printf (MSG_NOTE, ", operands types: ");
2809 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2810 TREE_TYPE (op1));
2811 dump_printf (MSG_NOTE, ",");
2812 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2813 TREE_TYPE (op2));
2814 if (op3)
2816 dump_printf (MSG_NOTE, ",");
2817 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2818 TREE_TYPE (op3));
2821 if (op4)
2823 dump_printf (MSG_NOTE, ",");
2824 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2825 TREE_TYPE (op4));
2827 dump_printf (MSG_NOTE, "\n");
2830 return NULL;
2833 /* Check that it's ok to change the order of the computation.
2834 Generally, when vectorizing a reduction we change the order of the
2835 computation. This may change the behavior of the program in some
2836 cases, so we need to check that this is ok. One exception is when
2837 vectorizing an outer-loop: the inner-loop is executed sequentially,
2838 and therefore vectorizing reductions in the inner-loop during
2839 outer-loop vectorization is safe. */
2841 if (*v_reduc_type != COND_REDUCTION)
2843 /* CHECKME: check for !flag_finite_math_only too? */
2844 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2845 && check_reduction)
2847 /* Changing the order of operations changes the semantics. */
2848 if (dump_enabled_p ())
2849 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2850 "reduction: unsafe fp math optimization: ");
2851 return NULL;
2853 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2855 if (!operation_no_trapping_overflow (type, code))
2857 /* Changing the order of operations changes the semantics. */
2858 if (dump_enabled_p ())
2859 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2860 "reduction: unsafe int math optimization"
2861 " (overflow traps): ");
2862 return NULL;
2864 if (need_wrapping_integral_overflow
2865 && !TYPE_OVERFLOW_WRAPS (type)
2866 && operation_can_overflow (code))
2868 /* Changing the order of operations changes the semantics. */
2869 if (dump_enabled_p ())
2870 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2871 "reduction: unsafe int math optimization"
2872 " (overflow doesn't wrap): ");
2873 return NULL;
2876 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2878 /* Changing the order of operations changes the semantics. */
2879 if (dump_enabled_p ())
2880 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881 "reduction: unsafe fixed-point math optimization: ");
2882 return NULL;
2886 /* Reduction is safe. We're dealing with one of the following:
2887 1) integer arithmetic and no trapv
2888 2) floating point arithmetic, and special flags permit this optimization
2889 3) nested cycle (i.e., outer loop vectorization). */
2890 if (TREE_CODE (op1) == SSA_NAME)
2891 def1 = SSA_NAME_DEF_STMT (op1);
2893 if (TREE_CODE (op2) == SSA_NAME)
2894 def2 = SSA_NAME_DEF_STMT (op2);
2896 if (code != COND_EXPR
2897 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2899 if (dump_enabled_p ())
2900 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2901 return NULL;
2904 /* Check that one def is the reduction def, defined by PHI,
2905 the other def is either defined in the loop ("vect_internal_def"),
2906 or it's an induction (defined by a loop-header phi-node). */
2908 if (def2 && def2 == phi
2909 && (code == COND_EXPR
2910 || !def1 || gimple_nop_p (def1)
2911 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2912 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2913 && (is_gimple_assign (def1)
2914 || is_gimple_call (def1)
2915 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2916 == vect_induction_def
2917 || (gimple_code (def1) == GIMPLE_PHI
2918 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2919 == vect_internal_def
2920 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2922 if (dump_enabled_p ())
2923 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2924 return def_stmt;
2927 if (def1 && def1 == phi
2928 && (code == COND_EXPR
2929 || !def2 || gimple_nop_p (def2)
2930 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2931 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2932 && (is_gimple_assign (def2)
2933 || is_gimple_call (def2)
2934 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2935 == vect_induction_def
2936 || (gimple_code (def2) == GIMPLE_PHI
2937 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2938 == vect_internal_def
2939 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2941 if (check_reduction
2942 && orig_code != MINUS_EXPR)
2944 if (code == COND_EXPR)
2946 /* No current known use where this case would be useful. */
2947 if (dump_enabled_p ())
2948 report_vect_op (MSG_NOTE, def_stmt,
2949 "detected reduction: cannot currently swap "
2950 "operands for cond_expr");
2951 return NULL;
2954 /* Swap operands (just for simplicity - so that the rest of the code
2955 can assume that the reduction variable is always the last (second)
2956 argument). */
2957 if (dump_enabled_p ())
2958 report_vect_op (MSG_NOTE, def_stmt,
2959 "detected reduction: need to swap operands: ");
2961 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2962 gimple_assign_rhs2_ptr (def_stmt));
2964 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2965 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2967 else
2969 if (dump_enabled_p ())
2970 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2973 return def_stmt;
2976 /* Try to find SLP reduction chain. */
2977 if (check_reduction && code != COND_EXPR
2978 && vect_is_slp_reduction (loop_info, phi, def_stmt))
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE, def_stmt,
2982 "reduction: detected reduction chain: ");
2984 return def_stmt;
2987 if (dump_enabled_p ())
2988 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989 "reduction: unknown pattern: ");
2991 return NULL;
2994 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2995 in-place if it enables detection of more reductions. Arguments
2996 as there. */
2998 gimple *
2999 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3000 bool check_reduction, bool *double_reduc,
3001 bool need_wrapping_integral_overflow)
3003 enum vect_reduction_type v_reduc_type;
3004 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3005 double_reduc,
3006 need_wrapping_integral_overflow,
3007 &v_reduc_type);
3010 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3012 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3013 int *peel_iters_epilogue,
3014 stmt_vector_for_cost *scalar_cost_vec,
3015 stmt_vector_for_cost *prologue_cost_vec,
3016 stmt_vector_for_cost *epilogue_cost_vec)
3018 int retval = 0;
3019 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3021 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3023 *peel_iters_epilogue = vf/2;
3024 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_NOTE, vect_location,
3026 "cost model: epilogue peel iters set to vf/2 "
3027 "because loop iterations are unknown .\n");
3029 /* If peeled iterations are known but number of scalar loop
3030 iterations are unknown, count a taken branch per peeled loop. */
3031 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3032 NULL, 0, vect_prologue);
3033 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3034 NULL, 0, vect_epilogue);
3036 else
3038 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3039 peel_iters_prologue = niters < peel_iters_prologue ?
3040 niters : peel_iters_prologue;
3041 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3042 /* If we need to peel for gaps, but no peeling is required, we have to
3043 peel VF iterations. */
3044 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3045 *peel_iters_epilogue = vf;
3048 stmt_info_for_cost *si;
3049 int j;
3050 if (peel_iters_prologue)
3051 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3052 retval += record_stmt_cost (prologue_cost_vec,
3053 si->count * peel_iters_prologue,
3054 si->kind, NULL, si->misalign,
3055 vect_prologue);
3056 if (*peel_iters_epilogue)
3057 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3058 retval += record_stmt_cost (epilogue_cost_vec,
3059 si->count * *peel_iters_epilogue,
3060 si->kind, NULL, si->misalign,
3061 vect_epilogue);
3063 return retval;
3066 /* Function vect_estimate_min_profitable_iters
3068 Return the number of iterations required for the vector version of the
3069 loop to be profitable relative to the cost of the scalar version of the
3070 loop. */
3072 static void
3073 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3074 int *ret_min_profitable_niters,
3075 int *ret_min_profitable_estimate)
3077 int min_profitable_iters;
3078 int min_profitable_estimate;
3079 int peel_iters_prologue;
3080 int peel_iters_epilogue;
3081 unsigned vec_inside_cost = 0;
3082 int vec_outside_cost = 0;
3083 unsigned vec_prologue_cost = 0;
3084 unsigned vec_epilogue_cost = 0;
3085 int scalar_single_iter_cost = 0;
3086 int scalar_outside_cost = 0;
3087 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3088 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3089 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3091 /* Cost model disabled. */
3092 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3094 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3095 *ret_min_profitable_niters = 0;
3096 *ret_min_profitable_estimate = 0;
3097 return;
3100 /* Requires loop versioning tests to handle misalignment. */
3101 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3103 /* FIXME: Make cost depend on complexity of individual check. */
3104 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3105 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3106 vect_prologue);
3107 dump_printf (MSG_NOTE,
3108 "cost model: Adding cost of checks for loop "
3109 "versioning to treat misalignment.\n");
3112 /* Requires loop versioning with alias checks. */
3113 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3115 /* FIXME: Make cost depend on complexity of individual check. */
3116 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3117 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3118 vect_prologue);
3119 dump_printf (MSG_NOTE,
3120 "cost model: Adding cost of checks for loop "
3121 "versioning aliasing.\n");
3124 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3125 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3126 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3127 vect_prologue);
3129 /* Count statements in scalar loop. Using this as scalar cost for a single
3130 iteration for now.
3132 TODO: Add outer loop support.
3134 TODO: Consider assigning different costs to different scalar
3135 statements. */
3137 scalar_single_iter_cost
3138 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3140 /* Add additional cost for the peeled instructions in prologue and epilogue
3141 loop.
3143 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3144 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3146 TODO: Build an expression that represents peel_iters for prologue and
3147 epilogue to be used in a run-time test. */
3149 if (npeel < 0)
3151 peel_iters_prologue = vf/2;
3152 dump_printf (MSG_NOTE, "cost model: "
3153 "prologue peel iters set to vf/2.\n");
3155 /* If peeling for alignment is unknown, loop bound of main loop becomes
3156 unknown. */
3157 peel_iters_epilogue = vf/2;
3158 dump_printf (MSG_NOTE, "cost model: "
3159 "epilogue peel iters set to vf/2 because "
3160 "peeling for alignment is unknown.\n");
3162 /* If peeled iterations are unknown, count a taken branch and a not taken
3163 branch per peeled loop. Even if scalar loop iterations are known,
3164 vector iterations are not known since peeled prologue iterations are
3165 not known. Hence guards remain the same. */
3166 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3167 NULL, 0, vect_prologue);
3168 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3169 NULL, 0, vect_prologue);
3170 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3171 NULL, 0, vect_epilogue);
3172 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3173 NULL, 0, vect_epilogue);
3174 stmt_info_for_cost *si;
3175 int j;
3176 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3178 struct _stmt_vec_info *stmt_info
3179 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3180 (void) add_stmt_cost (target_cost_data,
3181 si->count * peel_iters_prologue,
3182 si->kind, stmt_info, si->misalign,
3183 vect_prologue);
3184 (void) add_stmt_cost (target_cost_data,
3185 si->count * peel_iters_epilogue,
3186 si->kind, stmt_info, si->misalign,
3187 vect_epilogue);
3190 else
3192 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3193 stmt_info_for_cost *si;
3194 int j;
3195 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3197 prologue_cost_vec.create (2);
3198 epilogue_cost_vec.create (2);
3199 peel_iters_prologue = npeel;
3201 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3202 &peel_iters_epilogue,
3203 &LOOP_VINFO_SCALAR_ITERATION_COST
3204 (loop_vinfo),
3205 &prologue_cost_vec,
3206 &epilogue_cost_vec);
3208 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3210 struct _stmt_vec_info *stmt_info
3211 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3212 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3213 si->misalign, vect_prologue);
3216 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3218 struct _stmt_vec_info *stmt_info
3219 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3220 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3221 si->misalign, vect_epilogue);
3224 prologue_cost_vec.release ();
3225 epilogue_cost_vec.release ();
3228 /* FORNOW: The scalar outside cost is incremented in one of the
3229 following ways:
3231 1. The vectorizer checks for alignment and aliasing and generates
3232 a condition that allows dynamic vectorization. A cost model
3233 check is ANDED with the versioning condition. Hence scalar code
3234 path now has the added cost of the versioning check.
3236 if (cost > th & versioning_check)
3237 jmp to vector code
3239 Hence run-time scalar is incremented by not-taken branch cost.
3241 2. The vectorizer then checks if a prologue is required. If the
3242 cost model check was not done before during versioning, it has to
3243 be done before the prologue check.
3245 if (cost <= th)
3246 prologue = scalar_iters
3247 if (prologue == 0)
3248 jmp to vector code
3249 else
3250 execute prologue
3251 if (prologue == num_iters)
3252 go to exit
3254 Hence the run-time scalar cost is incremented by a taken branch,
3255 plus a not-taken branch, plus a taken branch cost.
3257 3. The vectorizer then checks if an epilogue is required. If the
3258 cost model check was not done before during prologue check, it
3259 has to be done with the epilogue check.
3261 if (prologue == 0)
3262 jmp to vector code
3263 else
3264 execute prologue
3265 if (prologue == num_iters)
3266 go to exit
3267 vector code:
3268 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3269 jmp to epilogue
3271 Hence the run-time scalar cost should be incremented by 2 taken
3272 branches.
3274 TODO: The back end may reorder the BBS's differently and reverse
3275 conditions/branch directions. Change the estimates below to
3276 something more reasonable. */
3278 /* If the number of iterations is known and we do not do versioning, we can
3279 decide whether to vectorize at compile time. Hence the scalar version
3280 do not carry cost model guard costs. */
3281 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3282 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3283 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3285 /* Cost model check occurs at versioning. */
3286 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3287 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3288 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3289 else
3291 /* Cost model check occurs at prologue generation. */
3292 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3293 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3294 + vect_get_stmt_cost (cond_branch_not_taken);
3295 /* Cost model check occurs at epilogue generation. */
3296 else
3297 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3301 /* Complete the target-specific cost calculations. */
3302 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3303 &vec_inside_cost, &vec_epilogue_cost);
3305 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3307 if (dump_enabled_p ())
3309 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3310 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3311 vec_inside_cost);
3312 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3313 vec_prologue_cost);
3314 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3315 vec_epilogue_cost);
3316 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3317 scalar_single_iter_cost);
3318 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3319 scalar_outside_cost);
3320 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3321 vec_outside_cost);
3322 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3323 peel_iters_prologue);
3324 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3325 peel_iters_epilogue);
3328 /* Calculate number of iterations required to make the vector version
3329 profitable, relative to the loop bodies only. The following condition
3330 must hold true:
3331 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3332 where
3333 SIC = scalar iteration cost, VIC = vector iteration cost,
3334 VOC = vector outside cost, VF = vectorization factor,
3335 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3336 SOC = scalar outside cost for run time cost model check. */
3338 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3340 if (vec_outside_cost <= 0)
3341 min_profitable_iters = 1;
3342 else
3344 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3345 - vec_inside_cost * peel_iters_prologue
3346 - vec_inside_cost * peel_iters_epilogue)
3347 / ((scalar_single_iter_cost * vf)
3348 - vec_inside_cost);
3350 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3351 <= (((int) vec_inside_cost * min_profitable_iters)
3352 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3353 min_profitable_iters++;
3356 /* vector version will never be profitable. */
3357 else
3359 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3360 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3361 "did not happen for a simd loop");
3363 if (dump_enabled_p ())
3364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3365 "cost model: the vector iteration cost = %d "
3366 "divided by the scalar iteration cost = %d "
3367 "is greater or equal to the vectorization factor = %d"
3368 ".\n",
3369 vec_inside_cost, scalar_single_iter_cost, vf);
3370 *ret_min_profitable_niters = -1;
3371 *ret_min_profitable_estimate = -1;
3372 return;
3375 dump_printf (MSG_NOTE,
3376 " Calculated minimum iters for profitability: %d\n",
3377 min_profitable_iters);
3379 min_profitable_iters =
3380 min_profitable_iters < vf ? vf : min_profitable_iters;
3382 /* Because the condition we create is:
3383 if (niters <= min_profitable_iters)
3384 then skip the vectorized loop. */
3385 min_profitable_iters--;
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location,
3389 " Runtime profitability threshold = %d\n",
3390 min_profitable_iters);
3392 *ret_min_profitable_niters = min_profitable_iters;
3394 /* Calculate number of iterations required to make the vector version
3395 profitable, relative to the loop bodies only.
3397 Non-vectorized variant is SIC * niters and it must win over vector
3398 variant on the expected loop trip count. The following condition must hold true:
3399 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3401 if (vec_outside_cost <= 0)
3402 min_profitable_estimate = 1;
3403 else
3405 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3406 - vec_inside_cost * peel_iters_prologue
3407 - vec_inside_cost * peel_iters_epilogue)
3408 / ((scalar_single_iter_cost * vf)
3409 - vec_inside_cost);
3411 min_profitable_estimate --;
3412 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3413 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_NOTE, vect_location,
3415 " Static estimate profitability threshold = %d\n",
3416 min_profitable_iters);
3418 *ret_min_profitable_estimate = min_profitable_estimate;
3421 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3422 vector elements (not bits) for a vector of mode MODE. */
3423 static void
3424 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3425 unsigned char *sel)
3427 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3429 for (i = 0; i < nelt; i++)
3430 sel[i] = (i + offset) & (2*nelt - 1);
3433 /* Checks whether the target supports whole-vector shifts for vectors of mode
3434 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3435 it supports vec_perm_const with masks for all necessary shift amounts. */
3436 static bool
3437 have_whole_vector_shift (enum machine_mode mode)
3439 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3440 return true;
3442 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3443 return false;
3445 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3446 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3448 for (i = nelt/2; i >= 1; i/=2)
3450 calc_vec_perm_mask_for_shift (mode, i, sel);
3451 if (!can_vec_perm_p (mode, false, sel))
3452 return false;
3454 return true;
3457 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3459 static tree
3460 get_reduction_op (gimple *stmt, int reduc_index)
3462 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3464 case GIMPLE_SINGLE_RHS:
3465 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3466 == ternary_op);
3467 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3468 case GIMPLE_UNARY_RHS:
3469 return gimple_assign_rhs1 (stmt);
3470 case GIMPLE_BINARY_RHS:
3471 return (reduc_index
3472 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3473 case GIMPLE_TERNARY_RHS:
3474 return gimple_op (stmt, reduc_index + 1);
3475 default:
3476 gcc_unreachable ();
3480 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3481 functions. Design better to avoid maintenance issues. */
3483 /* Function vect_model_reduction_cost.
3485 Models cost for a reduction operation, including the vector ops
3486 generated within the strip-mine loop, the initial definition before
3487 the loop, and the epilogue code that must be generated. */
3489 static bool
3490 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3491 int ncopies, int reduc_index)
3493 int prologue_cost = 0, epilogue_cost = 0;
3494 enum tree_code code;
3495 optab optab;
3496 tree vectype;
3497 gimple *stmt, *orig_stmt;
3498 tree reduction_op;
3499 machine_mode mode;
3500 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3501 struct loop *loop = NULL;
3502 void *target_cost_data;
3504 if (loop_vinfo)
3506 loop = LOOP_VINFO_LOOP (loop_vinfo);
3507 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3509 else
3510 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3512 /* Condition reductions generate two reductions in the loop. */
3513 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3514 ncopies *= 2;
3516 /* Cost of reduction op inside loop. */
3517 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3518 stmt_info, 0, vect_body);
3519 stmt = STMT_VINFO_STMT (stmt_info);
3521 reduction_op = get_reduction_op (stmt, reduc_index);
3523 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3524 if (!vectype)
3526 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3529 "unsupported data-type ");
3530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3531 TREE_TYPE (reduction_op));
3532 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3534 return false;
3537 mode = TYPE_MODE (vectype);
3538 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3540 if (!orig_stmt)
3541 orig_stmt = STMT_VINFO_STMT (stmt_info);
3543 code = gimple_assign_rhs_code (orig_stmt);
3545 /* Add in cost for initial definition.
3546 For cond reduction we have four vectors: initial index, step, initial
3547 result of the data reduction, initial value of the index reduction. */
3548 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3549 == COND_REDUCTION ? 4 : 1;
3550 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3551 scalar_to_vec, stmt_info, 0,
3552 vect_prologue);
3554 /* Determine cost of epilogue code.
3556 We have a reduction operator that will reduce the vector in one statement.
3557 Also requires scalar extract. */
3559 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3561 if (reduc_code != ERROR_MARK)
3563 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3565 /* An EQ stmt and an COND_EXPR stmt. */
3566 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3567 vector_stmt, stmt_info, 0,
3568 vect_epilogue);
3569 /* Reduction of the max index and a reduction of the found
3570 values. */
3571 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3572 vec_to_scalar, stmt_info, 0,
3573 vect_epilogue);
3574 /* A broadcast of the max value. */
3575 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3576 scalar_to_vec, stmt_info, 0,
3577 vect_epilogue);
3579 else
3581 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3582 stmt_info, 0, vect_epilogue);
3583 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3584 vec_to_scalar, stmt_info, 0,
3585 vect_epilogue);
3588 else
3590 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3591 tree bitsize =
3592 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3593 int element_bitsize = tree_to_uhwi (bitsize);
3594 int nelements = vec_size_in_bits / element_bitsize;
3596 optab = optab_for_tree_code (code, vectype, optab_default);
3598 /* We have a whole vector shift available. */
3599 if (VECTOR_MODE_P (mode)
3600 && optab_handler (optab, mode) != CODE_FOR_nothing
3601 && have_whole_vector_shift (mode))
3603 /* Final reduction via vector shifts and the reduction operator.
3604 Also requires scalar extract. */
3605 epilogue_cost += add_stmt_cost (target_cost_data,
3606 exact_log2 (nelements) * 2,
3607 vector_stmt, stmt_info, 0,
3608 vect_epilogue);
3609 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3610 vec_to_scalar, stmt_info, 0,
3611 vect_epilogue);
3613 else
3614 /* Use extracts and reduction op for final reduction. For N
3615 elements, we have N extracts and N-1 reduction ops. */
3616 epilogue_cost += add_stmt_cost (target_cost_data,
3617 nelements + nelements - 1,
3618 vector_stmt, stmt_info, 0,
3619 vect_epilogue);
3623 if (dump_enabled_p ())
3624 dump_printf (MSG_NOTE,
3625 "vect_model_reduction_cost: inside_cost = %d, "
3626 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3627 prologue_cost, epilogue_cost);
3629 return true;
3633 /* Function vect_model_induction_cost.
3635 Models cost for induction operations. */
3637 static void
3638 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3640 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3641 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3642 unsigned inside_cost, prologue_cost;
3644 /* loop cost for vec_loop. */
3645 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3646 stmt_info, 0, vect_body);
3648 /* prologue cost for vec_init and vec_step. */
3649 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3650 stmt_info, 0, vect_prologue);
3652 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_NOTE, vect_location,
3654 "vect_model_induction_cost: inside_cost = %d, "
3655 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3659 /* Function get_initial_def_for_induction
3661 Input:
3662 STMT - a stmt that performs an induction operation in the loop.
3663 IV_PHI - the initial value of the induction variable
3665 Output:
3666 Return a vector variable, initialized with the first VF values of
3667 the induction variable. E.g., for an iv with IV_PHI='X' and
3668 evolution S, for a vector of 4 units, we want to return:
3669 [X, X + S, X + 2*S, X + 3*S]. */
3671 static tree
3672 get_initial_def_for_induction (gimple *iv_phi)
3674 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3675 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3676 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3677 tree vectype;
3678 int nunits;
3679 edge pe = loop_preheader_edge (loop);
3680 struct loop *iv_loop;
3681 basic_block new_bb;
3682 tree new_vec, vec_init, vec_step, t;
3683 tree new_name;
3684 gimple *new_stmt;
3685 gphi *induction_phi;
3686 tree induc_def, vec_def, vec_dest;
3687 tree init_expr, step_expr;
3688 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3689 int i;
3690 int ncopies;
3691 tree expr;
3692 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3693 bool nested_in_vect_loop = false;
3694 gimple_seq stmts;
3695 imm_use_iterator imm_iter;
3696 use_operand_p use_p;
3697 gimple *exit_phi;
3698 edge latch_e;
3699 tree loop_arg;
3700 gimple_stmt_iterator si;
3701 basic_block bb = gimple_bb (iv_phi);
3702 tree stepvectype;
3703 tree resvectype;
3705 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3706 if (nested_in_vect_loop_p (loop, iv_phi))
3708 nested_in_vect_loop = true;
3709 iv_loop = loop->inner;
3711 else
3712 iv_loop = loop;
3713 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3715 latch_e = loop_latch_edge (iv_loop);
3716 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3718 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3719 gcc_assert (step_expr != NULL_TREE);
3721 pe = loop_preheader_edge (iv_loop);
3722 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3723 loop_preheader_edge (iv_loop));
3725 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3726 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3727 gcc_assert (vectype);
3728 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3729 ncopies = vf / nunits;
3731 gcc_assert (phi_info);
3732 gcc_assert (ncopies >= 1);
3734 /* Convert the step to the desired type. */
3735 stmts = NULL;
3736 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3737 if (stmts)
3739 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3740 gcc_assert (!new_bb);
3743 /* Find the first insertion point in the BB. */
3744 si = gsi_after_labels (bb);
3746 /* Create the vector that holds the initial_value of the induction. */
3747 if (nested_in_vect_loop)
3749 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3750 been created during vectorization of previous stmts. We obtain it
3751 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3752 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3753 /* If the initial value is not of proper type, convert it. */
3754 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3756 new_stmt
3757 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3758 vect_simple_var,
3759 "vec_iv_"),
3760 VIEW_CONVERT_EXPR,
3761 build1 (VIEW_CONVERT_EXPR, vectype,
3762 vec_init));
3763 vec_init = gimple_assign_lhs (new_stmt);
3764 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3765 new_stmt);
3766 gcc_assert (!new_bb);
3767 set_vinfo_for_stmt (new_stmt,
3768 new_stmt_vec_info (new_stmt, loop_vinfo));
3771 else
3773 vec<constructor_elt, va_gc> *v;
3775 /* iv_loop is the loop to be vectorized. Create:
3776 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3777 stmts = NULL;
3778 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3780 vec_alloc (v, nunits);
3781 bool constant_p = is_gimple_min_invariant (new_name);
3782 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3783 for (i = 1; i < nunits; i++)
3785 /* Create: new_name_i = new_name + step_expr */
3786 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3787 new_name, step_expr);
3788 if (!is_gimple_min_invariant (new_name))
3789 constant_p = false;
3790 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3792 if (stmts)
3794 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3795 gcc_assert (!new_bb);
3798 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3799 if (constant_p)
3800 new_vec = build_vector_from_ctor (vectype, v);
3801 else
3802 new_vec = build_constructor (vectype, v);
3803 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3807 /* Create the vector that holds the step of the induction. */
3808 if (nested_in_vect_loop)
3809 /* iv_loop is nested in the loop to be vectorized. Generate:
3810 vec_step = [S, S, S, S] */
3811 new_name = step_expr;
3812 else
3814 /* iv_loop is the loop to be vectorized. Generate:
3815 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3816 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3818 expr = build_int_cst (integer_type_node, vf);
3819 expr = fold_convert (TREE_TYPE (step_expr), expr);
3821 else
3822 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3823 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3824 expr, step_expr);
3825 if (TREE_CODE (step_expr) == SSA_NAME)
3826 new_name = vect_init_vector (iv_phi, new_name,
3827 TREE_TYPE (step_expr), NULL);
3830 t = unshare_expr (new_name);
3831 gcc_assert (CONSTANT_CLASS_P (new_name)
3832 || TREE_CODE (new_name) == SSA_NAME);
3833 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3834 gcc_assert (stepvectype);
3835 new_vec = build_vector_from_val (stepvectype, t);
3836 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3839 /* Create the following def-use cycle:
3840 loop prolog:
3841 vec_init = ...
3842 vec_step = ...
3843 loop:
3844 vec_iv = PHI <vec_init, vec_loop>
3846 STMT
3848 vec_loop = vec_iv + vec_step; */
3850 /* Create the induction-phi that defines the induction-operand. */
3851 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3852 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3853 set_vinfo_for_stmt (induction_phi,
3854 new_stmt_vec_info (induction_phi, loop_vinfo));
3855 induc_def = PHI_RESULT (induction_phi);
3857 /* Create the iv update inside the loop */
3858 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3859 vec_def = make_ssa_name (vec_dest, new_stmt);
3860 gimple_assign_set_lhs (new_stmt, vec_def);
3861 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3862 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3864 /* Set the arguments of the phi node: */
3865 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3866 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3867 UNKNOWN_LOCATION);
3870 /* In case that vectorization factor (VF) is bigger than the number
3871 of elements that we can fit in a vectype (nunits), we have to generate
3872 more than one vector stmt - i.e - we need to "unroll" the
3873 vector stmt by a factor VF/nunits. For more details see documentation
3874 in vectorizable_operation. */
3876 if (ncopies > 1)
3878 stmt_vec_info prev_stmt_vinfo;
3879 /* FORNOW. This restriction should be relaxed. */
3880 gcc_assert (!nested_in_vect_loop);
3882 /* Create the vector that holds the step of the induction. */
3883 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3885 expr = build_int_cst (integer_type_node, nunits);
3886 expr = fold_convert (TREE_TYPE (step_expr), expr);
3888 else
3889 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3890 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3891 expr, step_expr);
3892 if (TREE_CODE (step_expr) == SSA_NAME)
3893 new_name = vect_init_vector (iv_phi, new_name,
3894 TREE_TYPE (step_expr), NULL);
3895 t = unshare_expr (new_name);
3896 gcc_assert (CONSTANT_CLASS_P (new_name)
3897 || TREE_CODE (new_name) == SSA_NAME);
3898 new_vec = build_vector_from_val (stepvectype, t);
3899 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3901 vec_def = induc_def;
3902 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3903 for (i = 1; i < ncopies; i++)
3905 /* vec_i = vec_prev + vec_step */
3906 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3907 vec_def, vec_step);
3908 vec_def = make_ssa_name (vec_dest, new_stmt);
3909 gimple_assign_set_lhs (new_stmt, vec_def);
3911 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3912 if (!useless_type_conversion_p (resvectype, vectype))
3914 new_stmt
3915 = gimple_build_assign
3916 (vect_get_new_vect_var (resvectype, vect_simple_var,
3917 "vec_iv_"),
3918 VIEW_CONVERT_EXPR,
3919 build1 (VIEW_CONVERT_EXPR, resvectype,
3920 gimple_assign_lhs (new_stmt)));
3921 gimple_assign_set_lhs (new_stmt,
3922 make_ssa_name
3923 (gimple_assign_lhs (new_stmt), new_stmt));
3924 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3926 set_vinfo_for_stmt (new_stmt,
3927 new_stmt_vec_info (new_stmt, loop_vinfo));
3928 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3929 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3933 if (nested_in_vect_loop)
3935 /* Find the loop-closed exit-phi of the induction, and record
3936 the final vector of induction results: */
3937 exit_phi = NULL;
3938 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3940 gimple *use_stmt = USE_STMT (use_p);
3941 if (is_gimple_debug (use_stmt))
3942 continue;
3944 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3946 exit_phi = use_stmt;
3947 break;
3950 if (exit_phi)
3952 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3953 /* FORNOW. Currently not supporting the case that an inner-loop induction
3954 is not used in the outer-loop (i.e. only outside the outer-loop). */
3955 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3956 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3958 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3959 if (dump_enabled_p ())
3961 dump_printf_loc (MSG_NOTE, vect_location,
3962 "vector of inductions after inner-loop:");
3963 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3964 dump_printf (MSG_NOTE, "\n");
3970 if (dump_enabled_p ())
3972 dump_printf_loc (MSG_NOTE, vect_location,
3973 "transform induction: created def-use cycle: ");
3974 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3975 dump_printf (MSG_NOTE, "\n");
3976 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3977 SSA_NAME_DEF_STMT (vec_def), 0);
3978 dump_printf (MSG_NOTE, "\n");
3981 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3982 if (!useless_type_conversion_p (resvectype, vectype))
3984 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3985 vect_simple_var,
3986 "vec_iv_"),
3987 VIEW_CONVERT_EXPR,
3988 build1 (VIEW_CONVERT_EXPR, resvectype,
3989 induc_def));
3990 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3991 gimple_assign_set_lhs (new_stmt, induc_def);
3992 si = gsi_after_labels (bb);
3993 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3994 set_vinfo_for_stmt (new_stmt,
3995 new_stmt_vec_info (new_stmt, loop_vinfo));
3996 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3997 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4000 return induc_def;
4004 /* Function get_initial_def_for_reduction
4006 Input:
4007 STMT - a stmt that performs a reduction operation in the loop.
4008 INIT_VAL - the initial value of the reduction variable
4010 Output:
4011 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4012 of the reduction (used for adjusting the epilog - see below).
4013 Return a vector variable, initialized according to the operation that STMT
4014 performs. This vector will be used as the initial value of the
4015 vector of partial results.
4017 Option1 (adjust in epilog): Initialize the vector as follows:
4018 add/bit or/xor: [0,0,...,0,0]
4019 mult/bit and: [1,1,...,1,1]
4020 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4021 and when necessary (e.g. add/mult case) let the caller know
4022 that it needs to adjust the result by init_val.
4024 Option2: Initialize the vector as follows:
4025 add/bit or/xor: [init_val,0,0,...,0]
4026 mult/bit and: [init_val,1,1,...,1]
4027 min/max/cond_expr: [init_val,init_val,...,init_val]
4028 and no adjustments are needed.
4030 For example, for the following code:
4032 s = init_val;
4033 for (i=0;i<n;i++)
4034 s = s + a[i];
4036 STMT is 's = s + a[i]', and the reduction variable is 's'.
4037 For a vector of 4 units, we want to return either [0,0,0,init_val],
4038 or [0,0,0,0] and let the caller know that it needs to adjust
4039 the result at the end by 'init_val'.
4041 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4042 initialization vector is simpler (same element in all entries), if
4043 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4045 A cost model should help decide between these two schemes. */
4047 tree
4048 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4049 tree *adjustment_def)
4051 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4052 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4053 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4054 tree scalar_type = TREE_TYPE (init_val);
4055 tree vectype = get_vectype_for_scalar_type (scalar_type);
4056 int nunits;
4057 enum tree_code code = gimple_assign_rhs_code (stmt);
4058 tree def_for_init;
4059 tree init_def;
4060 tree *elts;
4061 int i;
4062 bool nested_in_vect_loop = false;
4063 tree init_value;
4064 REAL_VALUE_TYPE real_init_val = dconst0;
4065 int int_init_val = 0;
4066 gimple *def_stmt = NULL;
4068 gcc_assert (vectype);
4069 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4071 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4072 || SCALAR_FLOAT_TYPE_P (scalar_type));
4074 if (nested_in_vect_loop_p (loop, stmt))
4075 nested_in_vect_loop = true;
4076 else
4077 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4079 /* In case of double reduction we only create a vector variable to be put
4080 in the reduction phi node. The actual statement creation is done in
4081 vect_create_epilog_for_reduction. */
4082 if (adjustment_def && nested_in_vect_loop
4083 && TREE_CODE (init_val) == SSA_NAME
4084 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4085 && gimple_code (def_stmt) == GIMPLE_PHI
4086 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4087 && vinfo_for_stmt (def_stmt)
4088 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4089 == vect_double_reduction_def)
4091 *adjustment_def = NULL;
4092 return vect_create_destination_var (init_val, vectype);
4095 if (TREE_CONSTANT (init_val))
4097 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4098 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4099 else
4100 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4102 else
4103 init_value = init_val;
4105 switch (code)
4107 case WIDEN_SUM_EXPR:
4108 case DOT_PROD_EXPR:
4109 case SAD_EXPR:
4110 case PLUS_EXPR:
4111 case MINUS_EXPR:
4112 case BIT_IOR_EXPR:
4113 case BIT_XOR_EXPR:
4114 case MULT_EXPR:
4115 case BIT_AND_EXPR:
4116 /* ADJUSMENT_DEF is NULL when called from
4117 vect_create_epilog_for_reduction to vectorize double reduction. */
4118 if (adjustment_def)
4120 if (nested_in_vect_loop)
4121 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4122 else
4123 *adjustment_def = init_val;
4126 if (code == MULT_EXPR)
4128 real_init_val = dconst1;
4129 int_init_val = 1;
4132 if (code == BIT_AND_EXPR)
4133 int_init_val = -1;
4135 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4136 def_for_init = build_real (scalar_type, real_init_val);
4137 else
4138 def_for_init = build_int_cst (scalar_type, int_init_val);
4140 /* Create a vector of '0' or '1' except the first element. */
4141 elts = XALLOCAVEC (tree, nunits);
4142 for (i = nunits - 2; i >= 0; --i)
4143 elts[i + 1] = def_for_init;
4145 /* Option1: the first element is '0' or '1' as well. */
4146 if (adjustment_def)
4148 elts[0] = def_for_init;
4149 init_def = build_vector (vectype, elts);
4150 break;
4153 /* Option2: the first element is INIT_VAL. */
4154 elts[0] = init_val;
4155 if (TREE_CONSTANT (init_val))
4156 init_def = build_vector (vectype, elts);
4157 else
4159 vec<constructor_elt, va_gc> *v;
4160 vec_alloc (v, nunits);
4161 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4162 for (i = 1; i < nunits; ++i)
4163 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4164 init_def = build_constructor (vectype, v);
4167 break;
4169 case MIN_EXPR:
4170 case MAX_EXPR:
4171 case COND_EXPR:
4172 if (adjustment_def)
4174 *adjustment_def = NULL_TREE;
4175 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4177 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4178 break;
4181 init_def = build_vector_from_val (vectype, init_value);
4182 break;
4184 default:
4185 gcc_unreachable ();
4188 return init_def;
4191 /* Function vect_create_epilog_for_reduction
4193 Create code at the loop-epilog to finalize the result of a reduction
4194 computation.
4196 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4197 reduction statements.
4198 STMT is the scalar reduction stmt that is being vectorized.
4199 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4200 number of elements that we can fit in a vectype (nunits). In this case
4201 we have to generate more than one vector stmt - i.e - we need to "unroll"
4202 the vector stmt by a factor VF/nunits. For more details see documentation
4203 in vectorizable_operation.
4204 REDUC_CODE is the tree-code for the epilog reduction.
4205 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4206 computation.
4207 REDUC_INDEX is the index of the operand in the right hand side of the
4208 statement that is defined by REDUCTION_PHI.
4209 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4210 SLP_NODE is an SLP node containing a group of reduction statements. The
4211 first one in this group is STMT.
4212 INDUCTION_INDEX is the index of the loop for condition reductions.
4213 Otherwise it is undefined.
4215 This function:
4216 1. Creates the reduction def-use cycles: sets the arguments for
4217 REDUCTION_PHIS:
4218 The loop-entry argument is the vectorized initial-value of the reduction.
4219 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4220 sums.
4221 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4222 by applying the operation specified by REDUC_CODE if available, or by
4223 other means (whole-vector shifts or a scalar loop).
4224 The function also creates a new phi node at the loop exit to preserve
4225 loop-closed form, as illustrated below.
4227 The flow at the entry to this function:
4229 loop:
4230 vec_def = phi <null, null> # REDUCTION_PHI
4231 VECT_DEF = vector_stmt # vectorized form of STMT
4232 s_loop = scalar_stmt # (scalar) STMT
4233 loop_exit:
4234 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4235 use <s_out0>
4236 use <s_out0>
4238 The above is transformed by this function into:
4240 loop:
4241 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4242 VECT_DEF = vector_stmt # vectorized form of STMT
4243 s_loop = scalar_stmt # (scalar) STMT
4244 loop_exit:
4245 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4246 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4247 v_out2 = reduce <v_out1>
4248 s_out3 = extract_field <v_out2, 0>
4249 s_out4 = adjust_result <s_out3>
4250 use <s_out4>
4251 use <s_out4>
4254 static void
4255 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4256 int ncopies, enum tree_code reduc_code,
4257 vec<gimple *> reduction_phis,
4258 int reduc_index, bool double_reduc,
4259 slp_tree slp_node, tree induction_index)
4261 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4262 stmt_vec_info prev_phi_info;
4263 tree vectype;
4264 machine_mode mode;
4265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4267 basic_block exit_bb;
4268 tree scalar_dest;
4269 tree scalar_type;
4270 gimple *new_phi = NULL, *phi;
4271 gimple_stmt_iterator exit_gsi;
4272 tree vec_dest;
4273 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4274 gimple *epilog_stmt = NULL;
4275 enum tree_code code = gimple_assign_rhs_code (stmt);
4276 gimple *exit_phi;
4277 tree bitsize;
4278 tree adjustment_def = NULL;
4279 tree vec_initial_def = NULL;
4280 tree reduction_op, expr, def, initial_def = NULL;
4281 tree orig_name, scalar_result;
4282 imm_use_iterator imm_iter, phi_imm_iter;
4283 use_operand_p use_p, phi_use_p;
4284 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4285 bool nested_in_vect_loop = false;
4286 auto_vec<gimple *> new_phis;
4287 auto_vec<gimple *> inner_phis;
4288 enum vect_def_type dt = vect_unknown_def_type;
4289 int j, i;
4290 auto_vec<tree> scalar_results;
4291 unsigned int group_size = 1, k, ratio;
4292 auto_vec<tree> vec_initial_defs;
4293 auto_vec<gimple *> phis;
4294 bool slp_reduc = false;
4295 tree new_phi_result;
4296 gimple *inner_phi = NULL;
4298 if (slp_node)
4299 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4301 if (nested_in_vect_loop_p (loop, stmt))
4303 outer_loop = loop;
4304 loop = loop->inner;
4305 nested_in_vect_loop = true;
4306 gcc_assert (!slp_node);
4309 reduction_op = get_reduction_op (stmt, reduc_index);
4311 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4312 gcc_assert (vectype);
4313 mode = TYPE_MODE (vectype);
4315 /* 1. Create the reduction def-use cycle:
4316 Set the arguments of REDUCTION_PHIS, i.e., transform
4318 loop:
4319 vec_def = phi <null, null> # REDUCTION_PHI
4320 VECT_DEF = vector_stmt # vectorized form of STMT
4323 into:
4325 loop:
4326 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4327 VECT_DEF = vector_stmt # vectorized form of STMT
4330 (in case of SLP, do it for all the phis). */
4332 /* Get the loop-entry arguments. */
4333 if (slp_node)
4334 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4335 NULL, slp_node, reduc_index);
4336 else
4338 /* Get at the scalar def before the loop, that defines the initial value
4339 of the reduction variable. */
4340 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4341 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4342 loop_preheader_edge (loop));
4343 vec_initial_defs.create (1);
4344 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4345 &adjustment_def);
4346 vec_initial_defs.quick_push (vec_initial_def);
4349 /* Set phi nodes arguments. */
4350 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4352 tree vec_init_def, def;
4353 gimple_seq stmts;
4354 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4355 true, NULL_TREE);
4356 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4357 def = vect_defs[i];
4358 for (j = 0; j < ncopies; j++)
4360 /* Set the loop-entry arg of the reduction-phi. */
4362 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4363 == INTEGER_INDUC_COND_REDUCTION)
4365 /* Initialise the reduction phi to zero. This prevents initial
4366 values of non-zero interferring with the reduction op. */
4367 gcc_assert (ncopies == 1);
4368 gcc_assert (i == 0);
4370 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4371 tree zero_vec = build_zero_cst (vec_init_def_type);
4373 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4374 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4376 else
4377 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4378 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4380 /* Set the loop-latch arg for the reduction-phi. */
4381 if (j > 0)
4382 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4384 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4385 UNKNOWN_LOCATION);
4387 if (dump_enabled_p ())
4389 dump_printf_loc (MSG_NOTE, vect_location,
4390 "transform reduction: created def-use cycle: ");
4391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4392 dump_printf (MSG_NOTE, "\n");
4393 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4394 dump_printf (MSG_NOTE, "\n");
4397 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4401 /* 2. Create epilog code.
4402 The reduction epilog code operates across the elements of the vector
4403 of partial results computed by the vectorized loop.
4404 The reduction epilog code consists of:
4406 step 1: compute the scalar result in a vector (v_out2)
4407 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4408 step 3: adjust the scalar result (s_out3) if needed.
4410 Step 1 can be accomplished using one the following three schemes:
4411 (scheme 1) using reduc_code, if available.
4412 (scheme 2) using whole-vector shifts, if available.
4413 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4414 combined.
4416 The overall epilog code looks like this:
4418 s_out0 = phi <s_loop> # original EXIT_PHI
4419 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4420 v_out2 = reduce <v_out1> # step 1
4421 s_out3 = extract_field <v_out2, 0> # step 2
4422 s_out4 = adjust_result <s_out3> # step 3
4424 (step 3 is optional, and steps 1 and 2 may be combined).
4425 Lastly, the uses of s_out0 are replaced by s_out4. */
4428 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4429 v_out1 = phi <VECT_DEF>
4430 Store them in NEW_PHIS. */
4432 exit_bb = single_exit (loop)->dest;
4433 prev_phi_info = NULL;
4434 new_phis.create (vect_defs.length ());
4435 FOR_EACH_VEC_ELT (vect_defs, i, def)
4437 for (j = 0; j < ncopies; j++)
4439 tree new_def = copy_ssa_name (def);
4440 phi = create_phi_node (new_def, exit_bb);
4441 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4442 if (j == 0)
4443 new_phis.quick_push (phi);
4444 else
4446 def = vect_get_vec_def_for_stmt_copy (dt, def);
4447 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4450 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4451 prev_phi_info = vinfo_for_stmt (phi);
4455 /* The epilogue is created for the outer-loop, i.e., for the loop being
4456 vectorized. Create exit phis for the outer loop. */
4457 if (double_reduc)
4459 loop = outer_loop;
4460 exit_bb = single_exit (loop)->dest;
4461 inner_phis.create (vect_defs.length ());
4462 FOR_EACH_VEC_ELT (new_phis, i, phi)
4464 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4465 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4466 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4467 PHI_RESULT (phi));
4468 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4469 loop_vinfo));
4470 inner_phis.quick_push (phi);
4471 new_phis[i] = outer_phi;
4472 prev_phi_info = vinfo_for_stmt (outer_phi);
4473 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4475 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4476 new_result = copy_ssa_name (PHI_RESULT (phi));
4477 outer_phi = create_phi_node (new_result, exit_bb);
4478 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4479 PHI_RESULT (phi));
4480 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4481 loop_vinfo));
4482 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4483 prev_phi_info = vinfo_for_stmt (outer_phi);
4488 exit_gsi = gsi_after_labels (exit_bb);
4490 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4491 (i.e. when reduc_code is not available) and in the final adjustment
4492 code (if needed). Also get the original scalar reduction variable as
4493 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4494 represents a reduction pattern), the tree-code and scalar-def are
4495 taken from the original stmt that the pattern-stmt (STMT) replaces.
4496 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4497 are taken from STMT. */
4499 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4500 if (!orig_stmt)
4502 /* Regular reduction */
4503 orig_stmt = stmt;
4505 else
4507 /* Reduction pattern */
4508 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4509 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4510 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4513 code = gimple_assign_rhs_code (orig_stmt);
4514 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4515 partial results are added and not subtracted. */
4516 if (code == MINUS_EXPR)
4517 code = PLUS_EXPR;
4519 scalar_dest = gimple_assign_lhs (orig_stmt);
4520 scalar_type = TREE_TYPE (scalar_dest);
4521 scalar_results.create (group_size);
4522 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4523 bitsize = TYPE_SIZE (scalar_type);
4525 /* In case this is a reduction in an inner-loop while vectorizing an outer
4526 loop - we don't need to extract a single scalar result at the end of the
4527 inner-loop (unless it is double reduction, i.e., the use of reduction is
4528 outside the outer-loop). The final vector of partial results will be used
4529 in the vectorized outer-loop, or reduced to a scalar result at the end of
4530 the outer-loop. */
4531 if (nested_in_vect_loop && !double_reduc)
4532 goto vect_finalize_reduction;
4534 /* SLP reduction without reduction chain, e.g.,
4535 # a1 = phi <a2, a0>
4536 # b1 = phi <b2, b0>
4537 a2 = operation (a1)
4538 b2 = operation (b1) */
4539 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4541 /* In case of reduction chain, e.g.,
4542 # a1 = phi <a3, a0>
4543 a2 = operation (a1)
4544 a3 = operation (a2),
4546 we may end up with more than one vector result. Here we reduce them to
4547 one vector. */
4548 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4550 tree first_vect = PHI_RESULT (new_phis[0]);
4551 tree tmp;
4552 gassign *new_vec_stmt = NULL;
4554 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4555 for (k = 1; k < new_phis.length (); k++)
4557 gimple *next_phi = new_phis[k];
4558 tree second_vect = PHI_RESULT (next_phi);
4560 tmp = build2 (code, vectype, first_vect, second_vect);
4561 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4562 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4563 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4564 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4567 new_phi_result = first_vect;
4568 if (new_vec_stmt)
4570 new_phis.truncate (0);
4571 new_phis.safe_push (new_vec_stmt);
4574 else
4575 new_phi_result = PHI_RESULT (new_phis[0]);
4577 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4579 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4580 various data values where the condition matched and another vector
4581 (INDUCTION_INDEX) containing all the indexes of those matches. We
4582 need to extract the last matching index (which will be the index with
4583 highest value) and use this to index into the data vector.
4584 For the case where there were no matches, the data vector will contain
4585 all default values and the index vector will be all zeros. */
4587 /* Get various versions of the type of the vector of indexes. */
4588 tree index_vec_type = TREE_TYPE (induction_index);
4589 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4590 tree index_scalar_type = TREE_TYPE (index_vec_type);
4591 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4592 (index_vec_type);
4594 /* Get an unsigned integer version of the type of the data vector. */
4595 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4596 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4597 tree vectype_unsigned = build_vector_type
4598 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4600 /* First we need to create a vector (ZERO_VEC) of zeros and another
4601 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4602 can create using a MAX reduction and then expanding.
4603 In the case where the loop never made any matches, the max index will
4604 be zero. */
4606 /* Vector of {0, 0, 0,...}. */
4607 tree zero_vec = make_ssa_name (vectype);
4608 tree zero_vec_rhs = build_zero_cst (vectype);
4609 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4610 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4612 /* Find maximum value from the vector of found indexes. */
4613 tree max_index = make_ssa_name (index_scalar_type);
4614 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4615 induction_index);
4616 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4618 /* Vector of {max_index, max_index, max_index,...}. */
4619 tree max_index_vec = make_ssa_name (index_vec_type);
4620 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4621 max_index);
4622 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4623 max_index_vec_rhs);
4624 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4626 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4627 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4628 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4629 otherwise. Only one value should match, resulting in a vector
4630 (VEC_COND) with one data value and the rest zeros.
4631 In the case where the loop never made any matches, every index will
4632 match, resulting in a vector with all data values (which will all be
4633 the default value). */
4635 /* Compare the max index vector to the vector of found indexes to find
4636 the position of the max value. */
4637 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4638 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4639 induction_index,
4640 max_index_vec);
4641 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4643 /* Use the compare to choose either values from the data vector or
4644 zero. */
4645 tree vec_cond = make_ssa_name (vectype);
4646 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4647 vec_compare, new_phi_result,
4648 zero_vec);
4649 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4651 /* Finally we need to extract the data value from the vector (VEC_COND)
4652 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4653 reduction, but because this doesn't exist, we can use a MAX reduction
4654 instead. The data value might be signed or a float so we need to cast
4655 it first.
4656 In the case where the loop never made any matches, the data values are
4657 all identical, and so will reduce down correctly. */
4659 /* Make the matched data values unsigned. */
4660 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4661 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4662 vec_cond);
4663 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4664 VIEW_CONVERT_EXPR,
4665 vec_cond_cast_rhs);
4666 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4668 /* Reduce down to a scalar value. */
4669 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4670 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4671 optab_default);
4672 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4673 != CODE_FOR_nothing);
4674 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4675 REDUC_MAX_EXPR,
4676 vec_cond_cast);
4677 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4679 /* Convert the reduced value back to the result type and set as the
4680 result. */
4681 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4682 data_reduc);
4683 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4684 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4685 gimple_assign_set_lhs (epilog_stmt, new_temp);
4686 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4687 scalar_results.safe_push (new_temp);
4690 /* 2.3 Create the reduction code, using one of the three schemes described
4691 above. In SLP we simply need to extract all the elements from the
4692 vector (without reducing them), so we use scalar shifts. */
4693 else if (reduc_code != ERROR_MARK && !slp_reduc)
4695 tree tmp;
4696 tree vec_elem_type;
4698 /*** Case 1: Create:
4699 v_out2 = reduc_expr <v_out1> */
4701 if (dump_enabled_p ())
4702 dump_printf_loc (MSG_NOTE, vect_location,
4703 "Reduce using direct vector reduction.\n");
4705 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4706 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4708 tree tmp_dest =
4709 vect_create_destination_var (scalar_dest, vec_elem_type);
4710 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4711 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4712 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4713 gimple_assign_set_lhs (epilog_stmt, new_temp);
4714 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4716 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4718 else
4719 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4721 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4722 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4723 gimple_assign_set_lhs (epilog_stmt, new_temp);
4724 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4726 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4727 == INTEGER_INDUC_COND_REDUCTION)
4729 /* Earlier we set the initial value to be zero. Check the result
4730 and if it is zero then replace with the original initial
4731 value. */
4732 tree zero = build_zero_cst (scalar_type);
4733 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4735 tmp = make_ssa_name (new_scalar_dest);
4736 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4737 initial_def, new_temp);
4738 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4739 new_temp = tmp;
4742 scalar_results.safe_push (new_temp);
4744 else
4746 bool reduce_with_shift = have_whole_vector_shift (mode);
4747 int element_bitsize = tree_to_uhwi (bitsize);
4748 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4749 tree vec_temp;
4751 /* Regardless of whether we have a whole vector shift, if we're
4752 emulating the operation via tree-vect-generic, we don't want
4753 to use it. Only the first round of the reduction is likely
4754 to still be profitable via emulation. */
4755 /* ??? It might be better to emit a reduction tree code here, so that
4756 tree-vect-generic can expand the first round via bit tricks. */
4757 if (!VECTOR_MODE_P (mode))
4758 reduce_with_shift = false;
4759 else
4761 optab optab = optab_for_tree_code (code, vectype, optab_default);
4762 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4763 reduce_with_shift = false;
4766 if (reduce_with_shift && !slp_reduc)
4768 int nelements = vec_size_in_bits / element_bitsize;
4769 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4771 int elt_offset;
4773 tree zero_vec = build_zero_cst (vectype);
4774 /*** Case 2: Create:
4775 for (offset = nelements/2; offset >= 1; offset/=2)
4777 Create: va' = vec_shift <va, offset>
4778 Create: va = vop <va, va'>
4779 } */
4781 tree rhs;
4783 if (dump_enabled_p ())
4784 dump_printf_loc (MSG_NOTE, vect_location,
4785 "Reduce using vector shifts\n");
4787 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4788 new_temp = new_phi_result;
4789 for (elt_offset = nelements / 2;
4790 elt_offset >= 1;
4791 elt_offset /= 2)
4793 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4794 tree mask = vect_gen_perm_mask_any (vectype, sel);
4795 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4796 new_temp, zero_vec, mask);
4797 new_name = make_ssa_name (vec_dest, epilog_stmt);
4798 gimple_assign_set_lhs (epilog_stmt, new_name);
4799 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4801 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4802 new_temp);
4803 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4804 gimple_assign_set_lhs (epilog_stmt, new_temp);
4805 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4808 /* 2.4 Extract the final scalar result. Create:
4809 s_out3 = extract_field <v_out2, bitpos> */
4811 if (dump_enabled_p ())
4812 dump_printf_loc (MSG_NOTE, vect_location,
4813 "extract scalar result\n");
4815 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4816 bitsize, bitsize_zero_node);
4817 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4818 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4819 gimple_assign_set_lhs (epilog_stmt, new_temp);
4820 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4821 scalar_results.safe_push (new_temp);
4823 else
4825 /*** Case 3: Create:
4826 s = extract_field <v_out2, 0>
4827 for (offset = element_size;
4828 offset < vector_size;
4829 offset += element_size;)
4831 Create: s' = extract_field <v_out2, offset>
4832 Create: s = op <s, s'> // For non SLP cases
4833 } */
4835 if (dump_enabled_p ())
4836 dump_printf_loc (MSG_NOTE, vect_location,
4837 "Reduce using scalar code.\n");
4839 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4840 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4842 int bit_offset;
4843 if (gimple_code (new_phi) == GIMPLE_PHI)
4844 vec_temp = PHI_RESULT (new_phi);
4845 else
4846 vec_temp = gimple_assign_lhs (new_phi);
4847 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4848 bitsize_zero_node);
4849 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4850 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4851 gimple_assign_set_lhs (epilog_stmt, new_temp);
4852 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4854 /* In SLP we don't need to apply reduction operation, so we just
4855 collect s' values in SCALAR_RESULTS. */
4856 if (slp_reduc)
4857 scalar_results.safe_push (new_temp);
4859 for (bit_offset = element_bitsize;
4860 bit_offset < vec_size_in_bits;
4861 bit_offset += element_bitsize)
4863 tree bitpos = bitsize_int (bit_offset);
4864 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4865 bitsize, bitpos);
4867 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4868 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4869 gimple_assign_set_lhs (epilog_stmt, new_name);
4870 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4872 if (slp_reduc)
4874 /* In SLP we don't need to apply reduction operation, so
4875 we just collect s' values in SCALAR_RESULTS. */
4876 new_temp = new_name;
4877 scalar_results.safe_push (new_name);
4879 else
4881 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4882 new_name, new_temp);
4883 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4884 gimple_assign_set_lhs (epilog_stmt, new_temp);
4885 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4890 /* The only case where we need to reduce scalar results in SLP, is
4891 unrolling. If the size of SCALAR_RESULTS is greater than
4892 GROUP_SIZE, we reduce them combining elements modulo
4893 GROUP_SIZE. */
4894 if (slp_reduc)
4896 tree res, first_res, new_res;
4897 gimple *new_stmt;
4899 /* Reduce multiple scalar results in case of SLP unrolling. */
4900 for (j = group_size; scalar_results.iterate (j, &res);
4901 j++)
4903 first_res = scalar_results[j % group_size];
4904 new_stmt = gimple_build_assign (new_scalar_dest, code,
4905 first_res, res);
4906 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4907 gimple_assign_set_lhs (new_stmt, new_res);
4908 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4909 scalar_results[j % group_size] = new_res;
4912 else
4913 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4914 scalar_results.safe_push (new_temp);
4918 vect_finalize_reduction:
4920 if (double_reduc)
4921 loop = loop->inner;
4923 /* 2.5 Adjust the final result by the initial value of the reduction
4924 variable. (When such adjustment is not needed, then
4925 'adjustment_def' is zero). For example, if code is PLUS we create:
4926 new_temp = loop_exit_def + adjustment_def */
4928 if (adjustment_def)
4930 gcc_assert (!slp_reduc);
4931 if (nested_in_vect_loop)
4933 new_phi = new_phis[0];
4934 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4935 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4936 new_dest = vect_create_destination_var (scalar_dest, vectype);
4938 else
4940 new_temp = scalar_results[0];
4941 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4942 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4943 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4946 epilog_stmt = gimple_build_assign (new_dest, expr);
4947 new_temp = make_ssa_name (new_dest, epilog_stmt);
4948 gimple_assign_set_lhs (epilog_stmt, new_temp);
4949 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4950 if (nested_in_vect_loop)
4952 set_vinfo_for_stmt (epilog_stmt,
4953 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4954 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4955 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4957 if (!double_reduc)
4958 scalar_results.quick_push (new_temp);
4959 else
4960 scalar_results[0] = new_temp;
4962 else
4963 scalar_results[0] = new_temp;
4965 new_phis[0] = epilog_stmt;
4968 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4969 phis with new adjusted scalar results, i.e., replace use <s_out0>
4970 with use <s_out4>.
4972 Transform:
4973 loop_exit:
4974 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4975 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4976 v_out2 = reduce <v_out1>
4977 s_out3 = extract_field <v_out2, 0>
4978 s_out4 = adjust_result <s_out3>
4979 use <s_out0>
4980 use <s_out0>
4982 into:
4984 loop_exit:
4985 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4986 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4987 v_out2 = reduce <v_out1>
4988 s_out3 = extract_field <v_out2, 0>
4989 s_out4 = adjust_result <s_out3>
4990 use <s_out4>
4991 use <s_out4> */
4994 /* In SLP reduction chain we reduce vector results into one vector if
4995 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4996 the last stmt in the reduction chain, since we are looking for the loop
4997 exit phi node. */
4998 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5000 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5001 /* Handle reduction patterns. */
5002 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5003 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5005 scalar_dest = gimple_assign_lhs (dest_stmt);
5006 group_size = 1;
5009 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5010 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5011 need to match SCALAR_RESULTS with corresponding statements. The first
5012 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5013 the first vector stmt, etc.
5014 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5015 if (group_size > new_phis.length ())
5017 ratio = group_size / new_phis.length ();
5018 gcc_assert (!(group_size % new_phis.length ()));
5020 else
5021 ratio = 1;
5023 for (k = 0; k < group_size; k++)
5025 if (k % ratio == 0)
5027 epilog_stmt = new_phis[k / ratio];
5028 reduction_phi = reduction_phis[k / ratio];
5029 if (double_reduc)
5030 inner_phi = inner_phis[k / ratio];
5033 if (slp_reduc)
5035 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5037 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5038 /* SLP statements can't participate in patterns. */
5039 gcc_assert (!orig_stmt);
5040 scalar_dest = gimple_assign_lhs (current_stmt);
5043 phis.create (3);
5044 /* Find the loop-closed-use at the loop exit of the original scalar
5045 result. (The reduction result is expected to have two immediate uses -
5046 one at the latch block, and one at the loop exit). */
5047 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5048 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5049 && !is_gimple_debug (USE_STMT (use_p)))
5050 phis.safe_push (USE_STMT (use_p));
5052 /* While we expect to have found an exit_phi because of loop-closed-ssa
5053 form we can end up without one if the scalar cycle is dead. */
5055 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5057 if (outer_loop)
5059 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5060 gphi *vect_phi;
5062 /* FORNOW. Currently not supporting the case that an inner-loop
5063 reduction is not used in the outer-loop (but only outside the
5064 outer-loop), unless it is double reduction. */
5065 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5066 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5067 || double_reduc);
5069 if (double_reduc)
5070 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5071 else
5072 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5073 if (!double_reduc
5074 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5075 != vect_double_reduction_def)
5076 continue;
5078 /* Handle double reduction:
5080 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5081 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5082 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5083 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5085 At that point the regular reduction (stmt2 and stmt3) is
5086 already vectorized, as well as the exit phi node, stmt4.
5087 Here we vectorize the phi node of double reduction, stmt1, and
5088 update all relevant statements. */
5090 /* Go through all the uses of s2 to find double reduction phi
5091 node, i.e., stmt1 above. */
5092 orig_name = PHI_RESULT (exit_phi);
5093 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5095 stmt_vec_info use_stmt_vinfo;
5096 stmt_vec_info new_phi_vinfo;
5097 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5098 basic_block bb = gimple_bb (use_stmt);
5099 gimple *use;
5101 /* Check that USE_STMT is really double reduction phi
5102 node. */
5103 if (gimple_code (use_stmt) != GIMPLE_PHI
5104 || gimple_phi_num_args (use_stmt) != 2
5105 || bb->loop_father != outer_loop)
5106 continue;
5107 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5108 if (!use_stmt_vinfo
5109 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5110 != vect_double_reduction_def)
5111 continue;
5113 /* Create vector phi node for double reduction:
5114 vs1 = phi <vs0, vs2>
5115 vs1 was created previously in this function by a call to
5116 vect_get_vec_def_for_operand and is stored in
5117 vec_initial_def;
5118 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5119 vs0 is created here. */
5121 /* Create vector phi node. */
5122 vect_phi = create_phi_node (vec_initial_def, bb);
5123 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5124 loop_vec_info_for_loop (outer_loop));
5125 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5127 /* Create vs0 - initial def of the double reduction phi. */
5128 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5129 loop_preheader_edge (outer_loop));
5130 init_def = get_initial_def_for_reduction (stmt,
5131 preheader_arg, NULL);
5132 vect_phi_init = vect_init_vector (use_stmt, init_def,
5133 vectype, NULL);
5135 /* Update phi node arguments with vs0 and vs2. */
5136 add_phi_arg (vect_phi, vect_phi_init,
5137 loop_preheader_edge (outer_loop),
5138 UNKNOWN_LOCATION);
5139 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5140 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5141 if (dump_enabled_p ())
5143 dump_printf_loc (MSG_NOTE, vect_location,
5144 "created double reduction phi node: ");
5145 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5146 dump_printf (MSG_NOTE, "\n");
5149 vect_phi_res = PHI_RESULT (vect_phi);
5151 /* Replace the use, i.e., set the correct vs1 in the regular
5152 reduction phi node. FORNOW, NCOPIES is always 1, so the
5153 loop is redundant. */
5154 use = reduction_phi;
5155 for (j = 0; j < ncopies; j++)
5157 edge pr_edge = loop_preheader_edge (loop);
5158 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5159 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5165 phis.release ();
5166 if (nested_in_vect_loop)
5168 if (double_reduc)
5169 loop = outer_loop;
5170 else
5171 continue;
5174 phis.create (3);
5175 /* Find the loop-closed-use at the loop exit of the original scalar
5176 result. (The reduction result is expected to have two immediate uses,
5177 one at the latch block, and one at the loop exit). For double
5178 reductions we are looking for exit phis of the outer loop. */
5179 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5181 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5183 if (!is_gimple_debug (USE_STMT (use_p)))
5184 phis.safe_push (USE_STMT (use_p));
5186 else
5188 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5190 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5192 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5194 if (!flow_bb_inside_loop_p (loop,
5195 gimple_bb (USE_STMT (phi_use_p)))
5196 && !is_gimple_debug (USE_STMT (phi_use_p)))
5197 phis.safe_push (USE_STMT (phi_use_p));
5203 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5205 /* Replace the uses: */
5206 orig_name = PHI_RESULT (exit_phi);
5207 scalar_result = scalar_results[k];
5208 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5209 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5210 SET_USE (use_p, scalar_result);
5213 phis.release ();
5218 /* Function is_nonwrapping_integer_induction.
5220 Check if STMT (which is part of loop LOOP) both increments and
5221 does not cause overflow. */
5223 static bool
5224 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5226 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5227 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5228 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5229 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5230 widest_int ni, max_loop_value, lhs_max;
5231 bool overflow = false;
5233 /* Make sure the loop is integer based. */
5234 if (TREE_CODE (base) != INTEGER_CST
5235 || TREE_CODE (step) != INTEGER_CST)
5236 return false;
5238 /* Check that the induction increments. */
5239 if (tree_int_cst_sgn (step) == -1)
5240 return false;
5242 /* Check that the max size of the loop will not wrap. */
5244 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5245 return true;
5247 if (! max_stmt_executions (loop, &ni))
5248 return false;
5250 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5251 &overflow);
5252 if (overflow)
5253 return false;
5255 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5256 TYPE_SIGN (lhs_type), &overflow);
5257 if (overflow)
5258 return false;
5260 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5261 <= TYPE_PRECISION (lhs_type));
5264 /* Function vectorizable_reduction.
5266 Check if STMT performs a reduction operation that can be vectorized.
5267 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5268 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5269 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5271 This function also handles reduction idioms (patterns) that have been
5272 recognized in advance during vect_pattern_recog. In this case, STMT may be
5273 of this form:
5274 X = pattern_expr (arg0, arg1, ..., X)
5275 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5276 sequence that had been detected and replaced by the pattern-stmt (STMT).
5278 This function also handles reduction of condition expressions, for example:
5279 for (int i = 0; i < N; i++)
5280 if (a[i] < value)
5281 last = a[i];
5282 This is handled by vectorising the loop and creating an additional vector
5283 containing the loop indexes for which "a[i] < value" was true. In the
5284 function epilogue this is reduced to a single max value and then used to
5285 index into the vector of results.
5287 In some cases of reduction patterns, the type of the reduction variable X is
5288 different than the type of the other arguments of STMT.
5289 In such cases, the vectype that is used when transforming STMT into a vector
5290 stmt is different than the vectype that is used to determine the
5291 vectorization factor, because it consists of a different number of elements
5292 than the actual number of elements that are being operated upon in parallel.
5294 For example, consider an accumulation of shorts into an int accumulator.
5295 On some targets it's possible to vectorize this pattern operating on 8
5296 shorts at a time (hence, the vectype for purposes of determining the
5297 vectorization factor should be V8HI); on the other hand, the vectype that
5298 is used to create the vector form is actually V4SI (the type of the result).
5300 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5301 indicates what is the actual level of parallelism (V8HI in the example), so
5302 that the right vectorization factor would be derived. This vectype
5303 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5304 be used to create the vectorized stmt. The right vectype for the vectorized
5305 stmt is obtained from the type of the result X:
5306 get_vectype_for_scalar_type (TREE_TYPE (X))
5308 This means that, contrary to "regular" reductions (or "regular" stmts in
5309 general), the following equation:
5310 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5311 does *NOT* necessarily hold for reduction patterns. */
5313 bool
5314 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5315 gimple **vec_stmt, slp_tree slp_node)
5317 tree vec_dest;
5318 tree scalar_dest;
5319 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5320 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5321 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5322 tree vectype_in = NULL_TREE;
5323 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5324 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5325 enum tree_code code, orig_code, epilog_reduc_code;
5326 machine_mode vec_mode;
5327 int op_type;
5328 optab optab, reduc_optab;
5329 tree new_temp = NULL_TREE;
5330 gimple *def_stmt;
5331 enum vect_def_type dt;
5332 gphi *new_phi = NULL;
5333 tree scalar_type;
5334 bool is_simple_use;
5335 gimple *orig_stmt;
5336 stmt_vec_info orig_stmt_info;
5337 tree expr = NULL_TREE;
5338 int i;
5339 int ncopies;
5340 int epilog_copies;
5341 stmt_vec_info prev_stmt_info, prev_phi_info;
5342 bool single_defuse_cycle = false;
5343 tree reduc_def = NULL_TREE;
5344 gimple *new_stmt = NULL;
5345 int j;
5346 tree ops[3];
5347 bool nested_cycle = false, found_nested_cycle_def = false;
5348 gimple *reduc_def_stmt = NULL;
5349 bool double_reduc = false, dummy;
5350 basic_block def_bb;
5351 struct loop * def_stmt_loop, *outer_loop = NULL;
5352 tree def_arg;
5353 gimple *def_arg_stmt;
5354 auto_vec<tree> vec_oprnds0;
5355 auto_vec<tree> vec_oprnds1;
5356 auto_vec<tree> vect_defs;
5357 auto_vec<gimple *> phis;
5358 int vec_num;
5359 tree def0, def1, tem, op0, op1 = NULL_TREE;
5360 bool first_p = true;
5361 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5362 gimple *cond_expr_induction_def_stmt = NULL;
5364 /* In case of reduction chain we switch to the first stmt in the chain, but
5365 we don't update STMT_INFO, since only the last stmt is marked as reduction
5366 and has reduction properties. */
5367 if (GROUP_FIRST_ELEMENT (stmt_info)
5368 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5370 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5371 first_p = false;
5374 if (nested_in_vect_loop_p (loop, stmt))
5376 outer_loop = loop;
5377 loop = loop->inner;
5378 nested_cycle = true;
5381 /* 1. Is vectorizable reduction? */
5382 /* Not supportable if the reduction variable is used in the loop, unless
5383 it's a reduction chain. */
5384 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5385 && !GROUP_FIRST_ELEMENT (stmt_info))
5386 return false;
5388 /* Reductions that are not used even in an enclosing outer-loop,
5389 are expected to be "live" (used out of the loop). */
5390 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5391 && !STMT_VINFO_LIVE_P (stmt_info))
5392 return false;
5394 /* Make sure it was already recognized as a reduction computation. */
5395 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5396 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5397 return false;
5399 /* 2. Has this been recognized as a reduction pattern?
5401 Check if STMT represents a pattern that has been recognized
5402 in earlier analysis stages. For stmts that represent a pattern,
5403 the STMT_VINFO_RELATED_STMT field records the last stmt in
5404 the original sequence that constitutes the pattern. */
5406 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5407 if (orig_stmt)
5409 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5410 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5411 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5414 /* 3. Check the operands of the operation. The first operands are defined
5415 inside the loop body. The last operand is the reduction variable,
5416 which is defined by the loop-header-phi. */
5418 gcc_assert (is_gimple_assign (stmt));
5420 /* Flatten RHS. */
5421 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5423 case GIMPLE_SINGLE_RHS:
5424 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5425 if (op_type == ternary_op)
5427 tree rhs = gimple_assign_rhs1 (stmt);
5428 ops[0] = TREE_OPERAND (rhs, 0);
5429 ops[1] = TREE_OPERAND (rhs, 1);
5430 ops[2] = TREE_OPERAND (rhs, 2);
5431 code = TREE_CODE (rhs);
5433 else
5434 return false;
5435 break;
5437 case GIMPLE_BINARY_RHS:
5438 code = gimple_assign_rhs_code (stmt);
5439 op_type = TREE_CODE_LENGTH (code);
5440 gcc_assert (op_type == binary_op);
5441 ops[0] = gimple_assign_rhs1 (stmt);
5442 ops[1] = gimple_assign_rhs2 (stmt);
5443 break;
5445 case GIMPLE_TERNARY_RHS:
5446 code = gimple_assign_rhs_code (stmt);
5447 op_type = TREE_CODE_LENGTH (code);
5448 gcc_assert (op_type == ternary_op);
5449 ops[0] = gimple_assign_rhs1 (stmt);
5450 ops[1] = gimple_assign_rhs2 (stmt);
5451 ops[2] = gimple_assign_rhs3 (stmt);
5452 break;
5454 case GIMPLE_UNARY_RHS:
5455 return false;
5457 default:
5458 gcc_unreachable ();
5460 /* The default is that the reduction variable is the last in statement. */
5461 int reduc_index = op_type - 1;
5462 if (code == MINUS_EXPR)
5463 reduc_index = 0;
5465 if (code == COND_EXPR && slp_node)
5466 return false;
5468 scalar_dest = gimple_assign_lhs (stmt);
5469 scalar_type = TREE_TYPE (scalar_dest);
5470 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5471 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5472 return false;
5474 /* Do not try to vectorize bit-precision reductions. */
5475 if ((TYPE_PRECISION (scalar_type)
5476 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5477 return false;
5479 /* All uses but the last are expected to be defined in the loop.
5480 The last use is the reduction variable. In case of nested cycle this
5481 assumption is not true: we use reduc_index to record the index of the
5482 reduction variable. */
5483 for (i = 0; i < op_type; i++)
5485 if (i == reduc_index)
5486 continue;
5488 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5489 if (i == 0 && code == COND_EXPR)
5490 continue;
5492 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5493 &def_stmt, &dt, &tem);
5494 if (!vectype_in)
5495 vectype_in = tem;
5496 gcc_assert (is_simple_use);
5498 if (dt != vect_internal_def
5499 && dt != vect_external_def
5500 && dt != vect_constant_def
5501 && dt != vect_induction_def
5502 && !(dt == vect_nested_cycle && nested_cycle))
5503 return false;
5505 if (dt == vect_nested_cycle)
5507 found_nested_cycle_def = true;
5508 reduc_def_stmt = def_stmt;
5509 reduc_index = i;
5512 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5513 cond_expr_induction_def_stmt = def_stmt;
5516 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5517 &def_stmt, &dt, &tem);
5518 if (!vectype_in)
5519 vectype_in = tem;
5520 gcc_assert (is_simple_use);
5521 if (!found_nested_cycle_def)
5522 reduc_def_stmt = def_stmt;
5524 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5525 return false;
5527 if (!(dt == vect_reduction_def
5528 || dt == vect_nested_cycle
5529 || ((dt == vect_internal_def || dt == vect_external_def
5530 || dt == vect_constant_def || dt == vect_induction_def)
5531 && nested_cycle && found_nested_cycle_def)))
5533 /* For pattern recognized stmts, orig_stmt might be a reduction,
5534 but some helper statements for the pattern might not, or
5535 might be COND_EXPRs with reduction uses in the condition. */
5536 gcc_assert (orig_stmt);
5537 return false;
5540 enum vect_reduction_type v_reduc_type;
5541 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5542 !nested_cycle, &dummy, false,
5543 &v_reduc_type);
5545 /* If we have a condition reduction, see if we can simplify it further. */
5546 if (v_reduc_type == COND_REDUCTION
5547 && cond_expr_induction_def_stmt != NULL
5548 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5550 if (dump_enabled_p ())
5551 dump_printf_loc (MSG_NOTE, vect_location,
5552 "condition expression based on integer induction.\n");
5553 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5555 else
5556 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5558 if (orig_stmt)
5559 gcc_assert (tmp == orig_stmt
5560 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5561 else
5562 /* We changed STMT to be the first stmt in reduction chain, hence we
5563 check that in this case the first element in the chain is STMT. */
5564 gcc_assert (stmt == tmp
5565 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5567 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5568 return false;
5570 if (slp_node || PURE_SLP_STMT (stmt_info))
5571 ncopies = 1;
5572 else
5573 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5574 / TYPE_VECTOR_SUBPARTS (vectype_in));
5576 gcc_assert (ncopies >= 1);
5578 vec_mode = TYPE_MODE (vectype_in);
5580 if (code == COND_EXPR)
5582 /* Only call during the analysis stage, otherwise we'll lose
5583 STMT_VINFO_TYPE. */
5584 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5585 ops[reduc_index], 0, NULL))
5587 if (dump_enabled_p ())
5588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5589 "unsupported condition in reduction\n");
5590 return false;
5593 else
5595 /* 4. Supportable by target? */
5597 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5598 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5600 /* Shifts and rotates are only supported by vectorizable_shifts,
5601 not vectorizable_reduction. */
5602 if (dump_enabled_p ())
5603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5604 "unsupported shift or rotation.\n");
5605 return false;
5608 /* 4.1. check support for the operation in the loop */
5609 optab = optab_for_tree_code (code, vectype_in, optab_default);
5610 if (!optab)
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614 "no optab.\n");
5616 return false;
5619 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5621 if (dump_enabled_p ())
5622 dump_printf (MSG_NOTE, "op not supported by target.\n");
5624 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5625 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5626 < vect_min_worthwhile_factor (code))
5627 return false;
5629 if (dump_enabled_p ())
5630 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5633 /* Worthwhile without SIMD support? */
5634 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5635 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5636 < vect_min_worthwhile_factor (code))
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640 "not worthwhile without SIMD support.\n");
5642 return false;
5646 /* 4.2. Check support for the epilog operation.
5648 If STMT represents a reduction pattern, then the type of the
5649 reduction variable may be different than the type of the rest
5650 of the arguments. For example, consider the case of accumulation
5651 of shorts into an int accumulator; The original code:
5652 S1: int_a = (int) short_a;
5653 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5655 was replaced with:
5656 STMT: int_acc = widen_sum <short_a, int_acc>
5658 This means that:
5659 1. The tree-code that is used to create the vector operation in the
5660 epilog code (that reduces the partial results) is not the
5661 tree-code of STMT, but is rather the tree-code of the original
5662 stmt from the pattern that STMT is replacing. I.e, in the example
5663 above we want to use 'widen_sum' in the loop, but 'plus' in the
5664 epilog.
5665 2. The type (mode) we use to check available target support
5666 for the vector operation to be created in the *epilog*, is
5667 determined by the type of the reduction variable (in the example
5668 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5669 However the type (mode) we use to check available target support
5670 for the vector operation to be created *inside the loop*, is
5671 determined by the type of the other arguments to STMT (in the
5672 example we'd check this: optab_handler (widen_sum_optab,
5673 vect_short_mode)).
5675 This is contrary to "regular" reductions, in which the types of all
5676 the arguments are the same as the type of the reduction variable.
5677 For "regular" reductions we can therefore use the same vector type
5678 (and also the same tree-code) when generating the epilog code and
5679 when generating the code inside the loop. */
5681 if (orig_stmt)
5683 /* This is a reduction pattern: get the vectype from the type of the
5684 reduction variable, and get the tree-code from orig_stmt. */
5685 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5686 == TREE_CODE_REDUCTION);
5687 orig_code = gimple_assign_rhs_code (orig_stmt);
5688 gcc_assert (vectype_out);
5689 vec_mode = TYPE_MODE (vectype_out);
5691 else
5693 /* Regular reduction: use the same vectype and tree-code as used for
5694 the vector code inside the loop can be used for the epilog code. */
5695 orig_code = code;
5697 if (code == MINUS_EXPR)
5698 orig_code = PLUS_EXPR;
5700 /* For simple condition reductions, replace with the actual expression
5701 we want to base our reduction around. */
5702 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5703 == INTEGER_INDUC_COND_REDUCTION)
5704 orig_code = MAX_EXPR;
5707 if (nested_cycle)
5709 def_bb = gimple_bb (reduc_def_stmt);
5710 def_stmt_loop = def_bb->loop_father;
5711 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5712 loop_preheader_edge (def_stmt_loop));
5713 if (TREE_CODE (def_arg) == SSA_NAME
5714 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5715 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5716 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5717 && vinfo_for_stmt (def_arg_stmt)
5718 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5719 == vect_double_reduction_def)
5720 double_reduc = true;
5723 epilog_reduc_code = ERROR_MARK;
5725 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5726 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5727 == INTEGER_INDUC_COND_REDUCTION)
5729 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5731 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5732 optab_default);
5733 if (!reduc_optab)
5735 if (dump_enabled_p ())
5736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737 "no optab for reduction.\n");
5739 epilog_reduc_code = ERROR_MARK;
5741 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5743 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5744 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5746 if (dump_enabled_p ())
5747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5748 "reduc op not supported by target.\n");
5750 epilog_reduc_code = ERROR_MARK;
5754 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5755 generated in the epilog using multiple expressions. This does not
5756 work for condition reductions. */
5757 if (epilog_reduc_code == ERROR_MARK
5758 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5759 == INTEGER_INDUC_COND_REDUCTION)
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "no reduc code for scalar code.\n");
5764 return false;
5767 else
5769 if (!nested_cycle || double_reduc)
5771 if (dump_enabled_p ())
5772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5773 "no reduc code for scalar code.\n");
5775 return false;
5779 else
5781 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5782 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5783 cr_index_vector_type = build_vector_type
5784 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5786 epilog_reduc_code = REDUC_MAX_EXPR;
5787 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5788 optab_default);
5789 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5790 == CODE_FOR_nothing)
5792 if (dump_enabled_p ())
5793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5794 "reduc max op not supported by target.\n");
5795 return false;
5799 if ((double_reduc
5800 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5801 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5802 == INTEGER_INDUC_COND_REDUCTION)
5803 && ncopies > 1)
5805 if (dump_enabled_p ())
5806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5807 "multiple types in double reduction or condition "
5808 "reduction.\n");
5809 return false;
5812 /* In case of widenning multiplication by a constant, we update the type
5813 of the constant to be the type of the other operand. We check that the
5814 constant fits the type in the pattern recognition pass. */
5815 if (code == DOT_PROD_EXPR
5816 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5818 if (TREE_CODE (ops[0]) == INTEGER_CST)
5819 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5820 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5821 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5822 else
5824 if (dump_enabled_p ())
5825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5826 "invalid types in dot-prod\n");
5828 return false;
5832 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5834 widest_int ni;
5836 if (! max_loop_iterations (loop, &ni))
5838 if (dump_enabled_p ())
5839 dump_printf_loc (MSG_NOTE, vect_location,
5840 "loop count not known, cannot create cond "
5841 "reduction.\n");
5842 return false;
5844 /* Convert backedges to iterations. */
5845 ni += 1;
5847 /* The additional index will be the same type as the condition. Check
5848 that the loop can fit into this less one (because we'll use up the
5849 zero slot for when there are no matches). */
5850 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5851 if (wi::geu_p (ni, wi::to_widest (max_index)))
5853 if (dump_enabled_p ())
5854 dump_printf_loc (MSG_NOTE, vect_location,
5855 "loop size is greater than data size.\n");
5856 return false;
5860 if (!vec_stmt) /* transformation not required. */
5862 if (first_p
5863 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5864 reduc_index))
5865 return false;
5866 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5867 return true;
5870 /** Transform. **/
5872 if (dump_enabled_p ())
5873 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5875 /* FORNOW: Multiple types are not supported for condition. */
5876 if (code == COND_EXPR)
5877 gcc_assert (ncopies == 1);
5879 /* Create the destination vector */
5880 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5882 /* In case the vectorization factor (VF) is bigger than the number
5883 of elements that we can fit in a vectype (nunits), we have to generate
5884 more than one vector stmt - i.e - we need to "unroll" the
5885 vector stmt by a factor VF/nunits. For more details see documentation
5886 in vectorizable_operation. */
5888 /* If the reduction is used in an outer loop we need to generate
5889 VF intermediate results, like so (e.g. for ncopies=2):
5890 r0 = phi (init, r0)
5891 r1 = phi (init, r1)
5892 r0 = x0 + r0;
5893 r1 = x1 + r1;
5894 (i.e. we generate VF results in 2 registers).
5895 In this case we have a separate def-use cycle for each copy, and therefore
5896 for each copy we get the vector def for the reduction variable from the
5897 respective phi node created for this copy.
5899 Otherwise (the reduction is unused in the loop nest), we can combine
5900 together intermediate results, like so (e.g. for ncopies=2):
5901 r = phi (init, r)
5902 r = x0 + r;
5903 r = x1 + r;
5904 (i.e. we generate VF/2 results in a single register).
5905 In this case for each copy we get the vector def for the reduction variable
5906 from the vectorized reduction operation generated in the previous iteration.
5909 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5911 single_defuse_cycle = true;
5912 epilog_copies = 1;
5914 else
5915 epilog_copies = ncopies;
5917 prev_stmt_info = NULL;
5918 prev_phi_info = NULL;
5919 if (slp_node)
5920 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5921 else
5923 vec_num = 1;
5924 vec_oprnds0.create (1);
5925 if (op_type == ternary_op)
5926 vec_oprnds1.create (1);
5929 phis.create (vec_num);
5930 vect_defs.create (vec_num);
5931 if (!slp_node)
5932 vect_defs.quick_push (NULL_TREE);
5934 for (j = 0; j < ncopies; j++)
5936 if (j == 0 || !single_defuse_cycle)
5938 for (i = 0; i < vec_num; i++)
5940 /* Create the reduction-phi that defines the reduction
5941 operand. */
5942 new_phi = create_phi_node (vec_dest, loop->header);
5943 set_vinfo_for_stmt (new_phi,
5944 new_stmt_vec_info (new_phi, loop_vinfo));
5945 if (j == 0 || slp_node)
5946 phis.quick_push (new_phi);
5950 if (code == COND_EXPR)
5952 gcc_assert (!slp_node);
5953 vectorizable_condition (stmt, gsi, vec_stmt,
5954 PHI_RESULT (phis[0]),
5955 reduc_index, NULL);
5956 /* Multiple types are not supported for condition. */
5957 break;
5960 /* Handle uses. */
5961 if (j == 0)
5963 op0 = ops[!reduc_index];
5964 if (op_type == ternary_op)
5966 if (reduc_index == 0)
5967 op1 = ops[2];
5968 else
5969 op1 = ops[1];
5972 if (slp_node)
5973 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5974 slp_node, -1);
5975 else
5977 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5978 stmt);
5979 vec_oprnds0.quick_push (loop_vec_def0);
5980 if (op_type == ternary_op)
5982 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5983 vec_oprnds1.quick_push (loop_vec_def1);
5987 else
5989 if (!slp_node)
5991 enum vect_def_type dt;
5992 gimple *dummy_stmt;
5994 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5995 &dummy_stmt, &dt);
5996 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5997 loop_vec_def0);
5998 vec_oprnds0[0] = loop_vec_def0;
5999 if (op_type == ternary_op)
6001 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6002 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6003 loop_vec_def1);
6004 vec_oprnds1[0] = loop_vec_def1;
6008 if (single_defuse_cycle)
6009 reduc_def = gimple_assign_lhs (new_stmt);
6011 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6014 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6016 if (slp_node)
6017 reduc_def = PHI_RESULT (phis[i]);
6018 else
6020 if (!single_defuse_cycle || j == 0)
6021 reduc_def = PHI_RESULT (new_phi);
6024 def1 = ((op_type == ternary_op)
6025 ? vec_oprnds1[i] : NULL);
6026 if (op_type == binary_op)
6028 if (reduc_index == 0)
6029 expr = build2 (code, vectype_out, reduc_def, def0);
6030 else
6031 expr = build2 (code, vectype_out, def0, reduc_def);
6033 else
6035 if (reduc_index == 0)
6036 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6037 else
6039 if (reduc_index == 1)
6040 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6041 else
6042 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6046 new_stmt = gimple_build_assign (vec_dest, expr);
6047 new_temp = make_ssa_name (vec_dest, new_stmt);
6048 gimple_assign_set_lhs (new_stmt, new_temp);
6049 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6051 if (slp_node)
6053 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6054 vect_defs.quick_push (new_temp);
6056 else
6057 vect_defs[0] = new_temp;
6060 if (slp_node)
6061 continue;
6063 if (j == 0)
6064 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6065 else
6066 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6068 prev_stmt_info = vinfo_for_stmt (new_stmt);
6069 prev_phi_info = vinfo_for_stmt (new_phi);
6072 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6074 /* Finalize the reduction-phi (set its arguments) and create the
6075 epilog reduction code. */
6076 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6078 new_temp = gimple_assign_lhs (*vec_stmt);
6079 vect_defs[0] = new_temp;
6081 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6082 which is updated with the current index of the loop for every match of
6083 the original loop's cond_expr (VEC_STMT). This results in a vector
6084 containing the last time the condition passed for that vector lane.
6085 The first match will be a 1 to allow 0 to be used for non-matching
6086 indexes. If there are no matches at all then the vector will be all
6087 zeroes. */
6088 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6090 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6091 int k;
6093 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6095 /* First we create a simple vector induction variable which starts
6096 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6097 vector size (STEP). */
6099 /* Create a {1,2,3,...} vector. */
6100 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6101 for (k = 0; k < nunits_out; ++k)
6102 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6103 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6105 /* Create a vector of the step value. */
6106 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6107 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6109 /* Create an induction variable. */
6110 gimple_stmt_iterator incr_gsi;
6111 bool insert_after;
6112 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6113 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6114 insert_after, &indx_before_incr, &indx_after_incr);
6116 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6117 filled with zeros (VEC_ZERO). */
6119 /* Create a vector of 0s. */
6120 tree zero = build_zero_cst (cr_index_scalar_type);
6121 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6123 /* Create a vector phi node. */
6124 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6125 new_phi = create_phi_node (new_phi_tree, loop->header);
6126 set_vinfo_for_stmt (new_phi,
6127 new_stmt_vec_info (new_phi, loop_vinfo));
6128 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6129 UNKNOWN_LOCATION);
6131 /* Now take the condition from the loops original cond_expr
6132 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6133 every match uses values from the induction variable
6134 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6135 (NEW_PHI_TREE).
6136 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6137 the new cond_expr (INDEX_COND_EXPR). */
6139 /* Turn the condition from vec_stmt into an ssa name. */
6140 gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6141 tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6142 tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6143 gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6144 ccompare);
6145 gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6146 gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6147 update_stmt (*vec_stmt);
6149 /* Create a conditional, where the condition is taken from vec_stmt
6150 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6151 and else is the phi (NEW_PHI_TREE). */
6152 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6153 ccompare_name, indx_before_incr,
6154 new_phi_tree);
6155 cond_name = make_ssa_name (cr_index_vector_type);
6156 gimple *index_condition = gimple_build_assign (cond_name,
6157 index_cond_expr);
6158 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6159 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6160 loop_vinfo);
6161 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6162 set_vinfo_for_stmt (index_condition, index_vec_info);
6164 /* Update the phi with the vec cond. */
6165 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6166 UNKNOWN_LOCATION);
6170 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6171 epilog_reduc_code, phis, reduc_index,
6172 double_reduc, slp_node, cond_name);
6174 return true;
6177 /* Function vect_min_worthwhile_factor.
6179 For a loop where we could vectorize the operation indicated by CODE,
6180 return the minimum vectorization factor that makes it worthwhile
6181 to use generic vectors. */
6183 vect_min_worthwhile_factor (enum tree_code code)
6185 switch (code)
6187 case PLUS_EXPR:
6188 case MINUS_EXPR:
6189 case NEGATE_EXPR:
6190 return 4;
6192 case BIT_AND_EXPR:
6193 case BIT_IOR_EXPR:
6194 case BIT_XOR_EXPR:
6195 case BIT_NOT_EXPR:
6196 return 2;
6198 default:
6199 return INT_MAX;
6204 /* Function vectorizable_induction
6206 Check if PHI performs an induction computation that can be vectorized.
6207 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6208 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6209 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6211 bool
6212 vectorizable_induction (gimple *phi,
6213 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6214 gimple **vec_stmt)
6216 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6217 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6220 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6221 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6222 tree vec_def;
6224 gcc_assert (ncopies >= 1);
6225 /* FORNOW. These restrictions should be relaxed. */
6226 if (nested_in_vect_loop_p (loop, phi))
6228 imm_use_iterator imm_iter;
6229 use_operand_p use_p;
6230 gimple *exit_phi;
6231 edge latch_e;
6232 tree loop_arg;
6234 if (ncopies > 1)
6236 if (dump_enabled_p ())
6237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6238 "multiple types in nested loop.\n");
6239 return false;
6242 exit_phi = NULL;
6243 latch_e = loop_latch_edge (loop->inner);
6244 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6245 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6247 gimple *use_stmt = USE_STMT (use_p);
6248 if (is_gimple_debug (use_stmt))
6249 continue;
6251 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6253 exit_phi = use_stmt;
6254 break;
6257 if (exit_phi)
6259 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6260 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6261 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6263 if (dump_enabled_p ())
6264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265 "inner-loop induction only used outside "
6266 "of the outer vectorized loop.\n");
6267 return false;
6272 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6273 return false;
6275 /* FORNOW: SLP not supported. */
6276 if (STMT_SLP_TYPE (stmt_info))
6277 return false;
6279 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6281 if (gimple_code (phi) != GIMPLE_PHI)
6282 return false;
6284 if (!vec_stmt) /* transformation not required. */
6286 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_NOTE, vect_location,
6289 "=== vectorizable_induction ===\n");
6290 vect_model_induction_cost (stmt_info, ncopies);
6291 return true;
6294 /** Transform. **/
6296 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6299 vec_def = get_initial_def_for_induction (phi);
6300 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6301 return true;
6304 /* Function vectorizable_live_operation.
6306 STMT computes a value that is used outside the loop. Check if
6307 it can be supported. */
6309 bool
6310 vectorizable_live_operation (gimple *stmt,
6311 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6312 gimple **vec_stmt)
6314 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6317 tree op;
6318 gimple *def_stmt;
6319 ssa_op_iter iter;
6321 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6323 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6324 return false;
6326 if (!is_gimple_assign (stmt))
6328 if (gimple_call_internal_p (stmt)
6329 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6330 && gimple_call_lhs (stmt)
6331 && loop->simduid
6332 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6333 && loop->simduid
6334 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6336 edge e = single_exit (loop);
6337 basic_block merge_bb = e->dest;
6338 imm_use_iterator imm_iter;
6339 use_operand_p use_p;
6340 tree lhs = gimple_call_lhs (stmt);
6342 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6344 gimple *use_stmt = USE_STMT (use_p);
6345 if (gimple_code (use_stmt) == GIMPLE_PHI
6346 && gimple_bb (use_stmt) == merge_bb)
6348 if (vec_stmt)
6350 tree vfm1
6351 = build_int_cst (unsigned_type_node,
6352 loop_vinfo->vectorization_factor - 1);
6353 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6355 return true;
6360 return false;
6363 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6364 return false;
6366 /* FORNOW. CHECKME. */
6367 if (nested_in_vect_loop_p (loop, stmt))
6368 return false;
6370 /* FORNOW: support only if all uses are invariant. This means
6371 that the scalar operations can remain in place, unvectorized.
6372 The original last scalar value that they compute will be used. */
6373 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6375 enum vect_def_type dt = vect_uninitialized_def;
6377 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6379 if (dump_enabled_p ())
6380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6381 "use not simple.\n");
6382 return false;
6385 if (dt != vect_external_def && dt != vect_constant_def)
6386 return false;
6389 /* No transformation is required for the cases we currently support. */
6390 return true;
6393 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6395 static void
6396 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6398 ssa_op_iter op_iter;
6399 imm_use_iterator imm_iter;
6400 def_operand_p def_p;
6401 gimple *ustmt;
6403 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6405 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6407 basic_block bb;
6409 if (!is_gimple_debug (ustmt))
6410 continue;
6412 bb = gimple_bb (ustmt);
6414 if (!flow_bb_inside_loop_p (loop, bb))
6416 if (gimple_debug_bind_p (ustmt))
6418 if (dump_enabled_p ())
6419 dump_printf_loc (MSG_NOTE, vect_location,
6420 "killing debug use\n");
6422 gimple_debug_bind_reset_value (ustmt);
6423 update_stmt (ustmt);
6425 else
6426 gcc_unreachable ();
6433 /* This function builds ni_name = number of iterations. Statements
6434 are emitted on the loop preheader edge. */
6436 static tree
6437 vect_build_loop_niters (loop_vec_info loop_vinfo)
6439 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6440 if (TREE_CODE (ni) == INTEGER_CST)
6441 return ni;
6442 else
6444 tree ni_name, var;
6445 gimple_seq stmts = NULL;
6446 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6448 var = create_tmp_var (TREE_TYPE (ni), "niters");
6449 ni_name = force_gimple_operand (ni, &stmts, false, var);
6450 if (stmts)
6451 gsi_insert_seq_on_edge_immediate (pe, stmts);
6453 return ni_name;
6458 /* This function generates the following statements:
6460 ni_name = number of iterations loop executes
6461 ratio = ni_name / vf
6462 ratio_mult_vf_name = ratio * vf
6464 and places them on the loop preheader edge. */
6466 static void
6467 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6468 tree ni_name,
6469 tree *ratio_mult_vf_name_ptr,
6470 tree *ratio_name_ptr)
6472 tree ni_minus_gap_name;
6473 tree var;
6474 tree ratio_name;
6475 tree ratio_mult_vf_name;
6476 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6477 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6478 tree log_vf;
6480 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6482 /* If epilogue loop is required because of data accesses with gaps, we
6483 subtract one iteration from the total number of iterations here for
6484 correct calculation of RATIO. */
6485 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6487 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6488 ni_name,
6489 build_one_cst (TREE_TYPE (ni_name)));
6490 if (!is_gimple_val (ni_minus_gap_name))
6492 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6493 gimple *stmts = NULL;
6494 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6495 true, var);
6496 gsi_insert_seq_on_edge_immediate (pe, stmts);
6499 else
6500 ni_minus_gap_name = ni_name;
6502 /* Create: ratio = ni >> log2(vf) */
6503 /* ??? As we have ni == number of latch executions + 1, ni could
6504 have overflown to zero. So avoid computing ratio based on ni
6505 but compute it using the fact that we know ratio will be at least
6506 one, thus via (ni - vf) >> log2(vf) + 1. */
6507 ratio_name
6508 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6509 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6510 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6511 ni_minus_gap_name,
6512 build_int_cst
6513 (TREE_TYPE (ni_name), vf)),
6514 log_vf),
6515 build_int_cst (TREE_TYPE (ni_name), 1));
6516 if (!is_gimple_val (ratio_name))
6518 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6519 gimple *stmts = NULL;
6520 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6521 gsi_insert_seq_on_edge_immediate (pe, stmts);
6523 *ratio_name_ptr = ratio_name;
6525 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6527 if (ratio_mult_vf_name_ptr)
6529 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6530 ratio_name, log_vf);
6531 if (!is_gimple_val (ratio_mult_vf_name))
6533 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6534 gimple *stmts = NULL;
6535 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6536 true, var);
6537 gsi_insert_seq_on_edge_immediate (pe, stmts);
6539 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6542 return;
6546 /* Function vect_transform_loop.
6548 The analysis phase has determined that the loop is vectorizable.
6549 Vectorize the loop - created vectorized stmts to replace the scalar
6550 stmts in the loop, and update the loop exit condition. */
6552 void
6553 vect_transform_loop (loop_vec_info loop_vinfo)
6555 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6556 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6557 int nbbs = loop->num_nodes;
6558 int i;
6559 tree ratio = NULL;
6560 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6561 bool grouped_store;
6562 bool slp_scheduled = false;
6563 gimple *stmt, *pattern_stmt;
6564 gimple_seq pattern_def_seq = NULL;
6565 gimple_stmt_iterator pattern_def_si = gsi_none ();
6566 bool transform_pattern_stmt = false;
6567 bool check_profitability = false;
6568 int th;
6569 /* Record number of iterations before we started tampering with the profile. */
6570 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6572 if (dump_enabled_p ())
6573 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6575 /* If profile is inprecise, we have chance to fix it up. */
6576 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6577 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6579 /* Use the more conservative vectorization threshold. If the number
6580 of iterations is constant assume the cost check has been performed
6581 by our caller. If the threshold makes all loops profitable that
6582 run at least the vectorization factor number of times checking
6583 is pointless, too. */
6584 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6585 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6586 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6588 if (dump_enabled_p ())
6589 dump_printf_loc (MSG_NOTE, vect_location,
6590 "Profitability threshold is %d loop iterations.\n",
6591 th);
6592 check_profitability = true;
6595 /* Version the loop first, if required, so the profitability check
6596 comes first. */
6598 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6599 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6601 vect_loop_versioning (loop_vinfo, th, check_profitability);
6602 check_profitability = false;
6605 tree ni_name = vect_build_loop_niters (loop_vinfo);
6606 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6608 /* Peel the loop if there are data refs with unknown alignment.
6609 Only one data ref with unknown store is allowed. */
6611 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6613 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6614 th, check_profitability);
6615 check_profitability = false;
6616 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6617 be re-computed. */
6618 ni_name = NULL_TREE;
6621 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6622 compile time constant), or it is a constant that doesn't divide by the
6623 vectorization factor, then an epilog loop needs to be created.
6624 We therefore duplicate the loop: the original loop will be vectorized,
6625 and will compute the first (n/VF) iterations. The second copy of the loop
6626 will remain scalar and will compute the remaining (n%VF) iterations.
6627 (VF is the vectorization factor). */
6629 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6630 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6632 tree ratio_mult_vf;
6633 if (!ni_name)
6634 ni_name = vect_build_loop_niters (loop_vinfo);
6635 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6636 &ratio);
6637 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6638 th, check_profitability);
6640 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6641 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6642 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6643 else
6645 if (!ni_name)
6646 ni_name = vect_build_loop_niters (loop_vinfo);
6647 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6650 /* 1) Make sure the loop header has exactly two entries
6651 2) Make sure we have a preheader basic block. */
6653 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6655 split_edge (loop_preheader_edge (loop));
6657 /* FORNOW: the vectorizer supports only loops which body consist
6658 of one basic block (header + empty latch). When the vectorizer will
6659 support more involved loop forms, the order by which the BBs are
6660 traversed need to be reconsidered. */
6662 for (i = 0; i < nbbs; i++)
6664 basic_block bb = bbs[i];
6665 stmt_vec_info stmt_info;
6667 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6668 gsi_next (&si))
6670 gphi *phi = si.phi ();
6671 if (dump_enabled_p ())
6673 dump_printf_loc (MSG_NOTE, vect_location,
6674 "------>vectorizing phi: ");
6675 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6676 dump_printf (MSG_NOTE, "\n");
6678 stmt_info = vinfo_for_stmt (phi);
6679 if (!stmt_info)
6680 continue;
6682 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6683 vect_loop_kill_debug_uses (loop, phi);
6685 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6686 && !STMT_VINFO_LIVE_P (stmt_info))
6687 continue;
6689 if (STMT_VINFO_VECTYPE (stmt_info)
6690 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6691 != (unsigned HOST_WIDE_INT) vectorization_factor)
6692 && dump_enabled_p ())
6693 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6695 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6697 if (dump_enabled_p ())
6698 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6699 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6703 pattern_stmt = NULL;
6704 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6705 !gsi_end_p (si) || transform_pattern_stmt;)
6707 bool is_store;
6709 if (transform_pattern_stmt)
6710 stmt = pattern_stmt;
6711 else
6713 stmt = gsi_stmt (si);
6714 /* During vectorization remove existing clobber stmts. */
6715 if (gimple_clobber_p (stmt))
6717 unlink_stmt_vdef (stmt);
6718 gsi_remove (&si, true);
6719 release_defs (stmt);
6720 continue;
6724 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_NOTE, vect_location,
6727 "------>vectorizing statement: ");
6728 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6729 dump_printf (MSG_NOTE, "\n");
6732 stmt_info = vinfo_for_stmt (stmt);
6734 /* vector stmts created in the outer-loop during vectorization of
6735 stmts in an inner-loop may not have a stmt_info, and do not
6736 need to be vectorized. */
6737 if (!stmt_info)
6739 gsi_next (&si);
6740 continue;
6743 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6744 vect_loop_kill_debug_uses (loop, stmt);
6746 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6747 && !STMT_VINFO_LIVE_P (stmt_info))
6749 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6750 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6751 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6752 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6754 stmt = pattern_stmt;
6755 stmt_info = vinfo_for_stmt (stmt);
6757 else
6759 gsi_next (&si);
6760 continue;
6763 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6764 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6765 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6766 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6767 transform_pattern_stmt = true;
6769 /* If pattern statement has def stmts, vectorize them too. */
6770 if (is_pattern_stmt_p (stmt_info))
6772 if (pattern_def_seq == NULL)
6774 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6775 pattern_def_si = gsi_start (pattern_def_seq);
6777 else if (!gsi_end_p (pattern_def_si))
6778 gsi_next (&pattern_def_si);
6779 if (pattern_def_seq != NULL)
6781 gimple *pattern_def_stmt = NULL;
6782 stmt_vec_info pattern_def_stmt_info = NULL;
6784 while (!gsi_end_p (pattern_def_si))
6786 pattern_def_stmt = gsi_stmt (pattern_def_si);
6787 pattern_def_stmt_info
6788 = vinfo_for_stmt (pattern_def_stmt);
6789 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6790 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6791 break;
6792 gsi_next (&pattern_def_si);
6795 if (!gsi_end_p (pattern_def_si))
6797 if (dump_enabled_p ())
6799 dump_printf_loc (MSG_NOTE, vect_location,
6800 "==> vectorizing pattern def "
6801 "stmt: ");
6802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6803 pattern_def_stmt, 0);
6804 dump_printf (MSG_NOTE, "\n");
6807 stmt = pattern_def_stmt;
6808 stmt_info = pattern_def_stmt_info;
6810 else
6812 pattern_def_si = gsi_none ();
6813 transform_pattern_stmt = false;
6816 else
6817 transform_pattern_stmt = false;
6820 if (STMT_VINFO_VECTYPE (stmt_info))
6822 unsigned int nunits
6823 = (unsigned int)
6824 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6825 if (!STMT_SLP_TYPE (stmt_info)
6826 && nunits != (unsigned int) vectorization_factor
6827 && dump_enabled_p ())
6828 /* For SLP VF is set according to unrolling factor, and not
6829 to vector size, hence for SLP this print is not valid. */
6830 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6833 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6834 reached. */
6835 if (STMT_SLP_TYPE (stmt_info))
6837 if (!slp_scheduled)
6839 slp_scheduled = true;
6841 if (dump_enabled_p ())
6842 dump_printf_loc (MSG_NOTE, vect_location,
6843 "=== scheduling SLP instances ===\n");
6845 vect_schedule_slp (loop_vinfo);
6848 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6849 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6851 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6853 pattern_def_seq = NULL;
6854 gsi_next (&si);
6856 continue;
6860 /* -------- vectorize statement ------------ */
6861 if (dump_enabled_p ())
6862 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6864 grouped_store = false;
6865 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6866 if (is_store)
6868 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6870 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6871 interleaving chain was completed - free all the stores in
6872 the chain. */
6873 gsi_next (&si);
6874 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6876 else
6878 /* Free the attached stmt_vec_info and remove the stmt. */
6879 gimple *store = gsi_stmt (si);
6880 free_stmt_vec_info (store);
6881 unlink_stmt_vdef (store);
6882 gsi_remove (&si, true);
6883 release_defs (store);
6886 /* Stores can only appear at the end of pattern statements. */
6887 gcc_assert (!transform_pattern_stmt);
6888 pattern_def_seq = NULL;
6890 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6892 pattern_def_seq = NULL;
6893 gsi_next (&si);
6895 } /* stmts in BB */
6896 } /* BBs in loop */
6898 slpeel_make_loop_iterate_ntimes (loop, ratio);
6900 /* Reduce loop iterations by the vectorization factor. */
6901 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6902 expected_iterations / vectorization_factor);
6903 loop->nb_iterations_upper_bound
6904 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6905 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6906 && loop->nb_iterations_upper_bound != 0)
6907 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6908 if (loop->any_estimate)
6910 loop->nb_iterations_estimate
6911 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6912 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6913 && loop->nb_iterations_estimate != 0)
6914 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6917 if (dump_enabled_p ())
6919 dump_printf_loc (MSG_NOTE, vect_location,
6920 "LOOP VECTORIZED\n");
6921 if (loop->inner)
6922 dump_printf_loc (MSG_NOTE, vect_location,
6923 "OUTER LOOP VECTORIZED\n");
6924 dump_printf (MSG_NOTE, "\n");