re PR bootstrap/77993 (bootstrap failure on PowerPC/Linux)
[official-gcc.git] / gcc / tree-vect-loop.c
blob1cd9c7221b0c77bb22648661a18bd99de88c27ce
1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
53 /* Loop Vectorization Pass.
55 This pass tries to vectorize loops.
57 For example, the vectorizer transforms the following simple loop:
59 short a[N]; short b[N]; short c[N]; int i;
61 for (i=0; i<N; i++){
62 a[i] = b[i] + c[i];
65 as if it was manually vectorized by rewriting the source code into:
67 typedef int __attribute__((mode(V8HI))) v8hi;
68 short a[N]; short b[N]; short c[N]; int i;
69 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
70 v8hi va, vb, vc;
72 for (i=0; i<N/8; i++){
73 vb = pb[i];
74 vc = pc[i];
75 va = vb + vc;
76 pa[i] = va;
79 The main entry to this pass is vectorize_loops(), in which
80 the vectorizer applies a set of analyses on a given set of loops,
81 followed by the actual vectorization transformation for the loops that
82 had successfully passed the analysis phase.
83 Throughout this pass we make a distinction between two types of
84 data: scalars (which are represented by SSA_NAMES), and memory references
85 ("data-refs"). These two types of data require different handling both
86 during analysis and transformation. The types of data-refs that the
87 vectorizer currently supports are ARRAY_REFS which base is an array DECL
88 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
89 accesses are required to have a simple (consecutive) access pattern.
91 Analysis phase:
92 ===============
93 The driver for the analysis phase is vect_analyze_loop().
94 It applies a set of analyses, some of which rely on the scalar evolution
95 analyzer (scev) developed by Sebastian Pop.
97 During the analysis phase the vectorizer records some information
98 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
99 loop, as well as general information about the loop as a whole, which is
100 recorded in a "loop_vec_info" struct attached to each loop.
102 Transformation phase:
103 =====================
104 The loop transformation phase scans all the stmts in the loop, and
105 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
106 the loop that needs to be vectorized. It inserts the vector code sequence
107 just before the scalar stmt S, and records a pointer to the vector code
108 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
109 attached to S). This pointer will be used for the vectorization of following
110 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
111 otherwise, we rely on dead code elimination for removing it.
113 For example, say stmt S1 was vectorized into stmt VS1:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 S2: a = b;
119 To vectorize stmt S2, the vectorizer first finds the stmt that defines
120 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
121 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
122 resulting sequence would be:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 VS2: va = vb;
127 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
129 Operands that are not SSA_NAMEs, are data-refs that appear in
130 load/store operations (like 'x[i]' in S1), and are handled differently.
132 Target modeling:
133 =================
134 Currently the only target specific information that is used is the
135 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
136 Targets that can support different sizes of vectors, for now will need
137 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
138 flexibility will be added in the future.
140 Since we only vectorize operations which vector form can be
141 expressed using existing tree codes, to verify that an operation is
142 supported, the vectorizer checks the relevant optab at the relevant
143 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
144 the value found is CODE_FOR_nothing, then there's no target support, and
145 we can't vectorize the stmt.
147 For additional information on this project see:
148 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
151 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
153 /* Function vect_determine_vectorization_factor
155 Determine the vectorization factor (VF). VF is the number of data elements
156 that are operated upon in parallel in a single iteration of the vectorized
157 loop. For example, when vectorizing a loop that operates on 4byte elements,
158 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
159 elements can fit in a single vector register.
161 We currently support vectorization of loops in which all types operated upon
162 are of the same size. Therefore this function currently sets VF according to
163 the size of the types operated upon, and fails if there are multiple sizes
164 in the loop.
166 VF is also the factor by which the loop iterations are strip-mined, e.g.:
167 original loop:
168 for (i=0; i<N; i++){
169 a[i] = b[i] + c[i];
172 vectorized loop:
173 for (i=0; i<N; i+=VF){
174 a[i:VF] = b[i:VF] + c[i:VF];
178 static bool
179 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
181 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
182 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
183 unsigned nbbs = loop->num_nodes;
184 unsigned int vectorization_factor = 0;
185 tree scalar_type;
186 gphi *phi;
187 tree vectype;
188 unsigned int nunits;
189 stmt_vec_info stmt_info;
190 unsigned i;
191 HOST_WIDE_INT dummy;
192 gimple *stmt, *pattern_stmt = NULL;
193 gimple_seq pattern_def_seq = NULL;
194 gimple_stmt_iterator pattern_def_si = gsi_none ();
195 bool analyze_pattern_stmt = false;
196 bool bool_result;
197 auto_vec<stmt_vec_info> mask_producers;
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_NOTE, vect_location,
201 "=== vect_determine_vectorization_factor ===\n");
203 for (i = 0; i < nbbs; i++)
205 basic_block bb = bbs[i];
207 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
208 gsi_next (&si))
210 phi = si.phi ();
211 stmt_info = vinfo_for_stmt (phi);
212 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
215 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
218 gcc_assert (stmt_info);
220 if (STMT_VINFO_RELEVANT_P (stmt_info)
221 || STMT_VINFO_LIVE_P (stmt_info))
223 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
224 scalar_type = TREE_TYPE (PHI_RESULT (phi));
226 if (dump_enabled_p ())
228 dump_printf_loc (MSG_NOTE, vect_location,
229 "get vectype for scalar type: ");
230 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
231 dump_printf (MSG_NOTE, "\n");
234 vectype = get_vectype_for_scalar_type (scalar_type);
235 if (!vectype)
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
240 "not vectorized: unsupported "
241 "data-type ");
242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
243 scalar_type);
244 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
246 return false;
248 STMT_VINFO_VECTYPE (stmt_info) = vectype;
250 if (dump_enabled_p ())
252 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
253 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
254 dump_printf (MSG_NOTE, "\n");
257 nunits = TYPE_VECTOR_SUBPARTS (vectype);
258 if (dump_enabled_p ())
259 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
260 nunits);
262 if (!vectorization_factor
263 || (nunits > vectorization_factor))
264 vectorization_factor = nunits;
268 for (gimple_stmt_iterator si = gsi_start_bb (bb);
269 !gsi_end_p (si) || analyze_pattern_stmt;)
271 tree vf_vectype;
273 if (analyze_pattern_stmt)
274 stmt = pattern_stmt;
275 else
276 stmt = gsi_stmt (si);
278 stmt_info = vinfo_for_stmt (stmt);
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_NOTE, vect_location,
283 "==> examining statement: ");
284 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
287 gcc_assert (stmt_info);
289 /* Skip stmts which do not need to be vectorized. */
290 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
291 && !STMT_VINFO_LIVE_P (stmt_info))
292 || gimple_clobber_p (stmt))
294 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
295 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
296 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
297 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
299 stmt = pattern_stmt;
300 stmt_info = vinfo_for_stmt (pattern_stmt);
301 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location,
304 "==> examining pattern statement: ");
305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
308 else
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
325 if (pattern_def_seq == NULL)
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
337 while (!gsi_end_p (pattern_def_si))
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
348 if (!gsi_end_p (pattern_def_si))
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
358 stmt = pattern_def_stmt;
359 stmt_info = pattern_def_stmt_info;
361 else
363 pattern_def_si = gsi_none ();
364 analyze_pattern_stmt = false;
367 else
368 analyze_pattern_stmt = false;
371 if (gimple_get_lhs (stmt) == NULL_TREE
372 /* MASK_STORE has no lhs, but is ok. */
373 && (!is_gimple_call (stmt)
374 || !gimple_call_internal_p (stmt)
375 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
377 if (is_gimple_call (stmt))
379 /* Ignore calls with no lhs. These must be calls to
380 #pragma omp simd functions, and what vectorization factor
381 it really needs can't be determined until
382 vectorizable_simd_clone_call. */
383 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
385 pattern_def_seq = NULL;
386 gsi_next (&si);
388 continue;
390 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
393 "not vectorized: irregular stmt.");
394 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 return false;
400 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "not vectorized: vector stmt in loop:");
406 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
408 return false;
411 bool_result = false;
413 if (STMT_VINFO_VECTYPE (stmt_info))
415 /* The only case when a vectype had been already set is for stmts
416 that contain a dataref, or for "pattern-stmts" (stmts
417 generated by the vectorizer to represent/replace a certain
418 idiom). */
419 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
420 || is_pattern_stmt_p (stmt_info)
421 || !gsi_end_p (pattern_def_si));
422 vectype = STMT_VINFO_VECTYPE (stmt_info);
424 else
426 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
427 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
428 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
429 else
430 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
432 /* Bool ops don't participate in vectorization factor
433 computation. For comparison use compared types to
434 compute a factor. */
435 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
436 && is_gimple_assign (stmt)
437 && gimple_assign_rhs_code (stmt) != COND_EXPR)
439 if (STMT_VINFO_RELEVANT_P (stmt_info)
440 || STMT_VINFO_LIVE_P (stmt_info))
441 mask_producers.safe_push (stmt_info);
442 bool_result = true;
444 if (gimple_code (stmt) == GIMPLE_ASSIGN
445 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
446 == tcc_comparison
447 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
448 != BOOLEAN_TYPE)
449 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
450 else
452 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
454 pattern_def_seq = NULL;
455 gsi_next (&si);
457 continue;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "get vectype for scalar type: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
466 dump_printf (MSG_NOTE, "\n");
468 vectype = get_vectype_for_scalar_type (scalar_type);
469 if (!vectype)
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
474 "not vectorized: unsupported "
475 "data-type ");
476 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
477 scalar_type);
478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 return false;
483 if (!bool_result)
484 STMT_VINFO_VECTYPE (stmt_info) = vectype;
486 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
489 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
490 dump_printf (MSG_NOTE, "\n");
494 /* Don't try to compute VF out scalar types if we stmt
495 produces boolean vector. Use result vectype instead. */
496 if (VECTOR_BOOLEAN_TYPE_P (vectype))
497 vf_vectype = vectype;
498 else
500 /* The vectorization factor is according to the smallest
501 scalar type (or the largest vector size, but we only
502 support one vector size per loop). */
503 if (!bool_result)
504 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
505 &dummy);
506 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "get vectype for scalar type: ");
510 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
511 dump_printf (MSG_NOTE, "\n");
513 vf_vectype = get_vectype_for_scalar_type (scalar_type);
515 if (!vf_vectype)
517 if (dump_enabled_p ())
519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
520 "not vectorized: unsupported data-type ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
522 scalar_type);
523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 return false;
528 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
529 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "not vectorized: different sized vector "
535 "types in statement, ");
536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
537 vectype);
538 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vf_vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 return false;
546 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
549 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
550 dump_printf (MSG_NOTE, "\n");
553 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
556 if (!vectorization_factor
557 || (nunits > vectorization_factor))
558 vectorization_factor = nunits;
560 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 pattern_def_seq = NULL;
563 gsi_next (&si);
568 /* TODO: Analyze cost. Decide if worth while to vectorize. */
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
571 vectorization_factor);
572 if (vectorization_factor <= 1)
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "not vectorized: unsupported data-type\n");
577 return false;
579 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581 for (i = 0; i < mask_producers.length (); i++)
583 tree mask_type = NULL;
585 stmt = STMT_VINFO_STMT (mask_producers[i]);
587 if (gimple_code (stmt) == GIMPLE_ASSIGN
588 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
589 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
591 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
592 mask_type = get_mask_type_for_scalar_type (scalar_type);
594 if (!mask_type)
596 if (dump_enabled_p ())
597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
598 "not vectorized: unsupported mask\n");
599 return false;
602 else
604 tree rhs;
605 ssa_op_iter iter;
606 gimple *def_stmt;
607 enum vect_def_type dt;
609 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
611 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
612 &def_stmt, &dt, &vectype))
614 if (dump_enabled_p ())
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "not vectorized: can't compute mask type "
618 "for statement, ");
619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
622 return false;
625 /* No vectype probably means external definition.
626 Allow it in case there is another operand which
627 allows to determine mask type. */
628 if (!vectype)
629 continue;
631 if (!mask_type)
632 mask_type = vectype;
633 else if (TYPE_VECTOR_SUBPARTS (mask_type)
634 != TYPE_VECTOR_SUBPARTS (vectype))
636 if (dump_enabled_p ())
638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
639 "not vectorized: different sized masks "
640 "types in statement, ");
641 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
642 mask_type);
643 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
645 vectype);
646 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
648 return false;
650 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
651 != VECTOR_BOOLEAN_TYPE_P (vectype))
653 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "not vectorized: mixed mask and "
657 "nonmask vector types in statement, ");
658 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
659 mask_type);
660 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
661 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
662 vectype);
663 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
665 return false;
669 /* We may compare boolean value loaded as vector of integers.
670 Fix mask_type in such case. */
671 if (mask_type
672 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
673 && gimple_code (stmt) == GIMPLE_ASSIGN
674 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
675 mask_type = build_same_sized_truth_vector_type (mask_type);
678 /* No mask_type should mean loop invariant predicate.
679 This is probably a subject for optimization in
680 if-conversion. */
681 if (!mask_type)
683 if (dump_enabled_p ())
685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
686 "not vectorized: can't compute mask type "
687 "for statement, ");
688 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
691 return false;
694 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
697 return true;
701 /* Function vect_is_simple_iv_evolution.
703 FORNOW: A simple evolution of an induction variables in the loop is
704 considered a polynomial evolution. */
706 static bool
707 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
708 tree * step)
710 tree init_expr;
711 tree step_expr;
712 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
713 basic_block bb;
715 /* When there is no evolution in this loop, the evolution function
716 is not "simple". */
717 if (evolution_part == NULL_TREE)
718 return false;
720 /* When the evolution is a polynomial of degree >= 2
721 the evolution function is not "simple". */
722 if (tree_is_chrec (evolution_part))
723 return false;
725 step_expr = evolution_part;
726 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
728 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
731 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
732 dump_printf (MSG_NOTE, ", init: ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
734 dump_printf (MSG_NOTE, "\n");
737 *init = init_expr;
738 *step = step_expr;
740 if (TREE_CODE (step_expr) != INTEGER_CST
741 && (TREE_CODE (step_expr) != SSA_NAME
742 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
743 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
744 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
745 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
746 || !flag_associative_math)))
747 && (TREE_CODE (step_expr) != REAL_CST
748 || !flag_associative_math))
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "step unknown.\n");
753 return false;
756 return true;
759 /* Function vect_analyze_scalar_cycles_1.
761 Examine the cross iteration def-use cycles of scalar variables
762 in LOOP. LOOP_VINFO represents the loop that is now being
763 considered for vectorization (can be LOOP, or an outer-loop
764 enclosing LOOP). */
766 static void
767 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
769 basic_block bb = loop->header;
770 tree init, step;
771 auto_vec<gimple *, 64> worklist;
772 gphi_iterator gsi;
773 bool double_reduc;
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_NOTE, vect_location,
777 "=== vect_analyze_scalar_cycles ===\n");
779 /* First - identify all inductions. Reduction detection assumes that all the
780 inductions have been identified, therefore, this order must not be
781 changed. */
782 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
784 gphi *phi = gsi.phi ();
785 tree access_fn = NULL;
786 tree def = PHI_RESULT (phi);
787 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
789 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
792 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
795 /* Skip virtual phi's. The data dependences that are associated with
796 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
797 if (virtual_operand_p (def))
798 continue;
800 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
802 /* Analyze the evolution function. */
803 access_fn = analyze_scalar_evolution (loop, def);
804 if (access_fn)
806 STRIP_NOPS (access_fn);
807 if (dump_enabled_p ())
809 dump_printf_loc (MSG_NOTE, vect_location,
810 "Access function of PHI: ");
811 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
812 dump_printf (MSG_NOTE, "\n");
814 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
815 = initial_condition_in_loop_num (access_fn, loop->num);
816 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
817 = evolution_part_in_loop_num (access_fn, loop->num);
820 if (!access_fn
821 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
822 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
823 && TREE_CODE (step) != INTEGER_CST))
825 worklist.safe_push (phi);
826 continue;
829 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
830 != NULL_TREE);
831 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
835 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
839 /* Second - identify all reductions and nested cycles. */
840 while (worklist.length () > 0)
842 gimple *phi = worklist.pop ();
843 tree def = PHI_RESULT (phi);
844 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
845 gimple *reduc_stmt;
846 bool nested_cycle;
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
851 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
854 gcc_assert (!virtual_operand_p (def)
855 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
857 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
858 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
859 &double_reduc, false);
860 if (reduc_stmt)
862 if (double_reduc)
864 if (dump_enabled_p ())
865 dump_printf_loc (MSG_NOTE, vect_location,
866 "Detected double reduction.\n");
868 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
869 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
870 vect_double_reduction_def;
872 else
874 if (nested_cycle)
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location,
878 "Detected vectorizable nested cycle.\n");
880 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
881 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
882 vect_nested_cycle;
884 else
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "Detected reduction.\n");
890 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892 vect_reduction_def;
893 /* Store the reduction cycles for possible vectorization in
894 loop-aware SLP. */
895 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
899 else
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Unknown def-use cycle pattern.\n");
907 /* Function vect_analyze_scalar_cycles.
909 Examine the cross iteration def-use cycles of scalar variables, by
910 analyzing the loop-header PHIs of scalar variables. Classify each
911 cycle as one of the following: invariant, induction, reduction, unknown.
912 We do that for the loop represented by LOOP_VINFO, and also to its
913 inner-loop, if exists.
914 Examples for scalar cycles:
916 Example1: reduction:
918 loop1:
919 for (i=0; i<N; i++)
920 sum += a[i];
922 Example2: induction:
924 loop2:
925 for (i=0; i<N; i++)
926 a[i] = i; */
928 static void
929 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
933 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
935 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
936 Reductions in such inner-loop therefore have different properties than
937 the reductions in the nest that gets vectorized:
938 1. When vectorized, they are executed in the same order as in the original
939 scalar loop, so we can't change the order of computation when
940 vectorizing them.
941 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
942 current checks are too strict. */
944 if (loop->inner)
945 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
948 /* Transfer group and reduction information from STMT to its pattern stmt. */
950 static void
951 vect_fixup_reduc_chain (gimple *stmt)
953 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
954 gimple *stmtp;
955 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
956 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
957 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
960 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
961 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
962 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
963 if (stmt)
964 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
965 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
967 while (stmt);
968 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
971 /* Fixup scalar cycles that now have their stmts detected as patterns. */
973 static void
974 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
976 gimple *first;
977 unsigned i;
979 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
980 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
982 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
983 while (next)
985 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
986 break;
987 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
989 /* If not all stmt in the chain are patterns try to handle
990 the chain without patterns. */
991 if (! next)
993 vect_fixup_reduc_chain (first);
994 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
995 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1000 /* Function vect_get_loop_niters.
1002 Determine how many iterations the loop is executed and place it
1003 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1004 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1005 niter information holds in ASSUMPTIONS.
1007 Return the loop exit condition. */
1010 static gcond *
1011 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1012 tree *number_of_iterations, tree *number_of_iterationsm1)
1014 edge exit = single_exit (loop);
1015 struct tree_niter_desc niter_desc;
1016 tree niter_assumptions, niter, may_be_zero;
1017 gcond *cond = get_loop_exit_condition (loop);
1019 *assumptions = boolean_true_node;
1020 *number_of_iterationsm1 = chrec_dont_know;
1021 *number_of_iterations = chrec_dont_know;
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "=== get_loop_niters ===\n");
1026 if (!exit)
1027 return cond;
1029 niter = chrec_dont_know;
1030 may_be_zero = NULL_TREE;
1031 niter_assumptions = boolean_true_node;
1032 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1033 || chrec_contains_undetermined (niter_desc.niter))
1034 return cond;
1036 niter_assumptions = niter_desc.assumptions;
1037 may_be_zero = niter_desc.may_be_zero;
1038 niter = niter_desc.niter;
1040 if (may_be_zero && integer_zerop (may_be_zero))
1041 may_be_zero = NULL_TREE;
1043 if (may_be_zero)
1045 if (COMPARISON_CLASS_P (may_be_zero))
1047 /* Try to combine may_be_zero with assumptions, this can simplify
1048 computation of niter expression. */
1049 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1050 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1051 niter_assumptions,
1052 fold_build1 (TRUTH_NOT_EXPR,
1053 boolean_type_node,
1054 may_be_zero));
1055 else
1056 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1057 build_int_cst (TREE_TYPE (niter), 0), niter);
1059 may_be_zero = NULL_TREE;
1061 else if (integer_nonzerop (may_be_zero))
1063 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1064 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1065 return cond;
1067 else
1068 return cond;
1071 *assumptions = niter_assumptions;
1072 *number_of_iterationsm1 = niter;
1074 /* We want the number of loop header executions which is the number
1075 of latch executions plus one.
1076 ??? For UINT_MAX latch executions this number overflows to zero
1077 for loops like do { n++; } while (n != 0); */
1078 if (niter && !chrec_contains_undetermined (niter))
1079 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1080 build_int_cst (TREE_TYPE (niter), 1));
1081 *number_of_iterations = niter;
1083 return cond;
1086 /* Function bb_in_loop_p
1088 Used as predicate for dfs order traversal of the loop bbs. */
1090 static bool
1091 bb_in_loop_p (const_basic_block bb, const void *data)
1093 const struct loop *const loop = (const struct loop *)data;
1094 if (flow_bb_inside_loop_p (loop, bb))
1095 return true;
1096 return false;
1100 /* Function new_loop_vec_info.
1102 Create and initialize a new loop_vec_info struct for LOOP, as well as
1103 stmt_vec_info structs for all the stmts in LOOP. */
1105 static loop_vec_info
1106 new_loop_vec_info (struct loop *loop)
1108 loop_vec_info res;
1109 basic_block *bbs;
1110 gimple_stmt_iterator si;
1111 unsigned int i, nbbs;
1113 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1114 res->kind = vec_info::loop;
1115 LOOP_VINFO_LOOP (res) = loop;
1117 bbs = get_loop_body (loop);
1119 /* Create/Update stmt_info for all stmts in the loop. */
1120 for (i = 0; i < loop->num_nodes; i++)
1122 basic_block bb = bbs[i];
1124 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1126 gimple *phi = gsi_stmt (si);
1127 gimple_set_uid (phi, 0);
1128 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1131 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1133 gimple *stmt = gsi_stmt (si);
1134 gimple_set_uid (stmt, 0);
1135 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1139 /* CHECKME: We want to visit all BBs before their successors (except for
1140 latch blocks, for which this assertion wouldn't hold). In the simple
1141 case of the loop forms we allow, a dfs order of the BBs would the same
1142 as reversed postorder traversal, so we are safe. */
1144 free (bbs);
1145 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1146 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1147 bbs, loop->num_nodes, loop);
1148 gcc_assert (nbbs == loop->num_nodes);
1150 LOOP_VINFO_BBS (res) = bbs;
1151 LOOP_VINFO_NITERSM1 (res) = NULL;
1152 LOOP_VINFO_NITERS (res) = NULL;
1153 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1154 LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1155 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1156 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1157 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1158 LOOP_VINFO_VECT_FACTOR (res) = 0;
1159 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1160 LOOP_VINFO_DATAREFS (res) = vNULL;
1161 LOOP_VINFO_DDRS (res) = vNULL;
1162 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1163 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1164 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1165 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1166 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1167 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1168 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1169 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1170 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1171 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1172 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1173 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1175 return res;
1179 /* Function destroy_loop_vec_info.
1181 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1182 stmts in the loop. */
1184 void
1185 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1187 struct loop *loop;
1188 basic_block *bbs;
1189 int nbbs;
1190 gimple_stmt_iterator si;
1191 int j;
1192 vec<slp_instance> slp_instances;
1193 slp_instance instance;
1194 bool swapped;
1196 if (!loop_vinfo)
1197 return;
1199 loop = LOOP_VINFO_LOOP (loop_vinfo);
1201 bbs = LOOP_VINFO_BBS (loop_vinfo);
1202 nbbs = clean_stmts ? loop->num_nodes : 0;
1203 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1205 for (j = 0; j < nbbs; j++)
1207 basic_block bb = bbs[j];
1208 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1209 free_stmt_vec_info (gsi_stmt (si));
1211 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1213 gimple *stmt = gsi_stmt (si);
1215 /* We may have broken canonical form by moving a constant
1216 into RHS1 of a commutative op. Fix such occurrences. */
1217 if (swapped && is_gimple_assign (stmt))
1219 enum tree_code code = gimple_assign_rhs_code (stmt);
1221 if ((code == PLUS_EXPR
1222 || code == POINTER_PLUS_EXPR
1223 || code == MULT_EXPR)
1224 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1225 swap_ssa_operands (stmt,
1226 gimple_assign_rhs1_ptr (stmt),
1227 gimple_assign_rhs2_ptr (stmt));
1228 else if (code == COND_EXPR
1229 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1231 tree cond_expr = gimple_assign_rhs1 (stmt);
1232 enum tree_code cond_code = TREE_CODE (cond_expr);
1234 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1236 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1237 0));
1238 cond_code = invert_tree_comparison (cond_code,
1239 honor_nans);
1240 if (cond_code != ERROR_MARK)
1242 TREE_SET_CODE (cond_expr, cond_code);
1243 swap_ssa_operands (stmt,
1244 gimple_assign_rhs2_ptr (stmt),
1245 gimple_assign_rhs3_ptr (stmt));
1251 /* Free stmt_vec_info. */
1252 free_stmt_vec_info (stmt);
1253 gsi_next (&si);
1257 free (LOOP_VINFO_BBS (loop_vinfo));
1258 vect_destroy_datarefs (loop_vinfo);
1259 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1260 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1261 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1262 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1263 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1264 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1265 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1266 vect_free_slp_instance (instance);
1268 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1269 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1270 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1271 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1273 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1274 loop_vinfo->scalar_cost_vec.release ();
1276 free (loop_vinfo);
1277 loop->aux = NULL;
1281 /* Calculate the cost of one scalar iteration of the loop. */
1282 static void
1283 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1285 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1287 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1288 int innerloop_iters, i;
1290 /* Count statements in scalar loop. Using this as scalar cost for a single
1291 iteration for now.
1293 TODO: Add outer loop support.
1295 TODO: Consider assigning different costs to different scalar
1296 statements. */
1298 /* FORNOW. */
1299 innerloop_iters = 1;
1300 if (loop->inner)
1301 innerloop_iters = 50; /* FIXME */
1303 for (i = 0; i < nbbs; i++)
1305 gimple_stmt_iterator si;
1306 basic_block bb = bbs[i];
1308 if (bb->loop_father == loop->inner)
1309 factor = innerloop_iters;
1310 else
1311 factor = 1;
1313 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1315 gimple *stmt = gsi_stmt (si);
1316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1318 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1319 continue;
1321 /* Skip stmts that are not vectorized inside the loop. */
1322 if (stmt_info
1323 && !STMT_VINFO_RELEVANT_P (stmt_info)
1324 && (!STMT_VINFO_LIVE_P (stmt_info)
1325 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1326 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1327 continue;
1329 vect_cost_for_stmt kind;
1330 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1332 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1333 kind = scalar_load;
1334 else
1335 kind = scalar_store;
1337 else
1338 kind = scalar_stmt;
1340 scalar_single_iter_cost
1341 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1342 factor, kind, NULL, 0, vect_prologue);
1345 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1346 = scalar_single_iter_cost;
1350 /* Function vect_analyze_loop_form_1.
1352 Verify that certain CFG restrictions hold, including:
1353 - the loop has a pre-header
1354 - the loop has a single entry and exit
1355 - the loop exit condition is simple enough
1356 - the number of iterations can be analyzed, i.e, a countable loop. The
1357 niter could be analyzed under some assumptions. */
1359 bool
1360 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1361 tree *assumptions, tree *number_of_iterationsm1,
1362 tree *number_of_iterations, gcond **inner_loop_cond)
1364 if (dump_enabled_p ())
1365 dump_printf_loc (MSG_NOTE, vect_location,
1366 "=== vect_analyze_loop_form ===\n");
1368 /* Different restrictions apply when we are considering an inner-most loop,
1369 vs. an outer (nested) loop.
1370 (FORNOW. May want to relax some of these restrictions in the future). */
1372 if (!loop->inner)
1374 /* Inner-most loop. We currently require that the number of BBs is
1375 exactly 2 (the header and latch). Vectorizable inner-most loops
1376 look like this:
1378 (pre-header)
1380 header <--------+
1381 | | |
1382 | +--> latch --+
1384 (exit-bb) */
1386 if (loop->num_nodes != 2)
1388 if (dump_enabled_p ())
1389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1390 "not vectorized: control flow in loop.\n");
1391 return false;
1394 if (empty_block_p (loop->header))
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "not vectorized: empty loop.\n");
1399 return false;
1402 else
1404 struct loop *innerloop = loop->inner;
1405 edge entryedge;
1407 /* Nested loop. We currently require that the loop is doubly-nested,
1408 contains a single inner loop, and the number of BBs is exactly 5.
1409 Vectorizable outer-loops look like this:
1411 (pre-header)
1413 header <---+
1415 inner-loop |
1417 tail ------+
1419 (exit-bb)
1421 The inner-loop has the properties expected of inner-most loops
1422 as described above. */
1424 if ((loop->inner)->inner || (loop->inner)->next)
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428 "not vectorized: multiple nested loops.\n");
1429 return false;
1432 if (loop->num_nodes != 5)
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "not vectorized: control flow in loop.\n");
1437 return false;
1440 entryedge = loop_preheader_edge (innerloop);
1441 if (entryedge->src != loop->header
1442 || !single_exit (innerloop)
1443 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1447 "not vectorized: unsupported outerloop form.\n");
1448 return false;
1451 /* Analyze the inner-loop. */
1452 tree inner_niterm1, inner_niter, inner_assumptions;
1453 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1454 &inner_assumptions, &inner_niterm1,
1455 &inner_niter, NULL)
1456 /* Don't support analyzing niter under assumptions for inner
1457 loop. */
1458 || !integer_onep (inner_assumptions))
1460 if (dump_enabled_p ())
1461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1462 "not vectorized: Bad inner loop.\n");
1463 return false;
1466 if (!expr_invariant_in_loop_p (loop, inner_niter))
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470 "not vectorized: inner-loop count not"
1471 " invariant.\n");
1472 return false;
1475 if (dump_enabled_p ())
1476 dump_printf_loc (MSG_NOTE, vect_location,
1477 "Considering outer-loop vectorization.\n");
1480 if (!single_exit (loop)
1481 || EDGE_COUNT (loop->header->preds) != 2)
1483 if (dump_enabled_p ())
1485 if (!single_exit (loop))
1486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 "not vectorized: multiple exits.\n");
1488 else if (EDGE_COUNT (loop->header->preds) != 2)
1489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490 "not vectorized: too many incoming edges.\n");
1492 return false;
1495 /* We assume that the loop exit condition is at the end of the loop. i.e,
1496 that the loop is represented as a do-while (with a proper if-guard
1497 before the loop if needed), where the loop header contains all the
1498 executable statements, and the latch is empty. */
1499 if (!empty_block_p (loop->latch)
1500 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: latch block not empty.\n");
1505 return false;
1508 /* Make sure the exit is not abnormal. */
1509 edge e = single_exit (loop);
1510 if (e->flags & EDGE_ABNORMAL)
1512 if (dump_enabled_p ())
1513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514 "not vectorized: abnormal loop exit edge.\n");
1515 return false;
1518 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1519 number_of_iterationsm1);
1520 if (!*loop_cond)
1522 if (dump_enabled_p ())
1523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524 "not vectorized: complicated exit condition.\n");
1525 return false;
1528 if (integer_zerop (*assumptions)
1529 || !*number_of_iterations
1530 || chrec_contains_undetermined (*number_of_iterations))
1532 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1534 "not vectorized: number of iterations cannot be "
1535 "computed.\n");
1536 return false;
1539 if (integer_zerop (*number_of_iterations))
1541 if (dump_enabled_p ())
1542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1543 "not vectorized: number of iterations = 0.\n");
1544 return false;
1547 return true;
1550 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1552 loop_vec_info
1553 vect_analyze_loop_form (struct loop *loop)
1555 tree assumptions, number_of_iterations, number_of_iterationsm1;
1556 gcond *loop_cond, *inner_loop_cond = NULL;
1558 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1559 &assumptions, &number_of_iterationsm1,
1560 &number_of_iterations, &inner_loop_cond))
1561 return NULL;
1563 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1564 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1565 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1566 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1567 if (!integer_onep (assumptions))
1569 /* We consider to vectorize this loop by versioning it under
1570 some assumptions. In order to do this, we need to clear
1571 existing information computed by scev and niter analyzer. */
1572 scev_reset_htab ();
1573 free_numbers_of_iterations_estimates_loop (loop);
1574 /* Also set flag for this loop so that following scev and niter
1575 analysis are done under the assumptions. */
1576 loop_constraint_set (loop, LOOP_C_FINITE);
1577 /* Also record the assumptions for versioning. */
1578 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1581 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1583 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "Symbolic number of iterations is ");
1587 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1588 dump_printf (MSG_NOTE, "\n");
1592 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1593 if (inner_loop_cond)
1594 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1595 = loop_exit_ctrl_vec_info_type;
1597 gcc_assert (!loop->aux);
1598 loop->aux = loop_vinfo;
1599 return loop_vinfo;
1604 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1605 statements update the vectorization factor. */
1607 static void
1608 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1610 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1611 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1612 int nbbs = loop->num_nodes;
1613 unsigned int vectorization_factor;
1614 int i;
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_NOTE, vect_location,
1618 "=== vect_update_vf_for_slp ===\n");
1620 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1621 gcc_assert (vectorization_factor != 0);
1623 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1624 vectorization factor of the loop is the unrolling factor required by
1625 the SLP instances. If that unrolling factor is 1, we say, that we
1626 perform pure SLP on loop - cross iteration parallelism is not
1627 exploited. */
1628 bool only_slp_in_loop = true;
1629 for (i = 0; i < nbbs; i++)
1631 basic_block bb = bbs[i];
1632 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1633 gsi_next (&si))
1635 gimple *stmt = gsi_stmt (si);
1636 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1637 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1638 && STMT_VINFO_RELATED_STMT (stmt_info))
1640 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1641 stmt_info = vinfo_for_stmt (stmt);
1643 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1644 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1645 && !PURE_SLP_STMT (stmt_info))
1646 /* STMT needs both SLP and loop-based vectorization. */
1647 only_slp_in_loop = false;
1651 if (only_slp_in_loop)
1652 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1653 else
1654 vectorization_factor
1655 = least_common_multiple (vectorization_factor,
1656 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1658 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_NOTE, vect_location,
1661 "Updating vectorization factor to %d\n",
1662 vectorization_factor);
1665 /* Function vect_analyze_loop_operations.
1667 Scan the loop stmts and make sure they are all vectorizable. */
1669 static bool
1670 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1672 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1674 int nbbs = loop->num_nodes;
1675 int i;
1676 stmt_vec_info stmt_info;
1677 bool need_to_vectorize = false;
1678 bool ok;
1680 if (dump_enabled_p ())
1681 dump_printf_loc (MSG_NOTE, vect_location,
1682 "=== vect_analyze_loop_operations ===\n");
1684 for (i = 0; i < nbbs; i++)
1686 basic_block bb = bbs[i];
1688 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1689 gsi_next (&si))
1691 gphi *phi = si.phi ();
1692 ok = true;
1694 stmt_info = vinfo_for_stmt (phi);
1695 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1698 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1700 if (virtual_operand_p (gimple_phi_result (phi)))
1701 continue;
1703 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1704 (i.e., a phi in the tail of the outer-loop). */
1705 if (! is_loop_header_bb_p (bb))
1707 /* FORNOW: we currently don't support the case that these phis
1708 are not used in the outerloop (unless it is double reduction,
1709 i.e., this phi is vect_reduction_def), cause this case
1710 requires to actually do something here. */
1711 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1712 || STMT_VINFO_LIVE_P (stmt_info))
1713 && STMT_VINFO_DEF_TYPE (stmt_info)
1714 != vect_double_reduction_def)
1716 if (dump_enabled_p ())
1717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718 "Unsupported loop-closed phi in "
1719 "outer-loop.\n");
1720 return false;
1723 /* If PHI is used in the outer loop, we check that its operand
1724 is defined in the inner loop. */
1725 if (STMT_VINFO_RELEVANT_P (stmt_info))
1727 tree phi_op;
1728 gimple *op_def_stmt;
1730 if (gimple_phi_num_args (phi) != 1)
1731 return false;
1733 phi_op = PHI_ARG_DEF (phi, 0);
1734 if (TREE_CODE (phi_op) != SSA_NAME)
1735 return false;
1737 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1738 if (gimple_nop_p (op_def_stmt)
1739 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1740 || !vinfo_for_stmt (op_def_stmt))
1741 return false;
1743 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1744 != vect_used_in_outer
1745 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1746 != vect_used_in_outer_by_reduction)
1747 return false;
1750 continue;
1753 gcc_assert (stmt_info);
1755 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1756 || STMT_VINFO_LIVE_P (stmt_info))
1757 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1759 /* A scalar-dependence cycle that we don't support. */
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 "not vectorized: scalar dependence cycle.\n");
1763 return false;
1766 if (STMT_VINFO_RELEVANT_P (stmt_info))
1768 need_to_vectorize = true;
1769 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1770 ok = vectorizable_induction (phi, NULL, NULL);
1773 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1774 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1776 if (!ok)
1778 if (dump_enabled_p ())
1780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1781 "not vectorized: relevant phi not "
1782 "supported: ");
1783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1785 return false;
1789 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1790 gsi_next (&si))
1792 gimple *stmt = gsi_stmt (si);
1793 if (!gimple_clobber_p (stmt)
1794 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1795 return false;
1797 } /* bbs */
1799 /* All operations in the loop are either irrelevant (deal with loop
1800 control, or dead), or only used outside the loop and can be moved
1801 out of the loop (e.g. invariants, inductions). The loop can be
1802 optimized away by scalar optimizations. We're better off not
1803 touching this loop. */
1804 if (!need_to_vectorize)
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location,
1808 "All the computation can be taken out of the loop.\n");
1809 if (dump_enabled_p ())
1810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811 "not vectorized: redundant loop. no profit to "
1812 "vectorize.\n");
1813 return false;
1816 return true;
1820 /* Function vect_analyze_loop_2.
1822 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1823 for it. The different analyses will record information in the
1824 loop_vec_info struct. */
1825 static bool
1826 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1828 bool ok;
1829 int max_vf = MAX_VECTORIZATION_FACTOR;
1830 int min_vf = 2;
1831 unsigned int n_stmts = 0;
1833 /* The first group of checks is independent of the vector size. */
1834 fatal = true;
1836 /* Find all data references in the loop (which correspond to vdefs/vuses)
1837 and analyze their evolution in the loop. */
1839 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1841 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1842 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1844 if (dump_enabled_p ())
1845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846 "not vectorized: loop nest containing two "
1847 "or more consecutive inner loops cannot be "
1848 "vectorized\n");
1849 return false;
1852 for (unsigned i = 0; i < loop->num_nodes; i++)
1853 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1854 !gsi_end_p (gsi); gsi_next (&gsi))
1856 gimple *stmt = gsi_stmt (gsi);
1857 if (is_gimple_debug (stmt))
1858 continue;
1859 ++n_stmts;
1860 if (!find_data_references_in_stmt (loop, stmt,
1861 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1863 if (is_gimple_call (stmt) && loop->safelen)
1865 tree fndecl = gimple_call_fndecl (stmt), op;
1866 if (fndecl != NULL_TREE)
1868 cgraph_node *node = cgraph_node::get (fndecl);
1869 if (node != NULL && node->simd_clones != NULL)
1871 unsigned int j, n = gimple_call_num_args (stmt);
1872 for (j = 0; j < n; j++)
1874 op = gimple_call_arg (stmt, j);
1875 if (DECL_P (op)
1876 || (REFERENCE_CLASS_P (op)
1877 && get_base_address (op)))
1878 break;
1880 op = gimple_call_lhs (stmt);
1881 /* Ignore #pragma omp declare simd functions
1882 if they don't have data references in the
1883 call stmt itself. */
1884 if (j == n
1885 && !(op
1886 && (DECL_P (op)
1887 || (REFERENCE_CLASS_P (op)
1888 && get_base_address (op)))))
1889 continue;
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "not vectorized: loop contains function "
1896 "calls or data references that cannot "
1897 "be analyzed\n");
1898 return false;
1902 /* Analyze the data references and also adjust the minimal
1903 vectorization factor according to the loads and stores. */
1905 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1906 if (!ok)
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "bad data references.\n");
1911 return false;
1914 /* Classify all cross-iteration scalar data-flow cycles.
1915 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1916 vect_analyze_scalar_cycles (loop_vinfo);
1918 vect_pattern_recog (loop_vinfo);
1920 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1922 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1923 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1925 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1926 if (!ok)
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930 "bad data access.\n");
1931 return false;
1934 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1936 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1937 if (!ok)
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "unexpected pattern.\n");
1942 return false;
1945 /* While the rest of the analysis below depends on it in some way. */
1946 fatal = false;
1948 /* Analyze data dependences between the data-refs in the loop
1949 and adjust the maximum vectorization factor according to
1950 the dependences.
1951 FORNOW: fail at the first data dependence that we encounter. */
1953 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1954 if (!ok
1955 || max_vf < min_vf)
1957 if (dump_enabled_p ())
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 "bad data dependence.\n");
1960 return false;
1963 ok = vect_determine_vectorization_factor (loop_vinfo);
1964 if (!ok)
1966 if (dump_enabled_p ())
1967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1968 "can't determine vectorization factor.\n");
1969 return false;
1971 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1975 "bad data dependence.\n");
1976 return false;
1979 /* Compute the scalar iteration cost. */
1980 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1982 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1983 HOST_WIDE_INT estimated_niter;
1984 unsigned th;
1985 int min_scalar_loop_bound;
1987 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1988 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1989 if (!ok)
1990 return false;
1992 /* If there are any SLP instances mark them as pure_slp. */
1993 bool slp = vect_make_slp_decision (loop_vinfo);
1994 if (slp)
1996 /* Find stmts that need to be both vectorized and SLPed. */
1997 vect_detect_hybrid_slp (loop_vinfo);
1999 /* Update the vectorization factor based on the SLP decision. */
2000 vect_update_vf_for_slp (loop_vinfo);
2003 /* This is the point where we can re-start analysis with SLP forced off. */
2004 start_over:
2006 /* Now the vectorization factor is final. */
2007 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2008 gcc_assert (vectorization_factor != 0);
2010 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2011 dump_printf_loc (MSG_NOTE, vect_location,
2012 "vectorization_factor = %d, niters = "
2013 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
2014 LOOP_VINFO_INT_NITERS (loop_vinfo));
2016 HOST_WIDE_INT max_niter
2017 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2018 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2019 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2020 || (max_niter != -1
2021 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025 "not vectorized: iteration count smaller than "
2026 "vectorization factor.\n");
2027 return false;
2030 /* Analyze the alignment of the data-refs in the loop.
2031 Fail if a data reference is found that cannot be vectorized. */
2033 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2034 if (!ok)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 "bad data alignment.\n");
2039 return false;
2042 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2043 It is important to call pruning after vect_analyze_data_ref_accesses,
2044 since we use grouping information gathered by interleaving analysis. */
2045 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2046 if (!ok)
2047 return false;
2049 /* This pass will decide on using loop versioning and/or loop peeling in
2050 order to enhance the alignment of data references in the loop. */
2051 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2052 if (!ok)
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 "bad data alignment.\n");
2057 return false;
2060 if (slp)
2062 /* Analyze operations in the SLP instances. Note this may
2063 remove unsupported SLP instances which makes the above
2064 SLP kind detection invalid. */
2065 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2066 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2067 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2068 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2069 goto again;
2072 /* Scan all the remaining operations in the loop that are not subject
2073 to SLP and make sure they are vectorizable. */
2074 ok = vect_analyze_loop_operations (loop_vinfo);
2075 if (!ok)
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "bad operation or unsupported loop bound.\n");
2080 return false;
2083 /* If epilog loop is required because of data accesses with gaps,
2084 one additional iteration needs to be peeled. Check if there is
2085 enough iterations for vectorization. */
2086 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2087 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2089 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2090 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2092 if (wi::to_widest (scalar_niters) < vf)
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_NOTE, vect_location,
2096 "loop has no enough iterations to support"
2097 " peeling for gaps.\n");
2098 return false;
2102 /* Analyze cost. Decide if worth while to vectorize. */
2103 int min_profitable_estimate, min_profitable_iters;
2104 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2105 &min_profitable_estimate);
2107 if (min_profitable_iters < 0)
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111 "not vectorized: vectorization not profitable.\n");
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2114 "not vectorized: vector version will never be "
2115 "profitable.\n");
2116 goto again;
2119 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2120 * vectorization_factor) - 1);
2122 /* Use the cost model only if it is more conservative than user specified
2123 threshold. */
2124 th = (unsigned) min_scalar_loop_bound;
2125 if (min_profitable_iters
2126 && (!min_scalar_loop_bound
2127 || min_profitable_iters > min_scalar_loop_bound))
2128 th = (unsigned) min_profitable_iters;
2130 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2132 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2133 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137 "not vectorized: vectorization not profitable.\n");
2138 if (dump_enabled_p ())
2139 dump_printf_loc (MSG_NOTE, vect_location,
2140 "not vectorized: iteration count smaller than user "
2141 "specified loop bound parameter or minimum profitable "
2142 "iterations (whichever is more conservative).\n");
2143 goto again;
2146 estimated_niter
2147 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2148 if (estimated_niter == -1)
2149 estimated_niter = max_niter;
2150 if (estimated_niter != -1
2151 && ((unsigned HOST_WIDE_INT) estimated_niter
2152 <= MAX (th, (unsigned)min_profitable_estimate)))
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156 "not vectorized: estimated iteration count too "
2157 "small.\n");
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location,
2160 "not vectorized: estimated iteration count smaller "
2161 "than specified loop bound parameter or minimum "
2162 "profitable iterations (whichever is more "
2163 "conservative).\n");
2164 goto again;
2167 /* Decide whether we need to create an epilogue loop to handle
2168 remaining scalar iterations. */
2169 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2170 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2171 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2173 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2174 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2176 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2177 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2178 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2179 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2181 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2182 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2183 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2184 /* In case of versioning, check if the maximum number of
2185 iterations is greater than th. If they are identical,
2186 the epilogue is unnecessary. */
2187 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2188 || (unsigned HOST_WIDE_INT) max_niter > th)))
2189 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2191 /* If an epilogue loop is required make sure we can create one. */
2192 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2193 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2195 if (dump_enabled_p ())
2196 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2197 if (!vect_can_advance_ivs_p (loop_vinfo)
2198 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2199 single_exit (LOOP_VINFO_LOOP
2200 (loop_vinfo))))
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "not vectorized: can't create required "
2205 "epilog loop\n");
2206 goto again;
2210 gcc_assert (vectorization_factor
2211 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2213 /* Ok to vectorize! */
2214 return true;
2216 again:
2217 /* Try again with SLP forced off but if we didn't do any SLP there is
2218 no point in re-trying. */
2219 if (!slp)
2220 return false;
2222 /* If there are reduction chains re-trying will fail anyway. */
2223 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2224 return false;
2226 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2227 via interleaving or lane instructions. */
2228 slp_instance instance;
2229 slp_tree node;
2230 unsigned i, j;
2231 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2233 stmt_vec_info vinfo;
2234 vinfo = vinfo_for_stmt
2235 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2236 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2237 continue;
2238 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2239 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2240 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2241 if (! vect_store_lanes_supported (vectype, size)
2242 && ! vect_grouped_store_supported (vectype, size))
2243 return false;
2244 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2246 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2247 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2248 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2249 size = STMT_VINFO_GROUP_SIZE (vinfo);
2250 vectype = STMT_VINFO_VECTYPE (vinfo);
2251 if (! vect_load_lanes_supported (vectype, size)
2252 && ! vect_grouped_load_supported (vectype, single_element_p,
2253 size))
2254 return false;
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE, vect_location,
2260 "re-trying with SLP disabled\n");
2262 /* Roll back state appropriately. No SLP this time. */
2263 slp = false;
2264 /* Restore vectorization factor as it were without SLP. */
2265 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2266 /* Free the SLP instances. */
2267 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2268 vect_free_slp_instance (instance);
2269 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2270 /* Reset SLP type to loop_vect on all stmts. */
2271 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2273 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2274 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2275 !gsi_end_p (si); gsi_next (&si))
2277 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2278 STMT_SLP_TYPE (stmt_info) = loop_vect;
2279 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2281 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2282 STMT_SLP_TYPE (stmt_info) = loop_vect;
2283 for (gimple_stmt_iterator pi
2284 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2285 !gsi_end_p (pi); gsi_next (&pi))
2287 gimple *pstmt = gsi_stmt (pi);
2288 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2293 /* Free optimized alias test DDRS. */
2294 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2295 /* Reset target cost data. */
2296 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299 /* Reset assorted flags. */
2300 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2304 goto start_over;
2307 /* Function vect_analyze_loop.
2309 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2310 for it. The different analyses will record information in the
2311 loop_vec_info struct. */
2312 loop_vec_info
2313 vect_analyze_loop (struct loop *loop)
2315 loop_vec_info loop_vinfo;
2316 unsigned int vector_sizes;
2318 /* Autodetect first vector size we try. */
2319 current_vector_size = 0;
2320 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location,
2324 "===== analyze_loop_nest =====\n");
2326 if (loop_outer (loop)
2327 && loop_vec_info_for_loop (loop_outer (loop))
2328 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location,
2332 "outer-loop already vectorized.\n");
2333 return NULL;
2336 while (1)
2338 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2339 loop_vinfo = vect_analyze_loop_form (loop);
2340 if (!loop_vinfo)
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "bad loop form.\n");
2345 return NULL;
2348 bool fatal = false;
2349 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2351 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2353 return loop_vinfo;
2356 destroy_loop_vec_info (loop_vinfo, true);
2358 vector_sizes &= ~current_vector_size;
2359 if (fatal
2360 || vector_sizes == 0
2361 || current_vector_size == 0)
2362 return NULL;
2364 /* Try the next biggest vector size. */
2365 current_vector_size = 1 << floor_log2 (vector_sizes);
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "***** Re-trying analysis with "
2369 "vector size %d\n", current_vector_size);
2374 /* Function reduction_code_for_scalar_code
2376 Input:
2377 CODE - tree_code of a reduction operations.
2379 Output:
2380 REDUC_CODE - the corresponding tree-code to be used to reduce the
2381 vector of partial results into a single scalar result, or ERROR_MARK
2382 if the operation is a supported reduction operation, but does not have
2383 such a tree-code.
2385 Return FALSE if CODE currently cannot be vectorized as reduction. */
2387 static bool
2388 reduction_code_for_scalar_code (enum tree_code code,
2389 enum tree_code *reduc_code)
2391 switch (code)
2393 case MAX_EXPR:
2394 *reduc_code = REDUC_MAX_EXPR;
2395 return true;
2397 case MIN_EXPR:
2398 *reduc_code = REDUC_MIN_EXPR;
2399 return true;
2401 case PLUS_EXPR:
2402 *reduc_code = REDUC_PLUS_EXPR;
2403 return true;
2405 case MULT_EXPR:
2406 case MINUS_EXPR:
2407 case BIT_IOR_EXPR:
2408 case BIT_XOR_EXPR:
2409 case BIT_AND_EXPR:
2410 *reduc_code = ERROR_MARK;
2411 return true;
2413 default:
2414 return false;
2419 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2420 STMT is printed with a message MSG. */
2422 static void
2423 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2425 dump_printf_loc (msg_type, vect_location, "%s", msg);
2426 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2430 /* Detect SLP reduction of the form:
2432 #a1 = phi <a5, a0>
2433 a2 = operation (a1)
2434 a3 = operation (a2)
2435 a4 = operation (a3)
2436 a5 = operation (a4)
2438 #a = phi <a5>
2440 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2441 FIRST_STMT is the first reduction stmt in the chain
2442 (a2 = operation (a1)).
2444 Return TRUE if a reduction chain was detected. */
2446 static bool
2447 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2448 gimple *first_stmt)
2450 struct loop *loop = (gimple_bb (phi))->loop_father;
2451 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2452 enum tree_code code;
2453 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2454 stmt_vec_info use_stmt_info, current_stmt_info;
2455 tree lhs;
2456 imm_use_iterator imm_iter;
2457 use_operand_p use_p;
2458 int nloop_uses, size = 0, n_out_of_loop_uses;
2459 bool found = false;
2461 if (loop != vect_loop)
2462 return false;
2464 lhs = PHI_RESULT (phi);
2465 code = gimple_assign_rhs_code (first_stmt);
2466 while (1)
2468 nloop_uses = 0;
2469 n_out_of_loop_uses = 0;
2470 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2472 gimple *use_stmt = USE_STMT (use_p);
2473 if (is_gimple_debug (use_stmt))
2474 continue;
2476 /* Check if we got back to the reduction phi. */
2477 if (use_stmt == phi)
2479 loop_use_stmt = use_stmt;
2480 found = true;
2481 break;
2484 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2486 loop_use_stmt = use_stmt;
2487 nloop_uses++;
2489 else
2490 n_out_of_loop_uses++;
2492 /* There are can be either a single use in the loop or two uses in
2493 phi nodes. */
2494 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2495 return false;
2498 if (found)
2499 break;
2501 /* We reached a statement with no loop uses. */
2502 if (nloop_uses == 0)
2503 return false;
2505 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2506 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2507 return false;
2509 if (!is_gimple_assign (loop_use_stmt)
2510 || code != gimple_assign_rhs_code (loop_use_stmt)
2511 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2512 return false;
2514 /* Insert USE_STMT into reduction chain. */
2515 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2516 if (current_stmt)
2518 current_stmt_info = vinfo_for_stmt (current_stmt);
2519 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2520 GROUP_FIRST_ELEMENT (use_stmt_info)
2521 = GROUP_FIRST_ELEMENT (current_stmt_info);
2523 else
2524 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2526 lhs = gimple_assign_lhs (loop_use_stmt);
2527 current_stmt = loop_use_stmt;
2528 size++;
2531 if (!found || loop_use_stmt != phi || size < 2)
2532 return false;
2534 /* Swap the operands, if needed, to make the reduction operand be the second
2535 operand. */
2536 lhs = PHI_RESULT (phi);
2537 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2538 while (next_stmt)
2540 if (gimple_assign_rhs2 (next_stmt) == lhs)
2542 tree op = gimple_assign_rhs1 (next_stmt);
2543 gimple *def_stmt = NULL;
2545 if (TREE_CODE (op) == SSA_NAME)
2546 def_stmt = SSA_NAME_DEF_STMT (op);
2548 /* Check that the other def is either defined in the loop
2549 ("vect_internal_def"), or it's an induction (defined by a
2550 loop-header phi-node). */
2551 if (def_stmt
2552 && gimple_bb (def_stmt)
2553 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2554 && (is_gimple_assign (def_stmt)
2555 || is_gimple_call (def_stmt)
2556 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2557 == vect_induction_def
2558 || (gimple_code (def_stmt) == GIMPLE_PHI
2559 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2560 == vect_internal_def
2561 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2563 lhs = gimple_assign_lhs (next_stmt);
2564 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2565 continue;
2568 return false;
2570 else
2572 tree op = gimple_assign_rhs2 (next_stmt);
2573 gimple *def_stmt = NULL;
2575 if (TREE_CODE (op) == SSA_NAME)
2576 def_stmt = SSA_NAME_DEF_STMT (op);
2578 /* Check that the other def is either defined in the loop
2579 ("vect_internal_def"), or it's an induction (defined by a
2580 loop-header phi-node). */
2581 if (def_stmt
2582 && gimple_bb (def_stmt)
2583 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2584 && (is_gimple_assign (def_stmt)
2585 || is_gimple_call (def_stmt)
2586 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2587 == vect_induction_def
2588 || (gimple_code (def_stmt) == GIMPLE_PHI
2589 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2590 == vect_internal_def
2591 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2593 if (dump_enabled_p ())
2595 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2596 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2599 swap_ssa_operands (next_stmt,
2600 gimple_assign_rhs1_ptr (next_stmt),
2601 gimple_assign_rhs2_ptr (next_stmt));
2602 update_stmt (next_stmt);
2604 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2605 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2607 else
2608 return false;
2611 lhs = gimple_assign_lhs (next_stmt);
2612 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2615 /* Save the chain for further analysis in SLP detection. */
2616 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2617 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2618 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2620 return true;
2624 /* Function vect_is_simple_reduction_1
2626 (1) Detect a cross-iteration def-use cycle that represents a simple
2627 reduction computation. We look for the following pattern:
2629 loop_header:
2630 a1 = phi < a0, a2 >
2631 a3 = ...
2632 a2 = operation (a3, a1)
2636 a3 = ...
2637 loop_header:
2638 a1 = phi < a0, a2 >
2639 a2 = operation (a3, a1)
2641 such that:
2642 1. operation is commutative and associative and it is safe to
2643 change the order of the computation (if CHECK_REDUCTION is true)
2644 2. no uses for a2 in the loop (a2 is used out of the loop)
2645 3. no uses of a1 in the loop besides the reduction operation
2646 4. no uses of a1 outside the loop.
2648 Conditions 1,4 are tested here.
2649 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2651 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2652 nested cycles, if CHECK_REDUCTION is false.
2654 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2655 reductions:
2657 a1 = phi < a0, a2 >
2658 inner loop (def of a3)
2659 a2 = phi < a3 >
2661 (4) Detect condition expressions, ie:
2662 for (int i = 0; i < N; i++)
2663 if (a[i] < val)
2664 ret_val = a[i];
2668 static gimple *
2669 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2670 bool check_reduction, bool *double_reduc,
2671 bool need_wrapping_integral_overflow,
2672 enum vect_reduction_type *v_reduc_type)
2674 struct loop *loop = (gimple_bb (phi))->loop_father;
2675 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2676 edge latch_e = loop_latch_edge (loop);
2677 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2678 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2679 enum tree_code orig_code, code;
2680 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2681 tree type;
2682 int nloop_uses;
2683 tree name;
2684 imm_use_iterator imm_iter;
2685 use_operand_p use_p;
2686 bool phi_def;
2688 *double_reduc = false;
2689 *v_reduc_type = TREE_CODE_REDUCTION;
2691 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2692 otherwise, we assume outer loop vectorization. */
2693 gcc_assert ((check_reduction && loop == vect_loop)
2694 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2696 name = PHI_RESULT (phi);
2697 /* ??? If there are no uses of the PHI result the inner loop reduction
2698 won't be detected as possibly double-reduction by vectorizable_reduction
2699 because that tries to walk the PHI arg from the preheader edge which
2700 can be constant. See PR60382. */
2701 if (has_zero_uses (name))
2702 return NULL;
2703 nloop_uses = 0;
2704 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2706 gimple *use_stmt = USE_STMT (use_p);
2707 if (is_gimple_debug (use_stmt))
2708 continue;
2710 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2712 if (dump_enabled_p ())
2713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2714 "intermediate value used outside loop.\n");
2716 return NULL;
2719 nloop_uses++;
2720 if (nloop_uses > 1)
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2724 "reduction used in loop.\n");
2725 return NULL;
2728 phi_use_stmt = use_stmt;
2731 if (TREE_CODE (loop_arg) != SSA_NAME)
2733 if (dump_enabled_p ())
2735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736 "reduction: not ssa_name: ");
2737 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2740 return NULL;
2743 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744 if (!def_stmt)
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2748 "reduction: no def_stmt.\n");
2749 return NULL;
2752 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2754 if (dump_enabled_p ())
2755 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2756 return NULL;
2759 if (is_gimple_assign (def_stmt))
2761 name = gimple_assign_lhs (def_stmt);
2762 phi_def = false;
2764 else
2766 name = PHI_RESULT (def_stmt);
2767 phi_def = true;
2770 nloop_uses = 0;
2771 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2773 gimple *use_stmt = USE_STMT (use_p);
2774 if (is_gimple_debug (use_stmt))
2775 continue;
2776 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2777 nloop_uses++;
2778 if (nloop_uses > 1)
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 "reduction used in loop.\n");
2783 return NULL;
2787 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2788 defined in the inner loop. */
2789 if (phi_def)
2791 op1 = PHI_ARG_DEF (def_stmt, 0);
2793 if (gimple_phi_num_args (def_stmt) != 1
2794 || TREE_CODE (op1) != SSA_NAME)
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2798 "unsupported phi node definition.\n");
2800 return NULL;
2803 def1 = SSA_NAME_DEF_STMT (op1);
2804 if (gimple_bb (def1)
2805 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2806 && loop->inner
2807 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2808 && is_gimple_assign (def1)
2809 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2811 if (dump_enabled_p ())
2812 report_vect_op (MSG_NOTE, def_stmt,
2813 "detected double reduction: ");
2815 *double_reduc = true;
2816 return def_stmt;
2819 return NULL;
2822 code = orig_code = gimple_assign_rhs_code (def_stmt);
2824 /* We can handle "res -= x[i]", which is non-associative by
2825 simply rewriting this into "res += -x[i]". Avoid changing
2826 gimple instruction for the first simple tests and only do this
2827 if we're allowed to change code at all. */
2828 if (code == MINUS_EXPR
2829 && (op1 = gimple_assign_rhs1 (def_stmt))
2830 && TREE_CODE (op1) == SSA_NAME
2831 && SSA_NAME_DEF_STMT (op1) == phi)
2832 code = PLUS_EXPR;
2834 if (code == COND_EXPR)
2836 if (check_reduction)
2837 *v_reduc_type = COND_REDUCTION;
2839 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2841 if (dump_enabled_p ())
2842 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2843 "reduction: not commutative/associative: ");
2844 return NULL;
2847 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2849 if (code != COND_EXPR)
2851 if (dump_enabled_p ())
2852 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2853 "reduction: not binary operation: ");
2855 return NULL;
2858 op3 = gimple_assign_rhs1 (def_stmt);
2859 if (COMPARISON_CLASS_P (op3))
2861 op4 = TREE_OPERAND (op3, 1);
2862 op3 = TREE_OPERAND (op3, 0);
2865 op1 = gimple_assign_rhs2 (def_stmt);
2866 op2 = gimple_assign_rhs3 (def_stmt);
2868 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2870 if (dump_enabled_p ())
2871 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2872 "reduction: uses not ssa_names: ");
2874 return NULL;
2877 else
2879 op1 = gimple_assign_rhs1 (def_stmt);
2880 op2 = gimple_assign_rhs2 (def_stmt);
2882 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2884 if (dump_enabled_p ())
2885 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2886 "reduction: uses not ssa_names: ");
2888 return NULL;
2892 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2893 if ((TREE_CODE (op1) == SSA_NAME
2894 && !types_compatible_p (type,TREE_TYPE (op1)))
2895 || (TREE_CODE (op2) == SSA_NAME
2896 && !types_compatible_p (type, TREE_TYPE (op2)))
2897 || (op3 && TREE_CODE (op3) == SSA_NAME
2898 && !types_compatible_p (type, TREE_TYPE (op3)))
2899 || (op4 && TREE_CODE (op4) == SSA_NAME
2900 && !types_compatible_p (type, TREE_TYPE (op4))))
2902 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_NOTE, vect_location,
2905 "reduction: multiple types: operation type: ");
2906 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2907 dump_printf (MSG_NOTE, ", operands types: ");
2908 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2909 TREE_TYPE (op1));
2910 dump_printf (MSG_NOTE, ",");
2911 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2912 TREE_TYPE (op2));
2913 if (op3)
2915 dump_printf (MSG_NOTE, ",");
2916 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2917 TREE_TYPE (op3));
2920 if (op4)
2922 dump_printf (MSG_NOTE, ",");
2923 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2924 TREE_TYPE (op4));
2926 dump_printf (MSG_NOTE, "\n");
2929 return NULL;
2932 /* Check that it's ok to change the order of the computation.
2933 Generally, when vectorizing a reduction we change the order of the
2934 computation. This may change the behavior of the program in some
2935 cases, so we need to check that this is ok. One exception is when
2936 vectorizing an outer-loop: the inner-loop is executed sequentially,
2937 and therefore vectorizing reductions in the inner-loop during
2938 outer-loop vectorization is safe. */
2940 if (*v_reduc_type != COND_REDUCTION
2941 && check_reduction)
2943 /* CHECKME: check for !flag_finite_math_only too? */
2944 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2946 /* Changing the order of operations changes the semantics. */
2947 if (dump_enabled_p ())
2948 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2949 "reduction: unsafe fp math optimization: ");
2950 return NULL;
2952 else if (INTEGRAL_TYPE_P (type))
2954 if (!operation_no_trapping_overflow (type, code))
2956 /* Changing the order of operations changes the semantics. */
2957 if (dump_enabled_p ())
2958 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2959 "reduction: unsafe int math optimization"
2960 " (overflow traps): ");
2961 return NULL;
2963 if (need_wrapping_integral_overflow
2964 && !TYPE_OVERFLOW_WRAPS (type)
2965 && operation_can_overflow (code))
2967 /* Changing the order of operations changes the semantics. */
2968 if (dump_enabled_p ())
2969 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2970 "reduction: unsafe int math optimization"
2971 " (overflow doesn't wrap): ");
2972 return NULL;
2975 else if (SAT_FIXED_POINT_TYPE_P (type))
2977 /* Changing the order of operations changes the semantics. */
2978 if (dump_enabled_p ())
2979 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2980 "reduction: unsafe fixed-point math optimization: ");
2981 return NULL;
2985 /* Reduction is safe. We're dealing with one of the following:
2986 1) integer arithmetic and no trapv
2987 2) floating point arithmetic, and special flags permit this optimization
2988 3) nested cycle (i.e., outer loop vectorization). */
2989 if (TREE_CODE (op1) == SSA_NAME)
2990 def1 = SSA_NAME_DEF_STMT (op1);
2992 if (TREE_CODE (op2) == SSA_NAME)
2993 def2 = SSA_NAME_DEF_STMT (op2);
2995 if (code != COND_EXPR
2996 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2998 if (dump_enabled_p ())
2999 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3000 return NULL;
3003 /* Check that one def is the reduction def, defined by PHI,
3004 the other def is either defined in the loop ("vect_internal_def"),
3005 or it's an induction (defined by a loop-header phi-node). */
3007 if (def2 && def2 == phi
3008 && (code == COND_EXPR
3009 || !def1 || gimple_nop_p (def1)
3010 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3011 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3012 && (is_gimple_assign (def1)
3013 || is_gimple_call (def1)
3014 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3015 == vect_induction_def
3016 || (gimple_code (def1) == GIMPLE_PHI
3017 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3018 == vect_internal_def
3019 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3021 if (dump_enabled_p ())
3022 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3023 return def_stmt;
3026 if (def1 && def1 == phi
3027 && (code == COND_EXPR
3028 || !def2 || gimple_nop_p (def2)
3029 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3030 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3031 && (is_gimple_assign (def2)
3032 || is_gimple_call (def2)
3033 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3034 == vect_induction_def
3035 || (gimple_code (def2) == GIMPLE_PHI
3036 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3037 == vect_internal_def
3038 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3040 if (check_reduction && orig_code != MINUS_EXPR)
3042 /* Check if we can swap operands (just for simplicity - so that
3043 the rest of the code can assume that the reduction variable
3044 is always the last (second) argument). */
3045 if (code == COND_EXPR)
3047 /* Swap cond_expr by inverting the condition. */
3048 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3049 enum tree_code invert_code = ERROR_MARK;
3050 enum tree_code cond_code = TREE_CODE (cond_expr);
3052 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3054 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3055 invert_code = invert_tree_comparison (cond_code, honor_nans);
3057 if (invert_code != ERROR_MARK)
3059 TREE_SET_CODE (cond_expr, invert_code);
3060 swap_ssa_operands (def_stmt,
3061 gimple_assign_rhs2_ptr (def_stmt),
3062 gimple_assign_rhs3_ptr (def_stmt));
3064 else
3066 if (dump_enabled_p ())
3067 report_vect_op (MSG_NOTE, def_stmt,
3068 "detected reduction: cannot swap operands "
3069 "for cond_expr");
3070 return NULL;
3073 else
3074 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3075 gimple_assign_rhs2_ptr (def_stmt));
3077 if (dump_enabled_p ())
3078 report_vect_op (MSG_NOTE, def_stmt,
3079 "detected reduction: need to swap operands: ");
3081 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3082 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3084 else
3086 if (dump_enabled_p ())
3087 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3090 return def_stmt;
3093 /* Try to find SLP reduction chain. */
3094 if (check_reduction && code != COND_EXPR
3095 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3097 if (dump_enabled_p ())
3098 report_vect_op (MSG_NOTE, def_stmt,
3099 "reduction: detected reduction chain: ");
3101 return def_stmt;
3104 if (dump_enabled_p ())
3105 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3106 "reduction: unknown pattern: ");
3108 return NULL;
3111 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3112 in-place if it enables detection of more reductions. Arguments
3113 as there. */
3115 gimple *
3116 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3117 bool check_reduction, bool *double_reduc,
3118 bool need_wrapping_integral_overflow)
3120 enum vect_reduction_type v_reduc_type;
3121 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3122 double_reduc,
3123 need_wrapping_integral_overflow,
3124 &v_reduc_type);
3127 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3129 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3130 int *peel_iters_epilogue,
3131 stmt_vector_for_cost *scalar_cost_vec,
3132 stmt_vector_for_cost *prologue_cost_vec,
3133 stmt_vector_for_cost *epilogue_cost_vec)
3135 int retval = 0;
3136 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3140 *peel_iters_epilogue = vf/2;
3141 if (dump_enabled_p ())
3142 dump_printf_loc (MSG_NOTE, vect_location,
3143 "cost model: epilogue peel iters set to vf/2 "
3144 "because loop iterations are unknown .\n");
3146 /* If peeled iterations are known but number of scalar loop
3147 iterations are unknown, count a taken branch per peeled loop. */
3148 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3149 NULL, 0, vect_prologue);
3150 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3151 NULL, 0, vect_epilogue);
3153 else
3155 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3156 peel_iters_prologue = niters < peel_iters_prologue ?
3157 niters : peel_iters_prologue;
3158 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3159 /* If we need to peel for gaps, but no peeling is required, we have to
3160 peel VF iterations. */
3161 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3162 *peel_iters_epilogue = vf;
3165 stmt_info_for_cost *si;
3166 int j;
3167 if (peel_iters_prologue)
3168 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3169 retval += record_stmt_cost (prologue_cost_vec,
3170 si->count * peel_iters_prologue,
3171 si->kind, NULL, si->misalign,
3172 vect_prologue);
3173 if (*peel_iters_epilogue)
3174 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3175 retval += record_stmt_cost (epilogue_cost_vec,
3176 si->count * *peel_iters_epilogue,
3177 si->kind, NULL, si->misalign,
3178 vect_epilogue);
3180 return retval;
3183 /* Function vect_estimate_min_profitable_iters
3185 Return the number of iterations required for the vector version of the
3186 loop to be profitable relative to the cost of the scalar version of the
3187 loop.
3189 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3190 of iterations for vectorization. -1 value means loop vectorization
3191 is not profitable. This returned value may be used for dynamic
3192 profitability check.
3194 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3195 for static check against estimated number of iterations. */
3197 static void
3198 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3199 int *ret_min_profitable_niters,
3200 int *ret_min_profitable_estimate)
3202 int min_profitable_iters;
3203 int min_profitable_estimate;
3204 int peel_iters_prologue;
3205 int peel_iters_epilogue;
3206 unsigned vec_inside_cost = 0;
3207 int vec_outside_cost = 0;
3208 unsigned vec_prologue_cost = 0;
3209 unsigned vec_epilogue_cost = 0;
3210 int scalar_single_iter_cost = 0;
3211 int scalar_outside_cost = 0;
3212 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3213 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3214 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3216 /* Cost model disabled. */
3217 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3219 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3220 *ret_min_profitable_niters = 0;
3221 *ret_min_profitable_estimate = 0;
3222 return;
3225 /* Requires loop versioning tests to handle misalignment. */
3226 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3228 /* FIXME: Make cost depend on complexity of individual check. */
3229 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3230 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3231 vect_prologue);
3232 dump_printf (MSG_NOTE,
3233 "cost model: Adding cost of checks for loop "
3234 "versioning to treat misalignment.\n");
3237 /* Requires loop versioning with alias checks. */
3238 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3240 /* FIXME: Make cost depend on complexity of individual check. */
3241 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3242 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3243 vect_prologue);
3244 dump_printf (MSG_NOTE,
3245 "cost model: Adding cost of checks for loop "
3246 "versioning aliasing.\n");
3249 /* Requires loop versioning with niter checks. */
3250 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3252 /* FIXME: Make cost depend on complexity of individual check. */
3253 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3254 vect_prologue);
3255 dump_printf (MSG_NOTE,
3256 "cost model: Adding cost of checks for loop "
3257 "versioning niters.\n");
3260 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3261 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3262 vect_prologue);
3264 /* Count statements in scalar loop. Using this as scalar cost for a single
3265 iteration for now.
3267 TODO: Add outer loop support.
3269 TODO: Consider assigning different costs to different scalar
3270 statements. */
3272 scalar_single_iter_cost
3273 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3275 /* Add additional cost for the peeled instructions in prologue and epilogue
3276 loop.
3278 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3279 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3281 TODO: Build an expression that represents peel_iters for prologue and
3282 epilogue to be used in a run-time test. */
3284 if (npeel < 0)
3286 peel_iters_prologue = vf/2;
3287 dump_printf (MSG_NOTE, "cost model: "
3288 "prologue peel iters set to vf/2.\n");
3290 /* If peeling for alignment is unknown, loop bound of main loop becomes
3291 unknown. */
3292 peel_iters_epilogue = vf/2;
3293 dump_printf (MSG_NOTE, "cost model: "
3294 "epilogue peel iters set to vf/2 because "
3295 "peeling for alignment is unknown.\n");
3297 /* If peeled iterations are unknown, count a taken branch and a not taken
3298 branch per peeled loop. Even if scalar loop iterations are known,
3299 vector iterations are not known since peeled prologue iterations are
3300 not known. Hence guards remain the same. */
3301 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3302 NULL, 0, vect_prologue);
3303 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3304 NULL, 0, vect_prologue);
3305 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3306 NULL, 0, vect_epilogue);
3307 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3308 NULL, 0, vect_epilogue);
3309 stmt_info_for_cost *si;
3310 int j;
3311 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3313 struct _stmt_vec_info *stmt_info
3314 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3315 (void) add_stmt_cost (target_cost_data,
3316 si->count * peel_iters_prologue,
3317 si->kind, stmt_info, si->misalign,
3318 vect_prologue);
3319 (void) add_stmt_cost (target_cost_data,
3320 si->count * peel_iters_epilogue,
3321 si->kind, stmt_info, si->misalign,
3322 vect_epilogue);
3325 else
3327 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3328 stmt_info_for_cost *si;
3329 int j;
3330 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3332 prologue_cost_vec.create (2);
3333 epilogue_cost_vec.create (2);
3334 peel_iters_prologue = npeel;
3336 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3337 &peel_iters_epilogue,
3338 &LOOP_VINFO_SCALAR_ITERATION_COST
3339 (loop_vinfo),
3340 &prologue_cost_vec,
3341 &epilogue_cost_vec);
3343 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3345 struct _stmt_vec_info *stmt_info
3346 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3347 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3348 si->misalign, vect_prologue);
3351 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3353 struct _stmt_vec_info *stmt_info
3354 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3355 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3356 si->misalign, vect_epilogue);
3359 prologue_cost_vec.release ();
3360 epilogue_cost_vec.release ();
3363 /* FORNOW: The scalar outside cost is incremented in one of the
3364 following ways:
3366 1. The vectorizer checks for alignment and aliasing and generates
3367 a condition that allows dynamic vectorization. A cost model
3368 check is ANDED with the versioning condition. Hence scalar code
3369 path now has the added cost of the versioning check.
3371 if (cost > th & versioning_check)
3372 jmp to vector code
3374 Hence run-time scalar is incremented by not-taken branch cost.
3376 2. The vectorizer then checks if a prologue is required. If the
3377 cost model check was not done before during versioning, it has to
3378 be done before the prologue check.
3380 if (cost <= th)
3381 prologue = scalar_iters
3382 if (prologue == 0)
3383 jmp to vector code
3384 else
3385 execute prologue
3386 if (prologue == num_iters)
3387 go to exit
3389 Hence the run-time scalar cost is incremented by a taken branch,
3390 plus a not-taken branch, plus a taken branch cost.
3392 3. The vectorizer then checks if an epilogue is required. If the
3393 cost model check was not done before during prologue check, it
3394 has to be done with the epilogue check.
3396 if (prologue == 0)
3397 jmp to vector code
3398 else
3399 execute prologue
3400 if (prologue == num_iters)
3401 go to exit
3402 vector code:
3403 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3404 jmp to epilogue
3406 Hence the run-time scalar cost should be incremented by 2 taken
3407 branches.
3409 TODO: The back end may reorder the BBS's differently and reverse
3410 conditions/branch directions. Change the estimates below to
3411 something more reasonable. */
3413 /* If the number of iterations is known and we do not do versioning, we can
3414 decide whether to vectorize at compile time. Hence the scalar version
3415 do not carry cost model guard costs. */
3416 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3417 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3419 /* Cost model check occurs at versioning. */
3420 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3421 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3422 else
3424 /* Cost model check occurs at prologue generation. */
3425 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3426 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3427 + vect_get_stmt_cost (cond_branch_not_taken);
3428 /* Cost model check occurs at epilogue generation. */
3429 else
3430 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3434 /* Complete the target-specific cost calculations. */
3435 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3436 &vec_inside_cost, &vec_epilogue_cost);
3438 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3440 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3443 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3444 vec_inside_cost);
3445 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3446 vec_prologue_cost);
3447 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3448 vec_epilogue_cost);
3449 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3450 scalar_single_iter_cost);
3451 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3452 scalar_outside_cost);
3453 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3454 vec_outside_cost);
3455 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3456 peel_iters_prologue);
3457 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3458 peel_iters_epilogue);
3461 /* Calculate number of iterations required to make the vector version
3462 profitable, relative to the loop bodies only. The following condition
3463 must hold true:
3464 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3465 where
3466 SIC = scalar iteration cost, VIC = vector iteration cost,
3467 VOC = vector outside cost, VF = vectorization factor,
3468 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3469 SOC = scalar outside cost for run time cost model check. */
3471 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3473 if (vec_outside_cost <= 0)
3474 min_profitable_iters = 1;
3475 else
3477 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3478 - vec_inside_cost * peel_iters_prologue
3479 - vec_inside_cost * peel_iters_epilogue)
3480 / ((scalar_single_iter_cost * vf)
3481 - vec_inside_cost);
3483 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3484 <= (((int) vec_inside_cost * min_profitable_iters)
3485 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3486 min_profitable_iters++;
3489 /* vector version will never be profitable. */
3490 else
3492 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3493 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3494 "did not happen for a simd loop");
3496 if (dump_enabled_p ())
3497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3498 "cost model: the vector iteration cost = %d "
3499 "divided by the scalar iteration cost = %d "
3500 "is greater or equal to the vectorization factor = %d"
3501 ".\n",
3502 vec_inside_cost, scalar_single_iter_cost, vf);
3503 *ret_min_profitable_niters = -1;
3504 *ret_min_profitable_estimate = -1;
3505 return;
3508 dump_printf (MSG_NOTE,
3509 " Calculated minimum iters for profitability: %d\n",
3510 min_profitable_iters);
3512 min_profitable_iters =
3513 min_profitable_iters < vf ? vf : min_profitable_iters;
3515 /* Because the condition we create is:
3516 if (niters <= min_profitable_iters)
3517 then skip the vectorized loop. */
3518 min_profitable_iters--;
3520 if (dump_enabled_p ())
3521 dump_printf_loc (MSG_NOTE, vect_location,
3522 " Runtime profitability threshold = %d\n",
3523 min_profitable_iters);
3525 *ret_min_profitable_niters = min_profitable_iters;
3527 /* Calculate number of iterations required to make the vector version
3528 profitable, relative to the loop bodies only.
3530 Non-vectorized variant is SIC * niters and it must win over vector
3531 variant on the expected loop trip count. The following condition must hold true:
3532 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3534 if (vec_outside_cost <= 0)
3535 min_profitable_estimate = 1;
3536 else
3538 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3539 - vec_inside_cost * peel_iters_prologue
3540 - vec_inside_cost * peel_iters_epilogue)
3541 / ((scalar_single_iter_cost * vf)
3542 - vec_inside_cost);
3544 min_profitable_estimate --;
3545 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3546 if (dump_enabled_p ())
3547 dump_printf_loc (MSG_NOTE, vect_location,
3548 " Static estimate profitability threshold = %d\n",
3549 min_profitable_estimate);
3551 *ret_min_profitable_estimate = min_profitable_estimate;
3554 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3555 vector elements (not bits) for a vector of mode MODE. */
3556 static void
3557 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3558 unsigned char *sel)
3560 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3562 for (i = 0; i < nelt; i++)
3563 sel[i] = (i + offset) & (2*nelt - 1);
3566 /* Checks whether the target supports whole-vector shifts for vectors of mode
3567 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3568 it supports vec_perm_const with masks for all necessary shift amounts. */
3569 static bool
3570 have_whole_vector_shift (enum machine_mode mode)
3572 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3573 return true;
3575 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3576 return false;
3578 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3579 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3581 for (i = nelt/2; i >= 1; i/=2)
3583 calc_vec_perm_mask_for_shift (mode, i, sel);
3584 if (!can_vec_perm_p (mode, false, sel))
3585 return false;
3587 return true;
3590 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3592 static tree
3593 get_reduction_op (gimple *stmt, int reduc_index)
3595 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3597 case GIMPLE_SINGLE_RHS:
3598 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3599 == ternary_op);
3600 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3601 case GIMPLE_UNARY_RHS:
3602 return gimple_assign_rhs1 (stmt);
3603 case GIMPLE_BINARY_RHS:
3604 return (reduc_index
3605 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3606 case GIMPLE_TERNARY_RHS:
3607 return gimple_op (stmt, reduc_index + 1);
3608 default:
3609 gcc_unreachable ();
3613 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3614 functions. Design better to avoid maintenance issues. */
3616 /* Function vect_model_reduction_cost.
3618 Models cost for a reduction operation, including the vector ops
3619 generated within the strip-mine loop, the initial definition before
3620 the loop, and the epilogue code that must be generated. */
3622 static bool
3623 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3624 int ncopies, int reduc_index)
3626 int prologue_cost = 0, epilogue_cost = 0;
3627 enum tree_code code;
3628 optab optab;
3629 tree vectype;
3630 gimple *stmt, *orig_stmt;
3631 tree reduction_op;
3632 machine_mode mode;
3633 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3634 struct loop *loop = NULL;
3635 void *target_cost_data;
3637 if (loop_vinfo)
3639 loop = LOOP_VINFO_LOOP (loop_vinfo);
3640 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3642 else
3643 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3645 /* Condition reductions generate two reductions in the loop. */
3646 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3647 ncopies *= 2;
3649 /* Cost of reduction op inside loop. */
3650 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3651 stmt_info, 0, vect_body);
3652 stmt = STMT_VINFO_STMT (stmt_info);
3654 reduction_op = get_reduction_op (stmt, reduc_index);
3656 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3657 if (!vectype)
3659 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 "unsupported data-type ");
3663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3664 TREE_TYPE (reduction_op));
3665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3667 return false;
3670 mode = TYPE_MODE (vectype);
3671 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3673 if (!orig_stmt)
3674 orig_stmt = STMT_VINFO_STMT (stmt_info);
3676 code = gimple_assign_rhs_code (orig_stmt);
3678 /* Add in cost for initial definition.
3679 For cond reduction we have four vectors: initial index, step, initial
3680 result of the data reduction, initial value of the index reduction. */
3681 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3682 == COND_REDUCTION ? 4 : 1;
3683 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3684 scalar_to_vec, stmt_info, 0,
3685 vect_prologue);
3687 /* Determine cost of epilogue code.
3689 We have a reduction operator that will reduce the vector in one statement.
3690 Also requires scalar extract. */
3692 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3694 if (reduc_code != ERROR_MARK)
3696 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3698 /* An EQ stmt and an COND_EXPR stmt. */
3699 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3700 vector_stmt, stmt_info, 0,
3701 vect_epilogue);
3702 /* Reduction of the max index and a reduction of the found
3703 values. */
3704 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3705 vec_to_scalar, stmt_info, 0,
3706 vect_epilogue);
3707 /* A broadcast of the max value. */
3708 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3709 scalar_to_vec, stmt_info, 0,
3710 vect_epilogue);
3712 else
3714 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3715 stmt_info, 0, vect_epilogue);
3716 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3717 vec_to_scalar, stmt_info, 0,
3718 vect_epilogue);
3721 else
3723 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3724 tree bitsize =
3725 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3726 int element_bitsize = tree_to_uhwi (bitsize);
3727 int nelements = vec_size_in_bits / element_bitsize;
3729 optab = optab_for_tree_code (code, vectype, optab_default);
3731 /* We have a whole vector shift available. */
3732 if (VECTOR_MODE_P (mode)
3733 && optab_handler (optab, mode) != CODE_FOR_nothing
3734 && have_whole_vector_shift (mode))
3736 /* Final reduction via vector shifts and the reduction operator.
3737 Also requires scalar extract. */
3738 epilogue_cost += add_stmt_cost (target_cost_data,
3739 exact_log2 (nelements) * 2,
3740 vector_stmt, stmt_info, 0,
3741 vect_epilogue);
3742 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3743 vec_to_scalar, stmt_info, 0,
3744 vect_epilogue);
3746 else
3747 /* Use extracts and reduction op for final reduction. For N
3748 elements, we have N extracts and N-1 reduction ops. */
3749 epilogue_cost += add_stmt_cost (target_cost_data,
3750 nelements + nelements - 1,
3751 vector_stmt, stmt_info, 0,
3752 vect_epilogue);
3756 if (dump_enabled_p ())
3757 dump_printf (MSG_NOTE,
3758 "vect_model_reduction_cost: inside_cost = %d, "
3759 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3760 prologue_cost, epilogue_cost);
3762 return true;
3766 /* Function vect_model_induction_cost.
3768 Models cost for induction operations. */
3770 static void
3771 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3773 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3774 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3775 unsigned inside_cost, prologue_cost;
3777 /* loop cost for vec_loop. */
3778 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3779 stmt_info, 0, vect_body);
3781 /* prologue cost for vec_init and vec_step. */
3782 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3783 stmt_info, 0, vect_prologue);
3785 if (dump_enabled_p ())
3786 dump_printf_loc (MSG_NOTE, vect_location,
3787 "vect_model_induction_cost: inside_cost = %d, "
3788 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3792 /* Function get_initial_def_for_induction
3794 Input:
3795 STMT - a stmt that performs an induction operation in the loop.
3796 IV_PHI - the initial value of the induction variable
3798 Output:
3799 Return a vector variable, initialized with the first VF values of
3800 the induction variable. E.g., for an iv with IV_PHI='X' and
3801 evolution S, for a vector of 4 units, we want to return:
3802 [X, X + S, X + 2*S, X + 3*S]. */
3804 static tree
3805 get_initial_def_for_induction (gimple *iv_phi)
3807 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3808 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3809 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3810 tree vectype;
3811 int nunits;
3812 edge pe = loop_preheader_edge (loop);
3813 struct loop *iv_loop;
3814 basic_block new_bb;
3815 tree new_vec, vec_init, vec_step, t;
3816 tree new_name;
3817 gimple *new_stmt;
3818 gphi *induction_phi;
3819 tree induc_def, vec_def, vec_dest;
3820 tree init_expr, step_expr;
3821 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3822 int i;
3823 int ncopies;
3824 tree expr;
3825 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3826 bool nested_in_vect_loop = false;
3827 gimple_seq stmts;
3828 imm_use_iterator imm_iter;
3829 use_operand_p use_p;
3830 gimple *exit_phi;
3831 edge latch_e;
3832 tree loop_arg;
3833 gimple_stmt_iterator si;
3834 basic_block bb = gimple_bb (iv_phi);
3835 tree stepvectype;
3836 tree resvectype;
3838 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3839 if (nested_in_vect_loop_p (loop, iv_phi))
3841 nested_in_vect_loop = true;
3842 iv_loop = loop->inner;
3844 else
3845 iv_loop = loop;
3846 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3848 latch_e = loop_latch_edge (iv_loop);
3849 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3851 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3852 gcc_assert (step_expr != NULL_TREE);
3854 pe = loop_preheader_edge (iv_loop);
3855 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3856 loop_preheader_edge (iv_loop));
3858 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3859 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3860 gcc_assert (vectype);
3861 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3862 ncopies = vf / nunits;
3864 gcc_assert (phi_info);
3865 gcc_assert (ncopies >= 1);
3867 /* Convert the step to the desired type. */
3868 stmts = NULL;
3869 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3870 if (stmts)
3872 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3873 gcc_assert (!new_bb);
3876 /* Find the first insertion point in the BB. */
3877 si = gsi_after_labels (bb);
3879 /* Create the vector that holds the initial_value of the induction. */
3880 if (nested_in_vect_loop)
3882 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3883 been created during vectorization of previous stmts. We obtain it
3884 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3885 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3886 /* If the initial value is not of proper type, convert it. */
3887 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3889 new_stmt
3890 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3891 vect_simple_var,
3892 "vec_iv_"),
3893 VIEW_CONVERT_EXPR,
3894 build1 (VIEW_CONVERT_EXPR, vectype,
3895 vec_init));
3896 vec_init = gimple_assign_lhs (new_stmt);
3897 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3898 new_stmt);
3899 gcc_assert (!new_bb);
3900 set_vinfo_for_stmt (new_stmt,
3901 new_stmt_vec_info (new_stmt, loop_vinfo));
3904 else
3906 vec<constructor_elt, va_gc> *v;
3908 /* iv_loop is the loop to be vectorized. Create:
3909 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3910 stmts = NULL;
3911 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3913 vec_alloc (v, nunits);
3914 bool constant_p = is_gimple_min_invariant (new_name);
3915 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3916 for (i = 1; i < nunits; i++)
3918 /* Create: new_name_i = new_name + step_expr */
3919 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3920 new_name, step_expr);
3921 if (!is_gimple_min_invariant (new_name))
3922 constant_p = false;
3923 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3925 if (stmts)
3927 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3928 gcc_assert (!new_bb);
3931 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3932 if (constant_p)
3933 new_vec = build_vector_from_ctor (vectype, v);
3934 else
3935 new_vec = build_constructor (vectype, v);
3936 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3940 /* Create the vector that holds the step of the induction. */
3941 if (nested_in_vect_loop)
3942 /* iv_loop is nested in the loop to be vectorized. Generate:
3943 vec_step = [S, S, S, S] */
3944 new_name = step_expr;
3945 else
3947 /* iv_loop is the loop to be vectorized. Generate:
3948 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3949 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3951 expr = build_int_cst (integer_type_node, vf);
3952 expr = fold_convert (TREE_TYPE (step_expr), expr);
3954 else
3955 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3956 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3957 expr, step_expr);
3958 if (TREE_CODE (step_expr) == SSA_NAME)
3959 new_name = vect_init_vector (iv_phi, new_name,
3960 TREE_TYPE (step_expr), NULL);
3963 t = unshare_expr (new_name);
3964 gcc_assert (CONSTANT_CLASS_P (new_name)
3965 || TREE_CODE (new_name) == SSA_NAME);
3966 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3967 gcc_assert (stepvectype);
3968 new_vec = build_vector_from_val (stepvectype, t);
3969 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3972 /* Create the following def-use cycle:
3973 loop prolog:
3974 vec_init = ...
3975 vec_step = ...
3976 loop:
3977 vec_iv = PHI <vec_init, vec_loop>
3979 STMT
3981 vec_loop = vec_iv + vec_step; */
3983 /* Create the induction-phi that defines the induction-operand. */
3984 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3985 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3986 set_vinfo_for_stmt (induction_phi,
3987 new_stmt_vec_info (induction_phi, loop_vinfo));
3988 induc_def = PHI_RESULT (induction_phi);
3990 /* Create the iv update inside the loop */
3991 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3992 vec_def = make_ssa_name (vec_dest, new_stmt);
3993 gimple_assign_set_lhs (new_stmt, vec_def);
3994 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3995 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3997 /* Set the arguments of the phi node: */
3998 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3999 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
4000 UNKNOWN_LOCATION);
4003 /* In case that vectorization factor (VF) is bigger than the number
4004 of elements that we can fit in a vectype (nunits), we have to generate
4005 more than one vector stmt - i.e - we need to "unroll" the
4006 vector stmt by a factor VF/nunits. For more details see documentation
4007 in vectorizable_operation. */
4009 if (ncopies > 1)
4011 stmt_vec_info prev_stmt_vinfo;
4012 /* FORNOW. This restriction should be relaxed. */
4013 gcc_assert (!nested_in_vect_loop);
4015 /* Create the vector that holds the step of the induction. */
4016 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
4018 expr = build_int_cst (integer_type_node, nunits);
4019 expr = fold_convert (TREE_TYPE (step_expr), expr);
4021 else
4022 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
4023 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
4024 expr, step_expr);
4025 if (TREE_CODE (step_expr) == SSA_NAME)
4026 new_name = vect_init_vector (iv_phi, new_name,
4027 TREE_TYPE (step_expr), NULL);
4028 t = unshare_expr (new_name);
4029 gcc_assert (CONSTANT_CLASS_P (new_name)
4030 || TREE_CODE (new_name) == SSA_NAME);
4031 new_vec = build_vector_from_val (stepvectype, t);
4032 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
4034 vec_def = induc_def;
4035 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
4036 for (i = 1; i < ncopies; i++)
4038 /* vec_i = vec_prev + vec_step */
4039 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
4040 vec_def, vec_step);
4041 vec_def = make_ssa_name (vec_dest, new_stmt);
4042 gimple_assign_set_lhs (new_stmt, vec_def);
4044 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4045 if (!useless_type_conversion_p (resvectype, vectype))
4047 new_stmt
4048 = gimple_build_assign
4049 (vect_get_new_vect_var (resvectype, vect_simple_var,
4050 "vec_iv_"),
4051 VIEW_CONVERT_EXPR,
4052 build1 (VIEW_CONVERT_EXPR, resvectype,
4053 gimple_assign_lhs (new_stmt)));
4054 gimple_assign_set_lhs (new_stmt,
4055 make_ssa_name
4056 (gimple_assign_lhs (new_stmt), new_stmt));
4057 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4059 set_vinfo_for_stmt (new_stmt,
4060 new_stmt_vec_info (new_stmt, loop_vinfo));
4061 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
4062 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
4066 if (nested_in_vect_loop)
4068 /* Find the loop-closed exit-phi of the induction, and record
4069 the final vector of induction results: */
4070 exit_phi = NULL;
4071 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
4073 gimple *use_stmt = USE_STMT (use_p);
4074 if (is_gimple_debug (use_stmt))
4075 continue;
4077 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
4079 exit_phi = use_stmt;
4080 break;
4083 if (exit_phi)
4085 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
4086 /* FORNOW. Currently not supporting the case that an inner-loop induction
4087 is not used in the outer-loop (i.e. only outside the outer-loop). */
4088 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
4089 && !STMT_VINFO_LIVE_P (stmt_vinfo));
4091 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
4092 if (dump_enabled_p ())
4094 dump_printf_loc (MSG_NOTE, vect_location,
4095 "vector of inductions after inner-loop:");
4096 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
4102 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_NOTE, vect_location,
4105 "transform induction: created def-use cycle: ");
4106 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
4107 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4108 SSA_NAME_DEF_STMT (vec_def), 0);
4111 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4112 if (!useless_type_conversion_p (resvectype, vectype))
4114 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4115 vect_simple_var,
4116 "vec_iv_"),
4117 VIEW_CONVERT_EXPR,
4118 build1 (VIEW_CONVERT_EXPR, resvectype,
4119 induc_def));
4120 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4121 gimple_assign_set_lhs (new_stmt, induc_def);
4122 si = gsi_after_labels (bb);
4123 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4124 set_vinfo_for_stmt (new_stmt,
4125 new_stmt_vec_info (new_stmt, loop_vinfo));
4126 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4127 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4130 return induc_def;
4134 /* Function get_initial_def_for_reduction
4136 Input:
4137 STMT - a stmt that performs a reduction operation in the loop.
4138 INIT_VAL - the initial value of the reduction variable
4140 Output:
4141 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4142 of the reduction (used for adjusting the epilog - see below).
4143 Return a vector variable, initialized according to the operation that STMT
4144 performs. This vector will be used as the initial value of the
4145 vector of partial results.
4147 Option1 (adjust in epilog): Initialize the vector as follows:
4148 add/bit or/xor: [0,0,...,0,0]
4149 mult/bit and: [1,1,...,1,1]
4150 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4151 and when necessary (e.g. add/mult case) let the caller know
4152 that it needs to adjust the result by init_val.
4154 Option2: Initialize the vector as follows:
4155 add/bit or/xor: [init_val,0,0,...,0]
4156 mult/bit and: [init_val,1,1,...,1]
4157 min/max/cond_expr: [init_val,init_val,...,init_val]
4158 and no adjustments are needed.
4160 For example, for the following code:
4162 s = init_val;
4163 for (i=0;i<n;i++)
4164 s = s + a[i];
4166 STMT is 's = s + a[i]', and the reduction variable is 's'.
4167 For a vector of 4 units, we want to return either [0,0,0,init_val],
4168 or [0,0,0,0] and let the caller know that it needs to adjust
4169 the result at the end by 'init_val'.
4171 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4172 initialization vector is simpler (same element in all entries), if
4173 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4175 A cost model should help decide between these two schemes. */
4177 tree
4178 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4179 tree *adjustment_def)
4181 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4182 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4183 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4184 tree scalar_type = TREE_TYPE (init_val);
4185 tree vectype = get_vectype_for_scalar_type (scalar_type);
4186 int nunits;
4187 enum tree_code code = gimple_assign_rhs_code (stmt);
4188 tree def_for_init;
4189 tree init_def;
4190 tree *elts;
4191 int i;
4192 bool nested_in_vect_loop = false;
4193 REAL_VALUE_TYPE real_init_val = dconst0;
4194 int int_init_val = 0;
4195 gimple *def_stmt = NULL;
4196 gimple_seq stmts = NULL;
4198 gcc_assert (vectype);
4199 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4201 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4202 || SCALAR_FLOAT_TYPE_P (scalar_type));
4204 if (nested_in_vect_loop_p (loop, stmt))
4205 nested_in_vect_loop = true;
4206 else
4207 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4209 /* In case of double reduction we only create a vector variable to be put
4210 in the reduction phi node. The actual statement creation is done in
4211 vect_create_epilog_for_reduction. */
4212 if (adjustment_def && nested_in_vect_loop
4213 && TREE_CODE (init_val) == SSA_NAME
4214 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4215 && gimple_code (def_stmt) == GIMPLE_PHI
4216 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4217 && vinfo_for_stmt (def_stmt)
4218 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4219 == vect_double_reduction_def)
4221 *adjustment_def = NULL;
4222 return vect_create_destination_var (init_val, vectype);
4225 /* In case of a nested reduction do not use an adjustment def as
4226 that case is not supported by the epilogue generation correctly
4227 if ncopies is not one. */
4228 if (adjustment_def && nested_in_vect_loop)
4230 *adjustment_def = NULL;
4231 return vect_get_vec_def_for_operand (init_val, stmt);
4234 switch (code)
4236 case WIDEN_SUM_EXPR:
4237 case DOT_PROD_EXPR:
4238 case SAD_EXPR:
4239 case PLUS_EXPR:
4240 case MINUS_EXPR:
4241 case BIT_IOR_EXPR:
4242 case BIT_XOR_EXPR:
4243 case MULT_EXPR:
4244 case BIT_AND_EXPR:
4245 /* ADJUSMENT_DEF is NULL when called from
4246 vect_create_epilog_for_reduction to vectorize double reduction. */
4247 if (adjustment_def)
4248 *adjustment_def = init_val;
4250 if (code == MULT_EXPR)
4252 real_init_val = dconst1;
4253 int_init_val = 1;
4256 if (code == BIT_AND_EXPR)
4257 int_init_val = -1;
4259 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4260 def_for_init = build_real (scalar_type, real_init_val);
4261 else
4262 def_for_init = build_int_cst (scalar_type, int_init_val);
4264 /* Create a vector of '0' or '1' except the first element. */
4265 elts = XALLOCAVEC (tree, nunits);
4266 for (i = nunits - 2; i >= 0; --i)
4267 elts[i + 1] = def_for_init;
4269 /* Option1: the first element is '0' or '1' as well. */
4270 if (adjustment_def)
4272 elts[0] = def_for_init;
4273 init_def = build_vector (vectype, elts);
4274 break;
4277 /* Option2: the first element is INIT_VAL. */
4278 elts[0] = init_val;
4279 if (TREE_CONSTANT (init_val))
4280 init_def = build_vector (vectype, elts);
4281 else
4283 vec<constructor_elt, va_gc> *v;
4284 vec_alloc (v, nunits);
4285 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4286 for (i = 1; i < nunits; ++i)
4287 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4288 init_def = build_constructor (vectype, v);
4291 break;
4293 case MIN_EXPR:
4294 case MAX_EXPR:
4295 case COND_EXPR:
4296 if (adjustment_def)
4298 *adjustment_def = NULL_TREE;
4299 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4301 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4302 break;
4305 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4306 if (! gimple_seq_empty_p (stmts))
4307 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4308 init_def = build_vector_from_val (vectype, init_val);
4309 break;
4311 default:
4312 gcc_unreachable ();
4315 return init_def;
4318 /* Function vect_create_epilog_for_reduction
4320 Create code at the loop-epilog to finalize the result of a reduction
4321 computation.
4323 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4324 reduction statements.
4325 STMT is the scalar reduction stmt that is being vectorized.
4326 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4327 number of elements that we can fit in a vectype (nunits). In this case
4328 we have to generate more than one vector stmt - i.e - we need to "unroll"
4329 the vector stmt by a factor VF/nunits. For more details see documentation
4330 in vectorizable_operation.
4331 REDUC_CODE is the tree-code for the epilog reduction.
4332 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4333 computation.
4334 REDUC_INDEX is the index of the operand in the right hand side of the
4335 statement that is defined by REDUCTION_PHI.
4336 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4337 SLP_NODE is an SLP node containing a group of reduction statements. The
4338 first one in this group is STMT.
4339 INDUCTION_INDEX is the index of the loop for condition reductions.
4340 Otherwise it is undefined.
4342 This function:
4343 1. Creates the reduction def-use cycles: sets the arguments for
4344 REDUCTION_PHIS:
4345 The loop-entry argument is the vectorized initial-value of the reduction.
4346 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4347 sums.
4348 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4349 by applying the operation specified by REDUC_CODE if available, or by
4350 other means (whole-vector shifts or a scalar loop).
4351 The function also creates a new phi node at the loop exit to preserve
4352 loop-closed form, as illustrated below.
4354 The flow at the entry to this function:
4356 loop:
4357 vec_def = phi <null, null> # REDUCTION_PHI
4358 VECT_DEF = vector_stmt # vectorized form of STMT
4359 s_loop = scalar_stmt # (scalar) STMT
4360 loop_exit:
4361 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4362 use <s_out0>
4363 use <s_out0>
4365 The above is transformed by this function into:
4367 loop:
4368 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4369 VECT_DEF = vector_stmt # vectorized form of STMT
4370 s_loop = scalar_stmt # (scalar) STMT
4371 loop_exit:
4372 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4373 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4374 v_out2 = reduce <v_out1>
4375 s_out3 = extract_field <v_out2, 0>
4376 s_out4 = adjust_result <s_out3>
4377 use <s_out4>
4378 use <s_out4>
4381 static void
4382 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4383 int ncopies, enum tree_code reduc_code,
4384 vec<gimple *> reduction_phis,
4385 int reduc_index, bool double_reduc,
4386 slp_tree slp_node, tree induction_index)
4388 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4389 stmt_vec_info prev_phi_info;
4390 tree vectype;
4391 machine_mode mode;
4392 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4393 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4394 basic_block exit_bb;
4395 tree scalar_dest;
4396 tree scalar_type;
4397 gimple *new_phi = NULL, *phi;
4398 gimple_stmt_iterator exit_gsi;
4399 tree vec_dest;
4400 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4401 gimple *epilog_stmt = NULL;
4402 enum tree_code code = gimple_assign_rhs_code (stmt);
4403 gimple *exit_phi;
4404 tree bitsize;
4405 tree adjustment_def = NULL;
4406 tree vec_initial_def = NULL;
4407 tree reduction_op, expr, def, initial_def = NULL;
4408 tree orig_name, scalar_result;
4409 imm_use_iterator imm_iter, phi_imm_iter;
4410 use_operand_p use_p, phi_use_p;
4411 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4412 bool nested_in_vect_loop = false;
4413 auto_vec<gimple *> new_phis;
4414 auto_vec<gimple *> inner_phis;
4415 enum vect_def_type dt = vect_unknown_def_type;
4416 int j, i;
4417 auto_vec<tree> scalar_results;
4418 unsigned int group_size = 1, k, ratio;
4419 auto_vec<tree> vec_initial_defs;
4420 auto_vec<gimple *> phis;
4421 bool slp_reduc = false;
4422 tree new_phi_result;
4423 gimple *inner_phi = NULL;
4425 if (slp_node)
4426 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4428 if (nested_in_vect_loop_p (loop, stmt))
4430 outer_loop = loop;
4431 loop = loop->inner;
4432 nested_in_vect_loop = true;
4433 gcc_assert (!slp_node);
4436 reduction_op = get_reduction_op (stmt, reduc_index);
4438 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4439 gcc_assert (vectype);
4440 mode = TYPE_MODE (vectype);
4442 /* 1. Create the reduction def-use cycle:
4443 Set the arguments of REDUCTION_PHIS, i.e., transform
4445 loop:
4446 vec_def = phi <null, null> # REDUCTION_PHI
4447 VECT_DEF = vector_stmt # vectorized form of STMT
4450 into:
4452 loop:
4453 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4454 VECT_DEF = vector_stmt # vectorized form of STMT
4457 (in case of SLP, do it for all the phis). */
4459 /* Get the loop-entry arguments. */
4460 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4461 if (slp_node)
4462 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4463 NULL, slp_node, reduc_index);
4464 else
4466 /* Get at the scalar def before the loop, that defines the initial value
4467 of the reduction variable. */
4468 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4469 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4470 loop_preheader_edge (loop));
4471 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4472 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4473 &adjustment_def);
4474 vec_initial_defs.create (1);
4475 vec_initial_defs.quick_push (vec_initial_def);
4478 /* Set phi nodes arguments. */
4479 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4481 tree vec_init_def, def;
4482 gimple_seq stmts;
4483 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4484 true, NULL_TREE);
4485 if (stmts)
4486 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4488 def = vect_defs[i];
4489 for (j = 0; j < ncopies; j++)
4491 if (j != 0)
4493 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4494 if (nested_in_vect_loop)
4495 vec_init_def
4496 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4497 vec_init_def);
4500 /* Set the loop-entry arg of the reduction-phi. */
4502 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4503 == INTEGER_INDUC_COND_REDUCTION)
4505 /* Initialise the reduction phi to zero. This prevents initial
4506 values of non-zero interferring with the reduction op. */
4507 gcc_assert (ncopies == 1);
4508 gcc_assert (i == 0);
4510 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4511 tree zero_vec = build_zero_cst (vec_init_def_type);
4513 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4514 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4516 else
4517 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4518 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4520 /* Set the loop-latch arg for the reduction-phi. */
4521 if (j > 0)
4522 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4524 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4525 UNKNOWN_LOCATION);
4527 if (dump_enabled_p ())
4529 dump_printf_loc (MSG_NOTE, vect_location,
4530 "transform reduction: created def-use cycle: ");
4531 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4532 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4537 /* 2. Create epilog code.
4538 The reduction epilog code operates across the elements of the vector
4539 of partial results computed by the vectorized loop.
4540 The reduction epilog code consists of:
4542 step 1: compute the scalar result in a vector (v_out2)
4543 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4544 step 3: adjust the scalar result (s_out3) if needed.
4546 Step 1 can be accomplished using one the following three schemes:
4547 (scheme 1) using reduc_code, if available.
4548 (scheme 2) using whole-vector shifts, if available.
4549 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4550 combined.
4552 The overall epilog code looks like this:
4554 s_out0 = phi <s_loop> # original EXIT_PHI
4555 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4556 v_out2 = reduce <v_out1> # step 1
4557 s_out3 = extract_field <v_out2, 0> # step 2
4558 s_out4 = adjust_result <s_out3> # step 3
4560 (step 3 is optional, and steps 1 and 2 may be combined).
4561 Lastly, the uses of s_out0 are replaced by s_out4. */
4564 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4565 v_out1 = phi <VECT_DEF>
4566 Store them in NEW_PHIS. */
4568 exit_bb = single_exit (loop)->dest;
4569 prev_phi_info = NULL;
4570 new_phis.create (vect_defs.length ());
4571 FOR_EACH_VEC_ELT (vect_defs, i, def)
4573 for (j = 0; j < ncopies; j++)
4575 tree new_def = copy_ssa_name (def);
4576 phi = create_phi_node (new_def, exit_bb);
4577 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4578 if (j == 0)
4579 new_phis.quick_push (phi);
4580 else
4582 def = vect_get_vec_def_for_stmt_copy (dt, def);
4583 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4586 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4587 prev_phi_info = vinfo_for_stmt (phi);
4591 /* The epilogue is created for the outer-loop, i.e., for the loop being
4592 vectorized. Create exit phis for the outer loop. */
4593 if (double_reduc)
4595 loop = outer_loop;
4596 exit_bb = single_exit (loop)->dest;
4597 inner_phis.create (vect_defs.length ());
4598 FOR_EACH_VEC_ELT (new_phis, i, phi)
4600 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4601 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4602 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4603 PHI_RESULT (phi));
4604 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4605 loop_vinfo));
4606 inner_phis.quick_push (phi);
4607 new_phis[i] = outer_phi;
4608 prev_phi_info = vinfo_for_stmt (outer_phi);
4609 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4611 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4612 new_result = copy_ssa_name (PHI_RESULT (phi));
4613 outer_phi = create_phi_node (new_result, exit_bb);
4614 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4615 PHI_RESULT (phi));
4616 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4617 loop_vinfo));
4618 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4619 prev_phi_info = vinfo_for_stmt (outer_phi);
4624 exit_gsi = gsi_after_labels (exit_bb);
4626 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4627 (i.e. when reduc_code is not available) and in the final adjustment
4628 code (if needed). Also get the original scalar reduction variable as
4629 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4630 represents a reduction pattern), the tree-code and scalar-def are
4631 taken from the original stmt that the pattern-stmt (STMT) replaces.
4632 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4633 are taken from STMT. */
4635 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4636 if (!orig_stmt)
4638 /* Regular reduction */
4639 orig_stmt = stmt;
4641 else
4643 /* Reduction pattern */
4644 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4645 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4646 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4649 code = gimple_assign_rhs_code (orig_stmt);
4650 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4651 partial results are added and not subtracted. */
4652 if (code == MINUS_EXPR)
4653 code = PLUS_EXPR;
4655 scalar_dest = gimple_assign_lhs (orig_stmt);
4656 scalar_type = TREE_TYPE (scalar_dest);
4657 scalar_results.create (group_size);
4658 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4659 bitsize = TYPE_SIZE (scalar_type);
4661 /* In case this is a reduction in an inner-loop while vectorizing an outer
4662 loop - we don't need to extract a single scalar result at the end of the
4663 inner-loop (unless it is double reduction, i.e., the use of reduction is
4664 outside the outer-loop). The final vector of partial results will be used
4665 in the vectorized outer-loop, or reduced to a scalar result at the end of
4666 the outer-loop. */
4667 if (nested_in_vect_loop && !double_reduc)
4668 goto vect_finalize_reduction;
4670 /* SLP reduction without reduction chain, e.g.,
4671 # a1 = phi <a2, a0>
4672 # b1 = phi <b2, b0>
4673 a2 = operation (a1)
4674 b2 = operation (b1) */
4675 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4677 /* In case of reduction chain, e.g.,
4678 # a1 = phi <a3, a0>
4679 a2 = operation (a1)
4680 a3 = operation (a2),
4682 we may end up with more than one vector result. Here we reduce them to
4683 one vector. */
4684 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4686 tree first_vect = PHI_RESULT (new_phis[0]);
4687 tree tmp;
4688 gassign *new_vec_stmt = NULL;
4690 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4691 for (k = 1; k < new_phis.length (); k++)
4693 gimple *next_phi = new_phis[k];
4694 tree second_vect = PHI_RESULT (next_phi);
4696 tmp = build2 (code, vectype, first_vect, second_vect);
4697 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4698 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4699 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4700 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4703 new_phi_result = first_vect;
4704 if (new_vec_stmt)
4706 new_phis.truncate (0);
4707 new_phis.safe_push (new_vec_stmt);
4710 else
4711 new_phi_result = PHI_RESULT (new_phis[0]);
4713 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4715 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4716 various data values where the condition matched and another vector
4717 (INDUCTION_INDEX) containing all the indexes of those matches. We
4718 need to extract the last matching index (which will be the index with
4719 highest value) and use this to index into the data vector.
4720 For the case where there were no matches, the data vector will contain
4721 all default values and the index vector will be all zeros. */
4723 /* Get various versions of the type of the vector of indexes. */
4724 tree index_vec_type = TREE_TYPE (induction_index);
4725 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4726 tree index_scalar_type = TREE_TYPE (index_vec_type);
4727 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4728 (index_vec_type);
4730 /* Get an unsigned integer version of the type of the data vector. */
4731 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4732 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4733 tree vectype_unsigned = build_vector_type
4734 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4736 /* First we need to create a vector (ZERO_VEC) of zeros and another
4737 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4738 can create using a MAX reduction and then expanding.
4739 In the case where the loop never made any matches, the max index will
4740 be zero. */
4742 /* Vector of {0, 0, 0,...}. */
4743 tree zero_vec = make_ssa_name (vectype);
4744 tree zero_vec_rhs = build_zero_cst (vectype);
4745 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4746 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4748 /* Find maximum value from the vector of found indexes. */
4749 tree max_index = make_ssa_name (index_scalar_type);
4750 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4751 induction_index);
4752 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4754 /* Vector of {max_index, max_index, max_index,...}. */
4755 tree max_index_vec = make_ssa_name (index_vec_type);
4756 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4757 max_index);
4758 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4759 max_index_vec_rhs);
4760 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4762 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4763 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4764 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4765 otherwise. Only one value should match, resulting in a vector
4766 (VEC_COND) with one data value and the rest zeros.
4767 In the case where the loop never made any matches, every index will
4768 match, resulting in a vector with all data values (which will all be
4769 the default value). */
4771 /* Compare the max index vector to the vector of found indexes to find
4772 the position of the max value. */
4773 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4774 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4775 induction_index,
4776 max_index_vec);
4777 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4779 /* Use the compare to choose either values from the data vector or
4780 zero. */
4781 tree vec_cond = make_ssa_name (vectype);
4782 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4783 vec_compare, new_phi_result,
4784 zero_vec);
4785 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4787 /* Finally we need to extract the data value from the vector (VEC_COND)
4788 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4789 reduction, but because this doesn't exist, we can use a MAX reduction
4790 instead. The data value might be signed or a float so we need to cast
4791 it first.
4792 In the case where the loop never made any matches, the data values are
4793 all identical, and so will reduce down correctly. */
4795 /* Make the matched data values unsigned. */
4796 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4797 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4798 vec_cond);
4799 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4800 VIEW_CONVERT_EXPR,
4801 vec_cond_cast_rhs);
4802 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4804 /* Reduce down to a scalar value. */
4805 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4806 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4807 optab_default);
4808 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4809 != CODE_FOR_nothing);
4810 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4811 REDUC_MAX_EXPR,
4812 vec_cond_cast);
4813 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4815 /* Convert the reduced value back to the result type and set as the
4816 result. */
4817 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4818 data_reduc);
4819 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4820 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4821 gimple_assign_set_lhs (epilog_stmt, new_temp);
4822 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4823 scalar_results.safe_push (new_temp);
4826 /* 2.3 Create the reduction code, using one of the three schemes described
4827 above. In SLP we simply need to extract all the elements from the
4828 vector (without reducing them), so we use scalar shifts. */
4829 else if (reduc_code != ERROR_MARK && !slp_reduc)
4831 tree tmp;
4832 tree vec_elem_type;
4834 /*** Case 1: Create:
4835 v_out2 = reduc_expr <v_out1> */
4837 if (dump_enabled_p ())
4838 dump_printf_loc (MSG_NOTE, vect_location,
4839 "Reduce using direct vector reduction.\n");
4841 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4842 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4844 tree tmp_dest =
4845 vect_create_destination_var (scalar_dest, vec_elem_type);
4846 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4847 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4848 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4849 gimple_assign_set_lhs (epilog_stmt, new_temp);
4850 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4852 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4854 else
4855 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4857 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4858 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4859 gimple_assign_set_lhs (epilog_stmt, new_temp);
4860 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4862 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4863 == INTEGER_INDUC_COND_REDUCTION)
4865 /* Earlier we set the initial value to be zero. Check the result
4866 and if it is zero then replace with the original initial
4867 value. */
4868 tree zero = build_zero_cst (scalar_type);
4869 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4871 tmp = make_ssa_name (new_scalar_dest);
4872 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4873 initial_def, new_temp);
4874 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4875 new_temp = tmp;
4878 scalar_results.safe_push (new_temp);
4880 else
4882 bool reduce_with_shift = have_whole_vector_shift (mode);
4883 int element_bitsize = tree_to_uhwi (bitsize);
4884 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4885 tree vec_temp;
4887 /* Regardless of whether we have a whole vector shift, if we're
4888 emulating the operation via tree-vect-generic, we don't want
4889 to use it. Only the first round of the reduction is likely
4890 to still be profitable via emulation. */
4891 /* ??? It might be better to emit a reduction tree code here, so that
4892 tree-vect-generic can expand the first round via bit tricks. */
4893 if (!VECTOR_MODE_P (mode))
4894 reduce_with_shift = false;
4895 else
4897 optab optab = optab_for_tree_code (code, vectype, optab_default);
4898 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4899 reduce_with_shift = false;
4902 if (reduce_with_shift && !slp_reduc)
4904 int nelements = vec_size_in_bits / element_bitsize;
4905 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4907 int elt_offset;
4909 tree zero_vec = build_zero_cst (vectype);
4910 /*** Case 2: Create:
4911 for (offset = nelements/2; offset >= 1; offset/=2)
4913 Create: va' = vec_shift <va, offset>
4914 Create: va = vop <va, va'>
4915 } */
4917 tree rhs;
4919 if (dump_enabled_p ())
4920 dump_printf_loc (MSG_NOTE, vect_location,
4921 "Reduce using vector shifts\n");
4923 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4924 new_temp = new_phi_result;
4925 for (elt_offset = nelements / 2;
4926 elt_offset >= 1;
4927 elt_offset /= 2)
4929 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4930 tree mask = vect_gen_perm_mask_any (vectype, sel);
4931 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4932 new_temp, zero_vec, mask);
4933 new_name = make_ssa_name (vec_dest, epilog_stmt);
4934 gimple_assign_set_lhs (epilog_stmt, new_name);
4935 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4937 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4938 new_temp);
4939 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4940 gimple_assign_set_lhs (epilog_stmt, new_temp);
4941 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4944 /* 2.4 Extract the final scalar result. Create:
4945 s_out3 = extract_field <v_out2, bitpos> */
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_NOTE, vect_location,
4949 "extract scalar result\n");
4951 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4952 bitsize, bitsize_zero_node);
4953 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4954 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4955 gimple_assign_set_lhs (epilog_stmt, new_temp);
4956 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4957 scalar_results.safe_push (new_temp);
4959 else
4961 /*** Case 3: Create:
4962 s = extract_field <v_out2, 0>
4963 for (offset = element_size;
4964 offset < vector_size;
4965 offset += element_size;)
4967 Create: s' = extract_field <v_out2, offset>
4968 Create: s = op <s, s'> // For non SLP cases
4969 } */
4971 if (dump_enabled_p ())
4972 dump_printf_loc (MSG_NOTE, vect_location,
4973 "Reduce using scalar code.\n");
4975 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4976 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4978 int bit_offset;
4979 if (gimple_code (new_phi) == GIMPLE_PHI)
4980 vec_temp = PHI_RESULT (new_phi);
4981 else
4982 vec_temp = gimple_assign_lhs (new_phi);
4983 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4984 bitsize_zero_node);
4985 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4986 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4987 gimple_assign_set_lhs (epilog_stmt, new_temp);
4988 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990 /* In SLP we don't need to apply reduction operation, so we just
4991 collect s' values in SCALAR_RESULTS. */
4992 if (slp_reduc)
4993 scalar_results.safe_push (new_temp);
4995 for (bit_offset = element_bitsize;
4996 bit_offset < vec_size_in_bits;
4997 bit_offset += element_bitsize)
4999 tree bitpos = bitsize_int (bit_offset);
5000 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5001 bitsize, bitpos);
5003 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5004 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5005 gimple_assign_set_lhs (epilog_stmt, new_name);
5006 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5008 if (slp_reduc)
5010 /* In SLP we don't need to apply reduction operation, so
5011 we just collect s' values in SCALAR_RESULTS. */
5012 new_temp = new_name;
5013 scalar_results.safe_push (new_name);
5015 else
5017 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5018 new_name, new_temp);
5019 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5020 gimple_assign_set_lhs (epilog_stmt, new_temp);
5021 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5026 /* The only case where we need to reduce scalar results in SLP, is
5027 unrolling. If the size of SCALAR_RESULTS is greater than
5028 GROUP_SIZE, we reduce them combining elements modulo
5029 GROUP_SIZE. */
5030 if (slp_reduc)
5032 tree res, first_res, new_res;
5033 gimple *new_stmt;
5035 /* Reduce multiple scalar results in case of SLP unrolling. */
5036 for (j = group_size; scalar_results.iterate (j, &res);
5037 j++)
5039 first_res = scalar_results[j % group_size];
5040 new_stmt = gimple_build_assign (new_scalar_dest, code,
5041 first_res, res);
5042 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5043 gimple_assign_set_lhs (new_stmt, new_res);
5044 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5045 scalar_results[j % group_size] = new_res;
5048 else
5049 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5050 scalar_results.safe_push (new_temp);
5054 vect_finalize_reduction:
5056 if (double_reduc)
5057 loop = loop->inner;
5059 /* 2.5 Adjust the final result by the initial value of the reduction
5060 variable. (When such adjustment is not needed, then
5061 'adjustment_def' is zero). For example, if code is PLUS we create:
5062 new_temp = loop_exit_def + adjustment_def */
5064 if (adjustment_def)
5066 gcc_assert (!slp_reduc);
5067 if (nested_in_vect_loop)
5069 new_phi = new_phis[0];
5070 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5071 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5072 new_dest = vect_create_destination_var (scalar_dest, vectype);
5074 else
5076 new_temp = scalar_results[0];
5077 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5078 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5079 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5082 epilog_stmt = gimple_build_assign (new_dest, expr);
5083 new_temp = make_ssa_name (new_dest, epilog_stmt);
5084 gimple_assign_set_lhs (epilog_stmt, new_temp);
5085 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5086 if (nested_in_vect_loop)
5088 set_vinfo_for_stmt (epilog_stmt,
5089 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5090 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5091 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5093 if (!double_reduc)
5094 scalar_results.quick_push (new_temp);
5095 else
5096 scalar_results[0] = new_temp;
5098 else
5099 scalar_results[0] = new_temp;
5101 new_phis[0] = epilog_stmt;
5104 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5105 phis with new adjusted scalar results, i.e., replace use <s_out0>
5106 with use <s_out4>.
5108 Transform:
5109 loop_exit:
5110 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5111 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5112 v_out2 = reduce <v_out1>
5113 s_out3 = extract_field <v_out2, 0>
5114 s_out4 = adjust_result <s_out3>
5115 use <s_out0>
5116 use <s_out0>
5118 into:
5120 loop_exit:
5121 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5122 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5123 v_out2 = reduce <v_out1>
5124 s_out3 = extract_field <v_out2, 0>
5125 s_out4 = adjust_result <s_out3>
5126 use <s_out4>
5127 use <s_out4> */
5130 /* In SLP reduction chain we reduce vector results into one vector if
5131 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5132 the last stmt in the reduction chain, since we are looking for the loop
5133 exit phi node. */
5134 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5136 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5137 /* Handle reduction patterns. */
5138 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5139 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5141 scalar_dest = gimple_assign_lhs (dest_stmt);
5142 group_size = 1;
5145 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5146 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5147 need to match SCALAR_RESULTS with corresponding statements. The first
5148 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5149 the first vector stmt, etc.
5150 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5151 if (group_size > new_phis.length ())
5153 ratio = group_size / new_phis.length ();
5154 gcc_assert (!(group_size % new_phis.length ()));
5156 else
5157 ratio = 1;
5159 for (k = 0; k < group_size; k++)
5161 if (k % ratio == 0)
5163 epilog_stmt = new_phis[k / ratio];
5164 reduction_phi = reduction_phis[k / ratio];
5165 if (double_reduc)
5166 inner_phi = inner_phis[k / ratio];
5169 if (slp_reduc)
5171 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5173 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5174 /* SLP statements can't participate in patterns. */
5175 gcc_assert (!orig_stmt);
5176 scalar_dest = gimple_assign_lhs (current_stmt);
5179 phis.create (3);
5180 /* Find the loop-closed-use at the loop exit of the original scalar
5181 result. (The reduction result is expected to have two immediate uses -
5182 one at the latch block, and one at the loop exit). */
5183 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5184 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5185 && !is_gimple_debug (USE_STMT (use_p)))
5186 phis.safe_push (USE_STMT (use_p));
5188 /* While we expect to have found an exit_phi because of loop-closed-ssa
5189 form we can end up without one if the scalar cycle is dead. */
5191 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5193 if (outer_loop)
5195 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5196 gphi *vect_phi;
5198 /* FORNOW. Currently not supporting the case that an inner-loop
5199 reduction is not used in the outer-loop (but only outside the
5200 outer-loop), unless it is double reduction. */
5201 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5202 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5203 || double_reduc);
5205 if (double_reduc)
5206 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5207 else
5208 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5209 if (!double_reduc
5210 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5211 != vect_double_reduction_def)
5212 continue;
5214 /* Handle double reduction:
5216 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5217 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5218 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5219 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5221 At that point the regular reduction (stmt2 and stmt3) is
5222 already vectorized, as well as the exit phi node, stmt4.
5223 Here we vectorize the phi node of double reduction, stmt1, and
5224 update all relevant statements. */
5226 /* Go through all the uses of s2 to find double reduction phi
5227 node, i.e., stmt1 above. */
5228 orig_name = PHI_RESULT (exit_phi);
5229 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5231 stmt_vec_info use_stmt_vinfo;
5232 stmt_vec_info new_phi_vinfo;
5233 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5234 basic_block bb = gimple_bb (use_stmt);
5235 gimple *use;
5237 /* Check that USE_STMT is really double reduction phi
5238 node. */
5239 if (gimple_code (use_stmt) != GIMPLE_PHI
5240 || gimple_phi_num_args (use_stmt) != 2
5241 || bb->loop_father != outer_loop)
5242 continue;
5243 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5244 if (!use_stmt_vinfo
5245 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5246 != vect_double_reduction_def)
5247 continue;
5249 /* Create vector phi node for double reduction:
5250 vs1 = phi <vs0, vs2>
5251 vs1 was created previously in this function by a call to
5252 vect_get_vec_def_for_operand and is stored in
5253 vec_initial_def;
5254 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5255 vs0 is created here. */
5257 /* Create vector phi node. */
5258 vect_phi = create_phi_node (vec_initial_def, bb);
5259 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5260 loop_vec_info_for_loop (outer_loop));
5261 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5263 /* Create vs0 - initial def of the double reduction phi. */
5264 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5265 loop_preheader_edge (outer_loop));
5266 init_def = get_initial_def_for_reduction (stmt,
5267 preheader_arg, NULL);
5268 vect_phi_init = vect_init_vector (use_stmt, init_def,
5269 vectype, NULL);
5271 /* Update phi node arguments with vs0 and vs2. */
5272 add_phi_arg (vect_phi, vect_phi_init,
5273 loop_preheader_edge (outer_loop),
5274 UNKNOWN_LOCATION);
5275 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5276 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5277 if (dump_enabled_p ())
5279 dump_printf_loc (MSG_NOTE, vect_location,
5280 "created double reduction phi node: ");
5281 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5284 vect_phi_res = PHI_RESULT (vect_phi);
5286 /* Replace the use, i.e., set the correct vs1 in the regular
5287 reduction phi node. FORNOW, NCOPIES is always 1, so the
5288 loop is redundant. */
5289 use = reduction_phi;
5290 for (j = 0; j < ncopies; j++)
5292 edge pr_edge = loop_preheader_edge (loop);
5293 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5294 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5300 phis.release ();
5301 if (nested_in_vect_loop)
5303 if (double_reduc)
5304 loop = outer_loop;
5305 else
5306 continue;
5309 phis.create (3);
5310 /* Find the loop-closed-use at the loop exit of the original scalar
5311 result. (The reduction result is expected to have two immediate uses,
5312 one at the latch block, and one at the loop exit). For double
5313 reductions we are looking for exit phis of the outer loop. */
5314 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5316 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5318 if (!is_gimple_debug (USE_STMT (use_p)))
5319 phis.safe_push (USE_STMT (use_p));
5321 else
5323 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5325 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5327 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5329 if (!flow_bb_inside_loop_p (loop,
5330 gimple_bb (USE_STMT (phi_use_p)))
5331 && !is_gimple_debug (USE_STMT (phi_use_p)))
5332 phis.safe_push (USE_STMT (phi_use_p));
5338 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5340 /* Replace the uses: */
5341 orig_name = PHI_RESULT (exit_phi);
5342 scalar_result = scalar_results[k];
5343 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5344 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5345 SET_USE (use_p, scalar_result);
5348 phis.release ();
5353 /* Function is_nonwrapping_integer_induction.
5355 Check if STMT (which is part of loop LOOP) both increments and
5356 does not cause overflow. */
5358 static bool
5359 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5361 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5362 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5363 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5364 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5365 widest_int ni, max_loop_value, lhs_max;
5366 bool overflow = false;
5368 /* Make sure the loop is integer based. */
5369 if (TREE_CODE (base) != INTEGER_CST
5370 || TREE_CODE (step) != INTEGER_CST)
5371 return false;
5373 /* Check that the induction increments. */
5374 if (tree_int_cst_sgn (step) == -1)
5375 return false;
5377 /* Check that the max size of the loop will not wrap. */
5379 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5380 return true;
5382 if (! max_stmt_executions (loop, &ni))
5383 return false;
5385 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5386 &overflow);
5387 if (overflow)
5388 return false;
5390 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5391 TYPE_SIGN (lhs_type), &overflow);
5392 if (overflow)
5393 return false;
5395 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5396 <= TYPE_PRECISION (lhs_type));
5399 /* Function vectorizable_reduction.
5401 Check if STMT performs a reduction operation that can be vectorized.
5402 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5403 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5404 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5406 This function also handles reduction idioms (patterns) that have been
5407 recognized in advance during vect_pattern_recog. In this case, STMT may be
5408 of this form:
5409 X = pattern_expr (arg0, arg1, ..., X)
5410 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5411 sequence that had been detected and replaced by the pattern-stmt (STMT).
5413 This function also handles reduction of condition expressions, for example:
5414 for (int i = 0; i < N; i++)
5415 if (a[i] < value)
5416 last = a[i];
5417 This is handled by vectorising the loop and creating an additional vector
5418 containing the loop indexes for which "a[i] < value" was true. In the
5419 function epilogue this is reduced to a single max value and then used to
5420 index into the vector of results.
5422 In some cases of reduction patterns, the type of the reduction variable X is
5423 different than the type of the other arguments of STMT.
5424 In such cases, the vectype that is used when transforming STMT into a vector
5425 stmt is different than the vectype that is used to determine the
5426 vectorization factor, because it consists of a different number of elements
5427 than the actual number of elements that are being operated upon in parallel.
5429 For example, consider an accumulation of shorts into an int accumulator.
5430 On some targets it's possible to vectorize this pattern operating on 8
5431 shorts at a time (hence, the vectype for purposes of determining the
5432 vectorization factor should be V8HI); on the other hand, the vectype that
5433 is used to create the vector form is actually V4SI (the type of the result).
5435 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5436 indicates what is the actual level of parallelism (V8HI in the example), so
5437 that the right vectorization factor would be derived. This vectype
5438 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5439 be used to create the vectorized stmt. The right vectype for the vectorized
5440 stmt is obtained from the type of the result X:
5441 get_vectype_for_scalar_type (TREE_TYPE (X))
5443 This means that, contrary to "regular" reductions (or "regular" stmts in
5444 general), the following equation:
5445 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5446 does *NOT* necessarily hold for reduction patterns. */
5448 bool
5449 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5450 gimple **vec_stmt, slp_tree slp_node)
5452 tree vec_dest;
5453 tree scalar_dest;
5454 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5455 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5456 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5457 tree vectype_in = NULL_TREE;
5458 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5459 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5460 enum tree_code code, orig_code, epilog_reduc_code;
5461 machine_mode vec_mode;
5462 int op_type;
5463 optab optab, reduc_optab;
5464 tree new_temp = NULL_TREE;
5465 gimple *def_stmt;
5466 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5467 gphi *new_phi = NULL;
5468 tree scalar_type;
5469 bool is_simple_use;
5470 gimple *orig_stmt;
5471 stmt_vec_info orig_stmt_info;
5472 tree expr = NULL_TREE;
5473 int i;
5474 int ncopies;
5475 int epilog_copies;
5476 stmt_vec_info prev_stmt_info, prev_phi_info;
5477 bool single_defuse_cycle = false;
5478 tree reduc_def = NULL_TREE;
5479 gimple *new_stmt = NULL;
5480 int j;
5481 tree ops[3];
5482 bool nested_cycle = false, found_nested_cycle_def = false;
5483 gimple *reduc_def_stmt = NULL;
5484 bool double_reduc = false, dummy;
5485 basic_block def_bb;
5486 struct loop * def_stmt_loop, *outer_loop = NULL;
5487 tree def_arg;
5488 gimple *def_arg_stmt;
5489 auto_vec<tree> vec_oprnds0;
5490 auto_vec<tree> vec_oprnds1;
5491 auto_vec<tree> vect_defs;
5492 auto_vec<gimple *> phis;
5493 int vec_num;
5494 tree def0, def1, tem, op1 = NULL_TREE;
5495 bool first_p = true;
5496 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5497 tree cond_reduc_val = NULL_TREE;
5499 /* In case of reduction chain we switch to the first stmt in the chain, but
5500 we don't update STMT_INFO, since only the last stmt is marked as reduction
5501 and has reduction properties. */
5502 if (GROUP_FIRST_ELEMENT (stmt_info)
5503 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5505 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5506 first_p = false;
5509 if (nested_in_vect_loop_p (loop, stmt))
5511 outer_loop = loop;
5512 loop = loop->inner;
5513 nested_cycle = true;
5516 /* 1. Is vectorizable reduction? */
5517 /* Not supportable if the reduction variable is used in the loop, unless
5518 it's a reduction chain. */
5519 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5520 && !GROUP_FIRST_ELEMENT (stmt_info))
5521 return false;
5523 /* Reductions that are not used even in an enclosing outer-loop,
5524 are expected to be "live" (used out of the loop). */
5525 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5526 && !STMT_VINFO_LIVE_P (stmt_info))
5527 return false;
5529 /* Make sure it was already recognized as a reduction computation. */
5530 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5531 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5532 return false;
5534 /* 2. Has this been recognized as a reduction pattern?
5536 Check if STMT represents a pattern that has been recognized
5537 in earlier analysis stages. For stmts that represent a pattern,
5538 the STMT_VINFO_RELATED_STMT field records the last stmt in
5539 the original sequence that constitutes the pattern. */
5541 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5542 if (orig_stmt)
5544 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5545 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5546 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5549 /* 3. Check the operands of the operation. The first operands are defined
5550 inside the loop body. The last operand is the reduction variable,
5551 which is defined by the loop-header-phi. */
5553 gcc_assert (is_gimple_assign (stmt));
5555 /* Flatten RHS. */
5556 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5558 case GIMPLE_SINGLE_RHS:
5559 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5560 if (op_type == ternary_op)
5562 tree rhs = gimple_assign_rhs1 (stmt);
5563 ops[0] = TREE_OPERAND (rhs, 0);
5564 ops[1] = TREE_OPERAND (rhs, 1);
5565 ops[2] = TREE_OPERAND (rhs, 2);
5566 code = TREE_CODE (rhs);
5568 else
5569 return false;
5570 break;
5572 case GIMPLE_BINARY_RHS:
5573 code = gimple_assign_rhs_code (stmt);
5574 op_type = TREE_CODE_LENGTH (code);
5575 gcc_assert (op_type == binary_op);
5576 ops[0] = gimple_assign_rhs1 (stmt);
5577 ops[1] = gimple_assign_rhs2 (stmt);
5578 break;
5580 case GIMPLE_TERNARY_RHS:
5581 code = gimple_assign_rhs_code (stmt);
5582 op_type = TREE_CODE_LENGTH (code);
5583 gcc_assert (op_type == ternary_op);
5584 ops[0] = gimple_assign_rhs1 (stmt);
5585 ops[1] = gimple_assign_rhs2 (stmt);
5586 ops[2] = gimple_assign_rhs3 (stmt);
5587 break;
5589 case GIMPLE_UNARY_RHS:
5590 return false;
5592 default:
5593 gcc_unreachable ();
5595 /* The default is that the reduction variable is the last in statement. */
5596 int reduc_index = op_type - 1;
5597 if (code == MINUS_EXPR)
5598 reduc_index = 0;
5600 if (code == COND_EXPR && slp_node)
5601 return false;
5603 scalar_dest = gimple_assign_lhs (stmt);
5604 scalar_type = TREE_TYPE (scalar_dest);
5605 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5606 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5607 return false;
5609 /* Do not try to vectorize bit-precision reductions. */
5610 if ((TYPE_PRECISION (scalar_type)
5611 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5612 return false;
5614 /* All uses but the last are expected to be defined in the loop.
5615 The last use is the reduction variable. In case of nested cycle this
5616 assumption is not true: we use reduc_index to record the index of the
5617 reduction variable. */
5618 for (i = 0; i < op_type; i++)
5620 if (i == reduc_index)
5621 continue;
5623 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5624 if (i == 0 && code == COND_EXPR)
5625 continue;
5627 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5628 &def_stmt, &dt, &tem);
5629 if (!vectype_in)
5630 vectype_in = tem;
5631 gcc_assert (is_simple_use);
5633 if (dt != vect_internal_def
5634 && dt != vect_external_def
5635 && dt != vect_constant_def
5636 && dt != vect_induction_def
5637 && !(dt == vect_nested_cycle && nested_cycle))
5638 return false;
5640 if (dt == vect_nested_cycle)
5642 found_nested_cycle_def = true;
5643 reduc_def_stmt = def_stmt;
5644 reduc_index = i;
5647 if (i == 1 && code == COND_EXPR)
5649 /* Record how value of COND_EXPR is defined. */
5650 if (dt == vect_constant_def)
5652 cond_reduc_dt = dt;
5653 cond_reduc_val = ops[i];
5655 if (dt == vect_induction_def && def_stmt != NULL
5656 && is_nonwrapping_integer_induction (def_stmt, loop))
5657 cond_reduc_dt = dt;
5661 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5662 &def_stmt, &dt, &tem);
5663 if (!vectype_in)
5664 vectype_in = tem;
5665 gcc_assert (is_simple_use);
5666 if (!found_nested_cycle_def)
5667 reduc_def_stmt = def_stmt;
5669 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5670 return false;
5672 if (!(dt == vect_reduction_def
5673 || dt == vect_nested_cycle
5674 || ((dt == vect_internal_def || dt == vect_external_def
5675 || dt == vect_constant_def || dt == vect_induction_def)
5676 && nested_cycle && found_nested_cycle_def)))
5678 /* For pattern recognized stmts, orig_stmt might be a reduction,
5679 but some helper statements for the pattern might not, or
5680 might be COND_EXPRs with reduction uses in the condition. */
5681 gcc_assert (orig_stmt);
5682 return false;
5685 enum vect_reduction_type v_reduc_type;
5686 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5687 !nested_cycle, &dummy, false,
5688 &v_reduc_type);
5690 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5691 /* If we have a condition reduction, see if we can simplify it further. */
5692 if (v_reduc_type == COND_REDUCTION)
5694 if (cond_reduc_dt == vect_induction_def)
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_NOTE, vect_location,
5698 "condition expression based on "
5699 "integer induction.\n");
5700 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5701 = INTEGER_INDUC_COND_REDUCTION;
5704 /* Loop peeling modifies initial value of reduction PHI, which
5705 makes the reduction stmt to be transformed different to the
5706 original stmt analyzed. We need to record reduction code for
5707 CONST_COND_REDUCTION type reduction at analyzing stage, thus
5708 it can be used directly at transform stage. */
5709 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5710 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5712 /* Also set the reduction type to CONST_COND_REDUCTION. */
5713 gcc_assert (cond_reduc_dt == vect_constant_def);
5714 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5716 else if (cond_reduc_dt == vect_constant_def)
5718 enum vect_def_type cond_initial_dt;
5719 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5720 tree cond_initial_val
5721 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5723 gcc_assert (cond_reduc_val != NULL_TREE);
5724 vect_is_simple_use (cond_initial_val, loop_vinfo,
5725 &def_stmt, &cond_initial_dt);
5726 if (cond_initial_dt == vect_constant_def
5727 && types_compatible_p (TREE_TYPE (cond_initial_val),
5728 TREE_TYPE (cond_reduc_val)))
5730 tree e = fold_build2 (LE_EXPR, boolean_type_node,
5731 cond_initial_val, cond_reduc_val);
5732 if (e && (integer_onep (e) || integer_zerop (e)))
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 "condition expression based on "
5737 "compile time constant.\n");
5738 /* Record reduction code at analysis stage. */
5739 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5740 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5741 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5742 = CONST_COND_REDUCTION;
5748 if (orig_stmt)
5749 gcc_assert (tmp == orig_stmt
5750 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5751 else
5752 /* We changed STMT to be the first stmt in reduction chain, hence we
5753 check that in this case the first element in the chain is STMT. */
5754 gcc_assert (stmt == tmp
5755 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5757 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5758 return false;
5760 if (slp_node)
5761 ncopies = 1;
5762 else
5763 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5764 / TYPE_VECTOR_SUBPARTS (vectype_in));
5766 gcc_assert (ncopies >= 1);
5768 vec_mode = TYPE_MODE (vectype_in);
5770 if (code == COND_EXPR)
5772 /* Only call during the analysis stage, otherwise we'll lose
5773 STMT_VINFO_TYPE. */
5774 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5775 ops[reduc_index], 0, NULL))
5777 if (dump_enabled_p ())
5778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5779 "unsupported condition in reduction\n");
5780 return false;
5783 else
5785 /* 4. Supportable by target? */
5787 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5788 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5790 /* Shifts and rotates are only supported by vectorizable_shifts,
5791 not vectorizable_reduction. */
5792 if (dump_enabled_p ())
5793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5794 "unsupported shift or rotation.\n");
5795 return false;
5798 /* 4.1. check support for the operation in the loop */
5799 optab = optab_for_tree_code (code, vectype_in, optab_default);
5800 if (!optab)
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "no optab.\n");
5806 return false;
5809 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5811 if (dump_enabled_p ())
5812 dump_printf (MSG_NOTE, "op not supported by target.\n");
5814 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5815 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5816 < vect_min_worthwhile_factor (code))
5817 return false;
5819 if (dump_enabled_p ())
5820 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5823 /* Worthwhile without SIMD support? */
5824 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5825 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5826 < vect_min_worthwhile_factor (code))
5828 if (dump_enabled_p ())
5829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5830 "not worthwhile without SIMD support.\n");
5832 return false;
5836 /* 4.2. Check support for the epilog operation.
5838 If STMT represents a reduction pattern, then the type of the
5839 reduction variable may be different than the type of the rest
5840 of the arguments. For example, consider the case of accumulation
5841 of shorts into an int accumulator; The original code:
5842 S1: int_a = (int) short_a;
5843 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5845 was replaced with:
5846 STMT: int_acc = widen_sum <short_a, int_acc>
5848 This means that:
5849 1. The tree-code that is used to create the vector operation in the
5850 epilog code (that reduces the partial results) is not the
5851 tree-code of STMT, but is rather the tree-code of the original
5852 stmt from the pattern that STMT is replacing. I.e, in the example
5853 above we want to use 'widen_sum' in the loop, but 'plus' in the
5854 epilog.
5855 2. The type (mode) we use to check available target support
5856 for the vector operation to be created in the *epilog*, is
5857 determined by the type of the reduction variable (in the example
5858 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5859 However the type (mode) we use to check available target support
5860 for the vector operation to be created *inside the loop*, is
5861 determined by the type of the other arguments to STMT (in the
5862 example we'd check this: optab_handler (widen_sum_optab,
5863 vect_short_mode)).
5865 This is contrary to "regular" reductions, in which the types of all
5866 the arguments are the same as the type of the reduction variable.
5867 For "regular" reductions we can therefore use the same vector type
5868 (and also the same tree-code) when generating the epilog code and
5869 when generating the code inside the loop. */
5871 if (orig_stmt)
5873 /* This is a reduction pattern: get the vectype from the type of the
5874 reduction variable, and get the tree-code from orig_stmt. */
5875 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5876 == TREE_CODE_REDUCTION);
5877 orig_code = gimple_assign_rhs_code (orig_stmt);
5878 gcc_assert (vectype_out);
5879 vec_mode = TYPE_MODE (vectype_out);
5881 else
5883 /* Regular reduction: use the same vectype and tree-code as used for
5884 the vector code inside the loop can be used for the epilog code. */
5885 orig_code = code;
5887 if (code == MINUS_EXPR)
5888 orig_code = PLUS_EXPR;
5890 /* For simple condition reductions, replace with the actual expression
5891 we want to base our reduction around. */
5892 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
5894 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5895 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
5897 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5898 == INTEGER_INDUC_COND_REDUCTION)
5899 orig_code = MAX_EXPR;
5902 if (nested_cycle)
5904 def_bb = gimple_bb (reduc_def_stmt);
5905 def_stmt_loop = def_bb->loop_father;
5906 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5907 loop_preheader_edge (def_stmt_loop));
5908 if (TREE_CODE (def_arg) == SSA_NAME
5909 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5910 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5911 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5912 && vinfo_for_stmt (def_arg_stmt)
5913 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5914 == vect_double_reduction_def)
5915 double_reduc = true;
5918 epilog_reduc_code = ERROR_MARK;
5920 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
5922 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5924 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5925 optab_default);
5926 if (!reduc_optab)
5928 if (dump_enabled_p ())
5929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5930 "no optab for reduction.\n");
5932 epilog_reduc_code = ERROR_MARK;
5934 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5936 if (dump_enabled_p ())
5937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5938 "reduc op not supported by target.\n");
5940 epilog_reduc_code = ERROR_MARK;
5943 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5944 generated in the epilog using multiple expressions. This does not
5945 work for condition reductions. */
5946 if (epilog_reduc_code == ERROR_MARK
5947 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5948 == INTEGER_INDUC_COND_REDUCTION
5949 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5950 == CONST_COND_REDUCTION))
5952 if (dump_enabled_p ())
5953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5954 "no reduc code for scalar code.\n");
5955 return false;
5958 else
5960 if (!nested_cycle || double_reduc)
5962 if (dump_enabled_p ())
5963 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5964 "no reduc code for scalar code.\n");
5966 return false;
5970 else
5972 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5973 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5974 cr_index_vector_type = build_vector_type
5975 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5977 epilog_reduc_code = REDUC_MAX_EXPR;
5978 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5979 optab_default);
5980 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5981 == CODE_FOR_nothing)
5983 if (dump_enabled_p ())
5984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5985 "reduc max op not supported by target.\n");
5986 return false;
5990 if ((double_reduc
5991 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
5992 && ncopies > 1)
5994 if (dump_enabled_p ())
5995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5996 "multiple types in double reduction or condition "
5997 "reduction.\n");
5998 return false;
6001 /* In case of widenning multiplication by a constant, we update the type
6002 of the constant to be the type of the other operand. We check that the
6003 constant fits the type in the pattern recognition pass. */
6004 if (code == DOT_PROD_EXPR
6005 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6007 if (TREE_CODE (ops[0]) == INTEGER_CST)
6008 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6009 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6010 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6011 else
6013 if (dump_enabled_p ())
6014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6015 "invalid types in dot-prod\n");
6017 return false;
6021 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6023 widest_int ni;
6025 if (! max_loop_iterations (loop, &ni))
6027 if (dump_enabled_p ())
6028 dump_printf_loc (MSG_NOTE, vect_location,
6029 "loop count not known, cannot create cond "
6030 "reduction.\n");
6031 return false;
6033 /* Convert backedges to iterations. */
6034 ni += 1;
6036 /* The additional index will be the same type as the condition. Check
6037 that the loop can fit into this less one (because we'll use up the
6038 zero slot for when there are no matches). */
6039 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6040 if (wi::geu_p (ni, wi::to_widest (max_index)))
6042 if (dump_enabled_p ())
6043 dump_printf_loc (MSG_NOTE, vect_location,
6044 "loop size is greater than data size.\n");
6045 return false;
6049 if (!vec_stmt) /* transformation not required. */
6051 if (first_p
6052 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
6053 reduc_index))
6054 return false;
6055 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6056 return true;
6059 /** Transform. **/
6061 if (dump_enabled_p ())
6062 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6064 /* FORNOW: Multiple types are not supported for condition. */
6065 if (code == COND_EXPR)
6066 gcc_assert (ncopies == 1);
6068 /* Create the destination vector */
6069 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6071 /* In case the vectorization factor (VF) is bigger than the number
6072 of elements that we can fit in a vectype (nunits), we have to generate
6073 more than one vector stmt - i.e - we need to "unroll" the
6074 vector stmt by a factor VF/nunits. For more details see documentation
6075 in vectorizable_operation. */
6077 /* If the reduction is used in an outer loop we need to generate
6078 VF intermediate results, like so (e.g. for ncopies=2):
6079 r0 = phi (init, r0)
6080 r1 = phi (init, r1)
6081 r0 = x0 + r0;
6082 r1 = x1 + r1;
6083 (i.e. we generate VF results in 2 registers).
6084 In this case we have a separate def-use cycle for each copy, and therefore
6085 for each copy we get the vector def for the reduction variable from the
6086 respective phi node created for this copy.
6088 Otherwise (the reduction is unused in the loop nest), we can combine
6089 together intermediate results, like so (e.g. for ncopies=2):
6090 r = phi (init, r)
6091 r = x0 + r;
6092 r = x1 + r;
6093 (i.e. we generate VF/2 results in a single register).
6094 In this case for each copy we get the vector def for the reduction variable
6095 from the vectorized reduction operation generated in the previous iteration.
6098 if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6100 single_defuse_cycle = true;
6101 epilog_copies = 1;
6103 else
6104 epilog_copies = ncopies;
6106 prev_stmt_info = NULL;
6107 prev_phi_info = NULL;
6108 if (slp_node)
6109 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6110 else
6112 vec_num = 1;
6113 vec_oprnds0.create (1);
6114 if (op_type == ternary_op)
6115 vec_oprnds1.create (1);
6118 phis.create (vec_num);
6119 vect_defs.create (vec_num);
6120 if (!slp_node)
6121 vect_defs.quick_push (NULL_TREE);
6123 for (j = 0; j < ncopies; j++)
6125 if (j == 0 || !single_defuse_cycle)
6127 for (i = 0; i < vec_num; i++)
6129 /* Create the reduction-phi that defines the reduction
6130 operand. */
6131 new_phi = create_phi_node (vec_dest, loop->header);
6132 set_vinfo_for_stmt (new_phi,
6133 new_stmt_vec_info (new_phi, loop_vinfo));
6134 if (j == 0 || slp_node)
6135 phis.quick_push (new_phi);
6139 if (code == COND_EXPR)
6141 gcc_assert (!slp_node);
6142 vectorizable_condition (stmt, gsi, vec_stmt,
6143 PHI_RESULT (phis[0]),
6144 reduc_index, NULL);
6145 /* Multiple types are not supported for condition. */
6146 break;
6149 /* Handle uses. */
6150 if (j == 0)
6152 if (slp_node)
6154 /* Get vec defs for all the operands except the reduction index,
6155 ensuring the ordering of the ops in the vector is kept. */
6156 auto_vec<tree, 3> slp_ops;
6157 auto_vec<vec<tree>, 3> vec_defs;
6159 slp_ops.quick_push ((reduc_index == 0) ? NULL : ops[0]);
6160 slp_ops.quick_push ((reduc_index == 1) ? NULL : ops[1]);
6161 if (op_type == ternary_op)
6162 slp_ops.quick_push ((reduc_index == 2) ? NULL : ops[2]);
6164 vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1);
6166 vec_oprnds0.safe_splice (vec_defs[(reduc_index == 0) ? 1 : 0]);
6167 if (op_type == ternary_op)
6168 vec_oprnds1.safe_splice (vec_defs[(reduc_index == 2) ? 1 : 2]);
6170 else
6172 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6173 stmt);
6174 vec_oprnds0.quick_push (loop_vec_def0);
6175 if (op_type == ternary_op)
6177 op1 = (reduc_index == 0) ? ops[2] : ops[1];
6178 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6179 vec_oprnds1.quick_push (loop_vec_def1);
6183 else
6185 if (!slp_node)
6187 enum vect_def_type dt;
6188 gimple *dummy_stmt;
6190 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6191 &dummy_stmt, &dt);
6192 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6193 loop_vec_def0);
6194 vec_oprnds0[0] = loop_vec_def0;
6195 if (op_type == ternary_op)
6197 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6198 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6199 loop_vec_def1);
6200 vec_oprnds1[0] = loop_vec_def1;
6204 if (single_defuse_cycle)
6205 reduc_def = gimple_assign_lhs (new_stmt);
6207 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6210 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6212 if (slp_node)
6213 reduc_def = PHI_RESULT (phis[i]);
6214 else
6216 if (!single_defuse_cycle || j == 0)
6217 reduc_def = PHI_RESULT (new_phi);
6220 def1 = ((op_type == ternary_op)
6221 ? vec_oprnds1[i] : NULL);
6222 if (op_type == binary_op)
6224 if (reduc_index == 0)
6225 expr = build2 (code, vectype_out, reduc_def, def0);
6226 else
6227 expr = build2 (code, vectype_out, def0, reduc_def);
6229 else
6231 if (reduc_index == 0)
6232 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6233 else
6235 if (reduc_index == 1)
6236 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6237 else
6238 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6242 new_stmt = gimple_build_assign (vec_dest, expr);
6243 new_temp = make_ssa_name (vec_dest, new_stmt);
6244 gimple_assign_set_lhs (new_stmt, new_temp);
6245 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6247 if (slp_node)
6249 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6250 vect_defs.quick_push (new_temp);
6252 else
6253 vect_defs[0] = new_temp;
6256 if (slp_node)
6257 continue;
6259 if (j == 0)
6260 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6261 else
6262 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6264 prev_stmt_info = vinfo_for_stmt (new_stmt);
6265 prev_phi_info = vinfo_for_stmt (new_phi);
6268 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6270 /* Finalize the reduction-phi (set its arguments) and create the
6271 epilog reduction code. */
6272 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6274 new_temp = gimple_assign_lhs (*vec_stmt);
6275 vect_defs[0] = new_temp;
6277 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6278 which is updated with the current index of the loop for every match of
6279 the original loop's cond_expr (VEC_STMT). This results in a vector
6280 containing the last time the condition passed for that vector lane.
6281 The first match will be a 1 to allow 0 to be used for non-matching
6282 indexes. If there are no matches at all then the vector will be all
6283 zeroes. */
6284 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6286 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6287 int k;
6289 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6291 /* First we create a simple vector induction variable which starts
6292 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6293 vector size (STEP). */
6295 /* Create a {1,2,3,...} vector. */
6296 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6297 for (k = 0; k < nunits_out; ++k)
6298 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6299 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6301 /* Create a vector of the step value. */
6302 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6303 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6305 /* Create an induction variable. */
6306 gimple_stmt_iterator incr_gsi;
6307 bool insert_after;
6308 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6309 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6310 insert_after, &indx_before_incr, &indx_after_incr);
6312 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6313 filled with zeros (VEC_ZERO). */
6315 /* Create a vector of 0s. */
6316 tree zero = build_zero_cst (cr_index_scalar_type);
6317 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6319 /* Create a vector phi node. */
6320 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6321 new_phi = create_phi_node (new_phi_tree, loop->header);
6322 set_vinfo_for_stmt (new_phi,
6323 new_stmt_vec_info (new_phi, loop_vinfo));
6324 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6325 UNKNOWN_LOCATION);
6327 /* Now take the condition from the loops original cond_expr
6328 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6329 every match uses values from the induction variable
6330 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6331 (NEW_PHI_TREE).
6332 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6333 the new cond_expr (INDEX_COND_EXPR). */
6335 /* Duplicate the condition from vec_stmt. */
6336 tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6338 /* Create a conditional, where the condition is taken from vec_stmt
6339 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6340 else is the phi (NEW_PHI_TREE). */
6341 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6342 ccompare, indx_before_incr,
6343 new_phi_tree);
6344 cond_name = make_ssa_name (cr_index_vector_type);
6345 gimple *index_condition = gimple_build_assign (cond_name,
6346 index_cond_expr);
6347 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6348 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6349 loop_vinfo);
6350 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6351 set_vinfo_for_stmt (index_condition, index_vec_info);
6353 /* Update the phi with the vec cond. */
6354 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6355 UNKNOWN_LOCATION);
6359 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6360 epilog_reduc_code, phis, reduc_index,
6361 double_reduc, slp_node, cond_name);
6363 return true;
6366 /* Function vect_min_worthwhile_factor.
6368 For a loop where we could vectorize the operation indicated by CODE,
6369 return the minimum vectorization factor that makes it worthwhile
6370 to use generic vectors. */
6372 vect_min_worthwhile_factor (enum tree_code code)
6374 switch (code)
6376 case PLUS_EXPR:
6377 case MINUS_EXPR:
6378 case NEGATE_EXPR:
6379 return 4;
6381 case BIT_AND_EXPR:
6382 case BIT_IOR_EXPR:
6383 case BIT_XOR_EXPR:
6384 case BIT_NOT_EXPR:
6385 return 2;
6387 default:
6388 return INT_MAX;
6393 /* Function vectorizable_induction
6395 Check if PHI performs an induction computation that can be vectorized.
6396 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6397 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6398 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6400 bool
6401 vectorizable_induction (gimple *phi,
6402 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6403 gimple **vec_stmt)
6405 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6406 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6407 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6408 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6409 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6410 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6411 tree vec_def;
6413 gcc_assert (ncopies >= 1);
6414 /* FORNOW. These restrictions should be relaxed. */
6415 if (nested_in_vect_loop_p (loop, phi))
6417 imm_use_iterator imm_iter;
6418 use_operand_p use_p;
6419 gimple *exit_phi;
6420 edge latch_e;
6421 tree loop_arg;
6423 if (ncopies > 1)
6425 if (dump_enabled_p ())
6426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6427 "multiple types in nested loop.\n");
6428 return false;
6431 exit_phi = NULL;
6432 latch_e = loop_latch_edge (loop->inner);
6433 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6434 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6436 gimple *use_stmt = USE_STMT (use_p);
6437 if (is_gimple_debug (use_stmt))
6438 continue;
6440 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6442 exit_phi = use_stmt;
6443 break;
6446 if (exit_phi)
6448 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6449 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6450 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454 "inner-loop induction only used outside "
6455 "of the outer vectorized loop.\n");
6456 return false;
6461 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6462 return false;
6464 /* FORNOW: SLP not supported. */
6465 if (STMT_SLP_TYPE (stmt_info))
6466 return false;
6468 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6470 if (gimple_code (phi) != GIMPLE_PHI)
6471 return false;
6473 if (!vec_stmt) /* transformation not required. */
6475 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6476 if (dump_enabled_p ())
6477 dump_printf_loc (MSG_NOTE, vect_location,
6478 "=== vectorizable_induction ===\n");
6479 vect_model_induction_cost (stmt_info, ncopies);
6480 return true;
6483 /** Transform. **/
6485 if (dump_enabled_p ())
6486 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6488 vec_def = get_initial_def_for_induction (phi);
6489 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6490 return true;
6493 /* Function vectorizable_live_operation.
6495 STMT computes a value that is used outside the loop. Check if
6496 it can be supported. */
6498 bool
6499 vectorizable_live_operation (gimple *stmt,
6500 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6501 slp_tree slp_node, int slp_index,
6502 gimple **vec_stmt)
6504 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6505 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6506 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6507 imm_use_iterator imm_iter;
6508 tree lhs, lhs_type, bitsize, vec_bitsize;
6509 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6510 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6511 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6512 gimple *use_stmt;
6513 auto_vec<tree> vec_oprnds;
6515 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6517 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6518 return false;
6520 /* FORNOW. CHECKME. */
6521 if (nested_in_vect_loop_p (loop, stmt))
6522 return false;
6524 /* If STMT is not relevant and it is a simple assignment and its inputs are
6525 invariant then it can remain in place, unvectorized. The original last
6526 scalar value that it computes will be used. */
6527 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6529 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
6530 if (dump_enabled_p ())
6531 dump_printf_loc (MSG_NOTE, vect_location,
6532 "statement is simple and uses invariant. Leaving in "
6533 "place.\n");
6534 return true;
6537 if (!vec_stmt)
6538 /* No transformation required. */
6539 return true;
6541 /* If stmt has a related stmt, then use that for getting the lhs. */
6542 if (is_pattern_stmt_p (stmt_info))
6543 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
6545 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
6546 : gimple_get_lhs (stmt);
6547 lhs_type = TREE_TYPE (lhs);
6549 bitsize = TYPE_SIZE (TREE_TYPE (vectype));
6550 vec_bitsize = TYPE_SIZE (vectype);
6552 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
6553 tree vec_lhs, bitstart;
6554 if (slp_node)
6556 gcc_assert (slp_index >= 0);
6558 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6559 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6561 /* Get the last occurrence of the scalar index from the concatenation of
6562 all the slp vectors. Calculate which slp vector it is and the index
6563 within. */
6564 int pos = (num_vec * nunits) - num_scalar + slp_index;
6565 int vec_entry = pos / nunits;
6566 int vec_index = pos % nunits;
6568 /* Get the correct slp vectorized stmt. */
6569 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
6571 /* Get entry to use. */
6572 bitstart = build_int_cst (unsigned_type_node, vec_index);
6573 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
6575 else
6577 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
6578 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
6580 /* For multiple copies, get the last copy. */
6581 for (int i = 1; i < ncopies; ++i)
6582 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
6583 vec_lhs);
6585 /* Get the last lane in the vector. */
6586 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
6589 /* Create a new vectorized stmt for the uses of STMT and insert outside the
6590 loop. */
6591 gimple_seq stmts = NULL;
6592 tree new_tree = build3 (BIT_FIELD_REF, TREE_TYPE (vectype), vec_lhs, bitsize,
6593 bitstart);
6594 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
6595 true, NULL_TREE);
6596 if (stmts)
6597 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
6599 /* Replace use of lhs with newly computed result. If the use stmt is a
6600 single arg PHI, just replace all uses of PHI result. It's necessary
6601 because lcssa PHI defining lhs may be before newly inserted stmt. */
6602 use_operand_p use_p;
6603 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
6604 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
6605 && !is_gimple_debug (use_stmt))
6607 if (gimple_code (use_stmt) == GIMPLE_PHI
6608 && gimple_phi_num_args (use_stmt) == 1)
6610 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
6612 else
6614 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6615 SET_USE (use_p, new_tree);
6617 update_stmt (use_stmt);
6620 return true;
6623 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6625 static void
6626 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6628 ssa_op_iter op_iter;
6629 imm_use_iterator imm_iter;
6630 def_operand_p def_p;
6631 gimple *ustmt;
6633 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6635 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6637 basic_block bb;
6639 if (!is_gimple_debug (ustmt))
6640 continue;
6642 bb = gimple_bb (ustmt);
6644 if (!flow_bb_inside_loop_p (loop, bb))
6646 if (gimple_debug_bind_p (ustmt))
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_NOTE, vect_location,
6650 "killing debug use\n");
6652 gimple_debug_bind_reset_value (ustmt);
6653 update_stmt (ustmt);
6655 else
6656 gcc_unreachable ();
6662 /* Given loop represented by LOOP_VINFO, return true if computation of
6663 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
6664 otherwise. */
6666 static bool
6667 loop_niters_no_overflow (loop_vec_info loop_vinfo)
6669 /* Constant case. */
6670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6672 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
6673 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
6675 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
6676 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
6677 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
6678 return true;
6681 widest_int max;
6682 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6683 /* Check the upper bound of loop niters. */
6684 if (get_max_loop_iterations (loop, &max))
6686 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
6687 signop sgn = TYPE_SIGN (type);
6688 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
6689 if (max < type_max)
6690 return true;
6692 return false;
6695 /* Function vect_transform_loop.
6697 The analysis phase has determined that the loop is vectorizable.
6698 Vectorize the loop - created vectorized stmts to replace the scalar
6699 stmts in the loop, and update the loop exit condition. */
6701 void
6702 vect_transform_loop (loop_vec_info loop_vinfo)
6704 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6705 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6706 int nbbs = loop->num_nodes;
6707 int i;
6708 tree niters_vector = NULL;
6709 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6710 bool grouped_store;
6711 bool slp_scheduled = false;
6712 gimple *stmt, *pattern_stmt;
6713 gimple_seq pattern_def_seq = NULL;
6714 gimple_stmt_iterator pattern_def_si = gsi_none ();
6715 bool transform_pattern_stmt = false;
6716 bool check_profitability = false;
6717 int th;
6718 /* Record number of iterations before we started tampering with the profile. */
6719 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6721 if (dump_enabled_p ())
6722 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6724 /* If profile is inprecise, we have chance to fix it up. */
6725 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6726 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6728 /* Use the more conservative vectorization threshold. If the number
6729 of iterations is constant assume the cost check has been performed
6730 by our caller. If the threshold makes all loops profitable that
6731 run at least the vectorization factor number of times checking
6732 is pointless, too. */
6733 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6734 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6735 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6737 if (dump_enabled_p ())
6738 dump_printf_loc (MSG_NOTE, vect_location,
6739 "Profitability threshold is %d loop iterations.\n",
6740 th);
6741 check_profitability = true;
6744 /* Make sure there exists a single-predecessor exit bb. Do this before
6745 versioning. */
6746 edge e = single_exit (loop);
6747 if (! single_pred_p (e->dest))
6749 split_loop_exit_edge (e);
6750 if (dump_enabled_p ())
6751 dump_printf (MSG_NOTE, "split exit edge\n");
6754 /* Version the loop first, if required, so the profitability check
6755 comes first. */
6757 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
6759 vect_loop_versioning (loop_vinfo, th, check_profitability);
6760 check_profitability = false;
6763 /* Make sure there exists a single-predecessor exit bb also on the
6764 scalar loop copy. Do this after versioning but before peeling
6765 so CFG structure is fine for both scalar and if-converted loop
6766 to make slpeel_duplicate_current_defs_from_edges face matched
6767 loop closed PHI nodes on the exit. */
6768 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
6770 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
6771 if (! single_pred_p (e->dest))
6773 split_loop_exit_edge (e);
6774 if (dump_enabled_p ())
6775 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
6779 tree niters = vect_build_loop_niters (loop_vinfo);
6780 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
6781 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
6782 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
6783 vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
6784 check_profitability, niters_no_overflow);
6785 if (niters_vector == NULL_TREE)
6787 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6788 niters_vector
6789 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6790 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
6791 else
6792 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
6793 niters_no_overflow);
6796 /* 1) Make sure the loop header has exactly two entries
6797 2) Make sure we have a preheader basic block. */
6799 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6801 split_edge (loop_preheader_edge (loop));
6803 /* FORNOW: the vectorizer supports only loops which body consist
6804 of one basic block (header + empty latch). When the vectorizer will
6805 support more involved loop forms, the order by which the BBs are
6806 traversed need to be reconsidered. */
6808 for (i = 0; i < nbbs; i++)
6810 basic_block bb = bbs[i];
6811 stmt_vec_info stmt_info;
6813 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6814 gsi_next (&si))
6816 gphi *phi = si.phi ();
6817 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_NOTE, vect_location,
6820 "------>vectorizing phi: ");
6821 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6823 stmt_info = vinfo_for_stmt (phi);
6824 if (!stmt_info)
6825 continue;
6827 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6828 vect_loop_kill_debug_uses (loop, phi);
6830 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6831 && !STMT_VINFO_LIVE_P (stmt_info))
6832 continue;
6834 if (STMT_VINFO_VECTYPE (stmt_info)
6835 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6836 != (unsigned HOST_WIDE_INT) vf)
6837 && dump_enabled_p ())
6838 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6840 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6842 if (dump_enabled_p ())
6843 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6844 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6848 pattern_stmt = NULL;
6849 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6850 !gsi_end_p (si) || transform_pattern_stmt;)
6852 bool is_store;
6854 if (transform_pattern_stmt)
6855 stmt = pattern_stmt;
6856 else
6858 stmt = gsi_stmt (si);
6859 /* During vectorization remove existing clobber stmts. */
6860 if (gimple_clobber_p (stmt))
6862 unlink_stmt_vdef (stmt);
6863 gsi_remove (&si, true);
6864 release_defs (stmt);
6865 continue;
6869 if (dump_enabled_p ())
6871 dump_printf_loc (MSG_NOTE, vect_location,
6872 "------>vectorizing statement: ");
6873 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6876 stmt_info = vinfo_for_stmt (stmt);
6878 /* vector stmts created in the outer-loop during vectorization of
6879 stmts in an inner-loop may not have a stmt_info, and do not
6880 need to be vectorized. */
6881 if (!stmt_info)
6883 gsi_next (&si);
6884 continue;
6887 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6888 vect_loop_kill_debug_uses (loop, stmt);
6890 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6891 && !STMT_VINFO_LIVE_P (stmt_info))
6893 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6894 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6895 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6896 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6898 stmt = pattern_stmt;
6899 stmt_info = vinfo_for_stmt (stmt);
6901 else
6903 gsi_next (&si);
6904 continue;
6907 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6908 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6909 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6910 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6911 transform_pattern_stmt = true;
6913 /* If pattern statement has def stmts, vectorize them too. */
6914 if (is_pattern_stmt_p (stmt_info))
6916 if (pattern_def_seq == NULL)
6918 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6919 pattern_def_si = gsi_start (pattern_def_seq);
6921 else if (!gsi_end_p (pattern_def_si))
6922 gsi_next (&pattern_def_si);
6923 if (pattern_def_seq != NULL)
6925 gimple *pattern_def_stmt = NULL;
6926 stmt_vec_info pattern_def_stmt_info = NULL;
6928 while (!gsi_end_p (pattern_def_si))
6930 pattern_def_stmt = gsi_stmt (pattern_def_si);
6931 pattern_def_stmt_info
6932 = vinfo_for_stmt (pattern_def_stmt);
6933 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6934 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6935 break;
6936 gsi_next (&pattern_def_si);
6939 if (!gsi_end_p (pattern_def_si))
6941 if (dump_enabled_p ())
6943 dump_printf_loc (MSG_NOTE, vect_location,
6944 "==> vectorizing pattern def "
6945 "stmt: ");
6946 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6947 pattern_def_stmt, 0);
6950 stmt = pattern_def_stmt;
6951 stmt_info = pattern_def_stmt_info;
6953 else
6955 pattern_def_si = gsi_none ();
6956 transform_pattern_stmt = false;
6959 else
6960 transform_pattern_stmt = false;
6963 if (STMT_VINFO_VECTYPE (stmt_info))
6965 unsigned int nunits
6966 = (unsigned int)
6967 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6968 if (!STMT_SLP_TYPE (stmt_info)
6969 && nunits != (unsigned int) vf
6970 && dump_enabled_p ())
6971 /* For SLP VF is set according to unrolling factor, and not
6972 to vector size, hence for SLP this print is not valid. */
6973 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6976 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6977 reached. */
6978 if (STMT_SLP_TYPE (stmt_info))
6980 if (!slp_scheduled)
6982 slp_scheduled = true;
6984 if (dump_enabled_p ())
6985 dump_printf_loc (MSG_NOTE, vect_location,
6986 "=== scheduling SLP instances ===\n");
6988 vect_schedule_slp (loop_vinfo);
6991 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6992 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6994 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6996 pattern_def_seq = NULL;
6997 gsi_next (&si);
6999 continue;
7003 /* -------- vectorize statement ------------ */
7004 if (dump_enabled_p ())
7005 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7007 grouped_store = false;
7008 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7009 if (is_store)
7011 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7013 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7014 interleaving chain was completed - free all the stores in
7015 the chain. */
7016 gsi_next (&si);
7017 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7019 else
7021 /* Free the attached stmt_vec_info and remove the stmt. */
7022 gimple *store = gsi_stmt (si);
7023 free_stmt_vec_info (store);
7024 unlink_stmt_vdef (store);
7025 gsi_remove (&si, true);
7026 release_defs (store);
7029 /* Stores can only appear at the end of pattern statements. */
7030 gcc_assert (!transform_pattern_stmt);
7031 pattern_def_seq = NULL;
7033 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7035 pattern_def_seq = NULL;
7036 gsi_next (&si);
7038 } /* stmts in BB */
7039 } /* BBs in loop */
7041 slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7043 /* Reduce loop iterations by the vectorization factor. */
7044 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vf),
7045 expected_iterations / vf);
7046 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
7048 if (loop->nb_iterations_upper_bound != 0)
7049 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
7050 if (loop->nb_iterations_likely_upper_bound != 0)
7051 loop->nb_iterations_likely_upper_bound
7052 = loop->nb_iterations_likely_upper_bound - 1;
7054 loop->nb_iterations_upper_bound
7055 = wi::udiv_floor (loop->nb_iterations_upper_bound + 1, vf) - 1;
7056 loop->nb_iterations_likely_upper_bound
7057 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + 1, vf) - 1;
7059 if (loop->any_estimate)
7061 loop->nb_iterations_estimate
7062 = wi::udiv_floor (loop->nb_iterations_estimate, vf);
7063 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
7064 && loop->nb_iterations_estimate != 0)
7065 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
7068 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_NOTE, vect_location,
7071 "LOOP VECTORIZED\n");
7072 if (loop->inner)
7073 dump_printf_loc (MSG_NOTE, vect_location,
7074 "OUTER LOOP VECTORIZED\n");
7075 dump_printf (MSG_NOTE, "\n");
7078 /* Free SLP instances here because otherwise stmt reference counting
7079 won't work. */
7080 slp_instance instance;
7081 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7082 vect_free_slp_instance (instance);
7083 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7084 /* Clear-up safelen field since its value is invalid after vectorization
7085 since vectorized loop can have loop-carried dependencies. */
7086 loop->safelen = 0;
7089 /* The code below is trying to perform simple optimization - revert
7090 if-conversion for masked stores, i.e. if the mask of a store is zero
7091 do not perform it and all stored value producers also if possible.
7092 For example,
7093 for (i=0; i<n; i++)
7094 if (c[i])
7096 p1[i] += 1;
7097 p2[i] = p3[i] +2;
7099 this transformation will produce the following semi-hammock:
7101 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7103 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7104 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7105 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7106 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7107 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7108 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7112 void
7113 optimize_mask_stores (struct loop *loop)
7115 basic_block *bbs = get_loop_body (loop);
7116 unsigned nbbs = loop->num_nodes;
7117 unsigned i;
7118 basic_block bb;
7119 gimple_stmt_iterator gsi;
7120 gimple *stmt;
7121 auto_vec<gimple *> worklist;
7123 vect_location = find_loop_location (loop);
7124 /* Pick up all masked stores in loop if any. */
7125 for (i = 0; i < nbbs; i++)
7127 bb = bbs[i];
7128 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7129 gsi_next (&gsi))
7131 stmt = gsi_stmt (gsi);
7132 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7133 worklist.safe_push (stmt);
7137 free (bbs);
7138 if (worklist.is_empty ())
7139 return;
7141 /* Loop has masked stores. */
7142 while (!worklist.is_empty ())
7144 gimple *last, *last_store;
7145 edge e, efalse;
7146 tree mask;
7147 basic_block store_bb, join_bb;
7148 gimple_stmt_iterator gsi_to;
7149 tree vdef, new_vdef;
7150 gphi *phi;
7151 tree vectype;
7152 tree zero;
7154 last = worklist.pop ();
7155 mask = gimple_call_arg (last, 2);
7156 bb = gimple_bb (last);
7157 /* Create new bb. */
7158 e = split_block (bb, last);
7159 join_bb = e->dest;
7160 store_bb = create_empty_bb (bb);
7161 add_bb_to_loop (store_bb, loop);
7162 e->flags = EDGE_TRUE_VALUE;
7163 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7164 /* Put STORE_BB to likely part. */
7165 efalse->probability = PROB_UNLIKELY;
7166 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7167 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7168 if (dom_info_available_p (CDI_DOMINATORS))
7169 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7170 if (dump_enabled_p ())
7171 dump_printf_loc (MSG_NOTE, vect_location,
7172 "Create new block %d to sink mask stores.",
7173 store_bb->index);
7174 /* Create vector comparison with boolean result. */
7175 vectype = TREE_TYPE (mask);
7176 zero = build_zero_cst (vectype);
7177 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7178 gsi = gsi_last_bb (bb);
7179 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7180 /* Create new PHI node for vdef of the last masked store:
7181 .MEM_2 = VDEF <.MEM_1>
7182 will be converted to
7183 .MEM.3 = VDEF <.MEM_1>
7184 and new PHI node will be created in join bb
7185 .MEM_2 = PHI <.MEM_1, .MEM_3>
7187 vdef = gimple_vdef (last);
7188 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7189 gimple_set_vdef (last, new_vdef);
7190 phi = create_phi_node (vdef, join_bb);
7191 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7193 /* Put all masked stores with the same mask to STORE_BB if possible. */
7194 while (true)
7196 gimple_stmt_iterator gsi_from;
7197 gimple *stmt1 = NULL;
7199 /* Move masked store to STORE_BB. */
7200 last_store = last;
7201 gsi = gsi_for_stmt (last);
7202 gsi_from = gsi;
7203 /* Shift GSI to the previous stmt for further traversal. */
7204 gsi_prev (&gsi);
7205 gsi_to = gsi_start_bb (store_bb);
7206 gsi_move_before (&gsi_from, &gsi_to);
7207 /* Setup GSI_TO to the non-empty block start. */
7208 gsi_to = gsi_start_bb (store_bb);
7209 if (dump_enabled_p ())
7211 dump_printf_loc (MSG_NOTE, vect_location,
7212 "Move stmt to created bb\n");
7213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7215 /* Move all stored value producers if possible. */
7216 while (!gsi_end_p (gsi))
7218 tree lhs;
7219 imm_use_iterator imm_iter;
7220 use_operand_p use_p;
7221 bool res;
7223 /* Skip debug statements. */
7224 if (is_gimple_debug (gsi_stmt (gsi)))
7226 gsi_prev (&gsi);
7227 continue;
7229 stmt1 = gsi_stmt (gsi);
7230 /* Do not consider statements writing to memory or having
7231 volatile operand. */
7232 if (gimple_vdef (stmt1)
7233 || gimple_has_volatile_ops (stmt1))
7234 break;
7235 gsi_from = gsi;
7236 gsi_prev (&gsi);
7237 lhs = gimple_get_lhs (stmt1);
7238 if (!lhs)
7239 break;
7241 /* LHS of vectorized stmt must be SSA_NAME. */
7242 if (TREE_CODE (lhs) != SSA_NAME)
7243 break;
7245 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7247 /* Remove dead scalar statement. */
7248 if (has_zero_uses (lhs))
7250 gsi_remove (&gsi_from, true);
7251 continue;
7255 /* Check that LHS does not have uses outside of STORE_BB. */
7256 res = true;
7257 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7259 gimple *use_stmt;
7260 use_stmt = USE_STMT (use_p);
7261 if (is_gimple_debug (use_stmt))
7262 continue;
7263 if (gimple_bb (use_stmt) != store_bb)
7265 res = false;
7266 break;
7269 if (!res)
7270 break;
7272 if (gimple_vuse (stmt1)
7273 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7274 break;
7276 /* Can move STMT1 to STORE_BB. */
7277 if (dump_enabled_p ())
7279 dump_printf_loc (MSG_NOTE, vect_location,
7280 "Move stmt to created bb\n");
7281 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7283 gsi_move_before (&gsi_from, &gsi_to);
7284 /* Shift GSI_TO for further insertion. */
7285 gsi_prev (&gsi_to);
7287 /* Put other masked stores with the same mask to STORE_BB. */
7288 if (worklist.is_empty ()
7289 || gimple_call_arg (worklist.last (), 2) != mask
7290 || worklist.last () != stmt1)
7291 break;
7292 last = worklist.pop ();
7294 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);