Daily bump.
[official-gcc.git] / gcc / tree-vect-loop.c
blob1cb28d7b702956633b56ac77b05f5c77cb985cef
1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
51 /* Loop Vectorization Pass.
53 This pass tries to vectorize loops.
55 For example, the vectorizer transforms the following simple loop:
57 short a[N]; short b[N]; short c[N]; int i;
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
63 as if it was manually vectorized by rewriting the source code into:
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
111 For example, say stmt S1 was vectorized into stmt VS1:
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
151 /* Function vect_determine_vectorization_factor
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
176 static bool
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
201 for (i = 0; i < nbbs; i++)
203 basic_block bb = bbs[i];
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
217 gcc_assert (stmt_info);
219 if (STMT_VINFO_RELEVANT_P (stmt_info)
220 || STMT_VINFO_LIVE_P (stmt_info))
222 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
223 scalar_type = TREE_TYPE (PHI_RESULT (phi));
225 if (dump_enabled_p ())
227 dump_printf_loc (MSG_NOTE, vect_location,
228 "get vectype for scalar type: ");
229 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
230 dump_printf (MSG_NOTE, "\n");
233 vectype = get_vectype_for_scalar_type (scalar_type);
234 if (!vectype)
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
239 "not vectorized: unsupported "
240 "data-type ");
241 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
242 scalar_type);
243 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
245 return false;
247 STMT_VINFO_VECTYPE (stmt_info) = vectype;
249 if (dump_enabled_p ())
251 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
252 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
253 dump_printf (MSG_NOTE, "\n");
256 nunits = TYPE_VECTOR_SUBPARTS (vectype);
257 if (dump_enabled_p ())
258 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
259 nunits);
261 if (!vectorization_factor
262 || (nunits > vectorization_factor))
263 vectorization_factor = nunits;
267 for (gimple_stmt_iterator si = gsi_start_bb (bb);
268 !gsi_end_p (si) || analyze_pattern_stmt;)
270 tree vf_vectype;
272 if (analyze_pattern_stmt)
273 stmt = pattern_stmt;
274 else
275 stmt = gsi_stmt (si);
277 stmt_info = vinfo_for_stmt (stmt);
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_NOTE, vect_location,
282 "==> examining statement: ");
283 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
284 dump_printf (MSG_NOTE, "\n");
287 gcc_assert (stmt_info);
289 /* Skip stmts which do not need to be vectorized. */
290 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
291 && !STMT_VINFO_LIVE_P (stmt_info))
292 || gimple_clobber_p (stmt))
294 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
295 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
296 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
297 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
299 stmt = pattern_stmt;
300 stmt_info = vinfo_for_stmt (pattern_stmt);
301 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location,
304 "==> examining pattern statement: ");
305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
306 dump_printf (MSG_NOTE, "\n");
309 else
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
313 gsi_next (&si);
314 continue;
317 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
318 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
319 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
320 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
321 analyze_pattern_stmt = true;
323 /* If a pattern statement has def stmts, analyze them too. */
324 if (is_pattern_stmt_p (stmt_info))
326 if (pattern_def_seq == NULL)
328 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
329 pattern_def_si = gsi_start (pattern_def_seq);
331 else if (!gsi_end_p (pattern_def_si))
332 gsi_next (&pattern_def_si);
333 if (pattern_def_seq != NULL)
335 gimple *pattern_def_stmt = NULL;
336 stmt_vec_info pattern_def_stmt_info = NULL;
338 while (!gsi_end_p (pattern_def_si))
340 pattern_def_stmt = gsi_stmt (pattern_def_si);
341 pattern_def_stmt_info
342 = vinfo_for_stmt (pattern_def_stmt);
343 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
344 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
345 break;
346 gsi_next (&pattern_def_si);
349 if (!gsi_end_p (pattern_def_si))
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_NOTE, vect_location,
354 "==> examining pattern def stmt: ");
355 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
356 pattern_def_stmt, 0);
357 dump_printf (MSG_NOTE, "\n");
360 stmt = pattern_def_stmt;
361 stmt_info = pattern_def_stmt_info;
363 else
365 pattern_def_si = gsi_none ();
366 analyze_pattern_stmt = false;
369 else
370 analyze_pattern_stmt = false;
373 if (gimple_get_lhs (stmt) == NULL_TREE
374 /* MASK_STORE has no lhs, but is ok. */
375 && (!is_gimple_call (stmt)
376 || !gimple_call_internal_p (stmt)
377 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
379 if (is_gimple_call (stmt))
381 /* Ignore calls with no lhs. These must be calls to
382 #pragma omp simd functions, and what vectorization factor
383 it really needs can't be determined until
384 vectorizable_simd_clone_call. */
385 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
387 pattern_def_seq = NULL;
388 gsi_next (&si);
390 continue;
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "not vectorized: irregular stmt.");
396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
398 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
400 return false;
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (is_gimple_call (stmt)
432 && gimple_call_internal_p (stmt)
433 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
434 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 else
436 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
438 /* Bool ops don't participate in vectorization factor
439 computation. For comparison use compared types to
440 compute a factor. */
441 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
442 && is_gimple_assign (stmt)
443 && gimple_assign_rhs_code (stmt) != COND_EXPR)
445 if (STMT_VINFO_RELEVANT_P (stmt_info)
446 || STMT_VINFO_LIVE_P (stmt_info))
447 mask_producers.safe_push (stmt_info);
448 bool_result = true;
450 if (gimple_code (stmt) == GIMPLE_ASSIGN
451 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
452 == tcc_comparison
453 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
454 != BOOLEAN_TYPE)
455 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
456 else
458 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
460 pattern_def_seq = NULL;
461 gsi_next (&si);
463 continue;
467 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "get vectype for scalar type: ");
471 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
472 dump_printf (MSG_NOTE, "\n");
474 vectype = get_vectype_for_scalar_type (scalar_type);
475 if (!vectype)
477 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "not vectorized: unsupported "
481 "data-type ");
482 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
483 scalar_type);
484 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
486 return false;
489 if (!bool_result)
490 STMT_VINFO_VECTYPE (stmt_info) = vectype;
492 if (dump_enabled_p ())
494 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
495 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
496 dump_printf (MSG_NOTE, "\n");
500 /* Don't try to compute VF out scalar types if we stmt
501 produces boolean vector. Use result vectype instead. */
502 if (VECTOR_BOOLEAN_TYPE_P (vectype))
503 vf_vectype = vectype;
504 else
506 /* The vectorization factor is according to the smallest
507 scalar type (or the largest vector size, but we only
508 support one vector size per loop). */
509 if (!bool_result)
510 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
511 &dummy);
512 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "get vectype for scalar type: ");
516 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
517 dump_printf (MSG_NOTE, "\n");
519 vf_vectype = get_vectype_for_scalar_type (scalar_type);
521 if (!vf_vectype)
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
526 "not vectorized: unsupported data-type ");
527 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
528 scalar_type);
529 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
531 return false;
534 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
535 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
537 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "not vectorized: different sized vector "
541 "types in statement, ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
545 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
546 vf_vectype);
547 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
549 return false;
552 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
555 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
556 dump_printf (MSG_NOTE, "\n");
559 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
562 if (!vectorization_factor
563 || (nunits > vectorization_factor))
564 vectorization_factor = nunits;
566 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
568 pattern_def_seq = NULL;
569 gsi_next (&si);
574 /* TODO: Analyze cost. Decide if worth while to vectorize. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
577 vectorization_factor);
578 if (vectorization_factor <= 1)
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "not vectorized: unsupported data-type\n");
583 return false;
585 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
587 for (i = 0; i < mask_producers.length (); i++)
589 tree mask_type = NULL;
591 stmt = STMT_VINFO_STMT (mask_producers[i]);
593 if (gimple_code (stmt) == GIMPLE_ASSIGN
594 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
595 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
597 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
598 mask_type = get_mask_type_for_scalar_type (scalar_type);
600 if (!mask_type)
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
604 "not vectorized: unsupported mask\n");
605 return false;
608 else
610 tree rhs;
611 ssa_op_iter iter;
612 gimple *def_stmt;
613 enum vect_def_type dt;
615 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
617 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
618 &def_stmt, &dt, &vectype))
620 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "not vectorized: can't compute mask type "
624 "for statement, ");
625 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
627 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
629 return false;
632 /* No vectype probably means external definition.
633 Allow it in case there is another operand which
634 allows to determine mask type. */
635 if (!vectype)
636 continue;
638 if (!mask_type)
639 mask_type = vectype;
640 else if (TYPE_VECTOR_SUBPARTS (mask_type)
641 != TYPE_VECTOR_SUBPARTS (vectype))
643 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "not vectorized: different sized masks "
647 "types in statement, ");
648 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
649 mask_type);
650 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 vectype);
653 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
655 return false;
657 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
658 != VECTOR_BOOLEAN_TYPE_P (vectype))
660 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "not vectorized: mixed mask and "
664 "nonmask vector types in statement, ");
665 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
666 mask_type);
667 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 vectype);
670 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
672 return false;
676 /* We may compare boolean value loaded as vector of integers.
677 Fix mask_type in such case. */
678 if (mask_type
679 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
680 && gimple_code (stmt) == GIMPLE_ASSIGN
681 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
682 mask_type = build_same_sized_truth_vector_type (mask_type);
685 /* No mask_type should mean loop invariant predicate.
686 This is probably a subject for optimization in
687 if-conversion. */
688 if (!mask_type)
690 if (dump_enabled_p ())
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 "not vectorized: can't compute mask type "
694 "for statement, ");
695 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
697 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
699 return false;
702 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
705 return true;
709 /* Function vect_is_simple_iv_evolution.
711 FORNOW: A simple evolution of an induction variables in the loop is
712 considered a polynomial evolution. */
714 static bool
715 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
716 tree * step)
718 tree init_expr;
719 tree step_expr;
720 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
721 basic_block bb;
723 /* When there is no evolution in this loop, the evolution function
724 is not "simple". */
725 if (evolution_part == NULL_TREE)
726 return false;
728 /* When the evolution is a polynomial of degree >= 2
729 the evolution function is not "simple". */
730 if (tree_is_chrec (evolution_part))
731 return false;
733 step_expr = evolution_part;
734 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
736 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
739 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
740 dump_printf (MSG_NOTE, ", init: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
742 dump_printf (MSG_NOTE, "\n");
745 *init = init_expr;
746 *step = step_expr;
748 if (TREE_CODE (step_expr) != INTEGER_CST
749 && (TREE_CODE (step_expr) != SSA_NAME
750 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
751 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
752 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
753 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
754 || !flag_associative_math)))
755 && (TREE_CODE (step_expr) != REAL_CST
756 || !flag_associative_math))
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "step unknown.\n");
761 return false;
764 return true;
767 /* Function vect_analyze_scalar_cycles_1.
769 Examine the cross iteration def-use cycles of scalar variables
770 in LOOP. LOOP_VINFO represents the loop that is now being
771 considered for vectorization (can be LOOP, or an outer-loop
772 enclosing LOOP). */
774 static void
775 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
777 basic_block bb = loop->header;
778 tree init, step;
779 auto_vec<gimple *, 64> worklist;
780 gphi_iterator gsi;
781 bool double_reduc;
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE, vect_location,
785 "=== vect_analyze_scalar_cycles ===\n");
787 /* First - identify all inductions. Reduction detection assumes that all the
788 inductions have been identified, therefore, this order must not be
789 changed. */
790 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
792 gphi *phi = gsi.phi ();
793 tree access_fn = NULL;
794 tree def = PHI_RESULT (phi);
795 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
797 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
800 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
801 dump_printf (MSG_NOTE, "\n");
804 /* Skip virtual phi's. The data dependences that are associated with
805 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
806 if (virtual_operand_p (def))
807 continue;
809 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
811 /* Analyze the evolution function. */
812 access_fn = analyze_scalar_evolution (loop, def);
813 if (access_fn)
815 STRIP_NOPS (access_fn);
816 if (dump_enabled_p ())
818 dump_printf_loc (MSG_NOTE, vect_location,
819 "Access function of PHI: ");
820 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
821 dump_printf (MSG_NOTE, "\n");
823 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
824 = initial_condition_in_loop_num (access_fn, loop->num);
825 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
826 = evolution_part_in_loop_num (access_fn, loop->num);
829 if (!access_fn
830 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
831 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
832 && TREE_CODE (step) != INTEGER_CST))
834 worklist.safe_push (phi);
835 continue;
838 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
839 != NULL_TREE);
840 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
842 if (dump_enabled_p ())
843 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
844 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
848 /* Second - identify all reductions and nested cycles. */
849 while (worklist.length () > 0)
851 gimple *phi = worklist.pop ();
852 tree def = PHI_RESULT (phi);
853 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
854 gimple *reduc_stmt;
855 bool nested_cycle;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861 dump_printf (MSG_NOTE, "\n");
864 gcc_assert (!virtual_operand_p (def)
865 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
867 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
868 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
869 &double_reduc, false);
870 if (reduc_stmt)
872 if (double_reduc)
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE, vect_location,
876 "Detected double reduction.\n");
878 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
879 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
880 vect_double_reduction_def;
882 else
884 if (nested_cycle)
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "Detected vectorizable nested cycle.\n");
890 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892 vect_nested_cycle;
894 else
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "Detected reduction.\n");
900 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
901 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
902 vect_reduction_def;
903 /* Store the reduction cycles for possible vectorization in
904 loop-aware SLP. */
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1.
1016 Return the loop exit condition. */
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1021 tree *number_of_iterationsm1)
1023 tree niters;
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE, vect_location,
1027 "=== get_loop_niters ===\n");
1029 niters = number_of_latch_executions (loop);
1030 *number_of_iterationsm1 = niters;
1032 /* We want the number of loop header executions which is the number
1033 of latch executions plus one.
1034 ??? For UINT_MAX latch executions this number overflows to zero
1035 for loops like do { n++; } while (n != 0); */
1036 if (niters && !chrec_contains_undetermined (niters))
1037 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1038 build_int_cst (TREE_TYPE (niters), 1));
1039 *number_of_iterations = niters;
1041 return get_loop_exit_condition (loop);
1045 /* Function bb_in_loop_p
1047 Used as predicate for dfs order traversal of the loop bbs. */
1049 static bool
1050 bb_in_loop_p (const_basic_block bb, const void *data)
1052 const struct loop *const loop = (const struct loop *)data;
1053 if (flow_bb_inside_loop_p (loop, bb))
1054 return true;
1055 return false;
1059 /* Function new_loop_vec_info.
1061 Create and initialize a new loop_vec_info struct for LOOP, as well as
1062 stmt_vec_info structs for all the stmts in LOOP. */
1064 static loop_vec_info
1065 new_loop_vec_info (struct loop *loop)
1067 loop_vec_info res;
1068 basic_block *bbs;
1069 gimple_stmt_iterator si;
1070 unsigned int i, nbbs;
1072 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1073 res->kind = vec_info::loop;
1074 LOOP_VINFO_LOOP (res) = loop;
1076 bbs = get_loop_body (loop);
1078 /* Create/Update stmt_info for all stmts in the loop. */
1079 for (i = 0; i < loop->num_nodes; i++)
1081 basic_block bb = bbs[i];
1083 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1085 gimple *phi = gsi_stmt (si);
1086 gimple_set_uid (phi, 0);
1087 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1090 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092 gimple *stmt = gsi_stmt (si);
1093 gimple_set_uid (stmt, 0);
1094 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1098 /* CHECKME: We want to visit all BBs before their successors (except for
1099 latch blocks, for which this assertion wouldn't hold). In the simple
1100 case of the loop forms we allow, a dfs order of the BBs would the same
1101 as reversed postorder traversal, so we are safe. */
1103 free (bbs);
1104 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1105 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1106 bbs, loop->num_nodes, loop);
1107 gcc_assert (nbbs == loop->num_nodes);
1109 LOOP_VINFO_BBS (res) = bbs;
1110 LOOP_VINFO_NITERSM1 (res) = NULL;
1111 LOOP_VINFO_NITERS (res) = NULL;
1112 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1113 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1114 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1116 LOOP_VINFO_VECT_FACTOR (res) = 0;
1117 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1118 LOOP_VINFO_DATAREFS (res) = vNULL;
1119 LOOP_VINFO_DDRS (res) = vNULL;
1120 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1121 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1122 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1123 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1124 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1125 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1126 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1127 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1128 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1129 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1130 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1131 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1133 return res;
1137 /* Function destroy_loop_vec_info.
1139 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1140 stmts in the loop. */
1142 void
1143 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1145 struct loop *loop;
1146 basic_block *bbs;
1147 int nbbs;
1148 gimple_stmt_iterator si;
1149 int j;
1150 vec<slp_instance> slp_instances;
1151 slp_instance instance;
1152 bool swapped;
1154 if (!loop_vinfo)
1155 return;
1157 loop = LOOP_VINFO_LOOP (loop_vinfo);
1159 bbs = LOOP_VINFO_BBS (loop_vinfo);
1160 nbbs = clean_stmts ? loop->num_nodes : 0;
1161 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1163 for (j = 0; j < nbbs; j++)
1165 basic_block bb = bbs[j];
1166 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1167 free_stmt_vec_info (gsi_stmt (si));
1169 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1171 gimple *stmt = gsi_stmt (si);
1173 /* We may have broken canonical form by moving a constant
1174 into RHS1 of a commutative op. Fix such occurrences. */
1175 if (swapped && is_gimple_assign (stmt))
1177 enum tree_code code = gimple_assign_rhs_code (stmt);
1179 if ((code == PLUS_EXPR
1180 || code == POINTER_PLUS_EXPR
1181 || code == MULT_EXPR)
1182 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1183 swap_ssa_operands (stmt,
1184 gimple_assign_rhs1_ptr (stmt),
1185 gimple_assign_rhs2_ptr (stmt));
1188 /* Free stmt_vec_info. */
1189 free_stmt_vec_info (stmt);
1190 gsi_next (&si);
1194 free (LOOP_VINFO_BBS (loop_vinfo));
1195 vect_destroy_datarefs (loop_vinfo);
1196 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1197 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1198 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1199 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1201 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1202 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1203 vect_free_slp_instance (instance);
1205 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1206 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1207 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1208 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1210 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1211 loop_vinfo->scalar_cost_vec.release ();
1213 free (loop_vinfo);
1214 loop->aux = NULL;
1218 /* Calculate the cost of one scalar iteration of the loop. */
1219 static void
1220 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1222 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1223 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1224 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1225 int innerloop_iters, i;
1227 /* Count statements in scalar loop. Using this as scalar cost for a single
1228 iteration for now.
1230 TODO: Add outer loop support.
1232 TODO: Consider assigning different costs to different scalar
1233 statements. */
1235 /* FORNOW. */
1236 innerloop_iters = 1;
1237 if (loop->inner)
1238 innerloop_iters = 50; /* FIXME */
1240 for (i = 0; i < nbbs; i++)
1242 gimple_stmt_iterator si;
1243 basic_block bb = bbs[i];
1245 if (bb->loop_father == loop->inner)
1246 factor = innerloop_iters;
1247 else
1248 factor = 1;
1250 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1252 gimple *stmt = gsi_stmt (si);
1253 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1255 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1256 continue;
1258 /* Skip stmts that are not vectorized inside the loop. */
1259 if (stmt_info
1260 && !STMT_VINFO_RELEVANT_P (stmt_info)
1261 && (!STMT_VINFO_LIVE_P (stmt_info)
1262 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1263 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1264 continue;
1266 vect_cost_for_stmt kind;
1267 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1269 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1270 kind = scalar_load;
1271 else
1272 kind = scalar_store;
1274 else
1275 kind = scalar_stmt;
1277 scalar_single_iter_cost
1278 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1279 factor, kind, NULL, 0, vect_prologue);
1282 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1283 = scalar_single_iter_cost;
1287 /* Function vect_analyze_loop_form_1.
1289 Verify that certain CFG restrictions hold, including:
1290 - the loop has a pre-header
1291 - the loop has a single entry and exit
1292 - the loop exit condition is simple enough, and the number of iterations
1293 can be analyzed (a countable loop). */
1295 bool
1296 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1297 tree *number_of_iterationsm1,
1298 tree *number_of_iterations, gcond **inner_loop_cond)
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_NOTE, vect_location,
1302 "=== vect_analyze_loop_form ===\n");
1304 /* Different restrictions apply when we are considering an inner-most loop,
1305 vs. an outer (nested) loop.
1306 (FORNOW. May want to relax some of these restrictions in the future). */
1308 if (!loop->inner)
1310 /* Inner-most loop. We currently require that the number of BBs is
1311 exactly 2 (the header and latch). Vectorizable inner-most loops
1312 look like this:
1314 (pre-header)
1316 header <--------+
1317 | | |
1318 | +--> latch --+
1320 (exit-bb) */
1322 if (loop->num_nodes != 2)
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326 "not vectorized: control flow in loop.\n");
1327 return false;
1330 if (empty_block_p (loop->header))
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1334 "not vectorized: empty loop.\n");
1335 return false;
1338 else
1340 struct loop *innerloop = loop->inner;
1341 edge entryedge;
1343 /* Nested loop. We currently require that the loop is doubly-nested,
1344 contains a single inner loop, and the number of BBs is exactly 5.
1345 Vectorizable outer-loops look like this:
1347 (pre-header)
1349 header <---+
1351 inner-loop |
1353 tail ------+
1355 (exit-bb)
1357 The inner-loop has the properties expected of inner-most loops
1358 as described above. */
1360 if ((loop->inner)->inner || (loop->inner)->next)
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364 "not vectorized: multiple nested loops.\n");
1365 return false;
1368 if (loop->num_nodes != 5)
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "not vectorized: control flow in loop.\n");
1373 return false;
1376 entryedge = loop_preheader_edge (innerloop);
1377 if (entryedge->src != loop->header
1378 || !single_exit (innerloop)
1379 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 "not vectorized: unsupported outerloop form.\n");
1384 return false;
1387 /* Analyze the inner-loop. */
1388 tree inner_niterm1, inner_niter;
1389 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1390 &inner_niterm1, &inner_niter, NULL))
1392 if (dump_enabled_p ())
1393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1394 "not vectorized: Bad inner loop.\n");
1395 return false;
1398 if (!expr_invariant_in_loop_p (loop, inner_niter))
1400 if (dump_enabled_p ())
1401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 "not vectorized: inner-loop count not"
1403 " invariant.\n");
1404 return false;
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Considering outer-loop vectorization.\n");
1412 if (!single_exit (loop)
1413 || EDGE_COUNT (loop->header->preds) != 2)
1415 if (dump_enabled_p ())
1417 if (!single_exit (loop))
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "not vectorized: multiple exits.\n");
1420 else if (EDGE_COUNT (loop->header->preds) != 2)
1421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422 "not vectorized: too many incoming edges.\n");
1424 return false;
1427 /* We assume that the loop exit condition is at the end of the loop. i.e,
1428 that the loop is represented as a do-while (with a proper if-guard
1429 before the loop if needed), where the loop header contains all the
1430 executable statements, and the latch is empty. */
1431 if (!empty_block_p (loop->latch)
1432 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "not vectorized: latch block not empty.\n");
1437 return false;
1440 /* Make sure there exists a single-predecessor exit bb: */
1441 if (!single_pred_p (single_exit (loop)->dest))
1443 edge e = single_exit (loop);
1444 if (!(e->flags & EDGE_ABNORMAL))
1446 split_loop_exit_edge (e);
1447 if (dump_enabled_p ())
1448 dump_printf (MSG_NOTE, "split exit edge.\n");
1450 else
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454 "not vectorized: abnormal loop exit edge.\n");
1455 return false;
1459 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1460 number_of_iterationsm1);
1461 if (!*loop_cond)
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "not vectorized: complicated exit condition.\n");
1466 return false;
1469 if (!*number_of_iterations
1470 || chrec_contains_undetermined (*number_of_iterations))
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "not vectorized: number of iterations cannot be "
1475 "computed.\n");
1476 return false;
1479 if (integer_zerop (*number_of_iterations))
1481 if (dump_enabled_p ())
1482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483 "not vectorized: number of iterations = 0.\n");
1484 return false;
1487 return true;
1490 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1492 loop_vec_info
1493 vect_analyze_loop_form (struct loop *loop)
1495 tree number_of_iterations, number_of_iterationsm1;
1496 gcond *loop_cond, *inner_loop_cond = NULL;
1498 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1499 &number_of_iterations, &inner_loop_cond))
1500 return NULL;
1502 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1503 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1504 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1505 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1507 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1509 if (dump_enabled_p ())
1511 dump_printf_loc (MSG_NOTE, vect_location,
1512 "Symbolic number of iterations is ");
1513 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1514 dump_printf (MSG_NOTE, "\n");
1518 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1519 if (inner_loop_cond)
1520 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1521 = loop_exit_ctrl_vec_info_type;
1523 gcc_assert (!loop->aux);
1524 loop->aux = loop_vinfo;
1525 return loop_vinfo;
1530 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1531 statements update the vectorization factor. */
1533 static void
1534 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1536 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1537 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1538 int nbbs = loop->num_nodes;
1539 unsigned int vectorization_factor;
1540 int i;
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location,
1544 "=== vect_update_vf_for_slp ===\n");
1546 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1547 gcc_assert (vectorization_factor != 0);
1549 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1550 vectorization factor of the loop is the unrolling factor required by
1551 the SLP instances. If that unrolling factor is 1, we say, that we
1552 perform pure SLP on loop - cross iteration parallelism is not
1553 exploited. */
1554 bool only_slp_in_loop = true;
1555 for (i = 0; i < nbbs; i++)
1557 basic_block bb = bbs[i];
1558 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1559 gsi_next (&si))
1561 gimple *stmt = gsi_stmt (si);
1562 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1563 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1564 && STMT_VINFO_RELATED_STMT (stmt_info))
1566 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1567 stmt_info = vinfo_for_stmt (stmt);
1569 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1570 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1571 && !PURE_SLP_STMT (stmt_info))
1572 /* STMT needs both SLP and loop-based vectorization. */
1573 only_slp_in_loop = false;
1577 if (only_slp_in_loop)
1578 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1579 else
1580 vectorization_factor
1581 = least_common_multiple (vectorization_factor,
1582 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1584 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1585 if (dump_enabled_p ())
1586 dump_printf_loc (MSG_NOTE, vect_location,
1587 "Updating vectorization factor to %d\n",
1588 vectorization_factor);
1591 /* Function vect_analyze_loop_operations.
1593 Scan the loop stmts and make sure they are all vectorizable. */
1595 static bool
1596 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1598 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1599 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1600 int nbbs = loop->num_nodes;
1601 int i;
1602 stmt_vec_info stmt_info;
1603 bool need_to_vectorize = false;
1604 bool ok;
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location,
1608 "=== vect_analyze_loop_operations ===\n");
1610 for (i = 0; i < nbbs; i++)
1612 basic_block bb = bbs[i];
1614 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1615 gsi_next (&si))
1617 gphi *phi = si.phi ();
1618 ok = true;
1620 stmt_info = vinfo_for_stmt (phi);
1621 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1624 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1625 dump_printf (MSG_NOTE, "\n");
1627 if (virtual_operand_p (gimple_phi_result (phi)))
1628 continue;
1630 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1631 (i.e., a phi in the tail of the outer-loop). */
1632 if (! is_loop_header_bb_p (bb))
1634 /* FORNOW: we currently don't support the case that these phis
1635 are not used in the outerloop (unless it is double reduction,
1636 i.e., this phi is vect_reduction_def), cause this case
1637 requires to actually do something here. */
1638 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1639 || STMT_VINFO_LIVE_P (stmt_info))
1640 && STMT_VINFO_DEF_TYPE (stmt_info)
1641 != vect_double_reduction_def)
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "Unsupported loop-closed phi in "
1646 "outer-loop.\n");
1647 return false;
1650 /* If PHI is used in the outer loop, we check that its operand
1651 is defined in the inner loop. */
1652 if (STMT_VINFO_RELEVANT_P (stmt_info))
1654 tree phi_op;
1655 gimple *op_def_stmt;
1657 if (gimple_phi_num_args (phi) != 1)
1658 return false;
1660 phi_op = PHI_ARG_DEF (phi, 0);
1661 if (TREE_CODE (phi_op) != SSA_NAME)
1662 return false;
1664 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1665 if (gimple_nop_p (op_def_stmt)
1666 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1667 || !vinfo_for_stmt (op_def_stmt))
1668 return false;
1670 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1671 != vect_used_in_outer
1672 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1673 != vect_used_in_outer_by_reduction)
1674 return false;
1677 continue;
1680 gcc_assert (stmt_info);
1682 if (STMT_VINFO_LIVE_P (stmt_info))
1684 /* FORNOW: not yet supported. */
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687 "not vectorized: value used after loop.\n");
1688 return false;
1691 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1692 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1694 /* A scalar-dependence cycle that we don't support. */
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: scalar dependence cycle.\n");
1698 return false;
1701 if (STMT_VINFO_RELEVANT_P (stmt_info))
1703 need_to_vectorize = true;
1704 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1705 ok = vectorizable_induction (phi, NULL, NULL);
1708 if (!ok)
1710 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713 "not vectorized: relevant phi not "
1714 "supported: ");
1715 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1718 return false;
1722 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1723 gsi_next (&si))
1725 gimple *stmt = gsi_stmt (si);
1726 if (!gimple_clobber_p (stmt)
1727 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1728 return false;
1730 } /* bbs */
1732 /* All operations in the loop are either irrelevant (deal with loop
1733 control, or dead), or only used outside the loop and can be moved
1734 out of the loop (e.g. invariants, inductions). The loop can be
1735 optimized away by scalar optimizations. We're better off not
1736 touching this loop. */
1737 if (!need_to_vectorize)
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "All the computation can be taken out of the loop.\n");
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: redundant loop. no profit to "
1745 "vectorize.\n");
1746 return false;
1749 return true;
1753 /* Function vect_analyze_loop_2.
1755 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1756 for it. The different analyses will record information in the
1757 loop_vec_info struct. */
1758 static bool
1759 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1761 bool ok;
1762 int max_vf = MAX_VECTORIZATION_FACTOR;
1763 int min_vf = 2;
1764 unsigned int n_stmts = 0;
1766 /* The first group of checks is independent of the vector size. */
1767 fatal = true;
1769 /* Find all data references in the loop (which correspond to vdefs/vuses)
1770 and analyze their evolution in the loop. */
1772 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1774 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1775 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1777 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1779 "not vectorized: loop nest containing two "
1780 "or more consecutive inner loops cannot be "
1781 "vectorized\n");
1782 return false;
1785 for (unsigned i = 0; i < loop->num_nodes; i++)
1786 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1787 !gsi_end_p (gsi); gsi_next (&gsi))
1789 gimple *stmt = gsi_stmt (gsi);
1790 if (is_gimple_debug (stmt))
1791 continue;
1792 ++n_stmts;
1793 if (!find_data_references_in_stmt (loop, stmt,
1794 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1796 if (is_gimple_call (stmt) && loop->safelen)
1798 tree fndecl = gimple_call_fndecl (stmt), op;
1799 if (fndecl != NULL_TREE)
1801 cgraph_node *node = cgraph_node::get (fndecl);
1802 if (node != NULL && node->simd_clones != NULL)
1804 unsigned int j, n = gimple_call_num_args (stmt);
1805 for (j = 0; j < n; j++)
1807 op = gimple_call_arg (stmt, j);
1808 if (DECL_P (op)
1809 || (REFERENCE_CLASS_P (op)
1810 && get_base_address (op)))
1811 break;
1813 op = gimple_call_lhs (stmt);
1814 /* Ignore #pragma omp declare simd functions
1815 if they don't have data references in the
1816 call stmt itself. */
1817 if (j == n
1818 && !(op
1819 && (DECL_P (op)
1820 || (REFERENCE_CLASS_P (op)
1821 && get_base_address (op)))))
1822 continue;
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "not vectorized: loop contains function "
1829 "calls or data references that cannot "
1830 "be analyzed\n");
1831 return false;
1835 /* Analyze the data references and also adjust the minimal
1836 vectorization factor according to the loads and stores. */
1838 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1839 if (!ok)
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "bad data references.\n");
1844 return false;
1847 /* Classify all cross-iteration scalar data-flow cycles.
1848 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1849 vect_analyze_scalar_cycles (loop_vinfo);
1851 vect_pattern_recog (loop_vinfo);
1853 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1855 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1856 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1858 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1859 if (!ok)
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "bad data access.\n");
1864 return false;
1867 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1869 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1870 if (!ok)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "unexpected pattern.\n");
1875 return false;
1878 /* While the rest of the analysis below depends on it in some way. */
1879 fatal = false;
1881 /* Analyze data dependences between the data-refs in the loop
1882 and adjust the maximum vectorization factor according to
1883 the dependences.
1884 FORNOW: fail at the first data dependence that we encounter. */
1886 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1887 if (!ok
1888 || max_vf < min_vf)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data dependence.\n");
1893 return false;
1896 ok = vect_determine_vectorization_factor (loop_vinfo);
1897 if (!ok)
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901 "can't determine vectorization factor.\n");
1902 return false;
1904 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "bad data dependence.\n");
1909 return false;
1912 /* Compute the scalar iteration cost. */
1913 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1915 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1916 HOST_WIDE_INT estimated_niter;
1917 unsigned th;
1918 int min_scalar_loop_bound;
1920 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1921 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1922 if (!ok)
1923 return false;
1925 /* If there are any SLP instances mark them as pure_slp. */
1926 bool slp = vect_make_slp_decision (loop_vinfo);
1927 if (slp)
1929 /* Find stmts that need to be both vectorized and SLPed. */
1930 vect_detect_hybrid_slp (loop_vinfo);
1932 /* Update the vectorization factor based on the SLP decision. */
1933 vect_update_vf_for_slp (loop_vinfo);
1936 /* This is the point where we can re-start analysis with SLP forced off. */
1937 start_over:
1939 /* Now the vectorization factor is final. */
1940 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1941 gcc_assert (vectorization_factor != 0);
1943 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "vectorization_factor = %d, niters = "
1946 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1947 LOOP_VINFO_INT_NITERS (loop_vinfo));
1949 HOST_WIDE_INT max_niter
1950 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1951 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1952 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1953 || (max_niter != -1
1954 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1956 if (dump_enabled_p ())
1957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 "not vectorized: iteration count smaller than "
1959 "vectorization factor.\n");
1960 return false;
1963 /* Analyze the alignment of the data-refs in the loop.
1964 Fail if a data reference is found that cannot be vectorized. */
1966 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967 if (!ok)
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 "bad data alignment.\n");
1972 return false;
1975 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976 It is important to call pruning after vect_analyze_data_ref_accesses,
1977 since we use grouping information gathered by interleaving analysis. */
1978 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979 if (!ok)
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1983 "number of versioning for alias "
1984 "run-time tests exceeds %d "
1985 "(--param vect-max-version-for-alias-checks)\n",
1986 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1987 return false;
1990 /* This pass will decide on using loop versioning and/or loop peeling in
1991 order to enhance the alignment of data references in the loop. */
1992 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1993 if (!ok)
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997 "bad data alignment.\n");
1998 return false;
2001 if (slp)
2003 /* Analyze operations in the SLP instances. Note this may
2004 remove unsupported SLP instances which makes the above
2005 SLP kind detection invalid. */
2006 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2007 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2008 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2009 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2010 goto again;
2013 /* Scan all the remaining operations in the loop that are not subject
2014 to SLP and make sure they are vectorizable. */
2015 ok = vect_analyze_loop_operations (loop_vinfo);
2016 if (!ok)
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "bad operation or unsupported loop bound.\n");
2021 return false;
2024 /* Analyze cost. Decide if worth while to vectorize. */
2025 int min_profitable_estimate, min_profitable_iters;
2026 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2027 &min_profitable_estimate);
2029 if (min_profitable_iters < 0)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "not vectorized: vectorization not profitable.\n");
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036 "not vectorized: vector version will never be "
2037 "profitable.\n");
2038 goto again;
2041 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2042 * vectorization_factor) - 1);
2044 /* Use the cost model only if it is more conservative than user specified
2045 threshold. */
2046 th = (unsigned) min_scalar_loop_bound;
2047 if (min_profitable_iters
2048 && (!min_scalar_loop_bound
2049 || min_profitable_iters > min_scalar_loop_bound))
2050 th = (unsigned) min_profitable_iters;
2052 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2054 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2055 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "not vectorized: vectorization not profitable.\n");
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "not vectorized: iteration count smaller than user "
2063 "specified loop bound parameter or minimum profitable "
2064 "iterations (whichever is more conservative).\n");
2065 goto again;
2068 estimated_niter
2069 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2070 if (estimated_niter != -1
2071 && ((unsigned HOST_WIDE_INT) estimated_niter
2072 <= MAX (th, (unsigned)min_profitable_estimate)))
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "not vectorized: estimated iteration count too "
2077 "small.\n");
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_NOTE, vect_location,
2080 "not vectorized: estimated iteration count smaller "
2081 "than specified loop bound parameter or minimum "
2082 "profitable iterations (whichever is more "
2083 "conservative).\n");
2084 goto again;
2087 /* Decide whether we need to create an epilogue loop to handle
2088 remaining scalar iterations. */
2089 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2090 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2091 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2093 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2094 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2096 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2097 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2098 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2099 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2101 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2102 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2103 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2104 /* In case of versioning, check if the maximum number of
2105 iterations is greater than th. If they are identical,
2106 the epilogue is unnecessary. */
2107 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2108 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2109 || (unsigned HOST_WIDE_INT) max_niter > th)))
2110 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2112 /* If an epilogue loop is required make sure we can create one. */
2113 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2114 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2118 if (!vect_can_advance_ivs_p (loop_vinfo)
2119 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2120 single_exit (LOOP_VINFO_LOOP
2121 (loop_vinfo))))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 "not vectorized: can't create required "
2126 "epilog loop\n");
2127 goto again;
2131 gcc_assert (vectorization_factor
2132 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2134 /* Ok to vectorize! */
2135 return true;
2137 again:
2138 /* Try again with SLP forced off but if we didn't do any SLP there is
2139 no point in re-trying. */
2140 if (!slp)
2141 return false;
2143 /* If there are reduction chains re-trying will fail anyway. */
2144 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2145 return false;
2147 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2148 via interleaving or lane instructions. */
2149 slp_instance instance;
2150 slp_tree node;
2151 unsigned i, j;
2152 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2154 stmt_vec_info vinfo;
2155 vinfo = vinfo_for_stmt
2156 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2157 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2158 continue;
2159 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2160 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2161 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2162 if (! vect_store_lanes_supported (vectype, size)
2163 && ! vect_grouped_store_supported (vectype, size))
2164 return false;
2165 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2167 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2168 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2169 size = STMT_VINFO_GROUP_SIZE (vinfo);
2170 vectype = STMT_VINFO_VECTYPE (vinfo);
2171 if (! vect_load_lanes_supported (vectype, size)
2172 && ! vect_grouped_load_supported (vectype, size))
2173 return false;
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "re-trying with SLP disabled\n");
2181 /* Roll back state appropriately. No SLP this time. */
2182 slp = false;
2183 /* Restore vectorization factor as it were without SLP. */
2184 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2185 /* Free the SLP instances. */
2186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2187 vect_free_slp_instance (instance);
2188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2189 /* Reset SLP type to loop_vect on all stmts. */
2190 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2192 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2193 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2194 !gsi_end_p (si); gsi_next (&si))
2196 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2197 STMT_SLP_TYPE (stmt_info) = loop_vect;
2198 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2200 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2201 STMT_SLP_TYPE (stmt_info) = loop_vect;
2202 for (gimple_stmt_iterator pi
2203 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2204 !gsi_end_p (pi); gsi_next (&pi))
2206 gimple *pstmt = gsi_stmt (pi);
2207 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2212 /* Free optimized alias test DDRS. */
2213 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2214 /* Reset target cost data. */
2215 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2216 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2217 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2218 /* Reset assorted flags. */
2219 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2220 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2221 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2223 goto start_over;
2226 /* Function vect_analyze_loop.
2228 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2229 for it. The different analyses will record information in the
2230 loop_vec_info struct. */
2231 loop_vec_info
2232 vect_analyze_loop (struct loop *loop)
2234 loop_vec_info loop_vinfo;
2235 unsigned int vector_sizes;
2237 /* Autodetect first vector size we try. */
2238 current_vector_size = 0;
2239 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_NOTE, vect_location,
2243 "===== analyze_loop_nest =====\n");
2245 if (loop_outer (loop)
2246 && loop_vec_info_for_loop (loop_outer (loop))
2247 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "outer-loop already vectorized.\n");
2252 return NULL;
2255 while (1)
2257 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2258 loop_vinfo = vect_analyze_loop_form (loop);
2259 if (!loop_vinfo)
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "bad loop form.\n");
2264 return NULL;
2267 bool fatal = false;
2268 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272 return loop_vinfo;
2275 destroy_loop_vec_info (loop_vinfo, true);
2277 vector_sizes &= ~current_vector_size;
2278 if (fatal
2279 || vector_sizes == 0
2280 || current_vector_size == 0)
2281 return NULL;
2283 /* Try the next biggest vector size. */
2284 current_vector_size = 1 << floor_log2 (vector_sizes);
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "***** Re-trying analysis with "
2288 "vector size %d\n", current_vector_size);
2293 /* Function reduction_code_for_scalar_code
2295 Input:
2296 CODE - tree_code of a reduction operations.
2298 Output:
2299 REDUC_CODE - the corresponding tree-code to be used to reduce the
2300 vector of partial results into a single scalar result, or ERROR_MARK
2301 if the operation is a supported reduction operation, but does not have
2302 such a tree-code.
2304 Return FALSE if CODE currently cannot be vectorized as reduction. */
2306 static bool
2307 reduction_code_for_scalar_code (enum tree_code code,
2308 enum tree_code *reduc_code)
2310 switch (code)
2312 case MAX_EXPR:
2313 *reduc_code = REDUC_MAX_EXPR;
2314 return true;
2316 case MIN_EXPR:
2317 *reduc_code = REDUC_MIN_EXPR;
2318 return true;
2320 case PLUS_EXPR:
2321 *reduc_code = REDUC_PLUS_EXPR;
2322 return true;
2324 case MULT_EXPR:
2325 case MINUS_EXPR:
2326 case BIT_IOR_EXPR:
2327 case BIT_XOR_EXPR:
2328 case BIT_AND_EXPR:
2329 *reduc_code = ERROR_MARK;
2330 return true;
2332 default:
2333 return false;
2338 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2339 STMT is printed with a message MSG. */
2341 static void
2342 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2344 dump_printf_loc (msg_type, vect_location, "%s", msg);
2345 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2346 dump_printf (msg_type, "\n");
2350 /* Detect SLP reduction of the form:
2352 #a1 = phi <a5, a0>
2353 a2 = operation (a1)
2354 a3 = operation (a2)
2355 a4 = operation (a3)
2356 a5 = operation (a4)
2358 #a = phi <a5>
2360 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2361 FIRST_STMT is the first reduction stmt in the chain
2362 (a2 = operation (a1)).
2364 Return TRUE if a reduction chain was detected. */
2366 static bool
2367 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2368 gimple *first_stmt)
2370 struct loop *loop = (gimple_bb (phi))->loop_father;
2371 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2372 enum tree_code code;
2373 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2374 stmt_vec_info use_stmt_info, current_stmt_info;
2375 tree lhs;
2376 imm_use_iterator imm_iter;
2377 use_operand_p use_p;
2378 int nloop_uses, size = 0, n_out_of_loop_uses;
2379 bool found = false;
2381 if (loop != vect_loop)
2382 return false;
2384 lhs = PHI_RESULT (phi);
2385 code = gimple_assign_rhs_code (first_stmt);
2386 while (1)
2388 nloop_uses = 0;
2389 n_out_of_loop_uses = 0;
2390 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2392 gimple *use_stmt = USE_STMT (use_p);
2393 if (is_gimple_debug (use_stmt))
2394 continue;
2396 /* Check if we got back to the reduction phi. */
2397 if (use_stmt == phi)
2399 loop_use_stmt = use_stmt;
2400 found = true;
2401 break;
2404 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2406 loop_use_stmt = use_stmt;
2407 nloop_uses++;
2409 else
2410 n_out_of_loop_uses++;
2412 /* There are can be either a single use in the loop or two uses in
2413 phi nodes. */
2414 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2415 return false;
2418 if (found)
2419 break;
2421 /* We reached a statement with no loop uses. */
2422 if (nloop_uses == 0)
2423 return false;
2425 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2426 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2427 return false;
2429 if (!is_gimple_assign (loop_use_stmt)
2430 || code != gimple_assign_rhs_code (loop_use_stmt)
2431 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2432 return false;
2434 /* Insert USE_STMT into reduction chain. */
2435 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2436 if (current_stmt)
2438 current_stmt_info = vinfo_for_stmt (current_stmt);
2439 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2440 GROUP_FIRST_ELEMENT (use_stmt_info)
2441 = GROUP_FIRST_ELEMENT (current_stmt_info);
2443 else
2444 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2446 lhs = gimple_assign_lhs (loop_use_stmt);
2447 current_stmt = loop_use_stmt;
2448 size++;
2451 if (!found || loop_use_stmt != phi || size < 2)
2452 return false;
2454 /* Swap the operands, if needed, to make the reduction operand be the second
2455 operand. */
2456 lhs = PHI_RESULT (phi);
2457 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2458 while (next_stmt)
2460 if (gimple_assign_rhs2 (next_stmt) == lhs)
2462 tree op = gimple_assign_rhs1 (next_stmt);
2463 gimple *def_stmt = NULL;
2465 if (TREE_CODE (op) == SSA_NAME)
2466 def_stmt = SSA_NAME_DEF_STMT (op);
2468 /* Check that the other def is either defined in the loop
2469 ("vect_internal_def"), or it's an induction (defined by a
2470 loop-header phi-node). */
2471 if (def_stmt
2472 && gimple_bb (def_stmt)
2473 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2474 && (is_gimple_assign (def_stmt)
2475 || is_gimple_call (def_stmt)
2476 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2477 == vect_induction_def
2478 || (gimple_code (def_stmt) == GIMPLE_PHI
2479 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2480 == vect_internal_def
2481 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2483 lhs = gimple_assign_lhs (next_stmt);
2484 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2485 continue;
2488 return false;
2490 else
2492 tree op = gimple_assign_rhs2 (next_stmt);
2493 gimple *def_stmt = NULL;
2495 if (TREE_CODE (op) == SSA_NAME)
2496 def_stmt = SSA_NAME_DEF_STMT (op);
2498 /* Check that the other def is either defined in the loop
2499 ("vect_internal_def"), or it's an induction (defined by a
2500 loop-header phi-node). */
2501 if (def_stmt
2502 && gimple_bb (def_stmt)
2503 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2504 && (is_gimple_assign (def_stmt)
2505 || is_gimple_call (def_stmt)
2506 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2507 == vect_induction_def
2508 || (gimple_code (def_stmt) == GIMPLE_PHI
2509 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2510 == vect_internal_def
2511 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2513 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2516 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2517 dump_printf (MSG_NOTE, "\n");
2520 swap_ssa_operands (next_stmt,
2521 gimple_assign_rhs1_ptr (next_stmt),
2522 gimple_assign_rhs2_ptr (next_stmt));
2523 update_stmt (next_stmt);
2525 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2526 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2528 else
2529 return false;
2532 lhs = gimple_assign_lhs (next_stmt);
2533 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2536 /* Save the chain for further analysis in SLP detection. */
2537 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2538 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2539 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2541 return true;
2545 /* Function vect_is_simple_reduction_1
2547 (1) Detect a cross-iteration def-use cycle that represents a simple
2548 reduction computation. We look for the following pattern:
2550 loop_header:
2551 a1 = phi < a0, a2 >
2552 a3 = ...
2553 a2 = operation (a3, a1)
2557 a3 = ...
2558 loop_header:
2559 a1 = phi < a0, a2 >
2560 a2 = operation (a3, a1)
2562 such that:
2563 1. operation is commutative and associative and it is safe to
2564 change the order of the computation (if CHECK_REDUCTION is true)
2565 2. no uses for a2 in the loop (a2 is used out of the loop)
2566 3. no uses of a1 in the loop besides the reduction operation
2567 4. no uses of a1 outside the loop.
2569 Conditions 1,4 are tested here.
2570 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2572 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2573 nested cycles, if CHECK_REDUCTION is false.
2575 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2576 reductions:
2578 a1 = phi < a0, a2 >
2579 inner loop (def of a3)
2580 a2 = phi < a3 >
2582 (4) Detect condition expressions, ie:
2583 for (int i = 0; i < N; i++)
2584 if (a[i] < val)
2585 ret_val = a[i];
2589 static gimple *
2590 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2591 bool check_reduction, bool *double_reduc,
2592 bool need_wrapping_integral_overflow,
2593 enum vect_reduction_type *v_reduc_type)
2595 struct loop *loop = (gimple_bb (phi))->loop_father;
2596 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2597 edge latch_e = loop_latch_edge (loop);
2598 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2599 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2600 enum tree_code orig_code, code;
2601 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2602 tree type;
2603 int nloop_uses;
2604 tree name;
2605 imm_use_iterator imm_iter;
2606 use_operand_p use_p;
2607 bool phi_def;
2609 *double_reduc = false;
2610 *v_reduc_type = TREE_CODE_REDUCTION;
2612 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2613 otherwise, we assume outer loop vectorization. */
2614 gcc_assert ((check_reduction && loop == vect_loop)
2615 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2617 name = PHI_RESULT (phi);
2618 /* ??? If there are no uses of the PHI result the inner loop reduction
2619 won't be detected as possibly double-reduction by vectorizable_reduction
2620 because that tries to walk the PHI arg from the preheader edge which
2621 can be constant. See PR60382. */
2622 if (has_zero_uses (name))
2623 return NULL;
2624 nloop_uses = 0;
2625 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2627 gimple *use_stmt = USE_STMT (use_p);
2628 if (is_gimple_debug (use_stmt))
2629 continue;
2631 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "intermediate value used outside loop.\n");
2637 return NULL;
2640 nloop_uses++;
2641 if (nloop_uses > 1)
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645 "reduction used in loop.\n");
2646 return NULL;
2649 phi_use_stmt = use_stmt;
2652 if (TREE_CODE (loop_arg) != SSA_NAME)
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657 "reduction: not ssa_name: ");
2658 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2659 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2661 return NULL;
2664 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2665 if (!def_stmt)
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2669 "reduction: no def_stmt.\n");
2670 return NULL;
2673 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2675 if (dump_enabled_p ())
2677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2678 dump_printf (MSG_NOTE, "\n");
2680 return NULL;
2683 if (is_gimple_assign (def_stmt))
2685 name = gimple_assign_lhs (def_stmt);
2686 phi_def = false;
2688 else
2690 name = PHI_RESULT (def_stmt);
2691 phi_def = true;
2694 nloop_uses = 0;
2695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2697 gimple *use_stmt = USE_STMT (use_p);
2698 if (is_gimple_debug (use_stmt))
2699 continue;
2700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2701 nloop_uses++;
2702 if (nloop_uses > 1)
2704 if (dump_enabled_p ())
2705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2706 "reduction used in loop.\n");
2707 return NULL;
2711 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2712 defined in the inner loop. */
2713 if (phi_def)
2715 op1 = PHI_ARG_DEF (def_stmt, 0);
2717 if (gimple_phi_num_args (def_stmt) != 1
2718 || TREE_CODE (op1) != SSA_NAME)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "unsupported phi node definition.\n");
2724 return NULL;
2727 def1 = SSA_NAME_DEF_STMT (op1);
2728 if (gimple_bb (def1)
2729 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2730 && loop->inner
2731 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2732 && is_gimple_assign (def1)
2733 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE, def_stmt,
2737 "detected double reduction: ");
2739 *double_reduc = true;
2740 return def_stmt;
2743 return NULL;
2746 code = orig_code = gimple_assign_rhs_code (def_stmt);
2748 /* We can handle "res -= x[i]", which is non-associative by
2749 simply rewriting this into "res += -x[i]". Avoid changing
2750 gimple instruction for the first simple tests and only do this
2751 if we're allowed to change code at all. */
2752 if (code == MINUS_EXPR
2753 && (op1 = gimple_assign_rhs1 (def_stmt))
2754 && TREE_CODE (op1) == SSA_NAME
2755 && SSA_NAME_DEF_STMT (op1) == phi)
2756 code = PLUS_EXPR;
2758 if (code == COND_EXPR)
2760 if (check_reduction)
2761 *v_reduc_type = COND_REDUCTION;
2763 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2765 if (dump_enabled_p ())
2766 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2767 "reduction: not commutative/associative: ");
2768 return NULL;
2771 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2773 if (code != COND_EXPR)
2775 if (dump_enabled_p ())
2776 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2777 "reduction: not binary operation: ");
2779 return NULL;
2782 op3 = gimple_assign_rhs1 (def_stmt);
2783 if (COMPARISON_CLASS_P (op3))
2785 op4 = TREE_OPERAND (op3, 1);
2786 op3 = TREE_OPERAND (op3, 0);
2789 op1 = gimple_assign_rhs2 (def_stmt);
2790 op2 = gimple_assign_rhs3 (def_stmt);
2792 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2794 if (dump_enabled_p ())
2795 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2796 "reduction: uses not ssa_names: ");
2798 return NULL;
2801 else
2803 op1 = gimple_assign_rhs1 (def_stmt);
2804 op2 = gimple_assign_rhs2 (def_stmt);
2806 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2808 if (dump_enabled_p ())
2809 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2810 "reduction: uses not ssa_names: ");
2812 return NULL;
2816 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2817 if ((TREE_CODE (op1) == SSA_NAME
2818 && !types_compatible_p (type,TREE_TYPE (op1)))
2819 || (TREE_CODE (op2) == SSA_NAME
2820 && !types_compatible_p (type, TREE_TYPE (op2)))
2821 || (op3 && TREE_CODE (op3) == SSA_NAME
2822 && !types_compatible_p (type, TREE_TYPE (op3)))
2823 || (op4 && TREE_CODE (op4) == SSA_NAME
2824 && !types_compatible_p (type, TREE_TYPE (op4))))
2826 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "reduction: multiple types: operation type: ");
2830 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2831 dump_printf (MSG_NOTE, ", operands types: ");
2832 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2833 TREE_TYPE (op1));
2834 dump_printf (MSG_NOTE, ",");
2835 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2836 TREE_TYPE (op2));
2837 if (op3)
2839 dump_printf (MSG_NOTE, ",");
2840 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2841 TREE_TYPE (op3));
2844 if (op4)
2846 dump_printf (MSG_NOTE, ",");
2847 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2848 TREE_TYPE (op4));
2850 dump_printf (MSG_NOTE, "\n");
2853 return NULL;
2856 /* Check that it's ok to change the order of the computation.
2857 Generally, when vectorizing a reduction we change the order of the
2858 computation. This may change the behavior of the program in some
2859 cases, so we need to check that this is ok. One exception is when
2860 vectorizing an outer-loop: the inner-loop is executed sequentially,
2861 and therefore vectorizing reductions in the inner-loop during
2862 outer-loop vectorization is safe. */
2864 if (*v_reduc_type != COND_REDUCTION
2865 && check_reduction)
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2870 /* Changing the order of operations changes the semantics. */
2871 if (dump_enabled_p ())
2872 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2873 "reduction: unsafe fp math optimization: ");
2874 return NULL;
2876 else if (INTEGRAL_TYPE_P (type))
2878 if (!operation_no_trapping_overflow (type, code))
2880 /* Changing the order of operations changes the semantics. */
2881 if (dump_enabled_p ())
2882 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2883 "reduction: unsafe int math optimization"
2884 " (overflow traps): ");
2885 return NULL;
2887 if (need_wrapping_integral_overflow
2888 && !TYPE_OVERFLOW_WRAPS (type)
2889 && operation_can_overflow (code))
2891 /* Changing the order of operations changes the semantics. */
2892 if (dump_enabled_p ())
2893 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2894 "reduction: unsafe int math optimization"
2895 " (overflow doesn't wrap): ");
2896 return NULL;
2899 else if (SAT_FIXED_POINT_TYPE_P (type))
2901 /* Changing the order of operations changes the semantics. */
2902 if (dump_enabled_p ())
2903 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2904 "reduction: unsafe fixed-point math optimization: ");
2905 return NULL;
2909 /* Reduction is safe. We're dealing with one of the following:
2910 1) integer arithmetic and no trapv
2911 2) floating point arithmetic, and special flags permit this optimization
2912 3) nested cycle (i.e., outer loop vectorization). */
2913 if (TREE_CODE (op1) == SSA_NAME)
2914 def1 = SSA_NAME_DEF_STMT (op1);
2916 if (TREE_CODE (op2) == SSA_NAME)
2917 def2 = SSA_NAME_DEF_STMT (op2);
2919 if (code != COND_EXPR
2920 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2922 if (dump_enabled_p ())
2923 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2924 return NULL;
2927 /* Check that one def is the reduction def, defined by PHI,
2928 the other def is either defined in the loop ("vect_internal_def"),
2929 or it's an induction (defined by a loop-header phi-node). */
2931 if (def2 && def2 == phi
2932 && (code == COND_EXPR
2933 || !def1 || gimple_nop_p (def1)
2934 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2935 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2936 && (is_gimple_assign (def1)
2937 || is_gimple_call (def1)
2938 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2939 == vect_induction_def
2940 || (gimple_code (def1) == GIMPLE_PHI
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2942 == vect_internal_def
2943 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2947 return def_stmt;
2950 if (def1 && def1 == phi
2951 && (code == COND_EXPR
2952 || !def2 || gimple_nop_p (def2)
2953 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2954 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2955 && (is_gimple_assign (def2)
2956 || is_gimple_call (def2)
2957 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2958 == vect_induction_def
2959 || (gimple_code (def2) == GIMPLE_PHI
2960 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2961 == vect_internal_def
2962 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2964 if (check_reduction
2965 && orig_code != MINUS_EXPR)
2967 if (code == COND_EXPR)
2969 /* No current known use where this case would be useful. */
2970 if (dump_enabled_p ())
2971 report_vect_op (MSG_NOTE, def_stmt,
2972 "detected reduction: cannot currently swap "
2973 "operands for cond_expr");
2974 return NULL;
2977 /* Swap operands (just for simplicity - so that the rest of the code
2978 can assume that the reduction variable is always the last (second)
2979 argument). */
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE, def_stmt,
2982 "detected reduction: need to swap operands: ");
2984 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2985 gimple_assign_rhs2_ptr (def_stmt));
2987 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2988 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2990 else
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2996 return def_stmt;
2999 /* Try to find SLP reduction chain. */
3000 if (check_reduction && code != COND_EXPR
3001 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3003 if (dump_enabled_p ())
3004 report_vect_op (MSG_NOTE, def_stmt,
3005 "reduction: detected reduction chain: ");
3007 return def_stmt;
3010 if (dump_enabled_p ())
3011 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3012 "reduction: unknown pattern: ");
3014 return NULL;
3017 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3018 in-place if it enables detection of more reductions. Arguments
3019 as there. */
3021 gimple *
3022 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3023 bool check_reduction, bool *double_reduc,
3024 bool need_wrapping_integral_overflow)
3026 enum vect_reduction_type v_reduc_type;
3027 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3028 double_reduc,
3029 need_wrapping_integral_overflow,
3030 &v_reduc_type);
3033 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3035 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3036 int *peel_iters_epilogue,
3037 stmt_vector_for_cost *scalar_cost_vec,
3038 stmt_vector_for_cost *prologue_cost_vec,
3039 stmt_vector_for_cost *epilogue_cost_vec)
3041 int retval = 0;
3042 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3044 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3046 *peel_iters_epilogue = vf/2;
3047 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 "cost model: epilogue peel iters set to vf/2 "
3050 "because loop iterations are unknown .\n");
3052 /* If peeled iterations are known but number of scalar loop
3053 iterations are unknown, count a taken branch per peeled loop. */
3054 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3055 NULL, 0, vect_prologue);
3056 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3057 NULL, 0, vect_epilogue);
3059 else
3061 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3062 peel_iters_prologue = niters < peel_iters_prologue ?
3063 niters : peel_iters_prologue;
3064 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3065 /* If we need to peel for gaps, but no peeling is required, we have to
3066 peel VF iterations. */
3067 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3068 *peel_iters_epilogue = vf;
3071 stmt_info_for_cost *si;
3072 int j;
3073 if (peel_iters_prologue)
3074 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3075 retval += record_stmt_cost (prologue_cost_vec,
3076 si->count * peel_iters_prologue,
3077 si->kind, NULL, si->misalign,
3078 vect_prologue);
3079 if (*peel_iters_epilogue)
3080 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3081 retval += record_stmt_cost (epilogue_cost_vec,
3082 si->count * *peel_iters_epilogue,
3083 si->kind, NULL, si->misalign,
3084 vect_epilogue);
3086 return retval;
3089 /* Function vect_estimate_min_profitable_iters
3091 Return the number of iterations required for the vector version of the
3092 loop to be profitable relative to the cost of the scalar version of the
3093 loop. */
3095 static void
3096 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3097 int *ret_min_profitable_niters,
3098 int *ret_min_profitable_estimate)
3100 int min_profitable_iters;
3101 int min_profitable_estimate;
3102 int peel_iters_prologue;
3103 int peel_iters_epilogue;
3104 unsigned vec_inside_cost = 0;
3105 int vec_outside_cost = 0;
3106 unsigned vec_prologue_cost = 0;
3107 unsigned vec_epilogue_cost = 0;
3108 int scalar_single_iter_cost = 0;
3109 int scalar_outside_cost = 0;
3110 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3111 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3112 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3114 /* Cost model disabled. */
3115 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3117 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3118 *ret_min_profitable_niters = 0;
3119 *ret_min_profitable_estimate = 0;
3120 return;
3123 /* Requires loop versioning tests to handle misalignment. */
3124 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3126 /* FIXME: Make cost depend on complexity of individual check. */
3127 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3128 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3129 vect_prologue);
3130 dump_printf (MSG_NOTE,
3131 "cost model: Adding cost of checks for loop "
3132 "versioning to treat misalignment.\n");
3135 /* Requires loop versioning with alias checks. */
3136 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3138 /* FIXME: Make cost depend on complexity of individual check. */
3139 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3140 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3141 vect_prologue);
3142 dump_printf (MSG_NOTE,
3143 "cost model: Adding cost of checks for loop "
3144 "versioning aliasing.\n");
3147 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3148 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3149 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3150 vect_prologue);
3152 /* Count statements in scalar loop. Using this as scalar cost for a single
3153 iteration for now.
3155 TODO: Add outer loop support.
3157 TODO: Consider assigning different costs to different scalar
3158 statements. */
3160 scalar_single_iter_cost
3161 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3163 /* Add additional cost for the peeled instructions in prologue and epilogue
3164 loop.
3166 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3167 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3169 TODO: Build an expression that represents peel_iters for prologue and
3170 epilogue to be used in a run-time test. */
3172 if (npeel < 0)
3174 peel_iters_prologue = vf/2;
3175 dump_printf (MSG_NOTE, "cost model: "
3176 "prologue peel iters set to vf/2.\n");
3178 /* If peeling for alignment is unknown, loop bound of main loop becomes
3179 unknown. */
3180 peel_iters_epilogue = vf/2;
3181 dump_printf (MSG_NOTE, "cost model: "
3182 "epilogue peel iters set to vf/2 because "
3183 "peeling for alignment is unknown.\n");
3185 /* If peeled iterations are unknown, count a taken branch and a not taken
3186 branch per peeled loop. Even if scalar loop iterations are known,
3187 vector iterations are not known since peeled prologue iterations are
3188 not known. Hence guards remain the same. */
3189 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3190 NULL, 0, vect_prologue);
3191 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3192 NULL, 0, vect_prologue);
3193 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3194 NULL, 0, vect_epilogue);
3195 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3196 NULL, 0, vect_epilogue);
3197 stmt_info_for_cost *si;
3198 int j;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3201 struct _stmt_vec_info *stmt_info
3202 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3203 (void) add_stmt_cost (target_cost_data,
3204 si->count * peel_iters_prologue,
3205 si->kind, stmt_info, si->misalign,
3206 vect_prologue);
3207 (void) add_stmt_cost (target_cost_data,
3208 si->count * peel_iters_epilogue,
3209 si->kind, stmt_info, si->misalign,
3210 vect_epilogue);
3213 else
3215 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3216 stmt_info_for_cost *si;
3217 int j;
3218 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3220 prologue_cost_vec.create (2);
3221 epilogue_cost_vec.create (2);
3222 peel_iters_prologue = npeel;
3224 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3225 &peel_iters_epilogue,
3226 &LOOP_VINFO_SCALAR_ITERATION_COST
3227 (loop_vinfo),
3228 &prologue_cost_vec,
3229 &epilogue_cost_vec);
3231 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3233 struct _stmt_vec_info *stmt_info
3234 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3235 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3236 si->misalign, vect_prologue);
3239 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3241 struct _stmt_vec_info *stmt_info
3242 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3243 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3244 si->misalign, vect_epilogue);
3247 prologue_cost_vec.release ();
3248 epilogue_cost_vec.release ();
3251 /* FORNOW: The scalar outside cost is incremented in one of the
3252 following ways:
3254 1. The vectorizer checks for alignment and aliasing and generates
3255 a condition that allows dynamic vectorization. A cost model
3256 check is ANDED with the versioning condition. Hence scalar code
3257 path now has the added cost of the versioning check.
3259 if (cost > th & versioning_check)
3260 jmp to vector code
3262 Hence run-time scalar is incremented by not-taken branch cost.
3264 2. The vectorizer then checks if a prologue is required. If the
3265 cost model check was not done before during versioning, it has to
3266 be done before the prologue check.
3268 if (cost <= th)
3269 prologue = scalar_iters
3270 if (prologue == 0)
3271 jmp to vector code
3272 else
3273 execute prologue
3274 if (prologue == num_iters)
3275 go to exit
3277 Hence the run-time scalar cost is incremented by a taken branch,
3278 plus a not-taken branch, plus a taken branch cost.
3280 3. The vectorizer then checks if an epilogue is required. If the
3281 cost model check was not done before during prologue check, it
3282 has to be done with the epilogue check.
3284 if (prologue == 0)
3285 jmp to vector code
3286 else
3287 execute prologue
3288 if (prologue == num_iters)
3289 go to exit
3290 vector code:
3291 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3292 jmp to epilogue
3294 Hence the run-time scalar cost should be incremented by 2 taken
3295 branches.
3297 TODO: The back end may reorder the BBS's differently and reverse
3298 conditions/branch directions. Change the estimates below to
3299 something more reasonable. */
3301 /* If the number of iterations is known and we do not do versioning, we can
3302 decide whether to vectorize at compile time. Hence the scalar version
3303 do not carry cost model guard costs. */
3304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3305 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3306 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3308 /* Cost model check occurs at versioning. */
3309 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3310 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3311 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3312 else
3314 /* Cost model check occurs at prologue generation. */
3315 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3316 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3317 + vect_get_stmt_cost (cond_branch_not_taken);
3318 /* Cost model check occurs at epilogue generation. */
3319 else
3320 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3324 /* Complete the target-specific cost calculations. */
3325 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3326 &vec_inside_cost, &vec_epilogue_cost);
3328 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3330 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3333 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3334 vec_inside_cost);
3335 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3336 vec_prologue_cost);
3337 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3338 vec_epilogue_cost);
3339 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3340 scalar_single_iter_cost);
3341 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3342 scalar_outside_cost);
3343 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3344 vec_outside_cost);
3345 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3346 peel_iters_prologue);
3347 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3348 peel_iters_epilogue);
3351 /* Calculate number of iterations required to make the vector version
3352 profitable, relative to the loop bodies only. The following condition
3353 must hold true:
3354 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3355 where
3356 SIC = scalar iteration cost, VIC = vector iteration cost,
3357 VOC = vector outside cost, VF = vectorization factor,
3358 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3359 SOC = scalar outside cost for run time cost model check. */
3361 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3363 if (vec_outside_cost <= 0)
3364 min_profitable_iters = 1;
3365 else
3367 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3368 - vec_inside_cost * peel_iters_prologue
3369 - vec_inside_cost * peel_iters_epilogue)
3370 / ((scalar_single_iter_cost * vf)
3371 - vec_inside_cost);
3373 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3374 <= (((int) vec_inside_cost * min_profitable_iters)
3375 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3376 min_profitable_iters++;
3379 /* vector version will never be profitable. */
3380 else
3382 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3383 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3384 "did not happen for a simd loop");
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3388 "cost model: the vector iteration cost = %d "
3389 "divided by the scalar iteration cost = %d "
3390 "is greater or equal to the vectorization factor = %d"
3391 ".\n",
3392 vec_inside_cost, scalar_single_iter_cost, vf);
3393 *ret_min_profitable_niters = -1;
3394 *ret_min_profitable_estimate = -1;
3395 return;
3398 dump_printf (MSG_NOTE,
3399 " Calculated minimum iters for profitability: %d\n",
3400 min_profitable_iters);
3402 min_profitable_iters =
3403 min_profitable_iters < vf ? vf : min_profitable_iters;
3405 /* Because the condition we create is:
3406 if (niters <= min_profitable_iters)
3407 then skip the vectorized loop. */
3408 min_profitable_iters--;
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 " Runtime profitability threshold = %d\n",
3413 min_profitable_iters);
3415 *ret_min_profitable_niters = min_profitable_iters;
3417 /* Calculate number of iterations required to make the vector version
3418 profitable, relative to the loop bodies only.
3420 Non-vectorized variant is SIC * niters and it must win over vector
3421 variant on the expected loop trip count. The following condition must hold true:
3422 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3424 if (vec_outside_cost <= 0)
3425 min_profitable_estimate = 1;
3426 else
3428 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3429 - vec_inside_cost * peel_iters_prologue
3430 - vec_inside_cost * peel_iters_epilogue)
3431 / ((scalar_single_iter_cost * vf)
3432 - vec_inside_cost);
3434 min_profitable_estimate --;
3435 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3436 if (dump_enabled_p ())
3437 dump_printf_loc (MSG_NOTE, vect_location,
3438 " Static estimate profitability threshold = %d\n",
3439 min_profitable_estimate);
3441 *ret_min_profitable_estimate = min_profitable_estimate;
3444 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3445 vector elements (not bits) for a vector of mode MODE. */
3446 static void
3447 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3448 unsigned char *sel)
3450 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3452 for (i = 0; i < nelt; i++)
3453 sel[i] = (i + offset) & (2*nelt - 1);
3456 /* Checks whether the target supports whole-vector shifts for vectors of mode
3457 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3458 it supports vec_perm_const with masks for all necessary shift amounts. */
3459 static bool
3460 have_whole_vector_shift (enum machine_mode mode)
3462 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3463 return true;
3465 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3466 return false;
3468 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3469 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3471 for (i = nelt/2; i >= 1; i/=2)
3473 calc_vec_perm_mask_for_shift (mode, i, sel);
3474 if (!can_vec_perm_p (mode, false, sel))
3475 return false;
3477 return true;
3480 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3482 static tree
3483 get_reduction_op (gimple *stmt, int reduc_index)
3485 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3487 case GIMPLE_SINGLE_RHS:
3488 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3489 == ternary_op);
3490 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3491 case GIMPLE_UNARY_RHS:
3492 return gimple_assign_rhs1 (stmt);
3493 case GIMPLE_BINARY_RHS:
3494 return (reduc_index
3495 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3496 case GIMPLE_TERNARY_RHS:
3497 return gimple_op (stmt, reduc_index + 1);
3498 default:
3499 gcc_unreachable ();
3503 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3504 functions. Design better to avoid maintenance issues. */
3506 /* Function vect_model_reduction_cost.
3508 Models cost for a reduction operation, including the vector ops
3509 generated within the strip-mine loop, the initial definition before
3510 the loop, and the epilogue code that must be generated. */
3512 static bool
3513 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3514 int ncopies, int reduc_index)
3516 int prologue_cost = 0, epilogue_cost = 0;
3517 enum tree_code code;
3518 optab optab;
3519 tree vectype;
3520 gimple *stmt, *orig_stmt;
3521 tree reduction_op;
3522 machine_mode mode;
3523 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524 struct loop *loop = NULL;
3525 void *target_cost_data;
3527 if (loop_vinfo)
3529 loop = LOOP_VINFO_LOOP (loop_vinfo);
3530 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3532 else
3533 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3535 /* Condition reductions generate two reductions in the loop. */
3536 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3537 ncopies *= 2;
3539 /* Cost of reduction op inside loop. */
3540 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3541 stmt_info, 0, vect_body);
3542 stmt = STMT_VINFO_STMT (stmt_info);
3544 reduction_op = get_reduction_op (stmt, reduc_index);
3546 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3547 if (!vectype)
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 "unsupported data-type ");
3553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3554 TREE_TYPE (reduction_op));
3555 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3557 return false;
3560 mode = TYPE_MODE (vectype);
3561 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3563 if (!orig_stmt)
3564 orig_stmt = STMT_VINFO_STMT (stmt_info);
3566 code = gimple_assign_rhs_code (orig_stmt);
3568 /* Add in cost for initial definition.
3569 For cond reduction we have four vectors: initial index, step, initial
3570 result of the data reduction, initial value of the index reduction. */
3571 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3572 == COND_REDUCTION ? 4 : 1;
3573 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3574 scalar_to_vec, stmt_info, 0,
3575 vect_prologue);
3577 /* Determine cost of epilogue code.
3579 We have a reduction operator that will reduce the vector in one statement.
3580 Also requires scalar extract. */
3582 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3584 if (reduc_code != ERROR_MARK)
3586 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3588 /* An EQ stmt and an COND_EXPR stmt. */
3589 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3590 vector_stmt, stmt_info, 0,
3591 vect_epilogue);
3592 /* Reduction of the max index and a reduction of the found
3593 values. */
3594 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3595 vec_to_scalar, stmt_info, 0,
3596 vect_epilogue);
3597 /* A broadcast of the max value. */
3598 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3599 scalar_to_vec, stmt_info, 0,
3600 vect_epilogue);
3602 else
3604 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3605 stmt_info, 0, vect_epilogue);
3606 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3607 vec_to_scalar, stmt_info, 0,
3608 vect_epilogue);
3611 else
3613 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3614 tree bitsize =
3615 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3616 int element_bitsize = tree_to_uhwi (bitsize);
3617 int nelements = vec_size_in_bits / element_bitsize;
3619 optab = optab_for_tree_code (code, vectype, optab_default);
3621 /* We have a whole vector shift available. */
3622 if (VECTOR_MODE_P (mode)
3623 && optab_handler (optab, mode) != CODE_FOR_nothing
3624 && have_whole_vector_shift (mode))
3626 /* Final reduction via vector shifts and the reduction operator.
3627 Also requires scalar extract. */
3628 epilogue_cost += add_stmt_cost (target_cost_data,
3629 exact_log2 (nelements) * 2,
3630 vector_stmt, stmt_info, 0,
3631 vect_epilogue);
3632 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3633 vec_to_scalar, stmt_info, 0,
3634 vect_epilogue);
3636 else
3637 /* Use extracts and reduction op for final reduction. For N
3638 elements, we have N extracts and N-1 reduction ops. */
3639 epilogue_cost += add_stmt_cost (target_cost_data,
3640 nelements + nelements - 1,
3641 vector_stmt, stmt_info, 0,
3642 vect_epilogue);
3646 if (dump_enabled_p ())
3647 dump_printf (MSG_NOTE,
3648 "vect_model_reduction_cost: inside_cost = %d, "
3649 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3650 prologue_cost, epilogue_cost);
3652 return true;
3656 /* Function vect_model_induction_cost.
3658 Models cost for induction operations. */
3660 static void
3661 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3663 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3664 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3665 unsigned inside_cost, prologue_cost;
3667 /* loop cost for vec_loop. */
3668 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3669 stmt_info, 0, vect_body);
3671 /* prologue cost for vec_init and vec_step. */
3672 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3673 stmt_info, 0, vect_prologue);
3675 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_NOTE, vect_location,
3677 "vect_model_induction_cost: inside_cost = %d, "
3678 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3682 /* Function get_initial_def_for_induction
3684 Input:
3685 STMT - a stmt that performs an induction operation in the loop.
3686 IV_PHI - the initial value of the induction variable
3688 Output:
3689 Return a vector variable, initialized with the first VF values of
3690 the induction variable. E.g., for an iv with IV_PHI='X' and
3691 evolution S, for a vector of 4 units, we want to return:
3692 [X, X + S, X + 2*S, X + 3*S]. */
3694 static tree
3695 get_initial_def_for_induction (gimple *iv_phi)
3697 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3699 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3700 tree vectype;
3701 int nunits;
3702 edge pe = loop_preheader_edge (loop);
3703 struct loop *iv_loop;
3704 basic_block new_bb;
3705 tree new_vec, vec_init, vec_step, t;
3706 tree new_name;
3707 gimple *new_stmt;
3708 gphi *induction_phi;
3709 tree induc_def, vec_def, vec_dest;
3710 tree init_expr, step_expr;
3711 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3712 int i;
3713 int ncopies;
3714 tree expr;
3715 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3716 bool nested_in_vect_loop = false;
3717 gimple_seq stmts;
3718 imm_use_iterator imm_iter;
3719 use_operand_p use_p;
3720 gimple *exit_phi;
3721 edge latch_e;
3722 tree loop_arg;
3723 gimple_stmt_iterator si;
3724 basic_block bb = gimple_bb (iv_phi);
3725 tree stepvectype;
3726 tree resvectype;
3728 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3729 if (nested_in_vect_loop_p (loop, iv_phi))
3731 nested_in_vect_loop = true;
3732 iv_loop = loop->inner;
3734 else
3735 iv_loop = loop;
3736 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3738 latch_e = loop_latch_edge (iv_loop);
3739 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3741 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3742 gcc_assert (step_expr != NULL_TREE);
3744 pe = loop_preheader_edge (iv_loop);
3745 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3746 loop_preheader_edge (iv_loop));
3748 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3749 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3750 gcc_assert (vectype);
3751 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3752 ncopies = vf / nunits;
3754 gcc_assert (phi_info);
3755 gcc_assert (ncopies >= 1);
3757 /* Convert the step to the desired type. */
3758 stmts = NULL;
3759 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3760 if (stmts)
3762 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3763 gcc_assert (!new_bb);
3766 /* Find the first insertion point in the BB. */
3767 si = gsi_after_labels (bb);
3769 /* Create the vector that holds the initial_value of the induction. */
3770 if (nested_in_vect_loop)
3772 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3773 been created during vectorization of previous stmts. We obtain it
3774 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3775 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3776 /* If the initial value is not of proper type, convert it. */
3777 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3779 new_stmt
3780 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3781 vect_simple_var,
3782 "vec_iv_"),
3783 VIEW_CONVERT_EXPR,
3784 build1 (VIEW_CONVERT_EXPR, vectype,
3785 vec_init));
3786 vec_init = gimple_assign_lhs (new_stmt);
3787 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3788 new_stmt);
3789 gcc_assert (!new_bb);
3790 set_vinfo_for_stmt (new_stmt,
3791 new_stmt_vec_info (new_stmt, loop_vinfo));
3794 else
3796 vec<constructor_elt, va_gc> *v;
3798 /* iv_loop is the loop to be vectorized. Create:
3799 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3800 stmts = NULL;
3801 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3803 vec_alloc (v, nunits);
3804 bool constant_p = is_gimple_min_invariant (new_name);
3805 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3806 for (i = 1; i < nunits; i++)
3808 /* Create: new_name_i = new_name + step_expr */
3809 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3810 new_name, step_expr);
3811 if (!is_gimple_min_invariant (new_name))
3812 constant_p = false;
3813 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3815 if (stmts)
3817 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3818 gcc_assert (!new_bb);
3821 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3822 if (constant_p)
3823 new_vec = build_vector_from_ctor (vectype, v);
3824 else
3825 new_vec = build_constructor (vectype, v);
3826 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3830 /* Create the vector that holds the step of the induction. */
3831 if (nested_in_vect_loop)
3832 /* iv_loop is nested in the loop to be vectorized. Generate:
3833 vec_step = [S, S, S, S] */
3834 new_name = step_expr;
3835 else
3837 /* iv_loop is the loop to be vectorized. Generate:
3838 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3839 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3841 expr = build_int_cst (integer_type_node, vf);
3842 expr = fold_convert (TREE_TYPE (step_expr), expr);
3844 else
3845 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3846 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3847 expr, step_expr);
3848 if (TREE_CODE (step_expr) == SSA_NAME)
3849 new_name = vect_init_vector (iv_phi, new_name,
3850 TREE_TYPE (step_expr), NULL);
3853 t = unshare_expr (new_name);
3854 gcc_assert (CONSTANT_CLASS_P (new_name)
3855 || TREE_CODE (new_name) == SSA_NAME);
3856 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3857 gcc_assert (stepvectype);
3858 new_vec = build_vector_from_val (stepvectype, t);
3859 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3862 /* Create the following def-use cycle:
3863 loop prolog:
3864 vec_init = ...
3865 vec_step = ...
3866 loop:
3867 vec_iv = PHI <vec_init, vec_loop>
3869 STMT
3871 vec_loop = vec_iv + vec_step; */
3873 /* Create the induction-phi that defines the induction-operand. */
3874 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3875 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3876 set_vinfo_for_stmt (induction_phi,
3877 new_stmt_vec_info (induction_phi, loop_vinfo));
3878 induc_def = PHI_RESULT (induction_phi);
3880 /* Create the iv update inside the loop */
3881 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3882 vec_def = make_ssa_name (vec_dest, new_stmt);
3883 gimple_assign_set_lhs (new_stmt, vec_def);
3884 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3885 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3887 /* Set the arguments of the phi node: */
3888 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3889 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3890 UNKNOWN_LOCATION);
3893 /* In case that vectorization factor (VF) is bigger than the number
3894 of elements that we can fit in a vectype (nunits), we have to generate
3895 more than one vector stmt - i.e - we need to "unroll" the
3896 vector stmt by a factor VF/nunits. For more details see documentation
3897 in vectorizable_operation. */
3899 if (ncopies > 1)
3901 stmt_vec_info prev_stmt_vinfo;
3902 /* FORNOW. This restriction should be relaxed. */
3903 gcc_assert (!nested_in_vect_loop);
3905 /* Create the vector that holds the step of the induction. */
3906 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3908 expr = build_int_cst (integer_type_node, nunits);
3909 expr = fold_convert (TREE_TYPE (step_expr), expr);
3911 else
3912 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3913 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3914 expr, step_expr);
3915 if (TREE_CODE (step_expr) == SSA_NAME)
3916 new_name = vect_init_vector (iv_phi, new_name,
3917 TREE_TYPE (step_expr), NULL);
3918 t = unshare_expr (new_name);
3919 gcc_assert (CONSTANT_CLASS_P (new_name)
3920 || TREE_CODE (new_name) == SSA_NAME);
3921 new_vec = build_vector_from_val (stepvectype, t);
3922 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3924 vec_def = induc_def;
3925 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3926 for (i = 1; i < ncopies; i++)
3928 /* vec_i = vec_prev + vec_step */
3929 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3930 vec_def, vec_step);
3931 vec_def = make_ssa_name (vec_dest, new_stmt);
3932 gimple_assign_set_lhs (new_stmt, vec_def);
3934 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3935 if (!useless_type_conversion_p (resvectype, vectype))
3937 new_stmt
3938 = gimple_build_assign
3939 (vect_get_new_vect_var (resvectype, vect_simple_var,
3940 "vec_iv_"),
3941 VIEW_CONVERT_EXPR,
3942 build1 (VIEW_CONVERT_EXPR, resvectype,
3943 gimple_assign_lhs (new_stmt)));
3944 gimple_assign_set_lhs (new_stmt,
3945 make_ssa_name
3946 (gimple_assign_lhs (new_stmt), new_stmt));
3947 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3949 set_vinfo_for_stmt (new_stmt,
3950 new_stmt_vec_info (new_stmt, loop_vinfo));
3951 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3952 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3956 if (nested_in_vect_loop)
3958 /* Find the loop-closed exit-phi of the induction, and record
3959 the final vector of induction results: */
3960 exit_phi = NULL;
3961 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3963 gimple *use_stmt = USE_STMT (use_p);
3964 if (is_gimple_debug (use_stmt))
3965 continue;
3967 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3969 exit_phi = use_stmt;
3970 break;
3973 if (exit_phi)
3975 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3976 /* FORNOW. Currently not supporting the case that an inner-loop induction
3977 is not used in the outer-loop (i.e. only outside the outer-loop). */
3978 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3979 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3981 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE, vect_location,
3985 "vector of inductions after inner-loop:");
3986 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3987 dump_printf (MSG_NOTE, "\n");
3993 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_NOTE, vect_location,
3996 "transform induction: created def-use cycle: ");
3997 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3998 dump_printf (MSG_NOTE, "\n");
3999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4000 SSA_NAME_DEF_STMT (vec_def), 0);
4001 dump_printf (MSG_NOTE, "\n");
4004 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4005 if (!useless_type_conversion_p (resvectype, vectype))
4007 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4008 vect_simple_var,
4009 "vec_iv_"),
4010 VIEW_CONVERT_EXPR,
4011 build1 (VIEW_CONVERT_EXPR, resvectype,
4012 induc_def));
4013 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4014 gimple_assign_set_lhs (new_stmt, induc_def);
4015 si = gsi_after_labels (bb);
4016 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4017 set_vinfo_for_stmt (new_stmt,
4018 new_stmt_vec_info (new_stmt, loop_vinfo));
4019 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4020 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4023 return induc_def;
4027 /* Function get_initial_def_for_reduction
4029 Input:
4030 STMT - a stmt that performs a reduction operation in the loop.
4031 INIT_VAL - the initial value of the reduction variable
4033 Output:
4034 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4035 of the reduction (used for adjusting the epilog - see below).
4036 Return a vector variable, initialized according to the operation that STMT
4037 performs. This vector will be used as the initial value of the
4038 vector of partial results.
4040 Option1 (adjust in epilog): Initialize the vector as follows:
4041 add/bit or/xor: [0,0,...,0,0]
4042 mult/bit and: [1,1,...,1,1]
4043 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4044 and when necessary (e.g. add/mult case) let the caller know
4045 that it needs to adjust the result by init_val.
4047 Option2: Initialize the vector as follows:
4048 add/bit or/xor: [init_val,0,0,...,0]
4049 mult/bit and: [init_val,1,1,...,1]
4050 min/max/cond_expr: [init_val,init_val,...,init_val]
4051 and no adjustments are needed.
4053 For example, for the following code:
4055 s = init_val;
4056 for (i=0;i<n;i++)
4057 s = s + a[i];
4059 STMT is 's = s + a[i]', and the reduction variable is 's'.
4060 For a vector of 4 units, we want to return either [0,0,0,init_val],
4061 or [0,0,0,0] and let the caller know that it needs to adjust
4062 the result at the end by 'init_val'.
4064 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4065 initialization vector is simpler (same element in all entries), if
4066 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4068 A cost model should help decide between these two schemes. */
4070 tree
4071 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4072 tree *adjustment_def)
4074 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4075 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4076 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4077 tree scalar_type = TREE_TYPE (init_val);
4078 tree vectype = get_vectype_for_scalar_type (scalar_type);
4079 int nunits;
4080 enum tree_code code = gimple_assign_rhs_code (stmt);
4081 tree def_for_init;
4082 tree init_def;
4083 tree *elts;
4084 int i;
4085 bool nested_in_vect_loop = false;
4086 REAL_VALUE_TYPE real_init_val = dconst0;
4087 int int_init_val = 0;
4088 gimple *def_stmt = NULL;
4089 gimple_seq stmts = NULL;
4091 gcc_assert (vectype);
4092 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4094 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4095 || SCALAR_FLOAT_TYPE_P (scalar_type));
4097 if (nested_in_vect_loop_p (loop, stmt))
4098 nested_in_vect_loop = true;
4099 else
4100 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4102 /* In case of double reduction we only create a vector variable to be put
4103 in the reduction phi node. The actual statement creation is done in
4104 vect_create_epilog_for_reduction. */
4105 if (adjustment_def && nested_in_vect_loop
4106 && TREE_CODE (init_val) == SSA_NAME
4107 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4108 && gimple_code (def_stmt) == GIMPLE_PHI
4109 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4110 && vinfo_for_stmt (def_stmt)
4111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4112 == vect_double_reduction_def)
4114 *adjustment_def = NULL;
4115 return vect_create_destination_var (init_val, vectype);
4118 /* In case of a nested reduction do not use an adjustment def as
4119 that case is not supported by the epilogue generation correctly
4120 if ncopies is not one. */
4121 if (adjustment_def && nested_in_vect_loop)
4123 *adjustment_def = NULL;
4124 return vect_get_vec_def_for_operand (init_val, stmt);
4127 switch (code)
4129 case WIDEN_SUM_EXPR:
4130 case DOT_PROD_EXPR:
4131 case SAD_EXPR:
4132 case PLUS_EXPR:
4133 case MINUS_EXPR:
4134 case BIT_IOR_EXPR:
4135 case BIT_XOR_EXPR:
4136 case MULT_EXPR:
4137 case BIT_AND_EXPR:
4138 /* ADJUSMENT_DEF is NULL when called from
4139 vect_create_epilog_for_reduction to vectorize double reduction. */
4140 if (adjustment_def)
4141 *adjustment_def = init_val;
4143 if (code == MULT_EXPR)
4145 real_init_val = dconst1;
4146 int_init_val = 1;
4149 if (code == BIT_AND_EXPR)
4150 int_init_val = -1;
4152 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4153 def_for_init = build_real (scalar_type, real_init_val);
4154 else
4155 def_for_init = build_int_cst (scalar_type, int_init_val);
4157 /* Create a vector of '0' or '1' except the first element. */
4158 elts = XALLOCAVEC (tree, nunits);
4159 for (i = nunits - 2; i >= 0; --i)
4160 elts[i + 1] = def_for_init;
4162 /* Option1: the first element is '0' or '1' as well. */
4163 if (adjustment_def)
4165 elts[0] = def_for_init;
4166 init_def = build_vector (vectype, elts);
4167 break;
4170 /* Option2: the first element is INIT_VAL. */
4171 elts[0] = init_val;
4172 if (TREE_CONSTANT (init_val))
4173 init_def = build_vector (vectype, elts);
4174 else
4176 vec<constructor_elt, va_gc> *v;
4177 vec_alloc (v, nunits);
4178 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4179 for (i = 1; i < nunits; ++i)
4180 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4181 init_def = build_constructor (vectype, v);
4184 break;
4186 case MIN_EXPR:
4187 case MAX_EXPR:
4188 case COND_EXPR:
4189 if (adjustment_def)
4191 *adjustment_def = NULL_TREE;
4192 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4194 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4195 break;
4198 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4199 if (! gimple_seq_empty_p (stmts))
4200 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4201 init_def = build_vector_from_val (vectype, init_val);
4202 break;
4204 default:
4205 gcc_unreachable ();
4208 return init_def;
4211 /* Function vect_create_epilog_for_reduction
4213 Create code at the loop-epilog to finalize the result of a reduction
4214 computation.
4216 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4217 reduction statements.
4218 STMT is the scalar reduction stmt that is being vectorized.
4219 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4220 number of elements that we can fit in a vectype (nunits). In this case
4221 we have to generate more than one vector stmt - i.e - we need to "unroll"
4222 the vector stmt by a factor VF/nunits. For more details see documentation
4223 in vectorizable_operation.
4224 REDUC_CODE is the tree-code for the epilog reduction.
4225 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4226 computation.
4227 REDUC_INDEX is the index of the operand in the right hand side of the
4228 statement that is defined by REDUCTION_PHI.
4229 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4230 SLP_NODE is an SLP node containing a group of reduction statements. The
4231 first one in this group is STMT.
4232 INDUCTION_INDEX is the index of the loop for condition reductions.
4233 Otherwise it is undefined.
4235 This function:
4236 1. Creates the reduction def-use cycles: sets the arguments for
4237 REDUCTION_PHIS:
4238 The loop-entry argument is the vectorized initial-value of the reduction.
4239 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4240 sums.
4241 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4242 by applying the operation specified by REDUC_CODE if available, or by
4243 other means (whole-vector shifts or a scalar loop).
4244 The function also creates a new phi node at the loop exit to preserve
4245 loop-closed form, as illustrated below.
4247 The flow at the entry to this function:
4249 loop:
4250 vec_def = phi <null, null> # REDUCTION_PHI
4251 VECT_DEF = vector_stmt # vectorized form of STMT
4252 s_loop = scalar_stmt # (scalar) STMT
4253 loop_exit:
4254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4255 use <s_out0>
4256 use <s_out0>
4258 The above is transformed by this function into:
4260 loop:
4261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4262 VECT_DEF = vector_stmt # vectorized form of STMT
4263 s_loop = scalar_stmt # (scalar) STMT
4264 loop_exit:
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4270 use <s_out4>
4271 use <s_out4>
4274 static void
4275 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4276 int ncopies, enum tree_code reduc_code,
4277 vec<gimple *> reduction_phis,
4278 int reduc_index, bool double_reduc,
4279 slp_tree slp_node, tree induction_index)
4281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4282 stmt_vec_info prev_phi_info;
4283 tree vectype;
4284 machine_mode mode;
4285 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4287 basic_block exit_bb;
4288 tree scalar_dest;
4289 tree scalar_type;
4290 gimple *new_phi = NULL, *phi;
4291 gimple_stmt_iterator exit_gsi;
4292 tree vec_dest;
4293 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4294 gimple *epilog_stmt = NULL;
4295 enum tree_code code = gimple_assign_rhs_code (stmt);
4296 gimple *exit_phi;
4297 tree bitsize;
4298 tree adjustment_def = NULL;
4299 tree vec_initial_def = NULL;
4300 tree reduction_op, expr, def, initial_def = NULL;
4301 tree orig_name, scalar_result;
4302 imm_use_iterator imm_iter, phi_imm_iter;
4303 use_operand_p use_p, phi_use_p;
4304 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4305 bool nested_in_vect_loop = false;
4306 auto_vec<gimple *> new_phis;
4307 auto_vec<gimple *> inner_phis;
4308 enum vect_def_type dt = vect_unknown_def_type;
4309 int j, i;
4310 auto_vec<tree> scalar_results;
4311 unsigned int group_size = 1, k, ratio;
4312 auto_vec<tree> vec_initial_defs;
4313 auto_vec<gimple *> phis;
4314 bool slp_reduc = false;
4315 tree new_phi_result;
4316 gimple *inner_phi = NULL;
4318 if (slp_node)
4319 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4321 if (nested_in_vect_loop_p (loop, stmt))
4323 outer_loop = loop;
4324 loop = loop->inner;
4325 nested_in_vect_loop = true;
4326 gcc_assert (!slp_node);
4329 reduction_op = get_reduction_op (stmt, reduc_index);
4331 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4332 gcc_assert (vectype);
4333 mode = TYPE_MODE (vectype);
4335 /* 1. Create the reduction def-use cycle:
4336 Set the arguments of REDUCTION_PHIS, i.e., transform
4338 loop:
4339 vec_def = phi <null, null> # REDUCTION_PHI
4340 VECT_DEF = vector_stmt # vectorized form of STMT
4343 into:
4345 loop:
4346 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4347 VECT_DEF = vector_stmt # vectorized form of STMT
4350 (in case of SLP, do it for all the phis). */
4352 /* Get the loop-entry arguments. */
4353 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4354 if (slp_node)
4355 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4356 NULL, slp_node, reduc_index);
4357 else
4359 /* Get at the scalar def before the loop, that defines the initial value
4360 of the reduction variable. */
4361 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4362 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4363 loop_preheader_edge (loop));
4364 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4365 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4366 &adjustment_def);
4367 vec_initial_defs.create (1);
4368 vec_initial_defs.quick_push (vec_initial_def);
4371 /* Set phi nodes arguments. */
4372 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4374 tree vec_init_def, def;
4375 gimple_seq stmts;
4376 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4377 true, NULL_TREE);
4378 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4379 def = vect_defs[i];
4380 for (j = 0; j < ncopies; j++)
4382 if (j != 0)
4384 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4385 if (nested_in_vect_loop)
4386 vec_init_def
4387 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4388 vec_init_def);
4391 /* Set the loop-entry arg of the reduction-phi. */
4393 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4394 == INTEGER_INDUC_COND_REDUCTION)
4396 /* Initialise the reduction phi to zero. This prevents initial
4397 values of non-zero interferring with the reduction op. */
4398 gcc_assert (ncopies == 1);
4399 gcc_assert (i == 0);
4401 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4402 tree zero_vec = build_zero_cst (vec_init_def_type);
4404 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4405 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4407 else
4408 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4409 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4411 /* Set the loop-latch arg for the reduction-phi. */
4412 if (j > 0)
4413 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4415 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4416 UNKNOWN_LOCATION);
4418 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "transform reduction: created def-use cycle: ");
4422 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4423 dump_printf (MSG_NOTE, "\n");
4424 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4425 dump_printf (MSG_NOTE, "\n");
4430 /* 2. Create epilog code.
4431 The reduction epilog code operates across the elements of the vector
4432 of partial results computed by the vectorized loop.
4433 The reduction epilog code consists of:
4435 step 1: compute the scalar result in a vector (v_out2)
4436 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4437 step 3: adjust the scalar result (s_out3) if needed.
4439 Step 1 can be accomplished using one the following three schemes:
4440 (scheme 1) using reduc_code, if available.
4441 (scheme 2) using whole-vector shifts, if available.
4442 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4443 combined.
4445 The overall epilog code looks like this:
4447 s_out0 = phi <s_loop> # original EXIT_PHI
4448 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4449 v_out2 = reduce <v_out1> # step 1
4450 s_out3 = extract_field <v_out2, 0> # step 2
4451 s_out4 = adjust_result <s_out3> # step 3
4453 (step 3 is optional, and steps 1 and 2 may be combined).
4454 Lastly, the uses of s_out0 are replaced by s_out4. */
4457 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4458 v_out1 = phi <VECT_DEF>
4459 Store them in NEW_PHIS. */
4461 exit_bb = single_exit (loop)->dest;
4462 prev_phi_info = NULL;
4463 new_phis.create (vect_defs.length ());
4464 FOR_EACH_VEC_ELT (vect_defs, i, def)
4466 for (j = 0; j < ncopies; j++)
4468 tree new_def = copy_ssa_name (def);
4469 phi = create_phi_node (new_def, exit_bb);
4470 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4471 if (j == 0)
4472 new_phis.quick_push (phi);
4473 else
4475 def = vect_get_vec_def_for_stmt_copy (dt, def);
4476 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4479 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4480 prev_phi_info = vinfo_for_stmt (phi);
4484 /* The epilogue is created for the outer-loop, i.e., for the loop being
4485 vectorized. Create exit phis for the outer loop. */
4486 if (double_reduc)
4488 loop = outer_loop;
4489 exit_bb = single_exit (loop)->dest;
4490 inner_phis.create (vect_defs.length ());
4491 FOR_EACH_VEC_ELT (new_phis, i, phi)
4493 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4494 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4495 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4496 PHI_RESULT (phi));
4497 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4498 loop_vinfo));
4499 inner_phis.quick_push (phi);
4500 new_phis[i] = outer_phi;
4501 prev_phi_info = vinfo_for_stmt (outer_phi);
4502 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4504 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4505 new_result = copy_ssa_name (PHI_RESULT (phi));
4506 outer_phi = create_phi_node (new_result, exit_bb);
4507 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4508 PHI_RESULT (phi));
4509 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4510 loop_vinfo));
4511 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4512 prev_phi_info = vinfo_for_stmt (outer_phi);
4517 exit_gsi = gsi_after_labels (exit_bb);
4519 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4520 (i.e. when reduc_code is not available) and in the final adjustment
4521 code (if needed). Also get the original scalar reduction variable as
4522 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4523 represents a reduction pattern), the tree-code and scalar-def are
4524 taken from the original stmt that the pattern-stmt (STMT) replaces.
4525 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4526 are taken from STMT. */
4528 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4529 if (!orig_stmt)
4531 /* Regular reduction */
4532 orig_stmt = stmt;
4534 else
4536 /* Reduction pattern */
4537 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4538 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4539 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4542 code = gimple_assign_rhs_code (orig_stmt);
4543 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4544 partial results are added and not subtracted. */
4545 if (code == MINUS_EXPR)
4546 code = PLUS_EXPR;
4548 scalar_dest = gimple_assign_lhs (orig_stmt);
4549 scalar_type = TREE_TYPE (scalar_dest);
4550 scalar_results.create (group_size);
4551 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4552 bitsize = TYPE_SIZE (scalar_type);
4554 /* In case this is a reduction in an inner-loop while vectorizing an outer
4555 loop - we don't need to extract a single scalar result at the end of the
4556 inner-loop (unless it is double reduction, i.e., the use of reduction is
4557 outside the outer-loop). The final vector of partial results will be used
4558 in the vectorized outer-loop, or reduced to a scalar result at the end of
4559 the outer-loop. */
4560 if (nested_in_vect_loop && !double_reduc)
4561 goto vect_finalize_reduction;
4563 /* SLP reduction without reduction chain, e.g.,
4564 # a1 = phi <a2, a0>
4565 # b1 = phi <b2, b0>
4566 a2 = operation (a1)
4567 b2 = operation (b1) */
4568 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4570 /* In case of reduction chain, e.g.,
4571 # a1 = phi <a3, a0>
4572 a2 = operation (a1)
4573 a3 = operation (a2),
4575 we may end up with more than one vector result. Here we reduce them to
4576 one vector. */
4577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4579 tree first_vect = PHI_RESULT (new_phis[0]);
4580 tree tmp;
4581 gassign *new_vec_stmt = NULL;
4583 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4584 for (k = 1; k < new_phis.length (); k++)
4586 gimple *next_phi = new_phis[k];
4587 tree second_vect = PHI_RESULT (next_phi);
4589 tmp = build2 (code, vectype, first_vect, second_vect);
4590 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4591 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4592 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4593 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4596 new_phi_result = first_vect;
4597 if (new_vec_stmt)
4599 new_phis.truncate (0);
4600 new_phis.safe_push (new_vec_stmt);
4603 else
4604 new_phi_result = PHI_RESULT (new_phis[0]);
4606 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4608 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4609 various data values where the condition matched and another vector
4610 (INDUCTION_INDEX) containing all the indexes of those matches. We
4611 need to extract the last matching index (which will be the index with
4612 highest value) and use this to index into the data vector.
4613 For the case where there were no matches, the data vector will contain
4614 all default values and the index vector will be all zeros. */
4616 /* Get various versions of the type of the vector of indexes. */
4617 tree index_vec_type = TREE_TYPE (induction_index);
4618 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4619 tree index_scalar_type = TREE_TYPE (index_vec_type);
4620 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4621 (index_vec_type);
4623 /* Get an unsigned integer version of the type of the data vector. */
4624 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4625 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4626 tree vectype_unsigned = build_vector_type
4627 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4629 /* First we need to create a vector (ZERO_VEC) of zeros and another
4630 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4631 can create using a MAX reduction and then expanding.
4632 In the case where the loop never made any matches, the max index will
4633 be zero. */
4635 /* Vector of {0, 0, 0,...}. */
4636 tree zero_vec = make_ssa_name (vectype);
4637 tree zero_vec_rhs = build_zero_cst (vectype);
4638 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4639 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4641 /* Find maximum value from the vector of found indexes. */
4642 tree max_index = make_ssa_name (index_scalar_type);
4643 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4644 induction_index);
4645 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4647 /* Vector of {max_index, max_index, max_index,...}. */
4648 tree max_index_vec = make_ssa_name (index_vec_type);
4649 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4650 max_index);
4651 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4652 max_index_vec_rhs);
4653 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4655 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4656 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4657 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4658 otherwise. Only one value should match, resulting in a vector
4659 (VEC_COND) with one data value and the rest zeros.
4660 In the case where the loop never made any matches, every index will
4661 match, resulting in a vector with all data values (which will all be
4662 the default value). */
4664 /* Compare the max index vector to the vector of found indexes to find
4665 the position of the max value. */
4666 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4667 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4668 induction_index,
4669 max_index_vec);
4670 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4672 /* Use the compare to choose either values from the data vector or
4673 zero. */
4674 tree vec_cond = make_ssa_name (vectype);
4675 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4676 vec_compare, new_phi_result,
4677 zero_vec);
4678 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4680 /* Finally we need to extract the data value from the vector (VEC_COND)
4681 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4682 reduction, but because this doesn't exist, we can use a MAX reduction
4683 instead. The data value might be signed or a float so we need to cast
4684 it first.
4685 In the case where the loop never made any matches, the data values are
4686 all identical, and so will reduce down correctly. */
4688 /* Make the matched data values unsigned. */
4689 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4690 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4691 vec_cond);
4692 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4693 VIEW_CONVERT_EXPR,
4694 vec_cond_cast_rhs);
4695 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4697 /* Reduce down to a scalar value. */
4698 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4699 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4700 optab_default);
4701 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4702 != CODE_FOR_nothing);
4703 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4704 REDUC_MAX_EXPR,
4705 vec_cond_cast);
4706 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4708 /* Convert the reduced value back to the result type and set as the
4709 result. */
4710 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4711 data_reduc);
4712 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4713 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4714 gimple_assign_set_lhs (epilog_stmt, new_temp);
4715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4716 scalar_results.safe_push (new_temp);
4719 /* 2.3 Create the reduction code, using one of the three schemes described
4720 above. In SLP we simply need to extract all the elements from the
4721 vector (without reducing them), so we use scalar shifts. */
4722 else if (reduc_code != ERROR_MARK && !slp_reduc)
4724 tree tmp;
4725 tree vec_elem_type;
4727 /*** Case 1: Create:
4728 v_out2 = reduc_expr <v_out1> */
4730 if (dump_enabled_p ())
4731 dump_printf_loc (MSG_NOTE, vect_location,
4732 "Reduce using direct vector reduction.\n");
4734 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4735 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4737 tree tmp_dest =
4738 vect_create_destination_var (scalar_dest, vec_elem_type);
4739 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4740 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4741 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4742 gimple_assign_set_lhs (epilog_stmt, new_temp);
4743 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4745 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4747 else
4748 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4750 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4751 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4752 gimple_assign_set_lhs (epilog_stmt, new_temp);
4753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4755 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4756 == INTEGER_INDUC_COND_REDUCTION)
4758 /* Earlier we set the initial value to be zero. Check the result
4759 and if it is zero then replace with the original initial
4760 value. */
4761 tree zero = build_zero_cst (scalar_type);
4762 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4764 tmp = make_ssa_name (new_scalar_dest);
4765 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4766 initial_def, new_temp);
4767 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4768 new_temp = tmp;
4771 scalar_results.safe_push (new_temp);
4773 else
4775 bool reduce_with_shift = have_whole_vector_shift (mode);
4776 int element_bitsize = tree_to_uhwi (bitsize);
4777 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4778 tree vec_temp;
4780 /* Regardless of whether we have a whole vector shift, if we're
4781 emulating the operation via tree-vect-generic, we don't want
4782 to use it. Only the first round of the reduction is likely
4783 to still be profitable via emulation. */
4784 /* ??? It might be better to emit a reduction tree code here, so that
4785 tree-vect-generic can expand the first round via bit tricks. */
4786 if (!VECTOR_MODE_P (mode))
4787 reduce_with_shift = false;
4788 else
4790 optab optab = optab_for_tree_code (code, vectype, optab_default);
4791 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4792 reduce_with_shift = false;
4795 if (reduce_with_shift && !slp_reduc)
4797 int nelements = vec_size_in_bits / element_bitsize;
4798 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4800 int elt_offset;
4802 tree zero_vec = build_zero_cst (vectype);
4803 /*** Case 2: Create:
4804 for (offset = nelements/2; offset >= 1; offset/=2)
4806 Create: va' = vec_shift <va, offset>
4807 Create: va = vop <va, va'>
4808 } */
4810 tree rhs;
4812 if (dump_enabled_p ())
4813 dump_printf_loc (MSG_NOTE, vect_location,
4814 "Reduce using vector shifts\n");
4816 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4817 new_temp = new_phi_result;
4818 for (elt_offset = nelements / 2;
4819 elt_offset >= 1;
4820 elt_offset /= 2)
4822 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4823 tree mask = vect_gen_perm_mask_any (vectype, sel);
4824 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4825 new_temp, zero_vec, mask);
4826 new_name = make_ssa_name (vec_dest, epilog_stmt);
4827 gimple_assign_set_lhs (epilog_stmt, new_name);
4828 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4830 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4831 new_temp);
4832 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4833 gimple_assign_set_lhs (epilog_stmt, new_temp);
4834 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4837 /* 2.4 Extract the final scalar result. Create:
4838 s_out3 = extract_field <v_out2, bitpos> */
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_NOTE, vect_location,
4842 "extract scalar result\n");
4844 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4845 bitsize, bitsize_zero_node);
4846 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4847 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4848 gimple_assign_set_lhs (epilog_stmt, new_temp);
4849 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4850 scalar_results.safe_push (new_temp);
4852 else
4854 /*** Case 3: Create:
4855 s = extract_field <v_out2, 0>
4856 for (offset = element_size;
4857 offset < vector_size;
4858 offset += element_size;)
4860 Create: s' = extract_field <v_out2, offset>
4861 Create: s = op <s, s'> // For non SLP cases
4862 } */
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE, vect_location,
4866 "Reduce using scalar code.\n");
4868 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4869 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4871 int bit_offset;
4872 if (gimple_code (new_phi) == GIMPLE_PHI)
4873 vec_temp = PHI_RESULT (new_phi);
4874 else
4875 vec_temp = gimple_assign_lhs (new_phi);
4876 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4877 bitsize_zero_node);
4878 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4879 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4880 gimple_assign_set_lhs (epilog_stmt, new_temp);
4881 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4883 /* In SLP we don't need to apply reduction operation, so we just
4884 collect s' values in SCALAR_RESULTS. */
4885 if (slp_reduc)
4886 scalar_results.safe_push (new_temp);
4888 for (bit_offset = element_bitsize;
4889 bit_offset < vec_size_in_bits;
4890 bit_offset += element_bitsize)
4892 tree bitpos = bitsize_int (bit_offset);
4893 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4894 bitsize, bitpos);
4896 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4897 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4898 gimple_assign_set_lhs (epilog_stmt, new_name);
4899 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4901 if (slp_reduc)
4903 /* In SLP we don't need to apply reduction operation, so
4904 we just collect s' values in SCALAR_RESULTS. */
4905 new_temp = new_name;
4906 scalar_results.safe_push (new_name);
4908 else
4910 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4911 new_name, new_temp);
4912 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4913 gimple_assign_set_lhs (epilog_stmt, new_temp);
4914 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4919 /* The only case where we need to reduce scalar results in SLP, is
4920 unrolling. If the size of SCALAR_RESULTS is greater than
4921 GROUP_SIZE, we reduce them combining elements modulo
4922 GROUP_SIZE. */
4923 if (slp_reduc)
4925 tree res, first_res, new_res;
4926 gimple *new_stmt;
4928 /* Reduce multiple scalar results in case of SLP unrolling. */
4929 for (j = group_size; scalar_results.iterate (j, &res);
4930 j++)
4932 first_res = scalar_results[j % group_size];
4933 new_stmt = gimple_build_assign (new_scalar_dest, code,
4934 first_res, res);
4935 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4936 gimple_assign_set_lhs (new_stmt, new_res);
4937 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4938 scalar_results[j % group_size] = new_res;
4941 else
4942 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4943 scalar_results.safe_push (new_temp);
4947 vect_finalize_reduction:
4949 if (double_reduc)
4950 loop = loop->inner;
4952 /* 2.5 Adjust the final result by the initial value of the reduction
4953 variable. (When such adjustment is not needed, then
4954 'adjustment_def' is zero). For example, if code is PLUS we create:
4955 new_temp = loop_exit_def + adjustment_def */
4957 if (adjustment_def)
4959 gcc_assert (!slp_reduc);
4960 if (nested_in_vect_loop)
4962 new_phi = new_phis[0];
4963 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4964 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4965 new_dest = vect_create_destination_var (scalar_dest, vectype);
4967 else
4969 new_temp = scalar_results[0];
4970 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4971 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4972 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4975 epilog_stmt = gimple_build_assign (new_dest, expr);
4976 new_temp = make_ssa_name (new_dest, epilog_stmt);
4977 gimple_assign_set_lhs (epilog_stmt, new_temp);
4978 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979 if (nested_in_vect_loop)
4981 set_vinfo_for_stmt (epilog_stmt,
4982 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4983 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4986 if (!double_reduc)
4987 scalar_results.quick_push (new_temp);
4988 else
4989 scalar_results[0] = new_temp;
4991 else
4992 scalar_results[0] = new_temp;
4994 new_phis[0] = epilog_stmt;
4997 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4998 phis with new adjusted scalar results, i.e., replace use <s_out0>
4999 with use <s_out4>.
5001 Transform:
5002 loop_exit:
5003 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5004 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5005 v_out2 = reduce <v_out1>
5006 s_out3 = extract_field <v_out2, 0>
5007 s_out4 = adjust_result <s_out3>
5008 use <s_out0>
5009 use <s_out0>
5011 into:
5013 loop_exit:
5014 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5015 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5016 v_out2 = reduce <v_out1>
5017 s_out3 = extract_field <v_out2, 0>
5018 s_out4 = adjust_result <s_out3>
5019 use <s_out4>
5020 use <s_out4> */
5023 /* In SLP reduction chain we reduce vector results into one vector if
5024 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5025 the last stmt in the reduction chain, since we are looking for the loop
5026 exit phi node. */
5027 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5029 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5030 /* Handle reduction patterns. */
5031 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5032 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5034 scalar_dest = gimple_assign_lhs (dest_stmt);
5035 group_size = 1;
5038 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5039 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5040 need to match SCALAR_RESULTS with corresponding statements. The first
5041 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5042 the first vector stmt, etc.
5043 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5044 if (group_size > new_phis.length ())
5046 ratio = group_size / new_phis.length ();
5047 gcc_assert (!(group_size % new_phis.length ()));
5049 else
5050 ratio = 1;
5052 for (k = 0; k < group_size; k++)
5054 if (k % ratio == 0)
5056 epilog_stmt = new_phis[k / ratio];
5057 reduction_phi = reduction_phis[k / ratio];
5058 if (double_reduc)
5059 inner_phi = inner_phis[k / ratio];
5062 if (slp_reduc)
5064 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5066 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5067 /* SLP statements can't participate in patterns. */
5068 gcc_assert (!orig_stmt);
5069 scalar_dest = gimple_assign_lhs (current_stmt);
5072 phis.create (3);
5073 /* Find the loop-closed-use at the loop exit of the original scalar
5074 result. (The reduction result is expected to have two immediate uses -
5075 one at the latch block, and one at the loop exit). */
5076 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5077 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5078 && !is_gimple_debug (USE_STMT (use_p)))
5079 phis.safe_push (USE_STMT (use_p));
5081 /* While we expect to have found an exit_phi because of loop-closed-ssa
5082 form we can end up without one if the scalar cycle is dead. */
5084 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5086 if (outer_loop)
5088 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5089 gphi *vect_phi;
5091 /* FORNOW. Currently not supporting the case that an inner-loop
5092 reduction is not used in the outer-loop (but only outside the
5093 outer-loop), unless it is double reduction. */
5094 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5095 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5096 || double_reduc);
5098 if (double_reduc)
5099 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5100 else
5101 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5102 if (!double_reduc
5103 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5104 != vect_double_reduction_def)
5105 continue;
5107 /* Handle double reduction:
5109 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5110 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5111 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5112 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5114 At that point the regular reduction (stmt2 and stmt3) is
5115 already vectorized, as well as the exit phi node, stmt4.
5116 Here we vectorize the phi node of double reduction, stmt1, and
5117 update all relevant statements. */
5119 /* Go through all the uses of s2 to find double reduction phi
5120 node, i.e., stmt1 above. */
5121 orig_name = PHI_RESULT (exit_phi);
5122 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5124 stmt_vec_info use_stmt_vinfo;
5125 stmt_vec_info new_phi_vinfo;
5126 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5127 basic_block bb = gimple_bb (use_stmt);
5128 gimple *use;
5130 /* Check that USE_STMT is really double reduction phi
5131 node. */
5132 if (gimple_code (use_stmt) != GIMPLE_PHI
5133 || gimple_phi_num_args (use_stmt) != 2
5134 || bb->loop_father != outer_loop)
5135 continue;
5136 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5137 if (!use_stmt_vinfo
5138 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5139 != vect_double_reduction_def)
5140 continue;
5142 /* Create vector phi node for double reduction:
5143 vs1 = phi <vs0, vs2>
5144 vs1 was created previously in this function by a call to
5145 vect_get_vec_def_for_operand and is stored in
5146 vec_initial_def;
5147 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5148 vs0 is created here. */
5150 /* Create vector phi node. */
5151 vect_phi = create_phi_node (vec_initial_def, bb);
5152 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5153 loop_vec_info_for_loop (outer_loop));
5154 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5156 /* Create vs0 - initial def of the double reduction phi. */
5157 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5158 loop_preheader_edge (outer_loop));
5159 init_def = get_initial_def_for_reduction (stmt,
5160 preheader_arg, NULL);
5161 vect_phi_init = vect_init_vector (use_stmt, init_def,
5162 vectype, NULL);
5164 /* Update phi node arguments with vs0 and vs2. */
5165 add_phi_arg (vect_phi, vect_phi_init,
5166 loop_preheader_edge (outer_loop),
5167 UNKNOWN_LOCATION);
5168 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5169 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5170 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_NOTE, vect_location,
5173 "created double reduction phi node: ");
5174 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5175 dump_printf (MSG_NOTE, "\n");
5178 vect_phi_res = PHI_RESULT (vect_phi);
5180 /* Replace the use, i.e., set the correct vs1 in the regular
5181 reduction phi node. FORNOW, NCOPIES is always 1, so the
5182 loop is redundant. */
5183 use = reduction_phi;
5184 for (j = 0; j < ncopies; j++)
5186 edge pr_edge = loop_preheader_edge (loop);
5187 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5188 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5194 phis.release ();
5195 if (nested_in_vect_loop)
5197 if (double_reduc)
5198 loop = outer_loop;
5199 else
5200 continue;
5203 phis.create (3);
5204 /* Find the loop-closed-use at the loop exit of the original scalar
5205 result. (The reduction result is expected to have two immediate uses,
5206 one at the latch block, and one at the loop exit). For double
5207 reductions we are looking for exit phis of the outer loop. */
5208 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5210 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5212 if (!is_gimple_debug (USE_STMT (use_p)))
5213 phis.safe_push (USE_STMT (use_p));
5215 else
5217 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5219 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5221 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5223 if (!flow_bb_inside_loop_p (loop,
5224 gimple_bb (USE_STMT (phi_use_p)))
5225 && !is_gimple_debug (USE_STMT (phi_use_p)))
5226 phis.safe_push (USE_STMT (phi_use_p));
5232 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5234 /* Replace the uses: */
5235 orig_name = PHI_RESULT (exit_phi);
5236 scalar_result = scalar_results[k];
5237 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5238 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5239 SET_USE (use_p, scalar_result);
5242 phis.release ();
5247 /* Function is_nonwrapping_integer_induction.
5249 Check if STMT (which is part of loop LOOP) both increments and
5250 does not cause overflow. */
5252 static bool
5253 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5255 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5256 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5257 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5258 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5259 widest_int ni, max_loop_value, lhs_max;
5260 bool overflow = false;
5262 /* Make sure the loop is integer based. */
5263 if (TREE_CODE (base) != INTEGER_CST
5264 || TREE_CODE (step) != INTEGER_CST)
5265 return false;
5267 /* Check that the induction increments. */
5268 if (tree_int_cst_sgn (step) == -1)
5269 return false;
5271 /* Check that the max size of the loop will not wrap. */
5273 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5274 return true;
5276 if (! max_stmt_executions (loop, &ni))
5277 return false;
5279 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5280 &overflow);
5281 if (overflow)
5282 return false;
5284 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5285 TYPE_SIGN (lhs_type), &overflow);
5286 if (overflow)
5287 return false;
5289 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5290 <= TYPE_PRECISION (lhs_type));
5293 /* Function vectorizable_reduction.
5295 Check if STMT performs a reduction operation that can be vectorized.
5296 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5297 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5298 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5300 This function also handles reduction idioms (patterns) that have been
5301 recognized in advance during vect_pattern_recog. In this case, STMT may be
5302 of this form:
5303 X = pattern_expr (arg0, arg1, ..., X)
5304 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5305 sequence that had been detected and replaced by the pattern-stmt (STMT).
5307 This function also handles reduction of condition expressions, for example:
5308 for (int i = 0; i < N; i++)
5309 if (a[i] < value)
5310 last = a[i];
5311 This is handled by vectorising the loop and creating an additional vector
5312 containing the loop indexes for which "a[i] < value" was true. In the
5313 function epilogue this is reduced to a single max value and then used to
5314 index into the vector of results.
5316 In some cases of reduction patterns, the type of the reduction variable X is
5317 different than the type of the other arguments of STMT.
5318 In such cases, the vectype that is used when transforming STMT into a vector
5319 stmt is different than the vectype that is used to determine the
5320 vectorization factor, because it consists of a different number of elements
5321 than the actual number of elements that are being operated upon in parallel.
5323 For example, consider an accumulation of shorts into an int accumulator.
5324 On some targets it's possible to vectorize this pattern operating on 8
5325 shorts at a time (hence, the vectype for purposes of determining the
5326 vectorization factor should be V8HI); on the other hand, the vectype that
5327 is used to create the vector form is actually V4SI (the type of the result).
5329 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5330 indicates what is the actual level of parallelism (V8HI in the example), so
5331 that the right vectorization factor would be derived. This vectype
5332 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5333 be used to create the vectorized stmt. The right vectype for the vectorized
5334 stmt is obtained from the type of the result X:
5335 get_vectype_for_scalar_type (TREE_TYPE (X))
5337 This means that, contrary to "regular" reductions (or "regular" stmts in
5338 general), the following equation:
5339 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5340 does *NOT* necessarily hold for reduction patterns. */
5342 bool
5343 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5344 gimple **vec_stmt, slp_tree slp_node)
5346 tree vec_dest;
5347 tree scalar_dest;
5348 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5349 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5350 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5351 tree vectype_in = NULL_TREE;
5352 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354 enum tree_code code, orig_code, epilog_reduc_code;
5355 machine_mode vec_mode;
5356 int op_type;
5357 optab optab, reduc_optab;
5358 tree new_temp = NULL_TREE;
5359 gimple *def_stmt;
5360 enum vect_def_type dt;
5361 gphi *new_phi = NULL;
5362 tree scalar_type;
5363 bool is_simple_use;
5364 gimple *orig_stmt;
5365 stmt_vec_info orig_stmt_info;
5366 tree expr = NULL_TREE;
5367 int i;
5368 int ncopies;
5369 int epilog_copies;
5370 stmt_vec_info prev_stmt_info, prev_phi_info;
5371 bool single_defuse_cycle = false;
5372 tree reduc_def = NULL_TREE;
5373 gimple *new_stmt = NULL;
5374 int j;
5375 tree ops[3];
5376 bool nested_cycle = false, found_nested_cycle_def = false;
5377 gimple *reduc_def_stmt = NULL;
5378 bool double_reduc = false, dummy;
5379 basic_block def_bb;
5380 struct loop * def_stmt_loop, *outer_loop = NULL;
5381 tree def_arg;
5382 gimple *def_arg_stmt;
5383 auto_vec<tree> vec_oprnds0;
5384 auto_vec<tree> vec_oprnds1;
5385 auto_vec<tree> vect_defs;
5386 auto_vec<gimple *> phis;
5387 int vec_num;
5388 tree def0, def1, tem, op0, op1 = NULL_TREE;
5389 bool first_p = true;
5390 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5391 gimple *cond_expr_induction_def_stmt = NULL;
5393 /* In case of reduction chain we switch to the first stmt in the chain, but
5394 we don't update STMT_INFO, since only the last stmt is marked as reduction
5395 and has reduction properties. */
5396 if (GROUP_FIRST_ELEMENT (stmt_info)
5397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5399 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5400 first_p = false;
5403 if (nested_in_vect_loop_p (loop, stmt))
5405 outer_loop = loop;
5406 loop = loop->inner;
5407 nested_cycle = true;
5410 /* 1. Is vectorizable reduction? */
5411 /* Not supportable if the reduction variable is used in the loop, unless
5412 it's a reduction chain. */
5413 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5414 && !GROUP_FIRST_ELEMENT (stmt_info))
5415 return false;
5417 /* Reductions that are not used even in an enclosing outer-loop,
5418 are expected to be "live" (used out of the loop). */
5419 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5420 && !STMT_VINFO_LIVE_P (stmt_info))
5421 return false;
5423 /* Make sure it was already recognized as a reduction computation. */
5424 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5425 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5426 return false;
5428 /* 2. Has this been recognized as a reduction pattern?
5430 Check if STMT represents a pattern that has been recognized
5431 in earlier analysis stages. For stmts that represent a pattern,
5432 the STMT_VINFO_RELATED_STMT field records the last stmt in
5433 the original sequence that constitutes the pattern. */
5435 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5436 if (orig_stmt)
5438 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5439 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5440 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5443 /* 3. Check the operands of the operation. The first operands are defined
5444 inside the loop body. The last operand is the reduction variable,
5445 which is defined by the loop-header-phi. */
5447 gcc_assert (is_gimple_assign (stmt));
5449 /* Flatten RHS. */
5450 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5452 case GIMPLE_SINGLE_RHS:
5453 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5454 if (op_type == ternary_op)
5456 tree rhs = gimple_assign_rhs1 (stmt);
5457 ops[0] = TREE_OPERAND (rhs, 0);
5458 ops[1] = TREE_OPERAND (rhs, 1);
5459 ops[2] = TREE_OPERAND (rhs, 2);
5460 code = TREE_CODE (rhs);
5462 else
5463 return false;
5464 break;
5466 case GIMPLE_BINARY_RHS:
5467 code = gimple_assign_rhs_code (stmt);
5468 op_type = TREE_CODE_LENGTH (code);
5469 gcc_assert (op_type == binary_op);
5470 ops[0] = gimple_assign_rhs1 (stmt);
5471 ops[1] = gimple_assign_rhs2 (stmt);
5472 break;
5474 case GIMPLE_TERNARY_RHS:
5475 code = gimple_assign_rhs_code (stmt);
5476 op_type = TREE_CODE_LENGTH (code);
5477 gcc_assert (op_type == ternary_op);
5478 ops[0] = gimple_assign_rhs1 (stmt);
5479 ops[1] = gimple_assign_rhs2 (stmt);
5480 ops[2] = gimple_assign_rhs3 (stmt);
5481 break;
5483 case GIMPLE_UNARY_RHS:
5484 return false;
5486 default:
5487 gcc_unreachable ();
5489 /* The default is that the reduction variable is the last in statement. */
5490 int reduc_index = op_type - 1;
5491 if (code == MINUS_EXPR)
5492 reduc_index = 0;
5494 if (code == COND_EXPR && slp_node)
5495 return false;
5497 scalar_dest = gimple_assign_lhs (stmt);
5498 scalar_type = TREE_TYPE (scalar_dest);
5499 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5500 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5501 return false;
5503 /* Do not try to vectorize bit-precision reductions. */
5504 if ((TYPE_PRECISION (scalar_type)
5505 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5506 return false;
5508 /* All uses but the last are expected to be defined in the loop.
5509 The last use is the reduction variable. In case of nested cycle this
5510 assumption is not true: we use reduc_index to record the index of the
5511 reduction variable. */
5512 for (i = 0; i < op_type; i++)
5514 if (i == reduc_index)
5515 continue;
5517 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5518 if (i == 0 && code == COND_EXPR)
5519 continue;
5521 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5522 &def_stmt, &dt, &tem);
5523 if (!vectype_in)
5524 vectype_in = tem;
5525 gcc_assert (is_simple_use);
5527 if (dt != vect_internal_def
5528 && dt != vect_external_def
5529 && dt != vect_constant_def
5530 && dt != vect_induction_def
5531 && !(dt == vect_nested_cycle && nested_cycle))
5532 return false;
5534 if (dt == vect_nested_cycle)
5536 found_nested_cycle_def = true;
5537 reduc_def_stmt = def_stmt;
5538 reduc_index = i;
5541 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5542 cond_expr_induction_def_stmt = def_stmt;
5545 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5546 &def_stmt, &dt, &tem);
5547 if (!vectype_in)
5548 vectype_in = tem;
5549 gcc_assert (is_simple_use);
5550 if (!found_nested_cycle_def)
5551 reduc_def_stmt = def_stmt;
5553 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5554 return false;
5556 if (!(dt == vect_reduction_def
5557 || dt == vect_nested_cycle
5558 || ((dt == vect_internal_def || dt == vect_external_def
5559 || dt == vect_constant_def || dt == vect_induction_def)
5560 && nested_cycle && found_nested_cycle_def)))
5562 /* For pattern recognized stmts, orig_stmt might be a reduction,
5563 but some helper statements for the pattern might not, or
5564 might be COND_EXPRs with reduction uses in the condition. */
5565 gcc_assert (orig_stmt);
5566 return false;
5569 enum vect_reduction_type v_reduc_type;
5570 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5571 !nested_cycle, &dummy, false,
5572 &v_reduc_type);
5574 /* If we have a condition reduction, see if we can simplify it further. */
5575 if (v_reduc_type == COND_REDUCTION
5576 && cond_expr_induction_def_stmt != NULL
5577 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE, vect_location,
5581 "condition expression based on integer induction.\n");
5582 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5584 else
5585 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5587 if (orig_stmt)
5588 gcc_assert (tmp == orig_stmt
5589 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5590 else
5591 /* We changed STMT to be the first stmt in reduction chain, hence we
5592 check that in this case the first element in the chain is STMT. */
5593 gcc_assert (stmt == tmp
5594 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5596 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5597 return false;
5599 if (slp_node || PURE_SLP_STMT (stmt_info))
5600 ncopies = 1;
5601 else
5602 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5603 / TYPE_VECTOR_SUBPARTS (vectype_in));
5605 gcc_assert (ncopies >= 1);
5607 vec_mode = TYPE_MODE (vectype_in);
5609 if (code == COND_EXPR)
5611 /* Only call during the analysis stage, otherwise we'll lose
5612 STMT_VINFO_TYPE. */
5613 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5614 ops[reduc_index], 0, NULL))
5616 if (dump_enabled_p ())
5617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5618 "unsupported condition in reduction\n");
5619 return false;
5622 else
5624 /* 4. Supportable by target? */
5626 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5627 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5629 /* Shifts and rotates are only supported by vectorizable_shifts,
5630 not vectorizable_reduction. */
5631 if (dump_enabled_p ())
5632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5633 "unsupported shift or rotation.\n");
5634 return false;
5637 /* 4.1. check support for the operation in the loop */
5638 optab = optab_for_tree_code (code, vectype_in, optab_default);
5639 if (!optab)
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "no optab.\n");
5645 return false;
5648 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5650 if (dump_enabled_p ())
5651 dump_printf (MSG_NOTE, "op not supported by target.\n");
5653 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5654 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5655 < vect_min_worthwhile_factor (code))
5656 return false;
5658 if (dump_enabled_p ())
5659 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5662 /* Worthwhile without SIMD support? */
5663 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5664 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5665 < vect_min_worthwhile_factor (code))
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "not worthwhile without SIMD support.\n");
5671 return false;
5675 /* 4.2. Check support for the epilog operation.
5677 If STMT represents a reduction pattern, then the type of the
5678 reduction variable may be different than the type of the rest
5679 of the arguments. For example, consider the case of accumulation
5680 of shorts into an int accumulator; The original code:
5681 S1: int_a = (int) short_a;
5682 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5684 was replaced with:
5685 STMT: int_acc = widen_sum <short_a, int_acc>
5687 This means that:
5688 1. The tree-code that is used to create the vector operation in the
5689 epilog code (that reduces the partial results) is not the
5690 tree-code of STMT, but is rather the tree-code of the original
5691 stmt from the pattern that STMT is replacing. I.e, in the example
5692 above we want to use 'widen_sum' in the loop, but 'plus' in the
5693 epilog.
5694 2. The type (mode) we use to check available target support
5695 for the vector operation to be created in the *epilog*, is
5696 determined by the type of the reduction variable (in the example
5697 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5698 However the type (mode) we use to check available target support
5699 for the vector operation to be created *inside the loop*, is
5700 determined by the type of the other arguments to STMT (in the
5701 example we'd check this: optab_handler (widen_sum_optab,
5702 vect_short_mode)).
5704 This is contrary to "regular" reductions, in which the types of all
5705 the arguments are the same as the type of the reduction variable.
5706 For "regular" reductions we can therefore use the same vector type
5707 (and also the same tree-code) when generating the epilog code and
5708 when generating the code inside the loop. */
5710 if (orig_stmt)
5712 /* This is a reduction pattern: get the vectype from the type of the
5713 reduction variable, and get the tree-code from orig_stmt. */
5714 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5715 == TREE_CODE_REDUCTION);
5716 orig_code = gimple_assign_rhs_code (orig_stmt);
5717 gcc_assert (vectype_out);
5718 vec_mode = TYPE_MODE (vectype_out);
5720 else
5722 /* Regular reduction: use the same vectype and tree-code as used for
5723 the vector code inside the loop can be used for the epilog code. */
5724 orig_code = code;
5726 if (code == MINUS_EXPR)
5727 orig_code = PLUS_EXPR;
5729 /* For simple condition reductions, replace with the actual expression
5730 we want to base our reduction around. */
5731 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5732 == INTEGER_INDUC_COND_REDUCTION)
5733 orig_code = MAX_EXPR;
5736 if (nested_cycle)
5738 def_bb = gimple_bb (reduc_def_stmt);
5739 def_stmt_loop = def_bb->loop_father;
5740 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5741 loop_preheader_edge (def_stmt_loop));
5742 if (TREE_CODE (def_arg) == SSA_NAME
5743 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5744 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5745 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5746 && vinfo_for_stmt (def_arg_stmt)
5747 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5748 == vect_double_reduction_def)
5749 double_reduc = true;
5752 epilog_reduc_code = ERROR_MARK;
5754 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5755 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5756 == INTEGER_INDUC_COND_REDUCTION)
5758 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5760 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5761 optab_default);
5762 if (!reduc_optab)
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "no optab for reduction.\n");
5768 epilog_reduc_code = ERROR_MARK;
5770 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "reduc op not supported by target.\n");
5776 epilog_reduc_code = ERROR_MARK;
5779 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5780 generated in the epilog using multiple expressions. This does not
5781 work for condition reductions. */
5782 if (epilog_reduc_code == ERROR_MARK
5783 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5784 == INTEGER_INDUC_COND_REDUCTION)
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "no reduc code for scalar code.\n");
5789 return false;
5792 else
5794 if (!nested_cycle || double_reduc)
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "no reduc code for scalar code.\n");
5800 return false;
5804 else
5806 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5807 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5808 cr_index_vector_type = build_vector_type
5809 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5811 epilog_reduc_code = REDUC_MAX_EXPR;
5812 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5813 optab_default);
5814 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5815 == CODE_FOR_nothing)
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 "reduc max op not supported by target.\n");
5820 return false;
5824 if ((double_reduc
5825 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5826 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5827 == INTEGER_INDUC_COND_REDUCTION)
5828 && ncopies > 1)
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 "multiple types in double reduction or condition "
5833 "reduction.\n");
5834 return false;
5837 /* In case of widenning multiplication by a constant, we update the type
5838 of the constant to be the type of the other operand. We check that the
5839 constant fits the type in the pattern recognition pass. */
5840 if (code == DOT_PROD_EXPR
5841 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5843 if (TREE_CODE (ops[0]) == INTEGER_CST)
5844 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5845 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5846 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5847 else
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5851 "invalid types in dot-prod\n");
5853 return false;
5857 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5859 widest_int ni;
5861 if (! max_loop_iterations (loop, &ni))
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location,
5865 "loop count not known, cannot create cond "
5866 "reduction.\n");
5867 return false;
5869 /* Convert backedges to iterations. */
5870 ni += 1;
5872 /* The additional index will be the same type as the condition. Check
5873 that the loop can fit into this less one (because we'll use up the
5874 zero slot for when there are no matches). */
5875 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5876 if (wi::geu_p (ni, wi::to_widest (max_index)))
5878 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_NOTE, vect_location,
5880 "loop size is greater than data size.\n");
5881 return false;
5885 if (!vec_stmt) /* transformation not required. */
5887 if (first_p
5888 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5889 reduc_index))
5890 return false;
5891 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5892 return true;
5895 /** Transform. **/
5897 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5900 /* FORNOW: Multiple types are not supported for condition. */
5901 if (code == COND_EXPR)
5902 gcc_assert (ncopies == 1);
5904 /* Create the destination vector */
5905 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5907 /* In case the vectorization factor (VF) is bigger than the number
5908 of elements that we can fit in a vectype (nunits), we have to generate
5909 more than one vector stmt - i.e - we need to "unroll" the
5910 vector stmt by a factor VF/nunits. For more details see documentation
5911 in vectorizable_operation. */
5913 /* If the reduction is used in an outer loop we need to generate
5914 VF intermediate results, like so (e.g. for ncopies=2):
5915 r0 = phi (init, r0)
5916 r1 = phi (init, r1)
5917 r0 = x0 + r0;
5918 r1 = x1 + r1;
5919 (i.e. we generate VF results in 2 registers).
5920 In this case we have a separate def-use cycle for each copy, and therefore
5921 for each copy we get the vector def for the reduction variable from the
5922 respective phi node created for this copy.
5924 Otherwise (the reduction is unused in the loop nest), we can combine
5925 together intermediate results, like so (e.g. for ncopies=2):
5926 r = phi (init, r)
5927 r = x0 + r;
5928 r = x1 + r;
5929 (i.e. we generate VF/2 results in a single register).
5930 In this case for each copy we get the vector def for the reduction variable
5931 from the vectorized reduction operation generated in the previous iteration.
5934 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5936 single_defuse_cycle = true;
5937 epilog_copies = 1;
5939 else
5940 epilog_copies = ncopies;
5942 prev_stmt_info = NULL;
5943 prev_phi_info = NULL;
5944 if (slp_node)
5945 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5946 else
5948 vec_num = 1;
5949 vec_oprnds0.create (1);
5950 if (op_type == ternary_op)
5951 vec_oprnds1.create (1);
5954 phis.create (vec_num);
5955 vect_defs.create (vec_num);
5956 if (!slp_node)
5957 vect_defs.quick_push (NULL_TREE);
5959 for (j = 0; j < ncopies; j++)
5961 if (j == 0 || !single_defuse_cycle)
5963 for (i = 0; i < vec_num; i++)
5965 /* Create the reduction-phi that defines the reduction
5966 operand. */
5967 new_phi = create_phi_node (vec_dest, loop->header);
5968 set_vinfo_for_stmt (new_phi,
5969 new_stmt_vec_info (new_phi, loop_vinfo));
5970 if (j == 0 || slp_node)
5971 phis.quick_push (new_phi);
5975 if (code == COND_EXPR)
5977 gcc_assert (!slp_node);
5978 vectorizable_condition (stmt, gsi, vec_stmt,
5979 PHI_RESULT (phis[0]),
5980 reduc_index, NULL);
5981 /* Multiple types are not supported for condition. */
5982 break;
5985 /* Handle uses. */
5986 if (j == 0)
5988 op0 = ops[!reduc_index];
5989 if (op_type == ternary_op)
5991 if (reduc_index == 0)
5992 op1 = ops[2];
5993 else
5994 op1 = ops[1];
5997 if (slp_node)
5998 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5999 slp_node, -1);
6000 else
6002 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6003 stmt);
6004 vec_oprnds0.quick_push (loop_vec_def0);
6005 if (op_type == ternary_op)
6007 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6008 vec_oprnds1.quick_push (loop_vec_def1);
6012 else
6014 if (!slp_node)
6016 enum vect_def_type dt;
6017 gimple *dummy_stmt;
6019 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6020 &dummy_stmt, &dt);
6021 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6022 loop_vec_def0);
6023 vec_oprnds0[0] = loop_vec_def0;
6024 if (op_type == ternary_op)
6026 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6027 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6028 loop_vec_def1);
6029 vec_oprnds1[0] = loop_vec_def1;
6033 if (single_defuse_cycle)
6034 reduc_def = gimple_assign_lhs (new_stmt);
6036 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6039 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6041 if (slp_node)
6042 reduc_def = PHI_RESULT (phis[i]);
6043 else
6045 if (!single_defuse_cycle || j == 0)
6046 reduc_def = PHI_RESULT (new_phi);
6049 def1 = ((op_type == ternary_op)
6050 ? vec_oprnds1[i] : NULL);
6051 if (op_type == binary_op)
6053 if (reduc_index == 0)
6054 expr = build2 (code, vectype_out, reduc_def, def0);
6055 else
6056 expr = build2 (code, vectype_out, def0, reduc_def);
6058 else
6060 if (reduc_index == 0)
6061 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6062 else
6064 if (reduc_index == 1)
6065 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6066 else
6067 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6071 new_stmt = gimple_build_assign (vec_dest, expr);
6072 new_temp = make_ssa_name (vec_dest, new_stmt);
6073 gimple_assign_set_lhs (new_stmt, new_temp);
6074 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6076 if (slp_node)
6078 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6079 vect_defs.quick_push (new_temp);
6081 else
6082 vect_defs[0] = new_temp;
6085 if (slp_node)
6086 continue;
6088 if (j == 0)
6089 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6090 else
6091 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6093 prev_stmt_info = vinfo_for_stmt (new_stmt);
6094 prev_phi_info = vinfo_for_stmt (new_phi);
6097 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6099 /* Finalize the reduction-phi (set its arguments) and create the
6100 epilog reduction code. */
6101 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6103 new_temp = gimple_assign_lhs (*vec_stmt);
6104 vect_defs[0] = new_temp;
6106 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6107 which is updated with the current index of the loop for every match of
6108 the original loop's cond_expr (VEC_STMT). This results in a vector
6109 containing the last time the condition passed for that vector lane.
6110 The first match will be a 1 to allow 0 to be used for non-matching
6111 indexes. If there are no matches at all then the vector will be all
6112 zeroes. */
6113 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6115 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6116 int k;
6118 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6120 /* First we create a simple vector induction variable which starts
6121 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6122 vector size (STEP). */
6124 /* Create a {1,2,3,...} vector. */
6125 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6126 for (k = 0; k < nunits_out; ++k)
6127 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6128 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6130 /* Create a vector of the step value. */
6131 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6132 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6134 /* Create an induction variable. */
6135 gimple_stmt_iterator incr_gsi;
6136 bool insert_after;
6137 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6138 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6139 insert_after, &indx_before_incr, &indx_after_incr);
6141 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6142 filled with zeros (VEC_ZERO). */
6144 /* Create a vector of 0s. */
6145 tree zero = build_zero_cst (cr_index_scalar_type);
6146 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6148 /* Create a vector phi node. */
6149 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6150 new_phi = create_phi_node (new_phi_tree, loop->header);
6151 set_vinfo_for_stmt (new_phi,
6152 new_stmt_vec_info (new_phi, loop_vinfo));
6153 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6154 UNKNOWN_LOCATION);
6156 /* Now take the condition from the loops original cond_expr
6157 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6158 every match uses values from the induction variable
6159 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6160 (NEW_PHI_TREE).
6161 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6162 the new cond_expr (INDEX_COND_EXPR). */
6164 /* Turn the condition from vec_stmt into an ssa name. */
6165 gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6166 tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6167 tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6168 gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6169 ccompare);
6170 gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6171 gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6172 update_stmt (*vec_stmt);
6174 /* Create a conditional, where the condition is taken from vec_stmt
6175 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6176 and else is the phi (NEW_PHI_TREE). */
6177 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6178 ccompare_name, indx_before_incr,
6179 new_phi_tree);
6180 cond_name = make_ssa_name (cr_index_vector_type);
6181 gimple *index_condition = gimple_build_assign (cond_name,
6182 index_cond_expr);
6183 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6184 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6185 loop_vinfo);
6186 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6187 set_vinfo_for_stmt (index_condition, index_vec_info);
6189 /* Update the phi with the vec cond. */
6190 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6191 UNKNOWN_LOCATION);
6195 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6196 epilog_reduc_code, phis, reduc_index,
6197 double_reduc, slp_node, cond_name);
6199 return true;
6202 /* Function vect_min_worthwhile_factor.
6204 For a loop where we could vectorize the operation indicated by CODE,
6205 return the minimum vectorization factor that makes it worthwhile
6206 to use generic vectors. */
6208 vect_min_worthwhile_factor (enum tree_code code)
6210 switch (code)
6212 case PLUS_EXPR:
6213 case MINUS_EXPR:
6214 case NEGATE_EXPR:
6215 return 4;
6217 case BIT_AND_EXPR:
6218 case BIT_IOR_EXPR:
6219 case BIT_XOR_EXPR:
6220 case BIT_NOT_EXPR:
6221 return 2;
6223 default:
6224 return INT_MAX;
6229 /* Function vectorizable_induction
6231 Check if PHI performs an induction computation that can be vectorized.
6232 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6233 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6234 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6236 bool
6237 vectorizable_induction (gimple *phi,
6238 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6239 gimple **vec_stmt)
6241 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6242 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6244 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6245 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6246 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6247 tree vec_def;
6249 gcc_assert (ncopies >= 1);
6250 /* FORNOW. These restrictions should be relaxed. */
6251 if (nested_in_vect_loop_p (loop, phi))
6253 imm_use_iterator imm_iter;
6254 use_operand_p use_p;
6255 gimple *exit_phi;
6256 edge latch_e;
6257 tree loop_arg;
6259 if (ncopies > 1)
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "multiple types in nested loop.\n");
6264 return false;
6267 exit_phi = NULL;
6268 latch_e = loop_latch_edge (loop->inner);
6269 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6270 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6272 gimple *use_stmt = USE_STMT (use_p);
6273 if (is_gimple_debug (use_stmt))
6274 continue;
6276 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6278 exit_phi = use_stmt;
6279 break;
6282 if (exit_phi)
6284 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6285 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6286 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6288 if (dump_enabled_p ())
6289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290 "inner-loop induction only used outside "
6291 "of the outer vectorized loop.\n");
6292 return false;
6297 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6298 return false;
6300 /* FORNOW: SLP not supported. */
6301 if (STMT_SLP_TYPE (stmt_info))
6302 return false;
6304 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6306 if (gimple_code (phi) != GIMPLE_PHI)
6307 return false;
6309 if (!vec_stmt) /* transformation not required. */
6311 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6312 if (dump_enabled_p ())
6313 dump_printf_loc (MSG_NOTE, vect_location,
6314 "=== vectorizable_induction ===\n");
6315 vect_model_induction_cost (stmt_info, ncopies);
6316 return true;
6319 /** Transform. **/
6321 if (dump_enabled_p ())
6322 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6324 vec_def = get_initial_def_for_induction (phi);
6325 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6326 return true;
6329 /* Function vectorizable_live_operation.
6331 STMT computes a value that is used outside the loop. Check if
6332 it can be supported. */
6334 bool
6335 vectorizable_live_operation (gimple *stmt,
6336 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6337 gimple **vec_stmt)
6339 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6340 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6341 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6342 tree op;
6343 gimple *def_stmt;
6344 ssa_op_iter iter;
6346 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6349 return false;
6351 if (!is_gimple_assign (stmt))
6353 if (gimple_call_internal_p (stmt)
6354 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6355 && gimple_call_lhs (stmt)
6356 && loop->simduid
6357 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6358 && loop->simduid
6359 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6361 edge e = single_exit (loop);
6362 basic_block merge_bb = e->dest;
6363 imm_use_iterator imm_iter;
6364 use_operand_p use_p;
6365 tree lhs = gimple_call_lhs (stmt);
6367 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6369 gimple *use_stmt = USE_STMT (use_p);
6370 if (gimple_code (use_stmt) == GIMPLE_PHI
6371 && gimple_bb (use_stmt) == merge_bb)
6373 if (vec_stmt)
6375 tree vfm1
6376 = build_int_cst (unsigned_type_node,
6377 loop_vinfo->vectorization_factor - 1);
6378 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6380 return true;
6385 return false;
6388 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6389 return false;
6391 /* FORNOW. CHECKME. */
6392 if (nested_in_vect_loop_p (loop, stmt))
6393 return false;
6395 /* FORNOW: support only if all uses are invariant. This means
6396 that the scalar operations can remain in place, unvectorized.
6397 The original last scalar value that they compute will be used. */
6398 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6400 enum vect_def_type dt = vect_uninitialized_def;
6402 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6404 if (dump_enabled_p ())
6405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406 "use not simple.\n");
6407 return false;
6410 if (dt != vect_external_def && dt != vect_constant_def)
6411 return false;
6414 /* No transformation is required for the cases we currently support. */
6415 return true;
6418 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6420 static void
6421 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6423 ssa_op_iter op_iter;
6424 imm_use_iterator imm_iter;
6425 def_operand_p def_p;
6426 gimple *ustmt;
6428 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6430 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6432 basic_block bb;
6434 if (!is_gimple_debug (ustmt))
6435 continue;
6437 bb = gimple_bb (ustmt);
6439 if (!flow_bb_inside_loop_p (loop, bb))
6441 if (gimple_debug_bind_p (ustmt))
6443 if (dump_enabled_p ())
6444 dump_printf_loc (MSG_NOTE, vect_location,
6445 "killing debug use\n");
6447 gimple_debug_bind_reset_value (ustmt);
6448 update_stmt (ustmt);
6450 else
6451 gcc_unreachable ();
6458 /* This function builds ni_name = number of iterations. Statements
6459 are emitted on the loop preheader edge. */
6461 static tree
6462 vect_build_loop_niters (loop_vec_info loop_vinfo)
6464 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6465 if (TREE_CODE (ni) == INTEGER_CST)
6466 return ni;
6467 else
6469 tree ni_name, var;
6470 gimple_seq stmts = NULL;
6471 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6473 var = create_tmp_var (TREE_TYPE (ni), "niters");
6474 ni_name = force_gimple_operand (ni, &stmts, false, var);
6475 if (stmts)
6476 gsi_insert_seq_on_edge_immediate (pe, stmts);
6478 return ni_name;
6483 /* This function generates the following statements:
6485 ni_name = number of iterations loop executes
6486 ratio = ni_name / vf
6487 ratio_mult_vf_name = ratio * vf
6489 and places them on the loop preheader edge. */
6491 static void
6492 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6493 tree ni_name,
6494 tree *ratio_mult_vf_name_ptr,
6495 tree *ratio_name_ptr)
6497 tree ni_minus_gap_name;
6498 tree var;
6499 tree ratio_name;
6500 tree ratio_mult_vf_name;
6501 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6502 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6503 tree log_vf;
6505 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6507 /* If epilogue loop is required because of data accesses with gaps, we
6508 subtract one iteration from the total number of iterations here for
6509 correct calculation of RATIO. */
6510 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6512 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6513 ni_name,
6514 build_one_cst (TREE_TYPE (ni_name)));
6515 if (!is_gimple_val (ni_minus_gap_name))
6517 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6518 gimple *stmts = NULL;
6519 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6520 true, var);
6521 gsi_insert_seq_on_edge_immediate (pe, stmts);
6524 else
6525 ni_minus_gap_name = ni_name;
6527 /* Create: ratio = ni >> log2(vf) */
6528 /* ??? As we have ni == number of latch executions + 1, ni could
6529 have overflown to zero. So avoid computing ratio based on ni
6530 but compute it using the fact that we know ratio will be at least
6531 one, thus via (ni - vf) >> log2(vf) + 1. */
6532 ratio_name
6533 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6534 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6535 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6536 ni_minus_gap_name,
6537 build_int_cst
6538 (TREE_TYPE (ni_name), vf)),
6539 log_vf),
6540 build_int_cst (TREE_TYPE (ni_name), 1));
6541 if (!is_gimple_val (ratio_name))
6543 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6544 gimple *stmts = NULL;
6545 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6546 gsi_insert_seq_on_edge_immediate (pe, stmts);
6548 *ratio_name_ptr = ratio_name;
6550 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6552 if (ratio_mult_vf_name_ptr)
6554 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6555 ratio_name, log_vf);
6556 if (!is_gimple_val (ratio_mult_vf_name))
6558 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6559 gimple *stmts = NULL;
6560 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6561 true, var);
6562 gsi_insert_seq_on_edge_immediate (pe, stmts);
6564 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6567 return;
6571 /* Function vect_transform_loop.
6573 The analysis phase has determined that the loop is vectorizable.
6574 Vectorize the loop - created vectorized stmts to replace the scalar
6575 stmts in the loop, and update the loop exit condition. */
6577 void
6578 vect_transform_loop (loop_vec_info loop_vinfo)
6580 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6581 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6582 int nbbs = loop->num_nodes;
6583 int i;
6584 tree ratio = NULL;
6585 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6586 bool grouped_store;
6587 bool slp_scheduled = false;
6588 gimple *stmt, *pattern_stmt;
6589 gimple_seq pattern_def_seq = NULL;
6590 gimple_stmt_iterator pattern_def_si = gsi_none ();
6591 bool transform_pattern_stmt = false;
6592 bool check_profitability = false;
6593 int th;
6594 /* Record number of iterations before we started tampering with the profile. */
6595 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6597 if (dump_enabled_p ())
6598 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6600 /* If profile is inprecise, we have chance to fix it up. */
6601 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6602 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6604 /* Use the more conservative vectorization threshold. If the number
6605 of iterations is constant assume the cost check has been performed
6606 by our caller. If the threshold makes all loops profitable that
6607 run at least the vectorization factor number of times checking
6608 is pointless, too. */
6609 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6610 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6611 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6613 if (dump_enabled_p ())
6614 dump_printf_loc (MSG_NOTE, vect_location,
6615 "Profitability threshold is %d loop iterations.\n",
6616 th);
6617 check_profitability = true;
6620 /* Version the loop first, if required, so the profitability check
6621 comes first. */
6623 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6624 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6626 vect_loop_versioning (loop_vinfo, th, check_profitability);
6627 check_profitability = false;
6630 tree ni_name = vect_build_loop_niters (loop_vinfo);
6631 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6633 /* Peel the loop if there are data refs with unknown alignment.
6634 Only one data ref with unknown store is allowed. */
6636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6638 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6639 th, check_profitability);
6640 check_profitability = false;
6641 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6642 be re-computed. */
6643 ni_name = NULL_TREE;
6646 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6647 compile time constant), or it is a constant that doesn't divide by the
6648 vectorization factor, then an epilog loop needs to be created.
6649 We therefore duplicate the loop: the original loop will be vectorized,
6650 and will compute the first (n/VF) iterations. The second copy of the loop
6651 will remain scalar and will compute the remaining (n%VF) iterations.
6652 (VF is the vectorization factor). */
6654 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6655 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6657 tree ratio_mult_vf;
6658 if (!ni_name)
6659 ni_name = vect_build_loop_niters (loop_vinfo);
6660 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6661 &ratio);
6662 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6663 th, check_profitability);
6665 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6666 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6667 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6668 else
6670 if (!ni_name)
6671 ni_name = vect_build_loop_niters (loop_vinfo);
6672 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6675 /* 1) Make sure the loop header has exactly two entries
6676 2) Make sure we have a preheader basic block. */
6678 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6680 split_edge (loop_preheader_edge (loop));
6682 /* FORNOW: the vectorizer supports only loops which body consist
6683 of one basic block (header + empty latch). When the vectorizer will
6684 support more involved loop forms, the order by which the BBs are
6685 traversed need to be reconsidered. */
6687 for (i = 0; i < nbbs; i++)
6689 basic_block bb = bbs[i];
6690 stmt_vec_info stmt_info;
6692 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6693 gsi_next (&si))
6695 gphi *phi = si.phi ();
6696 if (dump_enabled_p ())
6698 dump_printf_loc (MSG_NOTE, vect_location,
6699 "------>vectorizing phi: ");
6700 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6701 dump_printf (MSG_NOTE, "\n");
6703 stmt_info = vinfo_for_stmt (phi);
6704 if (!stmt_info)
6705 continue;
6707 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6708 vect_loop_kill_debug_uses (loop, phi);
6710 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6711 && !STMT_VINFO_LIVE_P (stmt_info))
6712 continue;
6714 if (STMT_VINFO_VECTYPE (stmt_info)
6715 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6716 != (unsigned HOST_WIDE_INT) vectorization_factor)
6717 && dump_enabled_p ())
6718 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6720 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6724 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6728 pattern_stmt = NULL;
6729 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6730 !gsi_end_p (si) || transform_pattern_stmt;)
6732 bool is_store;
6734 if (transform_pattern_stmt)
6735 stmt = pattern_stmt;
6736 else
6738 stmt = gsi_stmt (si);
6739 /* During vectorization remove existing clobber stmts. */
6740 if (gimple_clobber_p (stmt))
6742 unlink_stmt_vdef (stmt);
6743 gsi_remove (&si, true);
6744 release_defs (stmt);
6745 continue;
6749 if (dump_enabled_p ())
6751 dump_printf_loc (MSG_NOTE, vect_location,
6752 "------>vectorizing statement: ");
6753 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6754 dump_printf (MSG_NOTE, "\n");
6757 stmt_info = vinfo_for_stmt (stmt);
6759 /* vector stmts created in the outer-loop during vectorization of
6760 stmts in an inner-loop may not have a stmt_info, and do not
6761 need to be vectorized. */
6762 if (!stmt_info)
6764 gsi_next (&si);
6765 continue;
6768 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6769 vect_loop_kill_debug_uses (loop, stmt);
6771 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6772 && !STMT_VINFO_LIVE_P (stmt_info))
6774 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6775 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6776 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6777 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6779 stmt = pattern_stmt;
6780 stmt_info = vinfo_for_stmt (stmt);
6782 else
6784 gsi_next (&si);
6785 continue;
6788 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6789 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6790 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6791 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6792 transform_pattern_stmt = true;
6794 /* If pattern statement has def stmts, vectorize them too. */
6795 if (is_pattern_stmt_p (stmt_info))
6797 if (pattern_def_seq == NULL)
6799 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6800 pattern_def_si = gsi_start (pattern_def_seq);
6802 else if (!gsi_end_p (pattern_def_si))
6803 gsi_next (&pattern_def_si);
6804 if (pattern_def_seq != NULL)
6806 gimple *pattern_def_stmt = NULL;
6807 stmt_vec_info pattern_def_stmt_info = NULL;
6809 while (!gsi_end_p (pattern_def_si))
6811 pattern_def_stmt = gsi_stmt (pattern_def_si);
6812 pattern_def_stmt_info
6813 = vinfo_for_stmt (pattern_def_stmt);
6814 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6815 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6816 break;
6817 gsi_next (&pattern_def_si);
6820 if (!gsi_end_p (pattern_def_si))
6822 if (dump_enabled_p ())
6824 dump_printf_loc (MSG_NOTE, vect_location,
6825 "==> vectorizing pattern def "
6826 "stmt: ");
6827 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6828 pattern_def_stmt, 0);
6829 dump_printf (MSG_NOTE, "\n");
6832 stmt = pattern_def_stmt;
6833 stmt_info = pattern_def_stmt_info;
6835 else
6837 pattern_def_si = gsi_none ();
6838 transform_pattern_stmt = false;
6841 else
6842 transform_pattern_stmt = false;
6845 if (STMT_VINFO_VECTYPE (stmt_info))
6847 unsigned int nunits
6848 = (unsigned int)
6849 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6850 if (!STMT_SLP_TYPE (stmt_info)
6851 && nunits != (unsigned int) vectorization_factor
6852 && dump_enabled_p ())
6853 /* For SLP VF is set according to unrolling factor, and not
6854 to vector size, hence for SLP this print is not valid. */
6855 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6858 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6859 reached. */
6860 if (STMT_SLP_TYPE (stmt_info))
6862 if (!slp_scheduled)
6864 slp_scheduled = true;
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_NOTE, vect_location,
6868 "=== scheduling SLP instances ===\n");
6870 vect_schedule_slp (loop_vinfo);
6873 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6874 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6876 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6878 pattern_def_seq = NULL;
6879 gsi_next (&si);
6881 continue;
6885 /* -------- vectorize statement ------------ */
6886 if (dump_enabled_p ())
6887 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6889 grouped_store = false;
6890 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6891 if (is_store)
6893 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6895 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6896 interleaving chain was completed - free all the stores in
6897 the chain. */
6898 gsi_next (&si);
6899 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6901 else
6903 /* Free the attached stmt_vec_info and remove the stmt. */
6904 gimple *store = gsi_stmt (si);
6905 free_stmt_vec_info (store);
6906 unlink_stmt_vdef (store);
6907 gsi_remove (&si, true);
6908 release_defs (store);
6911 /* Stores can only appear at the end of pattern statements. */
6912 gcc_assert (!transform_pattern_stmt);
6913 pattern_def_seq = NULL;
6915 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6917 pattern_def_seq = NULL;
6918 gsi_next (&si);
6920 } /* stmts in BB */
6921 } /* BBs in loop */
6923 slpeel_make_loop_iterate_ntimes (loop, ratio);
6925 /* Reduce loop iterations by the vectorization factor. */
6926 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6927 expected_iterations / vectorization_factor);
6928 loop->nb_iterations_upper_bound
6929 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6930 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6931 && loop->nb_iterations_upper_bound != 0)
6932 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6933 if (loop->any_estimate)
6935 loop->nb_iterations_estimate
6936 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6937 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6938 && loop->nb_iterations_estimate != 0)
6939 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6942 if (dump_enabled_p ())
6944 dump_printf_loc (MSG_NOTE, vect_location,
6945 "LOOP VECTORIZED\n");
6946 if (loop->inner)
6947 dump_printf_loc (MSG_NOTE, vect_location,
6948 "OUTER LOOP VECTORIZED\n");
6949 dump_printf (MSG_NOTE, "\n");
6952 /* Free SLP instances here because otherwise stmt reference counting
6953 won't work. */
6954 slp_instance instance;
6955 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6956 vect_free_slp_instance (instance);
6957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6960 /* The code below is trying to perform simple optimization - revert
6961 if-conversion for masked stores, i.e. if the mask of a store is zero
6962 do not perform it and all stored value producers also if possible.
6963 For example,
6964 for (i=0; i<n; i++)
6965 if (c[i])
6967 p1[i] += 1;
6968 p2[i] = p3[i] +2;
6970 this transformation will produce the following semi-hammock:
6972 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6974 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6975 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6976 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6977 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6978 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6979 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6983 void
6984 optimize_mask_stores (struct loop *loop)
6986 basic_block *bbs = get_loop_body (loop);
6987 unsigned nbbs = loop->num_nodes;
6988 unsigned i;
6989 basic_block bb;
6990 gimple_stmt_iterator gsi;
6991 gimple *stmt;
6992 auto_vec<gimple *> worklist;
6994 vect_location = find_loop_location (loop);
6995 /* Pick up all masked stores in loop if any. */
6996 for (i = 0; i < nbbs; i++)
6998 bb = bbs[i];
6999 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7000 gsi_next (&gsi))
7002 stmt = gsi_stmt (gsi);
7003 if (is_gimple_call (stmt)
7004 && gimple_call_internal_p (stmt)
7005 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7006 worklist.safe_push (stmt);
7010 free (bbs);
7011 if (worklist.is_empty ())
7012 return;
7014 /* Loop has masked stores. */
7015 while (!worklist.is_empty ())
7017 gimple *last, *last_store;
7018 edge e, efalse;
7019 tree mask;
7020 basic_block store_bb, join_bb;
7021 gimple_stmt_iterator gsi_to;
7022 tree vdef, new_vdef;
7023 gphi *phi;
7024 tree vectype;
7025 tree zero;
7027 last = worklist.pop ();
7028 mask = gimple_call_arg (last, 2);
7029 bb = gimple_bb (last);
7030 /* Create new bb. */
7031 e = split_block (bb, last);
7032 join_bb = e->dest;
7033 store_bb = create_empty_bb (bb);
7034 add_bb_to_loop (store_bb, loop);
7035 e->flags = EDGE_TRUE_VALUE;
7036 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7037 /* Put STORE_BB to likely part. */
7038 efalse->probability = PROB_UNLIKELY;
7039 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7040 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7041 if (dom_info_available_p (CDI_DOMINATORS))
7042 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7043 if (dump_enabled_p ())
7044 dump_printf_loc (MSG_NOTE, vect_location,
7045 "Create new block %d to sink mask stores.",
7046 store_bb->index);
7047 /* Create vector comparison with boolean result. */
7048 vectype = TREE_TYPE (mask);
7049 zero = build_zero_cst (vectype);
7050 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7051 gsi = gsi_last_bb (bb);
7052 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7053 /* Create new PHI node for vdef of the last masked store:
7054 .MEM_2 = VDEF <.MEM_1>
7055 will be converted to
7056 .MEM.3 = VDEF <.MEM_1>
7057 and new PHI node will be created in join bb
7058 .MEM_2 = PHI <.MEM_1, .MEM_3>
7060 vdef = gimple_vdef (last);
7061 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7062 gimple_set_vdef (last, new_vdef);
7063 phi = create_phi_node (vdef, join_bb);
7064 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7066 /* Put all masked stores with the same mask to STORE_BB if possible. */
7067 while (true)
7069 gimple_stmt_iterator gsi_from;
7070 gimple *stmt1 = NULL;
7072 /* Move masked store to STORE_BB. */
7073 last_store = last;
7074 gsi = gsi_for_stmt (last);
7075 gsi_from = gsi;
7076 /* Shift GSI to the previous stmt for further traversal. */
7077 gsi_prev (&gsi);
7078 gsi_to = gsi_start_bb (store_bb);
7079 gsi_move_before (&gsi_from, &gsi_to);
7080 /* Setup GSI_TO to the non-empty block start. */
7081 gsi_to = gsi_start_bb (store_bb);
7082 if (dump_enabled_p ())
7084 dump_printf_loc (MSG_NOTE, vect_location,
7085 "Move stmt to created bb\n");
7086 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7088 /* Move all stored value producers if possible. */
7089 while (!gsi_end_p (gsi))
7091 tree lhs;
7092 imm_use_iterator imm_iter;
7093 use_operand_p use_p;
7094 bool res;
7096 /* Skip debug statements. */
7097 if (is_gimple_debug (gsi_stmt (gsi)))
7099 gsi_prev (&gsi);
7100 continue;
7102 stmt1 = gsi_stmt (gsi);
7103 /* Do not consider statements writing to memory or having
7104 volatile operand. */
7105 if (gimple_vdef (stmt1)
7106 || gimple_has_volatile_ops (stmt1))
7107 break;
7108 gsi_from = gsi;
7109 gsi_prev (&gsi);
7110 lhs = gimple_get_lhs (stmt1);
7111 if (!lhs)
7112 break;
7114 /* LHS of vectorized stmt must be SSA_NAME. */
7115 if (TREE_CODE (lhs) != SSA_NAME)
7116 break;
7118 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7120 /* Remove dead scalar statement. */
7121 if (has_zero_uses (lhs))
7123 gsi_remove (&gsi_from, true);
7124 continue;
7128 /* Check that LHS does not have uses outside of STORE_BB. */
7129 res = true;
7130 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7132 gimple *use_stmt;
7133 use_stmt = USE_STMT (use_p);
7134 if (is_gimple_debug (use_stmt))
7135 continue;
7136 if (gimple_bb (use_stmt) != store_bb)
7138 res = false;
7139 break;
7142 if (!res)
7143 break;
7145 if (gimple_vuse (stmt1)
7146 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7147 break;
7149 /* Can move STMT1 to STORE_BB. */
7150 if (dump_enabled_p ())
7152 dump_printf_loc (MSG_NOTE, vect_location,
7153 "Move stmt to created bb\n");
7154 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7156 gsi_move_before (&gsi_from, &gsi_to);
7157 /* Shift GSI_TO for further insertion. */
7158 gsi_prev (&gsi_to);
7160 /* Put other masked stores with the same mask to STORE_BB. */
7161 if (worklist.is_empty ()
7162 || gimple_call_arg (worklist.last (), 2) != mask
7163 || worklist.last () != stmt1)
7164 break;
7165 last = worklist.pop ();
7167 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);