PR middle-end/83164
[official-gcc.git] / gcc / tree-vect-loop.c
blobd7669e4ebb71e25706fa1bdbe3f9a63ae4553a7c
1 /* Loop Vectorization
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
55 /* Loop Vectorization Pass.
57 This pass tries to vectorize loops.
59 For example, the vectorizer transforms the following simple loop:
61 short a[N]; short b[N]; short c[N]; int i;
63 for (i=0; i<N; i++){
64 a[i] = b[i] + c[i];
67 as if it was manually vectorized by rewriting the source code into:
69 typedef int __attribute__((mode(V8HI))) v8hi;
70 short a[N]; short b[N]; short c[N]; int i;
71 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
72 v8hi va, vb, vc;
74 for (i=0; i<N/8; i++){
75 vb = pb[i];
76 vc = pc[i];
77 va = vb + vc;
78 pa[i] = va;
81 The main entry to this pass is vectorize_loops(), in which
82 the vectorizer applies a set of analyses on a given set of loops,
83 followed by the actual vectorization transformation for the loops that
84 had successfully passed the analysis phase.
85 Throughout this pass we make a distinction between two types of
86 data: scalars (which are represented by SSA_NAMES), and memory references
87 ("data-refs"). These two types of data require different handling both
88 during analysis and transformation. The types of data-refs that the
89 vectorizer currently supports are ARRAY_REFS which base is an array DECL
90 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
91 accesses are required to have a simple (consecutive) access pattern.
93 Analysis phase:
94 ===============
95 The driver for the analysis phase is vect_analyze_loop().
96 It applies a set of analyses, some of which rely on the scalar evolution
97 analyzer (scev) developed by Sebastian Pop.
99 During the analysis phase the vectorizer records some information
100 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
101 loop, as well as general information about the loop as a whole, which is
102 recorded in a "loop_vec_info" struct attached to each loop.
104 Transformation phase:
105 =====================
106 The loop transformation phase scans all the stmts in the loop, and
107 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
108 the loop that needs to be vectorized. It inserts the vector code sequence
109 just before the scalar stmt S, and records a pointer to the vector code
110 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
111 attached to S). This pointer will be used for the vectorization of following
112 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
113 otherwise, we rely on dead code elimination for removing it.
115 For example, say stmt S1 was vectorized into stmt VS1:
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 S2: a = b;
121 To vectorize stmt S2, the vectorizer first finds the stmt that defines
122 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
123 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
124 resulting sequence would be:
126 VS1: vb = px[i];
127 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 VS2: va = vb;
129 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
131 Operands that are not SSA_NAMEs, are data-refs that appear in
132 load/store operations (like 'x[i]' in S1), and are handled differently.
134 Target modeling:
135 =================
136 Currently the only target specific information that is used is the
137 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
138 Targets that can support different sizes of vectors, for now will need
139 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
140 flexibility will be added in the future.
142 Since we only vectorize operations which vector form can be
143 expressed using existing tree codes, to verify that an operation is
144 supported, the vectorizer checks the relevant optab at the relevant
145 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
146 the value found is CODE_FOR_nothing, then there's no target support, and
147 we can't vectorize the stmt.
149 For additional information on this project see:
150 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
155 /* Function vect_determine_vectorization_factor
157 Determine the vectorization factor (VF). VF is the number of data elements
158 that are operated upon in parallel in a single iteration of the vectorized
159 loop. For example, when vectorizing a loop that operates on 4byte elements,
160 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
161 elements can fit in a single vector register.
163 We currently support vectorization of loops in which all types operated upon
164 are of the same size. Therefore this function currently sets VF according to
165 the size of the types operated upon, and fails if there are multiple sizes
166 in the loop.
168 VF is also the factor by which the loop iterations are strip-mined, e.g.:
169 original loop:
170 for (i=0; i<N; i++){
171 a[i] = b[i] + c[i];
174 vectorized loop:
175 for (i=0; i<N; i+=VF){
176 a[i:VF] = b[i:VF] + c[i:VF];
180 static bool
181 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
183 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
184 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
185 unsigned nbbs = loop->num_nodes;
186 unsigned int vectorization_factor = 0;
187 tree scalar_type = NULL_TREE;
188 gphi *phi;
189 tree vectype;
190 unsigned int nunits;
191 stmt_vec_info stmt_info;
192 unsigned i;
193 HOST_WIDE_INT dummy;
194 gimple *stmt, *pattern_stmt = NULL;
195 gimple_seq pattern_def_seq = NULL;
196 gimple_stmt_iterator pattern_def_si = gsi_none ();
197 bool analyze_pattern_stmt = false;
198 bool bool_result;
199 auto_vec<stmt_vec_info> mask_producers;
201 if (dump_enabled_p ())
202 dump_printf_loc (MSG_NOTE, vect_location,
203 "=== vect_determine_vectorization_factor ===\n");
205 for (i = 0; i < nbbs; i++)
207 basic_block bb = bbs[i];
209 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
210 gsi_next (&si))
212 phi = si.phi ();
213 stmt_info = vinfo_for_stmt (phi);
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
217 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220 gcc_assert (stmt_info);
222 if (STMT_VINFO_RELEVANT_P (stmt_info)
223 || STMT_VINFO_LIVE_P (stmt_info))
225 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
226 scalar_type = TREE_TYPE (PHI_RESULT (phi));
228 if (dump_enabled_p ())
230 dump_printf_loc (MSG_NOTE, vect_location,
231 "get vectype for scalar type: ");
232 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
233 dump_printf (MSG_NOTE, "\n");
236 vectype = get_vectype_for_scalar_type (scalar_type);
237 if (!vectype)
239 if (dump_enabled_p ())
241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
242 "not vectorized: unsupported "
243 "data-type ");
244 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
245 scalar_type);
246 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
248 return false;
250 STMT_VINFO_VECTYPE (stmt_info) = vectype;
252 if (dump_enabled_p ())
254 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
255 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
256 dump_printf (MSG_NOTE, "\n");
259 nunits = TYPE_VECTOR_SUBPARTS (vectype);
260 if (dump_enabled_p ())
261 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
262 nunits);
264 if (!vectorization_factor
265 || (nunits > vectorization_factor))
266 vectorization_factor = nunits;
270 for (gimple_stmt_iterator si = gsi_start_bb (bb);
271 !gsi_end_p (si) || analyze_pattern_stmt;)
273 tree vf_vectype;
275 if (analyze_pattern_stmt)
276 stmt = pattern_stmt;
277 else
278 stmt = gsi_stmt (si);
280 stmt_info = vinfo_for_stmt (stmt);
282 if (dump_enabled_p ())
284 dump_printf_loc (MSG_NOTE, vect_location,
285 "==> examining statement: ");
286 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
289 gcc_assert (stmt_info);
291 /* Skip stmts which do not need to be vectorized. */
292 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
293 && !STMT_VINFO_LIVE_P (stmt_info))
294 || gimple_clobber_p (stmt))
296 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
297 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
298 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
299 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
301 stmt = pattern_stmt;
302 stmt_info = vinfo_for_stmt (pattern_stmt);
303 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location,
306 "==> examining pattern statement: ");
307 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310 else
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
314 gsi_next (&si);
315 continue;
318 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
319 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
320 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
321 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
322 analyze_pattern_stmt = true;
324 /* If a pattern statement has def stmts, analyze them too. */
325 if (is_pattern_stmt_p (stmt_info))
327 if (pattern_def_seq == NULL)
329 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
330 pattern_def_si = gsi_start (pattern_def_seq);
332 else if (!gsi_end_p (pattern_def_si))
333 gsi_next (&pattern_def_si);
334 if (pattern_def_seq != NULL)
336 gimple *pattern_def_stmt = NULL;
337 stmt_vec_info pattern_def_stmt_info = NULL;
339 while (!gsi_end_p (pattern_def_si))
341 pattern_def_stmt = gsi_stmt (pattern_def_si);
342 pattern_def_stmt_info
343 = vinfo_for_stmt (pattern_def_stmt);
344 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
345 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
346 break;
347 gsi_next (&pattern_def_si);
350 if (!gsi_end_p (pattern_def_si))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "==> examining pattern def stmt: ");
356 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
357 pattern_def_stmt, 0);
360 stmt = pattern_def_stmt;
361 stmt_info = pattern_def_stmt_info;
363 else
365 pattern_def_si = gsi_none ();
366 analyze_pattern_stmt = false;
369 else
370 analyze_pattern_stmt = false;
373 if (gimple_get_lhs (stmt) == NULL_TREE
374 /* MASK_STORE has no lhs, but is ok. */
375 && (!is_gimple_call (stmt)
376 || !gimple_call_internal_p (stmt)
377 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
379 if (is_gimple_call (stmt))
381 /* Ignore calls with no lhs. These must be calls to
382 #pragma omp simd functions, and what vectorization factor
383 it really needs can't be determined until
384 vectorizable_simd_clone_call. */
385 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
387 pattern_def_seq = NULL;
388 gsi_next (&si);
390 continue;
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "not vectorized: irregular stmt.");
396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
399 return false;
402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: vector stmt in loop:");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
410 return false;
413 bool_result = false;
415 if (STMT_VINFO_VECTYPE (stmt_info))
417 /* The only case when a vectype had been already set is for stmts
418 that contain a dataref, or for "pattern-stmts" (stmts
419 generated by the vectorizer to represent/replace a certain
420 idiom). */
421 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
422 || is_pattern_stmt_p (stmt_info)
423 || !gsi_end_p (pattern_def_si));
424 vectype = STMT_VINFO_VECTYPE (stmt_info);
426 else
428 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
429 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
430 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
431 else
432 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
434 /* Bool ops don't participate in vectorization factor
435 computation. For comparison use compared types to
436 compute a factor. */
437 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
438 && is_gimple_assign (stmt)
439 && gimple_assign_rhs_code (stmt) != COND_EXPR)
441 if (STMT_VINFO_RELEVANT_P (stmt_info)
442 || STMT_VINFO_LIVE_P (stmt_info))
443 mask_producers.safe_push (stmt_info);
444 bool_result = true;
446 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
447 == tcc_comparison
448 && !VECT_SCALAR_BOOLEAN_TYPE_P
449 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
450 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
451 else
453 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
455 pattern_def_seq = NULL;
456 gsi_next (&si);
458 continue;
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "get vectype for scalar type: ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
467 dump_printf (MSG_NOTE, "\n");
469 vectype = get_vectype_for_scalar_type (scalar_type);
470 if (!vectype)
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized: unsupported "
476 "data-type ");
477 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
478 scalar_type);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
481 return false;
484 if (!bool_result)
485 STMT_VINFO_VECTYPE (stmt_info) = vectype;
487 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
490 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
491 dump_printf (MSG_NOTE, "\n");
495 /* Don't try to compute VF out scalar types if we stmt
496 produces boolean vector. Use result vectype instead. */
497 if (VECTOR_BOOLEAN_TYPE_P (vectype))
498 vf_vectype = vectype;
499 else
501 /* The vectorization factor is according to the smallest
502 scalar type (or the largest vector size, but we only
503 support one vector size per loop). */
504 if (!bool_result)
505 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
506 &dummy);
507 if (dump_enabled_p ())
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "get vectype for scalar type: ");
511 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
512 dump_printf (MSG_NOTE, "\n");
514 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 return false;
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 return false;
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
563 pattern_def_seq = NULL;
564 gsi_next (&si);
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
582 for (i = 0; i < mask_producers.length (); i++)
584 tree mask_type = NULL;
586 stmt = STMT_VINFO_STMT (mask_producers[i]);
588 if (is_gimple_assign (stmt)
589 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
590 && !VECT_SCALAR_BOOLEAN_TYPE_P
591 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
593 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
594 mask_type = get_mask_type_for_scalar_type (scalar_type);
596 if (!mask_type)
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "not vectorized: unsupported mask\n");
601 return false;
604 else
606 tree rhs;
607 ssa_op_iter iter;
608 gimple *def_stmt;
609 enum vect_def_type dt;
611 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
613 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
614 &def_stmt, &dt, &vectype))
616 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619 "not vectorized: can't compute mask type "
620 "for statement, ");
621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
624 return false;
627 /* No vectype probably means external definition.
628 Allow it in case there is another operand which
629 allows to determine mask type. */
630 if (!vectype)
631 continue;
633 if (!mask_type)
634 mask_type = vectype;
635 else if (TYPE_VECTOR_SUBPARTS (mask_type)
636 != TYPE_VECTOR_SUBPARTS (vectype))
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "not vectorized: different sized masks "
642 "types in statement, ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
644 mask_type);
645 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
647 vectype);
648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
650 return false;
652 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
653 != VECTOR_BOOLEAN_TYPE_P (vectype))
655 if (dump_enabled_p ())
657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
658 "not vectorized: mixed mask and "
659 "nonmask vector types in statement, ");
660 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
661 mask_type);
662 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
664 vectype);
665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
667 return false;
671 /* We may compare boolean value loaded as vector of integers.
672 Fix mask_type in such case. */
673 if (mask_type
674 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
675 && gimple_code (stmt) == GIMPLE_ASSIGN
676 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
677 mask_type = build_same_sized_truth_vector_type (mask_type);
680 /* No mask_type should mean loop invariant predicate.
681 This is probably a subject for optimization in
682 if-conversion. */
683 if (!mask_type)
685 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "not vectorized: can't compute mask type "
689 "for statement, ");
690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
693 return false;
696 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
699 return true;
703 /* Function vect_is_simple_iv_evolution.
705 FORNOW: A simple evolution of an induction variables in the loop is
706 considered a polynomial evolution. */
708 static bool
709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
710 tree * step)
712 tree init_expr;
713 tree step_expr;
714 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
715 basic_block bb;
717 /* When there is no evolution in this loop, the evolution function
718 is not "simple". */
719 if (evolution_part == NULL_TREE)
720 return false;
722 /* When the evolution is a polynomial of degree >= 2
723 the evolution function is not "simple". */
724 if (tree_is_chrec (evolution_part))
725 return false;
727 step_expr = evolution_part;
728 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
734 dump_printf (MSG_NOTE, ", init: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
736 dump_printf (MSG_NOTE, "\n");
739 *init = init_expr;
740 *step = step_expr;
742 if (TREE_CODE (step_expr) != INTEGER_CST
743 && (TREE_CODE (step_expr) != SSA_NAME
744 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
745 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
746 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
747 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
748 || !flag_associative_math)))
749 && (TREE_CODE (step_expr) != REAL_CST
750 || !flag_associative_math))
752 if (dump_enabled_p ())
753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 "step unknown.\n");
755 return false;
758 return true;
761 /* Function vect_analyze_scalar_cycles_1.
763 Examine the cross iteration def-use cycles of scalar variables
764 in LOOP. LOOP_VINFO represents the loop that is now being
765 considered for vectorization (can be LOOP, or an outer-loop
766 enclosing LOOP). */
768 static void
769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
771 basic_block bb = loop->header;
772 tree init, step;
773 auto_vec<gimple *, 64> worklist;
774 gphi_iterator gsi;
775 bool double_reduc;
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_NOTE, vect_location,
779 "=== vect_analyze_scalar_cycles ===\n");
781 /* First - identify all inductions. Reduction detection assumes that all the
782 inductions have been identified, therefore, this order must not be
783 changed. */
784 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
786 gphi *phi = gsi.phi ();
787 tree access_fn = NULL;
788 tree def = PHI_RESULT (phi);
789 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
791 if (dump_enabled_p ())
793 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
794 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
797 /* Skip virtual phi's. The data dependences that are associated with
798 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
799 if (virtual_operand_p (def))
800 continue;
802 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
804 /* Analyze the evolution function. */
805 access_fn = analyze_scalar_evolution (loop, def);
806 if (access_fn)
808 STRIP_NOPS (access_fn);
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location,
812 "Access function of PHI: ");
813 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
814 dump_printf (MSG_NOTE, "\n");
816 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
817 = initial_condition_in_loop_num (access_fn, loop->num);
818 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
819 = evolution_part_in_loop_num (access_fn, loop->num);
822 if (!access_fn
823 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
824 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
825 && TREE_CODE (step) != INTEGER_CST))
827 worklist.safe_push (phi);
828 continue;
831 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
832 != NULL_TREE);
833 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
837 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
841 /* Second - identify all reductions and nested cycles. */
842 while (worklist.length () > 0)
844 gimple *phi = worklist.pop ();
845 tree def = PHI_RESULT (phi);
846 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
847 gimple *reduc_stmt;
849 if (dump_enabled_p ())
851 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
852 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
855 gcc_assert (!virtual_operand_p (def)
856 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
858 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
859 &double_reduc, false);
860 if (reduc_stmt)
862 if (double_reduc)
864 if (dump_enabled_p ())
865 dump_printf_loc (MSG_NOTE, vect_location,
866 "Detected double reduction.\n");
868 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
869 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
870 vect_double_reduction_def;
872 else
874 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location,
878 "Detected vectorizable nested cycle.\n");
880 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
881 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
882 vect_nested_cycle;
884 else
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "Detected reduction.\n");
890 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892 vect_reduction_def;
893 /* Store the reduction cycles for possible vectorization in
894 loop-aware SLP if it was not detected as reduction
895 chain. */
896 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
897 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
901 else
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
904 "Unknown def-use cycle pattern.\n");
909 /* Function vect_analyze_scalar_cycles.
911 Examine the cross iteration def-use cycles of scalar variables, by
912 analyzing the loop-header PHIs of scalar variables. Classify each
913 cycle as one of the following: invariant, induction, reduction, unknown.
914 We do that for the loop represented by LOOP_VINFO, and also to its
915 inner-loop, if exists.
916 Examples for scalar cycles:
918 Example1: reduction:
920 loop1:
921 for (i=0; i<N; i++)
922 sum += a[i];
924 Example2: induction:
926 loop2:
927 for (i=0; i<N; i++)
928 a[i] = i; */
930 static void
931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
935 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
937 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
938 Reductions in such inner-loop therefore have different properties than
939 the reductions in the nest that gets vectorized:
940 1. When vectorized, they are executed in the same order as in the original
941 scalar loop, so we can't change the order of computation when
942 vectorizing them.
943 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
944 current checks are too strict. */
946 if (loop->inner)
947 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
950 /* Transfer group and reduction information from STMT to its pattern stmt. */
952 static void
953 vect_fixup_reduc_chain (gimple *stmt)
955 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
956 gimple *stmtp;
957 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
958 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
959 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
962 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
963 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
964 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
965 if (stmt)
966 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
967 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
969 while (stmt);
970 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
973 /* Fixup scalar cycles that now have their stmts detected as patterns. */
975 static void
976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
978 gimple *first;
979 unsigned i;
981 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
982 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
984 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
985 while (next)
987 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
988 break;
989 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
991 /* If not all stmt in the chain are patterns try to handle
992 the chain without patterns. */
993 if (! next)
995 vect_fixup_reduc_chain (first);
996 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
997 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1002 /* Function vect_get_loop_niters.
1004 Determine how many iterations the loop is executed and place it
1005 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1006 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1007 niter information holds in ASSUMPTIONS.
1009 Return the loop exit condition. */
1012 static gcond *
1013 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1014 tree *number_of_iterations, tree *number_of_iterationsm1)
1016 edge exit = single_exit (loop);
1017 struct tree_niter_desc niter_desc;
1018 tree niter_assumptions, niter, may_be_zero;
1019 gcond *cond = get_loop_exit_condition (loop);
1021 *assumptions = boolean_true_node;
1022 *number_of_iterationsm1 = chrec_dont_know;
1023 *number_of_iterations = chrec_dont_know;
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_NOTE, vect_location,
1026 "=== get_loop_niters ===\n");
1028 if (!exit)
1029 return cond;
1031 niter = chrec_dont_know;
1032 may_be_zero = NULL_TREE;
1033 niter_assumptions = boolean_true_node;
1034 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1035 || chrec_contains_undetermined (niter_desc.niter))
1036 return cond;
1038 niter_assumptions = niter_desc.assumptions;
1039 may_be_zero = niter_desc.may_be_zero;
1040 niter = niter_desc.niter;
1042 if (may_be_zero && integer_zerop (may_be_zero))
1043 may_be_zero = NULL_TREE;
1045 if (may_be_zero)
1047 if (COMPARISON_CLASS_P (may_be_zero))
1049 /* Try to combine may_be_zero with assumptions, this can simplify
1050 computation of niter expression. */
1051 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1052 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1053 niter_assumptions,
1054 fold_build1 (TRUTH_NOT_EXPR,
1055 boolean_type_node,
1056 may_be_zero));
1057 else
1058 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1059 build_int_cst (TREE_TYPE (niter), 0), niter);
1061 may_be_zero = NULL_TREE;
1063 else if (integer_nonzerop (may_be_zero))
1065 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1066 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1067 return cond;
1069 else
1070 return cond;
1073 *assumptions = niter_assumptions;
1074 *number_of_iterationsm1 = niter;
1076 /* We want the number of loop header executions which is the number
1077 of latch executions plus one.
1078 ??? For UINT_MAX latch executions this number overflows to zero
1079 for loops like do { n++; } while (n != 0); */
1080 if (niter && !chrec_contains_undetermined (niter))
1081 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1082 build_int_cst (TREE_TYPE (niter), 1));
1083 *number_of_iterations = niter;
1085 return cond;
1088 /* Function bb_in_loop_p
1090 Used as predicate for dfs order traversal of the loop bbs. */
1092 static bool
1093 bb_in_loop_p (const_basic_block bb, const void *data)
1095 const struct loop *const loop = (const struct loop *)data;
1096 if (flow_bb_inside_loop_p (loop, bb))
1097 return true;
1098 return false;
1102 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1103 stmt_vec_info structs for all the stmts in LOOP_IN. */
1105 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1106 : vec_info (vec_info::loop, init_cost (loop_in)),
1107 loop (loop_in),
1108 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1109 num_itersm1 (NULL_TREE),
1110 num_iters (NULL_TREE),
1111 num_iters_unchanged (NULL_TREE),
1112 num_iters_assumptions (NULL_TREE),
1113 th (0),
1114 vectorization_factor (0),
1115 max_vectorization_factor (0),
1116 unaligned_dr (NULL),
1117 peeling_for_alignment (0),
1118 ptr_mask (0),
1119 slp_unrolling_factor (1),
1120 single_scalar_iteration_cost (0),
1121 vectorizable (false),
1122 peeling_for_gaps (false),
1123 peeling_for_niter (false),
1124 operands_swapped (false),
1125 no_data_dependencies (false),
1126 has_mask_store (false),
1127 scalar_loop (NULL),
1128 orig_loop_info (NULL)
1130 /* Create/Update stmt_info for all stmts in the loop. */
1131 basic_block *body = get_loop_body (loop);
1132 for (unsigned int i = 0; i < loop->num_nodes; i++)
1134 basic_block bb = body[i];
1135 gimple_stmt_iterator si;
1137 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1139 gimple *phi = gsi_stmt (si);
1140 gimple_set_uid (phi, 0);
1141 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1144 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1146 gimple *stmt = gsi_stmt (si);
1147 gimple_set_uid (stmt, 0);
1148 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1151 free (body);
1153 /* CHECKME: We want to visit all BBs before their successors (except for
1154 latch blocks, for which this assertion wouldn't hold). In the simple
1155 case of the loop forms we allow, a dfs order of the BBs would the same
1156 as reversed postorder traversal, so we are safe. */
1158 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1159 bbs, loop->num_nodes, loop);
1160 gcc_assert (nbbs == loop->num_nodes);
1164 /* Free all memory used by the _loop_vec_info, as well as all the
1165 stmt_vec_info structs of all the stmts in the loop. */
1167 _loop_vec_info::~_loop_vec_info ()
1169 int nbbs;
1170 gimple_stmt_iterator si;
1171 int j;
1173 nbbs = loop->num_nodes;
1174 for (j = 0; j < nbbs; j++)
1176 basic_block bb = bbs[j];
1177 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1178 free_stmt_vec_info (gsi_stmt (si));
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1182 gimple *stmt = gsi_stmt (si);
1184 /* We may have broken canonical form by moving a constant
1185 into RHS1 of a commutative op. Fix such occurrences. */
1186 if (operands_swapped && is_gimple_assign (stmt))
1188 enum tree_code code = gimple_assign_rhs_code (stmt);
1190 if ((code == PLUS_EXPR
1191 || code == POINTER_PLUS_EXPR
1192 || code == MULT_EXPR)
1193 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1194 swap_ssa_operands (stmt,
1195 gimple_assign_rhs1_ptr (stmt),
1196 gimple_assign_rhs2_ptr (stmt));
1197 else if (code == COND_EXPR
1198 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1200 tree cond_expr = gimple_assign_rhs1 (stmt);
1201 enum tree_code cond_code = TREE_CODE (cond_expr);
1203 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1205 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1206 0));
1207 cond_code = invert_tree_comparison (cond_code,
1208 honor_nans);
1209 if (cond_code != ERROR_MARK)
1211 TREE_SET_CODE (cond_expr, cond_code);
1212 swap_ssa_operands (stmt,
1213 gimple_assign_rhs2_ptr (stmt),
1214 gimple_assign_rhs3_ptr (stmt));
1220 /* Free stmt_vec_info. */
1221 free_stmt_vec_info (stmt);
1222 gsi_next (&si);
1226 free (bbs);
1228 loop->aux = NULL;
1232 /* Calculate the cost of one scalar iteration of the loop. */
1233 static void
1234 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1238 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1239 int innerloop_iters, i;
1241 /* Count statements in scalar loop. Using this as scalar cost for a single
1242 iteration for now.
1244 TODO: Add outer loop support.
1246 TODO: Consider assigning different costs to different scalar
1247 statements. */
1249 /* FORNOW. */
1250 innerloop_iters = 1;
1251 if (loop->inner)
1252 innerloop_iters = 50; /* FIXME */
1254 for (i = 0; i < nbbs; i++)
1256 gimple_stmt_iterator si;
1257 basic_block bb = bbs[i];
1259 if (bb->loop_father == loop->inner)
1260 factor = innerloop_iters;
1261 else
1262 factor = 1;
1264 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1266 gimple *stmt = gsi_stmt (si);
1267 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1269 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1270 continue;
1272 /* Skip stmts that are not vectorized inside the loop. */
1273 if (stmt_info
1274 && !STMT_VINFO_RELEVANT_P (stmt_info)
1275 && (!STMT_VINFO_LIVE_P (stmt_info)
1276 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1277 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1278 continue;
1280 vect_cost_for_stmt kind;
1281 if (STMT_VINFO_DATA_REF (stmt_info))
1283 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1284 kind = scalar_load;
1285 else
1286 kind = scalar_store;
1288 else
1289 kind = scalar_stmt;
1291 scalar_single_iter_cost
1292 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1293 factor, kind, stmt_info, 0, vect_prologue);
1296 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1297 = scalar_single_iter_cost;
1301 /* Function vect_analyze_loop_form_1.
1303 Verify that certain CFG restrictions hold, including:
1304 - the loop has a pre-header
1305 - the loop has a single entry and exit
1306 - the loop exit condition is simple enough
1307 - the number of iterations can be analyzed, i.e, a countable loop. The
1308 niter could be analyzed under some assumptions. */
1310 bool
1311 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1312 tree *assumptions, tree *number_of_iterationsm1,
1313 tree *number_of_iterations, gcond **inner_loop_cond)
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_NOTE, vect_location,
1317 "=== vect_analyze_loop_form ===\n");
1319 /* Different restrictions apply when we are considering an inner-most loop,
1320 vs. an outer (nested) loop.
1321 (FORNOW. May want to relax some of these restrictions in the future). */
1323 if (!loop->inner)
1325 /* Inner-most loop. We currently require that the number of BBs is
1326 exactly 2 (the header and latch). Vectorizable inner-most loops
1327 look like this:
1329 (pre-header)
1331 header <--------+
1332 | | |
1333 | +--> latch --+
1335 (exit-bb) */
1337 if (loop->num_nodes != 2)
1339 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341 "not vectorized: control flow in loop.\n");
1342 return false;
1345 if (empty_block_p (loop->header))
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349 "not vectorized: empty loop.\n");
1350 return false;
1353 else
1355 struct loop *innerloop = loop->inner;
1356 edge entryedge;
1358 /* Nested loop. We currently require that the loop is doubly-nested,
1359 contains a single inner loop, and the number of BBs is exactly 5.
1360 Vectorizable outer-loops look like this:
1362 (pre-header)
1364 header <---+
1366 inner-loop |
1368 tail ------+
1370 (exit-bb)
1372 The inner-loop has the properties expected of inner-most loops
1373 as described above. */
1375 if ((loop->inner)->inner || (loop->inner)->next)
1377 if (dump_enabled_p ())
1378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379 "not vectorized: multiple nested loops.\n");
1380 return false;
1383 if (loop->num_nodes != 5)
1385 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 "not vectorized: control flow in loop.\n");
1388 return false;
1391 entryedge = loop_preheader_edge (innerloop);
1392 if (entryedge->src != loop->header
1393 || !single_exit (innerloop)
1394 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "not vectorized: unsupported outerloop form.\n");
1399 return false;
1402 /* Analyze the inner-loop. */
1403 tree inner_niterm1, inner_niter, inner_assumptions;
1404 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1405 &inner_assumptions, &inner_niterm1,
1406 &inner_niter, NULL)
1407 /* Don't support analyzing niter under assumptions for inner
1408 loop. */
1409 || !integer_onep (inner_assumptions))
1411 if (dump_enabled_p ())
1412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1413 "not vectorized: Bad inner loop.\n");
1414 return false;
1417 if (!expr_invariant_in_loop_p (loop, inner_niter))
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 "not vectorized: inner-loop count not"
1422 " invariant.\n");
1423 return false;
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location,
1428 "Considering outer-loop vectorization.\n");
1431 if (!single_exit (loop)
1432 || EDGE_COUNT (loop->header->preds) != 2)
1434 if (dump_enabled_p ())
1436 if (!single_exit (loop))
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438 "not vectorized: multiple exits.\n");
1439 else if (EDGE_COUNT (loop->header->preds) != 2)
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441 "not vectorized: too many incoming edges.\n");
1443 return false;
1446 /* We assume that the loop exit condition is at the end of the loop. i.e,
1447 that the loop is represented as a do-while (with a proper if-guard
1448 before the loop if needed), where the loop header contains all the
1449 executable statements, and the latch is empty. */
1450 if (!empty_block_p (loop->latch)
1451 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1453 if (dump_enabled_p ())
1454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1455 "not vectorized: latch block not empty.\n");
1456 return false;
1459 /* Make sure the exit is not abnormal. */
1460 edge e = single_exit (loop);
1461 if (e->flags & EDGE_ABNORMAL)
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "not vectorized: abnormal loop exit edge.\n");
1466 return false;
1469 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1470 number_of_iterationsm1);
1471 if (!*loop_cond)
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1475 "not vectorized: complicated exit condition.\n");
1476 return false;
1479 if (integer_zerop (*assumptions)
1480 || !*number_of_iterations
1481 || chrec_contains_undetermined (*number_of_iterations))
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "not vectorized: number of iterations cannot be "
1486 "computed.\n");
1487 return false;
1490 if (integer_zerop (*number_of_iterations))
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494 "not vectorized: number of iterations = 0.\n");
1495 return false;
1498 return true;
1501 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1503 loop_vec_info
1504 vect_analyze_loop_form (struct loop *loop)
1506 tree assumptions, number_of_iterations, number_of_iterationsm1;
1507 gcond *loop_cond, *inner_loop_cond = NULL;
1509 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1510 &assumptions, &number_of_iterationsm1,
1511 &number_of_iterations, &inner_loop_cond))
1512 return NULL;
1514 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1515 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1516 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1517 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1518 if (!integer_onep (assumptions))
1520 /* We consider to vectorize this loop by versioning it under
1521 some assumptions. In order to do this, we need to clear
1522 existing information computed by scev and niter analyzer. */
1523 scev_reset_htab ();
1524 free_numbers_of_iterations_estimates (loop);
1525 /* Also set flag for this loop so that following scev and niter
1526 analysis are done under the assumptions. */
1527 loop_constraint_set (loop, LOOP_C_FINITE);
1528 /* Also record the assumptions for versioning. */
1529 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1532 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1534 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_NOTE, vect_location,
1537 "Symbolic number of iterations is ");
1538 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1539 dump_printf (MSG_NOTE, "\n");
1543 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1544 if (inner_loop_cond)
1545 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1546 = loop_exit_ctrl_vec_info_type;
1548 gcc_assert (!loop->aux);
1549 loop->aux = loop_vinfo;
1550 return loop_vinfo;
1555 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1556 statements update the vectorization factor. */
1558 static void
1559 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1561 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1562 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1563 int nbbs = loop->num_nodes;
1564 unsigned int vectorization_factor;
1565 int i;
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_NOTE, vect_location,
1569 "=== vect_update_vf_for_slp ===\n");
1571 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1572 gcc_assert (vectorization_factor != 0);
1574 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1575 vectorization factor of the loop is the unrolling factor required by
1576 the SLP instances. If that unrolling factor is 1, we say, that we
1577 perform pure SLP on loop - cross iteration parallelism is not
1578 exploited. */
1579 bool only_slp_in_loop = true;
1580 for (i = 0; i < nbbs; i++)
1582 basic_block bb = bbs[i];
1583 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584 gsi_next (&si))
1586 gimple *stmt = gsi_stmt (si);
1587 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1588 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1589 && STMT_VINFO_RELATED_STMT (stmt_info))
1591 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1592 stmt_info = vinfo_for_stmt (stmt);
1594 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1595 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1596 && !PURE_SLP_STMT (stmt_info))
1597 /* STMT needs both SLP and loop-based vectorization. */
1598 only_slp_in_loop = false;
1602 if (only_slp_in_loop)
1604 dump_printf_loc (MSG_NOTE, vect_location,
1605 "Loop contains only SLP stmts\n");
1606 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1608 else
1610 dump_printf_loc (MSG_NOTE, vect_location,
1611 "Loop contains SLP and non-SLP stmts\n");
1612 vectorization_factor
1613 = least_common_multiple (vectorization_factor,
1614 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1617 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1618 if (dump_enabled_p ())
1619 dump_printf_loc (MSG_NOTE, vect_location,
1620 "Updating vectorization factor to %d\n",
1621 vectorization_factor);
1624 /* Function vect_analyze_loop_operations.
1626 Scan the loop stmts and make sure they are all vectorizable. */
1628 static bool
1629 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1631 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1632 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1633 int nbbs = loop->num_nodes;
1634 int i;
1635 stmt_vec_info stmt_info;
1636 bool need_to_vectorize = false;
1637 bool ok;
1639 if (dump_enabled_p ())
1640 dump_printf_loc (MSG_NOTE, vect_location,
1641 "=== vect_analyze_loop_operations ===\n");
1643 for (i = 0; i < nbbs; i++)
1645 basic_block bb = bbs[i];
1647 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1648 gsi_next (&si))
1650 gphi *phi = si.phi ();
1651 ok = true;
1653 stmt_info = vinfo_for_stmt (phi);
1654 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1657 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1659 if (virtual_operand_p (gimple_phi_result (phi)))
1660 continue;
1662 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1663 (i.e., a phi in the tail of the outer-loop). */
1664 if (! is_loop_header_bb_p (bb))
1666 /* FORNOW: we currently don't support the case that these phis
1667 are not used in the outerloop (unless it is double reduction,
1668 i.e., this phi is vect_reduction_def), cause this case
1669 requires to actually do something here. */
1670 if (STMT_VINFO_LIVE_P (stmt_info)
1671 && STMT_VINFO_DEF_TYPE (stmt_info)
1672 != vect_double_reduction_def)
1674 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676 "Unsupported loop-closed phi in "
1677 "outer-loop.\n");
1678 return false;
1681 /* If PHI is used in the outer loop, we check that its operand
1682 is defined in the inner loop. */
1683 if (STMT_VINFO_RELEVANT_P (stmt_info))
1685 tree phi_op;
1686 gimple *op_def_stmt;
1688 if (gimple_phi_num_args (phi) != 1)
1689 return false;
1691 phi_op = PHI_ARG_DEF (phi, 0);
1692 if (TREE_CODE (phi_op) != SSA_NAME)
1693 return false;
1695 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1696 if (gimple_nop_p (op_def_stmt)
1697 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1698 || !vinfo_for_stmt (op_def_stmt))
1699 return false;
1701 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1702 != vect_used_in_outer
1703 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1704 != vect_used_in_outer_by_reduction)
1705 return false;
1708 continue;
1711 gcc_assert (stmt_info);
1713 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1714 || STMT_VINFO_LIVE_P (stmt_info))
1715 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1717 /* A scalar-dependence cycle that we don't support. */
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720 "not vectorized: scalar dependence cycle.\n");
1721 return false;
1724 if (STMT_VINFO_RELEVANT_P (stmt_info))
1726 need_to_vectorize = true;
1727 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1728 && ! PURE_SLP_STMT (stmt_info))
1729 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1730 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1731 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1732 && ! PURE_SLP_STMT (stmt_info))
1733 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1736 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1737 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1739 if (!ok)
1741 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: relevant phi not "
1745 "supported: ");
1746 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1748 return false;
1752 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1753 gsi_next (&si))
1755 gimple *stmt = gsi_stmt (si);
1756 if (!gimple_clobber_p (stmt)
1757 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1758 return false;
1760 } /* bbs */
1762 /* All operations in the loop are either irrelevant (deal with loop
1763 control, or dead), or only used outside the loop and can be moved
1764 out of the loop (e.g. invariants, inductions). The loop can be
1765 optimized away by scalar optimizations. We're better off not
1766 touching this loop. */
1767 if (!need_to_vectorize)
1769 if (dump_enabled_p ())
1770 dump_printf_loc (MSG_NOTE, vect_location,
1771 "All the computation can be taken out of the loop.\n");
1772 if (dump_enabled_p ())
1773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1774 "not vectorized: redundant loop. no profit to "
1775 "vectorize.\n");
1776 return false;
1779 return true;
1783 /* Function vect_analyze_loop_2.
1785 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1786 for it. The different analyses will record information in the
1787 loop_vec_info struct. */
1788 static bool
1789 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1791 bool ok;
1792 int max_vf = MAX_VECTORIZATION_FACTOR;
1793 int min_vf = 2;
1794 unsigned int n_stmts = 0;
1796 /* The first group of checks is independent of the vector size. */
1797 fatal = true;
1799 /* Find all data references in the loop (which correspond to vdefs/vuses)
1800 and analyze their evolution in the loop. */
1802 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1804 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1805 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809 "not vectorized: loop nest containing two "
1810 "or more consecutive inner loops cannot be "
1811 "vectorized\n");
1812 return false;
1815 for (unsigned i = 0; i < loop->num_nodes; i++)
1816 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1817 !gsi_end_p (gsi); gsi_next (&gsi))
1819 gimple *stmt = gsi_stmt (gsi);
1820 if (is_gimple_debug (stmt))
1821 continue;
1822 ++n_stmts;
1823 if (!find_data_references_in_stmt (loop, stmt,
1824 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1826 if (is_gimple_call (stmt) && loop->safelen)
1828 tree fndecl = gimple_call_fndecl (stmt), op;
1829 if (fndecl != NULL_TREE)
1831 cgraph_node *node = cgraph_node::get (fndecl);
1832 if (node != NULL && node->simd_clones != NULL)
1834 unsigned int j, n = gimple_call_num_args (stmt);
1835 for (j = 0; j < n; j++)
1837 op = gimple_call_arg (stmt, j);
1838 if (DECL_P (op)
1839 || (REFERENCE_CLASS_P (op)
1840 && get_base_address (op)))
1841 break;
1843 op = gimple_call_lhs (stmt);
1844 /* Ignore #pragma omp declare simd functions
1845 if they don't have data references in the
1846 call stmt itself. */
1847 if (j == n
1848 && !(op
1849 && (DECL_P (op)
1850 || (REFERENCE_CLASS_P (op)
1851 && get_base_address (op)))))
1852 continue;
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858 "not vectorized: loop contains function "
1859 "calls or data references that cannot "
1860 "be analyzed\n");
1861 return false;
1865 /* Analyze the data references and also adjust the minimal
1866 vectorization factor according to the loads and stores. */
1868 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1869 if (!ok)
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873 "bad data references.\n");
1874 return false;
1877 /* Classify all cross-iteration scalar data-flow cycles.
1878 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1879 vect_analyze_scalar_cycles (loop_vinfo);
1881 vect_pattern_recog (loop_vinfo);
1883 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1885 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1886 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1888 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1889 if (!ok)
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893 "bad data access.\n");
1894 return false;
1897 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1899 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1900 if (!ok)
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "unexpected pattern.\n");
1905 return false;
1908 /* While the rest of the analysis below depends on it in some way. */
1909 fatal = false;
1911 /* Analyze data dependences between the data-refs in the loop
1912 and adjust the maximum vectorization factor according to
1913 the dependences.
1914 FORNOW: fail at the first data dependence that we encounter. */
1916 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1917 if (!ok
1918 || max_vf < min_vf)
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "bad data dependence.\n");
1923 return false;
1925 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1927 ok = vect_determine_vectorization_factor (loop_vinfo);
1928 if (!ok)
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932 "can't determine vectorization factor.\n");
1933 return false;
1935 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 "bad data dependence.\n");
1940 return false;
1943 /* Compute the scalar iteration cost. */
1944 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1947 HOST_WIDE_INT estimated_niter;
1948 unsigned th;
1949 int min_scalar_loop_bound;
1951 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1952 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1953 if (!ok)
1954 return false;
1956 /* If there are any SLP instances mark them as pure_slp. */
1957 bool slp = vect_make_slp_decision (loop_vinfo);
1958 if (slp)
1960 /* Find stmts that need to be both vectorized and SLPed. */
1961 vect_detect_hybrid_slp (loop_vinfo);
1963 /* Update the vectorization factor based on the SLP decision. */
1964 vect_update_vf_for_slp (loop_vinfo);
1967 /* This is the point where we can re-start analysis with SLP forced off. */
1968 start_over:
1970 /* Now the vectorization factor is final. */
1971 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1972 gcc_assert (vectorization_factor != 0);
1974 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1975 dump_printf_loc (MSG_NOTE, vect_location,
1976 "vectorization_factor = %d, niters = "
1977 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1978 LOOP_VINFO_INT_NITERS (loop_vinfo));
1980 HOST_WIDE_INT max_niter
1981 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1982 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1983 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1984 || (max_niter != -1
1985 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1987 if (dump_enabled_p ())
1988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989 "not vectorized: iteration count smaller than "
1990 "vectorization factor.\n");
1991 return false;
1994 /* Analyze the alignment of the data-refs in the loop.
1995 Fail if a data reference is found that cannot be vectorized. */
1997 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1998 if (!ok)
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 "bad data alignment.\n");
2003 return false;
2006 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2007 It is important to call pruning after vect_analyze_data_ref_accesses,
2008 since we use grouping information gathered by interleaving analysis. */
2009 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2010 if (!ok)
2011 return false;
2013 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2014 vectorization. */
2015 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2017 /* This pass will decide on using loop versioning and/or loop peeling in
2018 order to enhance the alignment of data references in the loop. */
2019 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2020 if (!ok)
2022 if (dump_enabled_p ())
2023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2024 "bad data alignment.\n");
2025 return false;
2029 if (slp)
2031 /* Analyze operations in the SLP instances. Note this may
2032 remove unsupported SLP instances which makes the above
2033 SLP kind detection invalid. */
2034 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2035 vect_slp_analyze_operations (loop_vinfo);
2036 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2037 goto again;
2040 /* Scan all the remaining operations in the loop that are not subject
2041 to SLP and make sure they are vectorizable. */
2042 ok = vect_analyze_loop_operations (loop_vinfo);
2043 if (!ok)
2045 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047 "bad operation or unsupported loop bound.\n");
2048 return false;
2051 /* If epilog loop is required because of data accesses with gaps,
2052 one additional iteration needs to be peeled. Check if there is
2053 enough iterations for vectorization. */
2054 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2055 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2057 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2058 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2060 if (wi::to_widest (scalar_niters) < vf)
2062 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "loop has no enough iterations to support"
2065 " peeling for gaps.\n");
2066 return false;
2070 /* Analyze cost. Decide if worth while to vectorize. */
2071 int min_profitable_estimate, min_profitable_iters;
2072 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2073 &min_profitable_estimate);
2075 if (min_profitable_iters < 0)
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "not vectorized: vectorization not profitable.\n");
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082 "not vectorized: vector version will never be "
2083 "profitable.\n");
2084 goto again;
2087 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2088 * vectorization_factor);
2090 /* Use the cost model only if it is more conservative than user specified
2091 threshold. */
2092 th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2094 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2096 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2097 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101 "not vectorized: vectorization not profitable.\n");
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "not vectorized: iteration count smaller than user "
2105 "specified loop bound parameter or minimum profitable "
2106 "iterations (whichever is more conservative).\n");
2107 goto again;
2110 estimated_niter
2111 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2112 if (estimated_niter == -1)
2113 estimated_niter = max_niter;
2114 if (estimated_niter != -1
2115 && ((unsigned HOST_WIDE_INT) estimated_niter
2116 < MAX (th, (unsigned) min_profitable_estimate)))
2118 if (dump_enabled_p ())
2119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2120 "not vectorized: estimated iteration count too "
2121 "small.\n");
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_NOTE, vect_location,
2124 "not vectorized: estimated iteration count smaller "
2125 "than specified loop bound parameter or minimum "
2126 "profitable iterations (whichever is more "
2127 "conservative).\n");
2128 goto again;
2131 /* Decide whether we need to create an epilogue loop to handle
2132 remaining scalar iterations. */
2133 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2134 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2135 * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2137 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2138 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2140 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2141 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2142 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2143 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2145 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2146 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2147 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2148 /* In case of versioning, check if the maximum number of
2149 iterations is greater than th. If they are identical,
2150 the epilogue is unnecessary. */
2151 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2152 || (unsigned HOST_WIDE_INT) max_niter > th)))
2153 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2155 /* If an epilogue loop is required make sure we can create one. */
2156 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2157 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2159 if (dump_enabled_p ())
2160 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2161 if (!vect_can_advance_ivs_p (loop_vinfo)
2162 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2163 single_exit (LOOP_VINFO_LOOP
2164 (loop_vinfo))))
2166 if (dump_enabled_p ())
2167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168 "not vectorized: can't create required "
2169 "epilog loop\n");
2170 goto again;
2174 /* During peeling, we need to check if number of loop iterations is
2175 enough for both peeled prolog loop and vector loop. This check
2176 can be merged along with threshold check of loop versioning, so
2177 increase threshold for this case if necessary. */
2178 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2179 && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2180 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2182 unsigned niters_th;
2184 /* Niters for peeled prolog loop. */
2185 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2187 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2188 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2190 niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2192 else
2193 niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2195 /* Niters for at least one iteration of vectorized loop. */
2196 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197 /* One additional iteration because of peeling for gap. */
2198 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2199 niters_th++;
2200 if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2201 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2204 gcc_assert (vectorization_factor
2205 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2207 /* Ok to vectorize! */
2208 return true;
2210 again:
2211 /* Try again with SLP forced off but if we didn't do any SLP there is
2212 no point in re-trying. */
2213 if (!slp)
2214 return false;
2216 /* If there are reduction chains re-trying will fail anyway. */
2217 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2218 return false;
2220 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2221 via interleaving or lane instructions. */
2222 slp_instance instance;
2223 slp_tree node;
2224 unsigned i, j;
2225 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2227 stmt_vec_info vinfo;
2228 vinfo = vinfo_for_stmt
2229 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2230 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2231 continue;
2232 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2233 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2234 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2235 if (! vect_store_lanes_supported (vectype, size)
2236 && ! vect_grouped_store_supported (vectype, size))
2237 return false;
2238 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2240 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2241 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2242 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2243 size = STMT_VINFO_GROUP_SIZE (vinfo);
2244 vectype = STMT_VINFO_VECTYPE (vinfo);
2245 if (! vect_load_lanes_supported (vectype, size)
2246 && ! vect_grouped_load_supported (vectype, single_element_p,
2247 size))
2248 return false;
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "re-trying with SLP disabled\n");
2256 /* Roll back state appropriately. No SLP this time. */
2257 slp = false;
2258 /* Restore vectorization factor as it were without SLP. */
2259 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2260 /* Free the SLP instances. */
2261 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2262 vect_free_slp_instance (instance);
2263 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2264 /* Reset SLP type to loop_vect on all stmts. */
2265 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2267 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2268 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2269 !gsi_end_p (si); gsi_next (&si))
2271 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2272 STMT_SLP_TYPE (stmt_info) = loop_vect;
2274 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2275 !gsi_end_p (si); gsi_next (&si))
2277 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2278 STMT_SLP_TYPE (stmt_info) = loop_vect;
2279 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2281 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2282 STMT_SLP_TYPE (stmt_info) = loop_vect;
2283 for (gimple_stmt_iterator pi
2284 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2285 !gsi_end_p (pi); gsi_next (&pi))
2287 gimple *pstmt = gsi_stmt (pi);
2288 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2293 /* Free optimized alias test DDRS. */
2294 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2295 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2296 /* Reset target cost data. */
2297 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2298 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2299 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2300 /* Reset assorted flags. */
2301 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2302 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2303 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2305 goto start_over;
2308 /* Function vect_analyze_loop.
2310 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2311 for it. The different analyses will record information in the
2312 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2313 be vectorized. */
2314 loop_vec_info
2315 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2317 loop_vec_info loop_vinfo;
2318 unsigned int vector_sizes;
2320 /* Autodetect first vector size we try. */
2321 current_vector_size = 0;
2322 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_NOTE, vect_location,
2326 "===== analyze_loop_nest =====\n");
2328 if (loop_outer (loop)
2329 && loop_vec_info_for_loop (loop_outer (loop))
2330 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "outer-loop already vectorized.\n");
2335 return NULL;
2338 while (1)
2340 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2341 loop_vinfo = vect_analyze_loop_form (loop);
2342 if (!loop_vinfo)
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "bad loop form.\n");
2347 return NULL;
2350 bool fatal = false;
2352 if (orig_loop_vinfo)
2353 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2357 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2359 return loop_vinfo;
2362 delete loop_vinfo;
2364 vector_sizes &= ~current_vector_size;
2365 if (fatal
2366 || vector_sizes == 0
2367 || current_vector_size == 0)
2368 return NULL;
2370 /* Try the next biggest vector size. */
2371 current_vector_size = 1 << floor_log2 (vector_sizes);
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "***** Re-trying analysis with "
2375 "vector size %d\n", current_vector_size);
2380 /* Function reduction_fn_for_scalar_code
2382 Input:
2383 CODE - tree_code of a reduction operations.
2385 Output:
2386 REDUC_FN - the corresponding internal function to be used to reduce the
2387 vector of partial results into a single scalar result, or IFN_LAST
2388 if the operation is a supported reduction operation, but does not have
2389 such an internal function.
2391 Return FALSE if CODE currently cannot be vectorized as reduction. */
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2396 switch (code)
2398 case MAX_EXPR:
2399 *reduc_fn = IFN_REDUC_MAX;
2400 return true;
2402 case MIN_EXPR:
2403 *reduc_fn = IFN_REDUC_MIN;
2404 return true;
2406 case PLUS_EXPR:
2407 *reduc_fn = IFN_REDUC_PLUS;
2408 return true;
2410 case MULT_EXPR:
2411 case MINUS_EXPR:
2412 case BIT_IOR_EXPR:
2413 case BIT_XOR_EXPR:
2414 case BIT_AND_EXPR:
2415 *reduc_fn = IFN_LAST;
2416 return true;
2418 default:
2419 return false;
2424 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2425 STMT is printed with a message MSG. */
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 dump_printf_loc (msg_type, vect_location, "%s", msg);
2431 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2435 /* Detect SLP reduction of the form:
2437 #a1 = phi <a5, a0>
2438 a2 = operation (a1)
2439 a3 = operation (a2)
2440 a4 = operation (a3)
2441 a5 = operation (a4)
2443 #a = phi <a5>
2445 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446 FIRST_STMT is the first reduction stmt in the chain
2447 (a2 = operation (a1)).
2449 Return TRUE if a reduction chain was detected. */
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453 gimple *first_stmt)
2455 struct loop *loop = (gimple_bb (phi))->loop_father;
2456 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457 enum tree_code code;
2458 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459 stmt_vec_info use_stmt_info, current_stmt_info;
2460 tree lhs;
2461 imm_use_iterator imm_iter;
2462 use_operand_p use_p;
2463 int nloop_uses, size = 0, n_out_of_loop_uses;
2464 bool found = false;
2466 if (loop != vect_loop)
2467 return false;
2469 lhs = PHI_RESULT (phi);
2470 code = gimple_assign_rhs_code (first_stmt);
2471 while (1)
2473 nloop_uses = 0;
2474 n_out_of_loop_uses = 0;
2475 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477 gimple *use_stmt = USE_STMT (use_p);
2478 if (is_gimple_debug (use_stmt))
2479 continue;
2481 /* Check if we got back to the reduction phi. */
2482 if (use_stmt == phi)
2484 loop_use_stmt = use_stmt;
2485 found = true;
2486 break;
2489 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491 loop_use_stmt = use_stmt;
2492 nloop_uses++;
2494 else
2495 n_out_of_loop_uses++;
2497 /* There are can be either a single use in the loop or two uses in
2498 phi nodes. */
2499 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500 return false;
2503 if (found)
2504 break;
2506 /* We reached a statement with no loop uses. */
2507 if (nloop_uses == 0)
2508 return false;
2510 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2511 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512 return false;
2514 if (!is_gimple_assign (loop_use_stmt)
2515 || code != gimple_assign_rhs_code (loop_use_stmt)
2516 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517 return false;
2519 /* Insert USE_STMT into reduction chain. */
2520 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521 if (current_stmt)
2523 current_stmt_info = vinfo_for_stmt (current_stmt);
2524 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525 GROUP_FIRST_ELEMENT (use_stmt_info)
2526 = GROUP_FIRST_ELEMENT (current_stmt_info);
2528 else
2529 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531 lhs = gimple_assign_lhs (loop_use_stmt);
2532 current_stmt = loop_use_stmt;
2533 size++;
2536 if (!found || loop_use_stmt != phi || size < 2)
2537 return false;
2539 /* Swap the operands, if needed, to make the reduction operand be the second
2540 operand. */
2541 lhs = PHI_RESULT (phi);
2542 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543 while (next_stmt)
2545 if (gimple_assign_rhs2 (next_stmt) == lhs)
2547 tree op = gimple_assign_rhs1 (next_stmt);
2548 gimple *def_stmt = NULL;
2550 if (TREE_CODE (op) == SSA_NAME)
2551 def_stmt = SSA_NAME_DEF_STMT (op);
2553 /* Check that the other def is either defined in the loop
2554 ("vect_internal_def"), or it's an induction (defined by a
2555 loop-header phi-node). */
2556 if (def_stmt
2557 && gimple_bb (def_stmt)
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559 && (is_gimple_assign (def_stmt)
2560 || is_gimple_call (def_stmt)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562 == vect_induction_def
2563 || (gimple_code (def_stmt) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568 lhs = gimple_assign_lhs (next_stmt);
2569 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570 continue;
2573 return false;
2575 else
2577 tree op = gimple_assign_rhs2 (next_stmt);
2578 gimple *def_stmt = NULL;
2580 if (TREE_CODE (op) == SSA_NAME)
2581 def_stmt = SSA_NAME_DEF_STMT (op);
2583 /* Check that the other def is either defined in the loop
2584 ("vect_internal_def"), or it's an induction (defined by a
2585 loop-header phi-node). */
2586 if (def_stmt
2587 && gimple_bb (def_stmt)
2588 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589 && (is_gimple_assign (def_stmt)
2590 || is_gimple_call (def_stmt)
2591 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592 == vect_induction_def
2593 || (gimple_code (def_stmt) == GIMPLE_PHI
2594 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595 == vect_internal_def
2596 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2604 swap_ssa_operands (next_stmt,
2605 gimple_assign_rhs1_ptr (next_stmt),
2606 gimple_assign_rhs2_ptr (next_stmt));
2607 update_stmt (next_stmt);
2609 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612 else
2613 return false;
2616 lhs = gimple_assign_lhs (next_stmt);
2617 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2620 /* Save the chain for further analysis in SLP detection. */
2621 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625 return true;
2629 /* Function vect_is_simple_reduction
2631 (1) Detect a cross-iteration def-use cycle that represents a simple
2632 reduction computation. We look for the following pattern:
2634 loop_header:
2635 a1 = phi < a0, a2 >
2636 a3 = ...
2637 a2 = operation (a3, a1)
2641 a3 = ...
2642 loop_header:
2643 a1 = phi < a0, a2 >
2644 a2 = operation (a3, a1)
2646 such that:
2647 1. operation is commutative and associative and it is safe to
2648 change the order of the computation
2649 2. no uses for a2 in the loop (a2 is used out of the loop)
2650 3. no uses of a1 in the loop besides the reduction operation
2651 4. no uses of a1 outside the loop.
2653 Conditions 1,4 are tested here.
2654 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2656 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2657 nested cycles.
2659 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2660 reductions:
2662 a1 = phi < a0, a2 >
2663 inner loop (def of a3)
2664 a2 = phi < a3 >
2666 (4) Detect condition expressions, ie:
2667 for (int i = 0; i < N; i++)
2668 if (a[i] < val)
2669 ret_val = a[i];
2673 static gimple *
2674 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2675 bool *double_reduc,
2676 bool need_wrapping_integral_overflow,
2677 enum vect_reduction_type *v_reduc_type)
2679 struct loop *loop = (gimple_bb (phi))->loop_father;
2680 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2681 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2682 enum tree_code orig_code, code;
2683 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2684 tree type;
2685 int nloop_uses;
2686 tree name;
2687 imm_use_iterator imm_iter;
2688 use_operand_p use_p;
2689 bool phi_def;
2691 *double_reduc = false;
2692 *v_reduc_type = TREE_CODE_REDUCTION;
2694 tree phi_name = PHI_RESULT (phi);
2695 /* ??? If there are no uses of the PHI result the inner loop reduction
2696 won't be detected as possibly double-reduction by vectorizable_reduction
2697 because that tries to walk the PHI arg from the preheader edge which
2698 can be constant. See PR60382. */
2699 if (has_zero_uses (phi_name))
2700 return NULL;
2701 nloop_uses = 0;
2702 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2704 gimple *use_stmt = USE_STMT (use_p);
2705 if (is_gimple_debug (use_stmt))
2706 continue;
2708 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712 "intermediate value used outside loop.\n");
2714 return NULL;
2717 nloop_uses++;
2718 if (nloop_uses > 1)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "reduction value used in loop.\n");
2723 return NULL;
2726 phi_use_stmt = use_stmt;
2729 edge latch_e = loop_latch_edge (loop);
2730 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2731 if (TREE_CODE (loop_arg) != SSA_NAME)
2733 if (dump_enabled_p ())
2735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736 "reduction: not ssa_name: ");
2737 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2740 return NULL;
2743 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744 if (is_gimple_assign (def_stmt))
2746 name = gimple_assign_lhs (def_stmt);
2747 phi_def = false;
2749 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2751 name = PHI_RESULT (def_stmt);
2752 phi_def = true;
2754 else
2756 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759 "reduction: unhandled reduction operation: ");
2760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2762 return NULL;
2765 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2766 return NULL;
2768 nloop_uses = 0;
2769 auto_vec<gphi *, 3> lcphis;
2770 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2772 gimple *use_stmt = USE_STMT (use_p);
2773 if (is_gimple_debug (use_stmt))
2774 continue;
2775 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2776 nloop_uses++;
2777 else
2778 /* We can have more than one loop-closed PHI. */
2779 lcphis.safe_push (as_a <gphi *> (use_stmt));
2780 if (nloop_uses > 1)
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784 "reduction used in loop.\n");
2785 return NULL;
2789 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2790 defined in the inner loop. */
2791 if (phi_def)
2793 op1 = PHI_ARG_DEF (def_stmt, 0);
2795 if (gimple_phi_num_args (def_stmt) != 1
2796 || TREE_CODE (op1) != SSA_NAME)
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800 "unsupported phi node definition.\n");
2802 return NULL;
2805 def1 = SSA_NAME_DEF_STMT (op1);
2806 if (gimple_bb (def1)
2807 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2808 && loop->inner
2809 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2810 && is_gimple_assign (def1)
2811 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2813 if (dump_enabled_p ())
2814 report_vect_op (MSG_NOTE, def_stmt,
2815 "detected double reduction: ");
2817 *double_reduc = true;
2818 return def_stmt;
2821 return NULL;
2824 /* If we are vectorizing an inner reduction we are executing that
2825 in the original order only in case we are not dealing with a
2826 double reduction. */
2827 bool check_reduction = true;
2828 if (flow_loop_nested_p (vect_loop, loop))
2830 gphi *lcphi;
2831 unsigned i;
2832 check_reduction = false;
2833 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2836 gimple *use_stmt = USE_STMT (use_p);
2837 if (is_gimple_debug (use_stmt))
2838 continue;
2839 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2840 check_reduction = true;
2844 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2845 code = orig_code = gimple_assign_rhs_code (def_stmt);
2847 /* We can handle "res -= x[i]", which is non-associative by
2848 simply rewriting this into "res += -x[i]". Avoid changing
2849 gimple instruction for the first simple tests and only do this
2850 if we're allowed to change code at all. */
2851 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2852 code = PLUS_EXPR;
2854 if (code == COND_EXPR)
2856 if (! nested_in_vect_loop)
2857 *v_reduc_type = COND_REDUCTION;
2859 op3 = gimple_assign_rhs1 (def_stmt);
2860 if (COMPARISON_CLASS_P (op3))
2862 op4 = TREE_OPERAND (op3, 1);
2863 op3 = TREE_OPERAND (op3, 0);
2865 if (op3 == phi_name || op4 == phi_name)
2867 if (dump_enabled_p ())
2868 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869 "reduction: condition depends on previous"
2870 " iteration: ");
2871 return NULL;
2874 op1 = gimple_assign_rhs2 (def_stmt);
2875 op2 = gimple_assign_rhs3 (def_stmt);
2877 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2879 if (dump_enabled_p ())
2880 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881 "reduction: not commutative/associative: ");
2882 return NULL;
2884 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2886 op1 = gimple_assign_rhs1 (def_stmt);
2887 op2 = gimple_assign_rhs2 (def_stmt);
2889 else
2891 if (dump_enabled_p ())
2892 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893 "reduction: not handled operation: ");
2894 return NULL;
2897 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2899 if (dump_enabled_p ())
2900 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2901 "reduction: both uses not ssa_names: ");
2903 return NULL;
2906 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2907 if ((TREE_CODE (op1) == SSA_NAME
2908 && !types_compatible_p (type,TREE_TYPE (op1)))
2909 || (TREE_CODE (op2) == SSA_NAME
2910 && !types_compatible_p (type, TREE_TYPE (op2)))
2911 || (op3 && TREE_CODE (op3) == SSA_NAME
2912 && !types_compatible_p (type, TREE_TYPE (op3)))
2913 || (op4 && TREE_CODE (op4) == SSA_NAME
2914 && !types_compatible_p (type, TREE_TYPE (op4))))
2916 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE, vect_location,
2919 "reduction: multiple types: operation type: ");
2920 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2921 dump_printf (MSG_NOTE, ", operands types: ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923 TREE_TYPE (op1));
2924 dump_printf (MSG_NOTE, ",");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926 TREE_TYPE (op2));
2927 if (op3)
2929 dump_printf (MSG_NOTE, ",");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 TREE_TYPE (op3));
2934 if (op4)
2936 dump_printf (MSG_NOTE, ",");
2937 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938 TREE_TYPE (op4));
2940 dump_printf (MSG_NOTE, "\n");
2943 return NULL;
2946 /* Check that it's ok to change the order of the computation.
2947 Generally, when vectorizing a reduction we change the order of the
2948 computation. This may change the behavior of the program in some
2949 cases, so we need to check that this is ok. One exception is when
2950 vectorizing an outer-loop: the inner-loop is executed sequentially,
2951 and therefore vectorizing reductions in the inner-loop during
2952 outer-loop vectorization is safe. */
2954 if (*v_reduc_type != COND_REDUCTION
2955 && check_reduction)
2957 /* CHECKME: check for !flag_finite_math_only too? */
2958 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2960 /* Changing the order of operations changes the semantics. */
2961 if (dump_enabled_p ())
2962 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2963 "reduction: unsafe fp math optimization: ");
2964 return NULL;
2966 else if (INTEGRAL_TYPE_P (type))
2968 if (!operation_no_trapping_overflow (type, code))
2970 /* Changing the order of operations changes the semantics. */
2971 if (dump_enabled_p ())
2972 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2973 "reduction: unsafe int math optimization"
2974 " (overflow traps): ");
2975 return NULL;
2977 if (need_wrapping_integral_overflow
2978 && !TYPE_OVERFLOW_WRAPS (type)
2979 && operation_can_overflow (code))
2981 /* Changing the order of operations changes the semantics. */
2982 if (dump_enabled_p ())
2983 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2984 "reduction: unsafe int math optimization"
2985 " (overflow doesn't wrap): ");
2986 return NULL;
2989 else if (SAT_FIXED_POINT_TYPE_P (type))
2991 /* Changing the order of operations changes the semantics. */
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994 "reduction: unsafe fixed-point math optimization: ");
2995 return NULL;
2999 /* Reduction is safe. We're dealing with one of the following:
3000 1) integer arithmetic and no trapv
3001 2) floating point arithmetic, and special flags permit this optimization
3002 3) nested cycle (i.e., outer loop vectorization). */
3003 if (TREE_CODE (op1) == SSA_NAME)
3004 def1 = SSA_NAME_DEF_STMT (op1);
3006 if (TREE_CODE (op2) == SSA_NAME)
3007 def2 = SSA_NAME_DEF_STMT (op2);
3009 if (code != COND_EXPR
3010 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3012 if (dump_enabled_p ())
3013 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3014 return NULL;
3017 /* Check that one def is the reduction def, defined by PHI,
3018 the other def is either defined in the loop ("vect_internal_def"),
3019 or it's an induction (defined by a loop-header phi-node). */
3021 if (def2 && def2 == phi
3022 && (code == COND_EXPR
3023 || !def1 || gimple_nop_p (def1)
3024 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026 && (is_gimple_assign (def1)
3027 || is_gimple_call (def1)
3028 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3029 == vect_induction_def
3030 || (gimple_code (def1) == GIMPLE_PHI
3031 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3032 == vect_internal_def
3033 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3035 if (dump_enabled_p ())
3036 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3037 return def_stmt;
3040 if (def1 && def1 == phi
3041 && (code == COND_EXPR
3042 || !def2 || gimple_nop_p (def2)
3043 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045 && (is_gimple_assign (def2)
3046 || is_gimple_call (def2)
3047 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3048 == vect_induction_def
3049 || (gimple_code (def2) == GIMPLE_PHI
3050 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3051 == vect_internal_def
3052 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3054 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3056 /* Check if we can swap operands (just for simplicity - so that
3057 the rest of the code can assume that the reduction variable
3058 is always the last (second) argument). */
3059 if (code == COND_EXPR)
3061 /* Swap cond_expr by inverting the condition. */
3062 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3063 enum tree_code invert_code = ERROR_MARK;
3064 enum tree_code cond_code = TREE_CODE (cond_expr);
3066 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3068 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3069 invert_code = invert_tree_comparison (cond_code, honor_nans);
3071 if (invert_code != ERROR_MARK)
3073 TREE_SET_CODE (cond_expr, invert_code);
3074 swap_ssa_operands (def_stmt,
3075 gimple_assign_rhs2_ptr (def_stmt),
3076 gimple_assign_rhs3_ptr (def_stmt));
3078 else
3080 if (dump_enabled_p ())
3081 report_vect_op (MSG_NOTE, def_stmt,
3082 "detected reduction: cannot swap operands "
3083 "for cond_expr");
3084 return NULL;
3087 else
3088 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3089 gimple_assign_rhs2_ptr (def_stmt));
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_NOTE, def_stmt,
3093 "detected reduction: need to swap operands: ");
3095 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3096 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3098 else
3100 if (dump_enabled_p ())
3101 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3104 return def_stmt;
3107 /* Try to find SLP reduction chain. */
3108 if (! nested_in_vect_loop
3109 && code != COND_EXPR
3110 && orig_code != MINUS_EXPR
3111 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3113 if (dump_enabled_p ())
3114 report_vect_op (MSG_NOTE, def_stmt,
3115 "reduction: detected reduction chain: ");
3117 return def_stmt;
3120 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3121 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3122 while (first)
3124 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3125 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127 first = next;
3130 /* Look for the expression computing loop_arg from loop PHI result. */
3131 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3132 auto_bitmap visited;
3133 tree lookfor = PHI_RESULT (phi);
3134 ssa_op_iter curri;
3135 use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3136 SSA_OP_USE);
3137 while (USE_FROM_PTR (curr) != loop_arg)
3138 curr = op_iter_next_use (&curri);
3139 curri.i = curri.numops;
3142 path.safe_push (std::make_pair (curri, curr));
3143 tree use = USE_FROM_PTR (curr);
3144 if (use == lookfor)
3145 break;
3146 gimple *def = SSA_NAME_DEF_STMT (use);
3147 if (gimple_nop_p (def)
3148 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3150 pop:
3153 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3154 curri = x.first;
3155 curr = x.second;
3157 curr = op_iter_next_use (&curri);
3158 /* Skip already visited or non-SSA operands (from iterating
3159 over PHI args). */
3160 while (curr != NULL_USE_OPERAND_P
3161 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3162 || ! bitmap_set_bit (visited,
3163 SSA_NAME_VERSION
3164 (USE_FROM_PTR (curr)))));
3166 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3167 if (curr == NULL_USE_OPERAND_P)
3168 break;
3170 else
3172 if (gimple_code (def) == GIMPLE_PHI)
3173 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3174 else
3175 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3176 while (curr != NULL_USE_OPERAND_P
3177 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3178 || ! bitmap_set_bit (visited,
3179 SSA_NAME_VERSION
3180 (USE_FROM_PTR (curr)))))
3181 curr = op_iter_next_use (&curri);
3182 if (curr == NULL_USE_OPERAND_P)
3183 goto pop;
3186 while (1);
3187 if (dump_file && (dump_flags & TDF_DETAILS))
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "reduction path: ");
3191 unsigned i;
3192 std::pair<ssa_op_iter, use_operand_p> *x;
3193 FOR_EACH_VEC_ELT (path, i, x)
3195 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3196 dump_printf (MSG_NOTE, " ");
3198 dump_printf (MSG_NOTE, "\n");
3201 /* Check whether the reduction path detected is valid. */
3202 bool fail = path.length () == 0;
3203 bool neg = false;
3204 for (unsigned i = 1; i < path.length (); ++i)
3206 gimple *use_stmt = USE_STMT (path[i].second);
3207 tree op = USE_FROM_PTR (path[i].second);
3208 if (! has_single_use (op)
3209 || ! is_gimple_assign (use_stmt))
3211 fail = true;
3212 break;
3214 if (gimple_assign_rhs_code (use_stmt) != code)
3216 if (code == PLUS_EXPR
3217 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3219 /* Track whether we negate the reduction value each iteration. */
3220 if (gimple_assign_rhs2 (use_stmt) == op)
3221 neg = ! neg;
3223 else
3225 fail = true;
3226 break;
3230 if (! fail && ! neg)
3231 return def_stmt;
3233 if (dump_enabled_p ())
3235 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3236 "reduction: unknown pattern: ");
3239 return NULL;
3242 /* Wrapper around vect_is_simple_reduction, which will modify code
3243 in-place if it enables detection of more reductions. Arguments
3244 as there. */
3246 gimple *
3247 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3248 bool *double_reduc,
3249 bool need_wrapping_integral_overflow)
3251 enum vect_reduction_type v_reduc_type;
3252 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3253 need_wrapping_integral_overflow,
3254 &v_reduc_type);
3255 if (def)
3257 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3258 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3259 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3260 reduc_def_info = vinfo_for_stmt (def);
3261 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3263 return def;
3266 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3268 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3269 int *peel_iters_epilogue,
3270 stmt_vector_for_cost *scalar_cost_vec,
3271 stmt_vector_for_cost *prologue_cost_vec,
3272 stmt_vector_for_cost *epilogue_cost_vec)
3274 int retval = 0;
3275 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3277 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3279 *peel_iters_epilogue = vf/2;
3280 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "cost model: epilogue peel iters set to vf/2 "
3283 "because loop iterations are unknown .\n");
3285 /* If peeled iterations are known but number of scalar loop
3286 iterations are unknown, count a taken branch per peeled loop. */
3287 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3288 NULL, 0, vect_prologue);
3289 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3290 NULL, 0, vect_epilogue);
3292 else
3294 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3295 peel_iters_prologue = niters < peel_iters_prologue ?
3296 niters : peel_iters_prologue;
3297 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3298 /* If we need to peel for gaps, but no peeling is required, we have to
3299 peel VF iterations. */
3300 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3301 *peel_iters_epilogue = vf;
3304 stmt_info_for_cost *si;
3305 int j;
3306 if (peel_iters_prologue)
3307 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309 stmt_vec_info stmt_info
3310 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3311 retval += record_stmt_cost (prologue_cost_vec,
3312 si->count * peel_iters_prologue,
3313 si->kind, stmt_info, si->misalign,
3314 vect_prologue);
3316 if (*peel_iters_epilogue)
3317 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319 stmt_vec_info stmt_info
3320 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321 retval += record_stmt_cost (epilogue_cost_vec,
3322 si->count * *peel_iters_epilogue,
3323 si->kind, stmt_info, si->misalign,
3324 vect_epilogue);
3327 return retval;
3330 /* Function vect_estimate_min_profitable_iters
3332 Return the number of iterations required for the vector version of the
3333 loop to be profitable relative to the cost of the scalar version of the
3334 loop.
3336 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3337 of iterations for vectorization. -1 value means loop vectorization
3338 is not profitable. This returned value may be used for dynamic
3339 profitability check.
3341 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3342 for static check against estimated number of iterations. */
3344 static void
3345 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3346 int *ret_min_profitable_niters,
3347 int *ret_min_profitable_estimate)
3349 int min_profitable_iters;
3350 int min_profitable_estimate;
3351 int peel_iters_prologue;
3352 int peel_iters_epilogue;
3353 unsigned vec_inside_cost = 0;
3354 int vec_outside_cost = 0;
3355 unsigned vec_prologue_cost = 0;
3356 unsigned vec_epilogue_cost = 0;
3357 int scalar_single_iter_cost = 0;
3358 int scalar_outside_cost = 0;
3359 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3361 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3363 /* Cost model disabled. */
3364 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3366 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3367 *ret_min_profitable_niters = 0;
3368 *ret_min_profitable_estimate = 0;
3369 return;
3372 /* Requires loop versioning tests to handle misalignment. */
3373 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3375 /* FIXME: Make cost depend on complexity of individual check. */
3376 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3377 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3378 vect_prologue);
3379 dump_printf (MSG_NOTE,
3380 "cost model: Adding cost of checks for loop "
3381 "versioning to treat misalignment.\n");
3384 /* Requires loop versioning with alias checks. */
3385 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3387 /* FIXME: Make cost depend on complexity of individual check. */
3388 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3389 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3390 vect_prologue);
3391 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3392 if (len)
3393 /* Count LEN - 1 ANDs and LEN comparisons. */
3394 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3395 NULL, 0, vect_prologue);
3396 dump_printf (MSG_NOTE,
3397 "cost model: Adding cost of checks for loop "
3398 "versioning aliasing.\n");
3401 /* Requires loop versioning with niter checks. */
3402 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404 /* FIXME: Make cost depend on complexity of individual check. */
3405 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3406 vect_prologue);
3407 dump_printf (MSG_NOTE,
3408 "cost model: Adding cost of checks for loop "
3409 "versioning niters.\n");
3412 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3413 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3414 vect_prologue);
3416 /* Count statements in scalar loop. Using this as scalar cost for a single
3417 iteration for now.
3419 TODO: Add outer loop support.
3421 TODO: Consider assigning different costs to different scalar
3422 statements. */
3424 scalar_single_iter_cost
3425 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427 /* Add additional cost for the peeled instructions in prologue and epilogue
3428 loop.
3430 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3431 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433 TODO: Build an expression that represents peel_iters for prologue and
3434 epilogue to be used in a run-time test. */
3436 if (npeel < 0)
3438 peel_iters_prologue = vf/2;
3439 dump_printf (MSG_NOTE, "cost model: "
3440 "prologue peel iters set to vf/2.\n");
3442 /* If peeling for alignment is unknown, loop bound of main loop becomes
3443 unknown. */
3444 peel_iters_epilogue = vf/2;
3445 dump_printf (MSG_NOTE, "cost model: "
3446 "epilogue peel iters set to vf/2 because "
3447 "peeling for alignment is unknown.\n");
3449 /* If peeled iterations are unknown, count a taken branch and a not taken
3450 branch per peeled loop. Even if scalar loop iterations are known,
3451 vector iterations are not known since peeled prologue iterations are
3452 not known. Hence guards remain the same. */
3453 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3454 NULL, 0, vect_prologue);
3455 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3456 NULL, 0, vect_prologue);
3457 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458 NULL, 0, vect_epilogue);
3459 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460 NULL, 0, vect_epilogue);
3461 stmt_info_for_cost *si;
3462 int j;
3463 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3465 struct _stmt_vec_info *stmt_info
3466 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3467 (void) add_stmt_cost (target_cost_data,
3468 si->count * peel_iters_prologue,
3469 si->kind, stmt_info, si->misalign,
3470 vect_prologue);
3471 (void) add_stmt_cost (target_cost_data,
3472 si->count * peel_iters_epilogue,
3473 si->kind, stmt_info, si->misalign,
3474 vect_epilogue);
3477 else
3479 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3480 stmt_info_for_cost *si;
3481 int j;
3482 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3484 prologue_cost_vec.create (2);
3485 epilogue_cost_vec.create (2);
3486 peel_iters_prologue = npeel;
3488 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3489 &peel_iters_epilogue,
3490 &LOOP_VINFO_SCALAR_ITERATION_COST
3491 (loop_vinfo),
3492 &prologue_cost_vec,
3493 &epilogue_cost_vec);
3495 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3497 struct _stmt_vec_info *stmt_info
3498 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3499 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3500 si->misalign, vect_prologue);
3503 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3505 struct _stmt_vec_info *stmt_info
3506 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3507 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3508 si->misalign, vect_epilogue);
3511 prologue_cost_vec.release ();
3512 epilogue_cost_vec.release ();
3515 /* FORNOW: The scalar outside cost is incremented in one of the
3516 following ways:
3518 1. The vectorizer checks for alignment and aliasing and generates
3519 a condition that allows dynamic vectorization. A cost model
3520 check is ANDED with the versioning condition. Hence scalar code
3521 path now has the added cost of the versioning check.
3523 if (cost > th & versioning_check)
3524 jmp to vector code
3526 Hence run-time scalar is incremented by not-taken branch cost.
3528 2. The vectorizer then checks if a prologue is required. If the
3529 cost model check was not done before during versioning, it has to
3530 be done before the prologue check.
3532 if (cost <= th)
3533 prologue = scalar_iters
3534 if (prologue == 0)
3535 jmp to vector code
3536 else
3537 execute prologue
3538 if (prologue == num_iters)
3539 go to exit
3541 Hence the run-time scalar cost is incremented by a taken branch,
3542 plus a not-taken branch, plus a taken branch cost.
3544 3. The vectorizer then checks if an epilogue is required. If the
3545 cost model check was not done before during prologue check, it
3546 has to be done with the epilogue check.
3548 if (prologue == 0)
3549 jmp to vector code
3550 else
3551 execute prologue
3552 if (prologue == num_iters)
3553 go to exit
3554 vector code:
3555 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3556 jmp to epilogue
3558 Hence the run-time scalar cost should be incremented by 2 taken
3559 branches.
3561 TODO: The back end may reorder the BBS's differently and reverse
3562 conditions/branch directions. Change the estimates below to
3563 something more reasonable. */
3565 /* If the number of iterations is known and we do not do versioning, we can
3566 decide whether to vectorize at compile time. Hence the scalar version
3567 do not carry cost model guard costs. */
3568 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3569 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3571 /* Cost model check occurs at versioning. */
3572 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3573 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3574 else
3576 /* Cost model check occurs at prologue generation. */
3577 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3578 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3579 + vect_get_stmt_cost (cond_branch_not_taken);
3580 /* Cost model check occurs at epilogue generation. */
3581 else
3582 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3586 /* Complete the target-specific cost calculations. */
3587 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3588 &vec_inside_cost, &vec_epilogue_cost);
3590 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3592 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3595 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3596 vec_inside_cost);
3597 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3598 vec_prologue_cost);
3599 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3600 vec_epilogue_cost);
3601 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3602 scalar_single_iter_cost);
3603 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3604 scalar_outside_cost);
3605 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3606 vec_outside_cost);
3607 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3608 peel_iters_prologue);
3609 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3610 peel_iters_epilogue);
3613 /* Calculate number of iterations required to make the vector version
3614 profitable, relative to the loop bodies only. The following condition
3615 must hold true:
3616 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3617 where
3618 SIC = scalar iteration cost, VIC = vector iteration cost,
3619 VOC = vector outside cost, VF = vectorization factor,
3620 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3621 SOC = scalar outside cost for run time cost model check. */
3623 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3625 if (vec_outside_cost <= 0)
3626 min_profitable_iters = 0;
3627 else
3629 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3630 - vec_inside_cost * peel_iters_prologue
3631 - vec_inside_cost * peel_iters_epilogue)
3632 / ((scalar_single_iter_cost * vf)
3633 - vec_inside_cost);
3635 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3636 <= (((int) vec_inside_cost * min_profitable_iters)
3637 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3638 min_profitable_iters++;
3641 /* vector version will never be profitable. */
3642 else
3644 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3645 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3646 "did not happen for a simd loop");
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "cost model: the vector iteration cost = %d "
3651 "divided by the scalar iteration cost = %d "
3652 "is greater or equal to the vectorization factor = %d"
3653 ".\n",
3654 vec_inside_cost, scalar_single_iter_cost, vf);
3655 *ret_min_profitable_niters = -1;
3656 *ret_min_profitable_estimate = -1;
3657 return;
3660 dump_printf (MSG_NOTE,
3661 " Calculated minimum iters for profitability: %d\n",
3662 min_profitable_iters);
3664 /* We want the vectorized loop to execute at least once. */
3665 if (min_profitable_iters < (vf + peel_iters_prologue))
3666 min_profitable_iters = vf + peel_iters_prologue;
3668 if (dump_enabled_p ())
3669 dump_printf_loc (MSG_NOTE, vect_location,
3670 " Runtime profitability threshold = %d\n",
3671 min_profitable_iters);
3673 *ret_min_profitable_niters = min_profitable_iters;
3675 /* Calculate number of iterations required to make the vector version
3676 profitable, relative to the loop bodies only.
3678 Non-vectorized variant is SIC * niters and it must win over vector
3679 variant on the expected loop trip count. The following condition must hold true:
3680 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3682 if (vec_outside_cost <= 0)
3683 min_profitable_estimate = 0;
3684 else
3686 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3687 - vec_inside_cost * peel_iters_prologue
3688 - vec_inside_cost * peel_iters_epilogue)
3689 / ((scalar_single_iter_cost * vf)
3690 - vec_inside_cost);
3692 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 " Static estimate profitability threshold = %d\n",
3696 min_profitable_estimate);
3698 *ret_min_profitable_estimate = min_profitable_estimate;
3701 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3702 vector elements (not bits) for a vector with NELT elements. */
3703 static void
3704 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3705 vec_perm_indices *sel)
3707 unsigned int i;
3709 for (i = 0; i < nelt; i++)
3710 sel->quick_push ((i + offset) & (2 * nelt - 1));
3713 /* Checks whether the target supports whole-vector shifts for vectors of mode
3714 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3715 it supports vec_perm_const with masks for all necessary shift amounts. */
3716 static bool
3717 have_whole_vector_shift (machine_mode mode)
3719 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3720 return true;
3722 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3723 return false;
3725 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3726 auto_vec_perm_indices sel (nelt);
3728 for (i = nelt/2; i >= 1; i/=2)
3730 sel.truncate (0);
3731 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3732 if (!can_vec_perm_p (mode, false, &sel))
3733 return false;
3735 return true;
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739 functions. Design better to avoid maintenance issues. */
3741 /* Function vect_model_reduction_cost.
3743 Models cost for a reduction operation, including the vector ops
3744 generated within the strip-mine loop, the initial definition before
3745 the loop, and the epilogue code that must be generated. */
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3749 int ncopies)
3751 int prologue_cost = 0, epilogue_cost = 0;
3752 enum tree_code code;
3753 optab optab;
3754 tree vectype;
3755 gimple *orig_stmt;
3756 machine_mode mode;
3757 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3758 struct loop *loop = NULL;
3759 void *target_cost_data;
3761 if (loop_vinfo)
3763 loop = LOOP_VINFO_LOOP (loop_vinfo);
3764 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3766 else
3767 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3769 /* Condition reductions generate two reductions in the loop. */
3770 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3771 ncopies *= 2;
3773 /* Cost of reduction op inside loop. */
3774 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3775 stmt_info, 0, vect_body);
3777 vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 mode = TYPE_MODE (vectype);
3779 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3781 if (!orig_stmt)
3782 orig_stmt = STMT_VINFO_STMT (stmt_info);
3784 code = gimple_assign_rhs_code (orig_stmt);
3786 /* Add in cost for initial definition.
3787 For cond reduction we have four vectors: initial index, step, initial
3788 result of the data reduction, initial value of the index reduction. */
3789 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3790 == COND_REDUCTION ? 4 : 1;
3791 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3792 scalar_to_vec, stmt_info, 0,
3793 vect_prologue);
3795 /* Determine cost of epilogue code.
3797 We have a reduction operator that will reduce the vector in one statement.
3798 Also requires scalar extract. */
3800 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3802 if (reduc_fn != IFN_LAST)
3804 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3806 /* An EQ stmt and an COND_EXPR stmt. */
3807 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3808 vector_stmt, stmt_info, 0,
3809 vect_epilogue);
3810 /* Reduction of the max index and a reduction of the found
3811 values. */
3812 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3813 vec_to_scalar, stmt_info, 0,
3814 vect_epilogue);
3815 /* A broadcast of the max value. */
3816 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3817 scalar_to_vec, stmt_info, 0,
3818 vect_epilogue);
3820 else
3822 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3823 stmt_info, 0, vect_epilogue);
3824 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3825 vec_to_scalar, stmt_info, 0,
3826 vect_epilogue);
3829 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832 /* Extraction of scalar elements. */
3833 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3834 vec_to_scalar, stmt_info, 0,
3835 vect_epilogue);
3836 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3837 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3838 scalar_stmt, stmt_info, 0,
3839 vect_epilogue);
3841 else
3843 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3844 tree bitsize =
3845 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3846 int element_bitsize = tree_to_uhwi (bitsize);
3847 int nelements = vec_size_in_bits / element_bitsize;
3849 if (code == COND_EXPR)
3850 code = MAX_EXPR;
3852 optab = optab_for_tree_code (code, vectype, optab_default);
3854 /* We have a whole vector shift available. */
3855 if (optab != unknown_optab
3856 && VECTOR_MODE_P (mode)
3857 && optab_handler (optab, mode) != CODE_FOR_nothing
3858 && have_whole_vector_shift (mode))
3860 /* Final reduction via vector shifts and the reduction operator.
3861 Also requires scalar extract. */
3862 epilogue_cost += add_stmt_cost (target_cost_data,
3863 exact_log2 (nelements) * 2,
3864 vector_stmt, stmt_info, 0,
3865 vect_epilogue);
3866 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867 vec_to_scalar, stmt_info, 0,
3868 vect_epilogue);
3870 else
3871 /* Use extracts and reduction op for final reduction. For N
3872 elements, we have N extracts and N-1 reduction ops. */
3873 epilogue_cost += add_stmt_cost (target_cost_data,
3874 nelements + nelements - 1,
3875 vector_stmt, stmt_info, 0,
3876 vect_epilogue);
3880 if (dump_enabled_p ())
3881 dump_printf (MSG_NOTE,
3882 "vect_model_reduction_cost: inside_cost = %d, "
3883 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3884 prologue_cost, epilogue_cost);
3888 /* Function vect_model_induction_cost.
3890 Models cost for induction operations. */
3892 static void
3893 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3895 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3896 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3897 unsigned inside_cost, prologue_cost;
3899 if (PURE_SLP_STMT (stmt_info))
3900 return;
3902 /* loop cost for vec_loop. */
3903 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3904 stmt_info, 0, vect_body);
3906 /* prologue cost for vec_init and vec_step. */
3907 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3908 stmt_info, 0, vect_prologue);
3910 if (dump_enabled_p ())
3911 dump_printf_loc (MSG_NOTE, vect_location,
3912 "vect_model_induction_cost: inside_cost = %d, "
3913 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3918 /* Function get_initial_def_for_reduction
3920 Input:
3921 STMT - a stmt that performs a reduction operation in the loop.
3922 INIT_VAL - the initial value of the reduction variable
3924 Output:
3925 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3926 of the reduction (used for adjusting the epilog - see below).
3927 Return a vector variable, initialized according to the operation that STMT
3928 performs. This vector will be used as the initial value of the
3929 vector of partial results.
3931 Option1 (adjust in epilog): Initialize the vector as follows:
3932 add/bit or/xor: [0,0,...,0,0]
3933 mult/bit and: [1,1,...,1,1]
3934 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3935 and when necessary (e.g. add/mult case) let the caller know
3936 that it needs to adjust the result by init_val.
3938 Option2: Initialize the vector as follows:
3939 add/bit or/xor: [init_val,0,0,...,0]
3940 mult/bit and: [init_val,1,1,...,1]
3941 min/max/cond_expr: [init_val,init_val,...,init_val]
3942 and no adjustments are needed.
3944 For example, for the following code:
3946 s = init_val;
3947 for (i=0;i<n;i++)
3948 s = s + a[i];
3950 STMT is 's = s + a[i]', and the reduction variable is 's'.
3951 For a vector of 4 units, we want to return either [0,0,0,init_val],
3952 or [0,0,0,0] and let the caller know that it needs to adjust
3953 the result at the end by 'init_val'.
3955 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3956 initialization vector is simpler (same element in all entries), if
3957 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3959 A cost model should help decide between these two schemes. */
3961 tree
3962 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3963 tree *adjustment_def)
3965 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3966 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3967 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3968 tree scalar_type = TREE_TYPE (init_val);
3969 tree vectype = get_vectype_for_scalar_type (scalar_type);
3970 int nunits;
3971 enum tree_code code = gimple_assign_rhs_code (stmt);
3972 tree def_for_init;
3973 tree init_def;
3974 int i;
3975 bool nested_in_vect_loop = false;
3976 REAL_VALUE_TYPE real_init_val = dconst0;
3977 int int_init_val = 0;
3978 gimple *def_stmt = NULL;
3979 gimple_seq stmts = NULL;
3981 gcc_assert (vectype);
3982 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3984 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3985 || SCALAR_FLOAT_TYPE_P (scalar_type));
3987 if (nested_in_vect_loop_p (loop, stmt))
3988 nested_in_vect_loop = true;
3989 else
3990 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3992 /* In case of double reduction we only create a vector variable to be put
3993 in the reduction phi node. The actual statement creation is done in
3994 vect_create_epilog_for_reduction. */
3995 if (adjustment_def && nested_in_vect_loop
3996 && TREE_CODE (init_val) == SSA_NAME
3997 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3998 && gimple_code (def_stmt) == GIMPLE_PHI
3999 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4000 && vinfo_for_stmt (def_stmt)
4001 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4002 == vect_double_reduction_def)
4004 *adjustment_def = NULL;
4005 return vect_create_destination_var (init_val, vectype);
4008 /* In case of a nested reduction do not use an adjustment def as
4009 that case is not supported by the epilogue generation correctly
4010 if ncopies is not one. */
4011 if (adjustment_def && nested_in_vect_loop)
4013 *adjustment_def = NULL;
4014 return vect_get_vec_def_for_operand (init_val, stmt);
4017 switch (code)
4019 case WIDEN_SUM_EXPR:
4020 case DOT_PROD_EXPR:
4021 case SAD_EXPR:
4022 case PLUS_EXPR:
4023 case MINUS_EXPR:
4024 case BIT_IOR_EXPR:
4025 case BIT_XOR_EXPR:
4026 case MULT_EXPR:
4027 case BIT_AND_EXPR:
4029 /* ADJUSMENT_DEF is NULL when called from
4030 vect_create_epilog_for_reduction to vectorize double reduction. */
4031 if (adjustment_def)
4032 *adjustment_def = init_val;
4034 if (code == MULT_EXPR)
4036 real_init_val = dconst1;
4037 int_init_val = 1;
4040 if (code == BIT_AND_EXPR)
4041 int_init_val = -1;
4043 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044 def_for_init = build_real (scalar_type, real_init_val);
4045 else
4046 def_for_init = build_int_cst (scalar_type, int_init_val);
4048 if (adjustment_def)
4049 /* Option1: the first element is '0' or '1' as well. */
4050 init_def = gimple_build_vector_from_val (&stmts, vectype,
4051 def_for_init);
4052 else
4054 /* Option2: the first element is INIT_VAL. */
4055 auto_vec<tree, 32> elts (nunits);
4056 elts.quick_push (init_val);
4057 for (i = 1; i < nunits; ++i)
4058 elts.quick_push (def_for_init);
4059 init_def = gimple_build_vector (&stmts, vectype, elts);
4062 break;
4064 case MIN_EXPR:
4065 case MAX_EXPR:
4066 case COND_EXPR:
4068 if (adjustment_def)
4070 *adjustment_def = NULL_TREE;
4071 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4073 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4074 break;
4077 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4080 break;
4082 default:
4083 gcc_unreachable ();
4086 if (stmts)
4087 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088 return init_def;
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092 NUMBER_OF_VECTORS is the number of vector defs to create. */
4094 static void
4095 get_initial_defs_for_reduction (slp_tree slp_node,
4096 vec<tree> *vec_oprnds,
4097 unsigned int number_of_vectors,
4098 enum tree_code code, bool reduc_chain)
4100 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4101 gimple *stmt = stmts[0];
4102 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4103 unsigned nunits;
4104 unsigned j, number_of_places_left_in_vector;
4105 tree vector_type, scalar_type;
4106 tree vop;
4107 int group_size = stmts.length ();
4108 unsigned int vec_num, i;
4109 unsigned number_of_copies = 1;
4110 vec<tree> voprnds;
4111 voprnds.create (number_of_vectors);
4112 tree neutral_op = NULL;
4113 struct loop *loop;
4115 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4116 scalar_type = TREE_TYPE (vector_type);
4117 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4119 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4121 loop = (gimple_bb (stmt))->loop_father;
4122 gcc_assert (loop);
4123 edge pe = loop_preheader_edge (loop);
4125 /* op is the reduction operand of the first stmt already. */
4126 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4127 we need either neutral operands or the original operands. See
4128 get_initial_def_for_reduction() for details. */
4129 switch (code)
4131 case WIDEN_SUM_EXPR:
4132 case DOT_PROD_EXPR:
4133 case SAD_EXPR:
4134 case PLUS_EXPR:
4135 case MINUS_EXPR:
4136 case BIT_IOR_EXPR:
4137 case BIT_XOR_EXPR:
4138 neutral_op = build_zero_cst (scalar_type);
4139 break;
4141 case MULT_EXPR:
4142 neutral_op = build_one_cst (scalar_type);
4143 break;
4145 case BIT_AND_EXPR:
4146 neutral_op = build_all_ones_cst (scalar_type);
4147 break;
4149 /* For MIN/MAX we don't have an easy neutral operand but
4150 the initial values can be used fine here. Only for
4151 a reduction chain we have to force a neutral element. */
4152 case MAX_EXPR:
4153 case MIN_EXPR:
4154 if (! reduc_chain)
4155 neutral_op = NULL;
4156 else
4157 neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4158 break;
4160 default:
4161 gcc_assert (! reduc_chain);
4162 neutral_op = NULL;
4165 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4166 created vectors. It is greater than 1 if unrolling is performed.
4168 For example, we have two scalar operands, s1 and s2 (e.g., group of
4169 strided accesses of size two), while NUNITS is four (i.e., four scalars
4170 of this type can be packed in a vector). The output vector will contain
4171 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4172 will be 2).
4174 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4175 containing the operands.
4177 For example, NUNITS is four as before, and the group size is 8
4178 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4179 {s5, s6, s7, s8}. */
4181 number_of_copies = nunits * number_of_vectors / group_size;
4183 number_of_places_left_in_vector = nunits;
4184 auto_vec<tree, 32> elts (nunits);
4185 elts.quick_grow (nunits);
4186 for (j = 0; j < number_of_copies; j++)
4188 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4190 tree op;
4191 /* Get the def before the loop. In reduction chain we have only
4192 one initial value. */
4193 if ((j != (number_of_copies - 1)
4194 || (reduc_chain && i != 0))
4195 && neutral_op)
4196 op = neutral_op;
4197 else
4198 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4200 /* Create 'vect_ = {op0,op1,...,opn}'. */
4201 number_of_places_left_in_vector--;
4202 elts[number_of_places_left_in_vector] = op;
4204 if (number_of_places_left_in_vector == 0)
4206 gimple_seq ctor_seq = NULL;
4207 tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4208 if (ctor_seq != NULL)
4209 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4210 voprnds.quick_push (init);
4212 number_of_places_left_in_vector = nunits;
4217 /* Since the vectors are created in the reverse order, we should invert
4218 them. */
4219 vec_num = voprnds.length ();
4220 for (j = vec_num; j != 0; j--)
4222 vop = voprnds[j - 1];
4223 vec_oprnds->quick_push (vop);
4226 voprnds.release ();
4228 /* In case that VF is greater than the unrolling factor needed for the SLP
4229 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4230 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4231 to replicate the vectors. */
4232 tree neutral_vec = NULL;
4233 while (number_of_vectors > vec_oprnds->length ())
4235 if (neutral_op)
4237 if (!neutral_vec)
4239 gimple_seq ctor_seq = NULL;
4240 neutral_vec = gimple_build_vector_from_val
4241 (&ctor_seq, vector_type, neutral_op);
4242 if (ctor_seq != NULL)
4243 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4245 vec_oprnds->quick_push (neutral_vec);
4247 else
4249 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4250 vec_oprnds->quick_push (vop);
4256 /* Function vect_create_epilog_for_reduction
4258 Create code at the loop-epilog to finalize the result of a reduction
4259 computation.
4261 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4262 reduction statements.
4263 STMT is the scalar reduction stmt that is being vectorized.
4264 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4265 number of elements that we can fit in a vectype (nunits). In this case
4266 we have to generate more than one vector stmt - i.e - we need to "unroll"
4267 the vector stmt by a factor VF/nunits. For more details see documentation
4268 in vectorizable_operation.
4269 REDUC_FN is the internal function for the epilog reduction.
4270 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4271 computation.
4272 REDUC_INDEX is the index of the operand in the right hand side of the
4273 statement that is defined by REDUCTION_PHI.
4274 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4275 SLP_NODE is an SLP node containing a group of reduction statements. The
4276 first one in this group is STMT.
4278 This function:
4279 1. Creates the reduction def-use cycles: sets the arguments for
4280 REDUCTION_PHIS:
4281 The loop-entry argument is the vectorized initial-value of the reduction.
4282 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4283 sums.
4284 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4285 by calling the function specified by REDUC_FN if available, or by
4286 other means (whole-vector shifts or a scalar loop).
4287 The function also creates a new phi node at the loop exit to preserve
4288 loop-closed form, as illustrated below.
4290 The flow at the entry to this function:
4292 loop:
4293 vec_def = phi <null, null> # REDUCTION_PHI
4294 VECT_DEF = vector_stmt # vectorized form of STMT
4295 s_loop = scalar_stmt # (scalar) STMT
4296 loop_exit:
4297 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4298 use <s_out0>
4299 use <s_out0>
4301 The above is transformed by this function into:
4303 loop:
4304 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4305 VECT_DEF = vector_stmt # vectorized form of STMT
4306 s_loop = scalar_stmt # (scalar) STMT
4307 loop_exit:
4308 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4309 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4310 v_out2 = reduce <v_out1>
4311 s_out3 = extract_field <v_out2, 0>
4312 s_out4 = adjust_result <s_out3>
4313 use <s_out4>
4314 use <s_out4>
4317 static void
4318 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4319 gimple *reduc_def_stmt,
4320 int ncopies, internal_fn reduc_fn,
4321 vec<gimple *> reduction_phis,
4322 bool double_reduc,
4323 slp_tree slp_node,
4324 slp_instance slp_node_instance)
4326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4327 stmt_vec_info prev_phi_info;
4328 tree vectype;
4329 machine_mode mode;
4330 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4332 basic_block exit_bb;
4333 tree scalar_dest;
4334 tree scalar_type;
4335 gimple *new_phi = NULL, *phi;
4336 gimple_stmt_iterator exit_gsi;
4337 tree vec_dest;
4338 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4339 gimple *epilog_stmt = NULL;
4340 enum tree_code code = gimple_assign_rhs_code (stmt);
4341 gimple *exit_phi;
4342 tree bitsize;
4343 tree adjustment_def = NULL;
4344 tree vec_initial_def = NULL;
4345 tree expr, def, initial_def = NULL;
4346 tree orig_name, scalar_result;
4347 imm_use_iterator imm_iter, phi_imm_iter;
4348 use_operand_p use_p, phi_use_p;
4349 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4350 bool nested_in_vect_loop = false;
4351 auto_vec<gimple *> new_phis;
4352 auto_vec<gimple *> inner_phis;
4353 enum vect_def_type dt = vect_unknown_def_type;
4354 int j, i;
4355 auto_vec<tree> scalar_results;
4356 unsigned int group_size = 1, k, ratio;
4357 auto_vec<tree> vec_initial_defs;
4358 auto_vec<gimple *> phis;
4359 bool slp_reduc = false;
4360 tree new_phi_result;
4361 gimple *inner_phi = NULL;
4362 tree induction_index = NULL_TREE;
4364 if (slp_node)
4365 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4367 if (nested_in_vect_loop_p (loop, stmt))
4369 outer_loop = loop;
4370 loop = loop->inner;
4371 nested_in_vect_loop = true;
4372 gcc_assert (!slp_node);
4375 vectype = STMT_VINFO_VECTYPE (stmt_info);
4376 gcc_assert (vectype);
4377 mode = TYPE_MODE (vectype);
4379 /* 1. Create the reduction def-use cycle:
4380 Set the arguments of REDUCTION_PHIS, i.e., transform
4382 loop:
4383 vec_def = phi <null, null> # REDUCTION_PHI
4384 VECT_DEF = vector_stmt # vectorized form of STMT
4387 into:
4389 loop:
4390 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4391 VECT_DEF = vector_stmt # vectorized form of STMT
4394 (in case of SLP, do it for all the phis). */
4396 /* Get the loop-entry arguments. */
4397 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4398 if (slp_node)
4400 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4401 vec_initial_defs.reserve (vec_num);
4402 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4403 &vec_initial_defs, vec_num, code,
4404 GROUP_FIRST_ELEMENT (stmt_info));
4406 else
4408 /* Get at the scalar def before the loop, that defines the initial value
4409 of the reduction variable. */
4410 gimple *def_stmt;
4411 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4412 loop_preheader_edge (loop));
4413 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4414 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4415 &adjustment_def);
4416 vec_initial_defs.create (1);
4417 vec_initial_defs.quick_push (vec_initial_def);
4420 /* Set phi nodes arguments. */
4421 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4423 tree vec_init_def = vec_initial_defs[i];
4424 tree def = vect_defs[i];
4425 for (j = 0; j < ncopies; j++)
4427 if (j != 0)
4429 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4430 if (nested_in_vect_loop)
4431 vec_init_def
4432 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4433 vec_init_def);
4436 /* Set the loop-entry arg of the reduction-phi. */
4438 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4439 == INTEGER_INDUC_COND_REDUCTION)
4441 /* Initialise the reduction phi to zero. This prevents initial
4442 values of non-zero interferring with the reduction op. */
4443 gcc_assert (ncopies == 1);
4444 gcc_assert (i == 0);
4446 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4447 tree zero_vec = build_zero_cst (vec_init_def_type);
4449 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4450 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4452 else
4453 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4454 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4456 /* Set the loop-latch arg for the reduction-phi. */
4457 if (j > 0)
4458 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4460 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4461 UNKNOWN_LOCATION);
4463 if (dump_enabled_p ())
4465 dump_printf_loc (MSG_NOTE, vect_location,
4466 "transform reduction: created def-use cycle: ");
4467 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4468 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4473 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4474 which is updated with the current index of the loop for every match of
4475 the original loop's cond_expr (VEC_STMT). This results in a vector
4476 containing the last time the condition passed for that vector lane.
4477 The first match will be a 1 to allow 0 to be used for non-matching
4478 indexes. If there are no matches at all then the vector will be all
4479 zeroes. */
4480 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482 tree indx_before_incr, indx_after_incr;
4483 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4484 int k;
4486 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4489 int scalar_precision
4490 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4491 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4492 tree cr_index_vector_type = build_vector_type
4493 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4495 /* First we create a simple vector induction variable which starts
4496 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4497 vector size (STEP). */
4499 /* Create a {1,2,3,...} vector. */
4500 auto_vec<tree, 32> vtemp (nunits_out);
4501 for (k = 0; k < nunits_out; ++k)
4502 vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4503 tree series_vect = build_vector (cr_index_vector_type, vtemp);
4505 /* Create a vector of the step value. */
4506 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4507 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4509 /* Create an induction variable. */
4510 gimple_stmt_iterator incr_gsi;
4511 bool insert_after;
4512 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4513 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4514 insert_after, &indx_before_incr, &indx_after_incr);
4516 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4517 filled with zeros (VEC_ZERO). */
4519 /* Create a vector of 0s. */
4520 tree zero = build_zero_cst (cr_index_scalar_type);
4521 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4523 /* Create a vector phi node. */
4524 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4525 new_phi = create_phi_node (new_phi_tree, loop->header);
4526 set_vinfo_for_stmt (new_phi,
4527 new_stmt_vec_info (new_phi, loop_vinfo));
4528 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4529 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531 /* Now take the condition from the loops original cond_expr
4532 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4533 every match uses values from the induction variable
4534 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4535 (NEW_PHI_TREE).
4536 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4537 the new cond_expr (INDEX_COND_EXPR). */
4539 /* Duplicate the condition from vec_stmt. */
4540 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4542 /* Create a conditional, where the condition is taken from vec_stmt
4543 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4544 else is the phi (NEW_PHI_TREE). */
4545 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4546 ccompare, indx_before_incr,
4547 new_phi_tree);
4548 induction_index = make_ssa_name (cr_index_vector_type);
4549 gimple *index_condition = gimple_build_assign (induction_index,
4550 index_cond_expr);
4551 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4552 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4553 loop_vinfo);
4554 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4555 set_vinfo_for_stmt (index_condition, index_vec_info);
4557 /* Update the phi with the vec cond. */
4558 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4559 loop_latch_edge (loop), UNKNOWN_LOCATION);
4562 /* 2. Create epilog code.
4563 The reduction epilog code operates across the elements of the vector
4564 of partial results computed by the vectorized loop.
4565 The reduction epilog code consists of:
4567 step 1: compute the scalar result in a vector (v_out2)
4568 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4569 step 3: adjust the scalar result (s_out3) if needed.
4571 Step 1 can be accomplished using one the following three schemes:
4572 (scheme 1) using reduc_fn, if available.
4573 (scheme 2) using whole-vector shifts, if available.
4574 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4575 combined.
4577 The overall epilog code looks like this:
4579 s_out0 = phi <s_loop> # original EXIT_PHI
4580 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4581 v_out2 = reduce <v_out1> # step 1
4582 s_out3 = extract_field <v_out2, 0> # step 2
4583 s_out4 = adjust_result <s_out3> # step 3
4585 (step 3 is optional, and steps 1 and 2 may be combined).
4586 Lastly, the uses of s_out0 are replaced by s_out4. */
4589 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4590 v_out1 = phi <VECT_DEF>
4591 Store them in NEW_PHIS. */
4593 exit_bb = single_exit (loop)->dest;
4594 prev_phi_info = NULL;
4595 new_phis.create (vect_defs.length ());
4596 FOR_EACH_VEC_ELT (vect_defs, i, def)
4598 for (j = 0; j < ncopies; j++)
4600 tree new_def = copy_ssa_name (def);
4601 phi = create_phi_node (new_def, exit_bb);
4602 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4603 if (j == 0)
4604 new_phis.quick_push (phi);
4605 else
4607 def = vect_get_vec_def_for_stmt_copy (dt, def);
4608 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4611 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4612 prev_phi_info = vinfo_for_stmt (phi);
4616 /* The epilogue is created for the outer-loop, i.e., for the loop being
4617 vectorized. Create exit phis for the outer loop. */
4618 if (double_reduc)
4620 loop = outer_loop;
4621 exit_bb = single_exit (loop)->dest;
4622 inner_phis.create (vect_defs.length ());
4623 FOR_EACH_VEC_ELT (new_phis, i, phi)
4625 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4626 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4627 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4628 PHI_RESULT (phi));
4629 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4630 loop_vinfo));
4631 inner_phis.quick_push (phi);
4632 new_phis[i] = outer_phi;
4633 prev_phi_info = vinfo_for_stmt (outer_phi);
4634 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4636 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4637 new_result = copy_ssa_name (PHI_RESULT (phi));
4638 outer_phi = create_phi_node (new_result, exit_bb);
4639 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4640 PHI_RESULT (phi));
4641 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4642 loop_vinfo));
4643 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4644 prev_phi_info = vinfo_for_stmt (outer_phi);
4649 exit_gsi = gsi_after_labels (exit_bb);
4651 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4652 (i.e. when reduc_fn is not available) and in the final adjustment
4653 code (if needed). Also get the original scalar reduction variable as
4654 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4655 represents a reduction pattern), the tree-code and scalar-def are
4656 taken from the original stmt that the pattern-stmt (STMT) replaces.
4657 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4658 are taken from STMT. */
4660 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4661 if (!orig_stmt)
4663 /* Regular reduction */
4664 orig_stmt = stmt;
4666 else
4668 /* Reduction pattern */
4669 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4670 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4671 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4674 code = gimple_assign_rhs_code (orig_stmt);
4675 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4676 partial results are added and not subtracted. */
4677 if (code == MINUS_EXPR)
4678 code = PLUS_EXPR;
4680 scalar_dest = gimple_assign_lhs (orig_stmt);
4681 scalar_type = TREE_TYPE (scalar_dest);
4682 scalar_results.create (group_size);
4683 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4684 bitsize = TYPE_SIZE (scalar_type);
4686 /* In case this is a reduction in an inner-loop while vectorizing an outer
4687 loop - we don't need to extract a single scalar result at the end of the
4688 inner-loop (unless it is double reduction, i.e., the use of reduction is
4689 outside the outer-loop). The final vector of partial results will be used
4690 in the vectorized outer-loop, or reduced to a scalar result at the end of
4691 the outer-loop. */
4692 if (nested_in_vect_loop && !double_reduc)
4693 goto vect_finalize_reduction;
4695 /* SLP reduction without reduction chain, e.g.,
4696 # a1 = phi <a2, a0>
4697 # b1 = phi <b2, b0>
4698 a2 = operation (a1)
4699 b2 = operation (b1) */
4700 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4702 /* In case of reduction chain, e.g.,
4703 # a1 = phi <a3, a0>
4704 a2 = operation (a1)
4705 a3 = operation (a2),
4707 we may end up with more than one vector result. Here we reduce them to
4708 one vector. */
4709 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4711 tree first_vect = PHI_RESULT (new_phis[0]);
4712 gassign *new_vec_stmt = NULL;
4713 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4714 for (k = 1; k < new_phis.length (); k++)
4716 gimple *next_phi = new_phis[k];
4717 tree second_vect = PHI_RESULT (next_phi);
4718 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4719 new_vec_stmt = gimple_build_assign (tem, code,
4720 first_vect, second_vect);
4721 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4722 first_vect = tem;
4725 new_phi_result = first_vect;
4726 if (new_vec_stmt)
4728 new_phis.truncate (0);
4729 new_phis.safe_push (new_vec_stmt);
4732 /* Likewise if we couldn't use a single defuse cycle. */
4733 else if (ncopies > 1)
4735 gcc_assert (new_phis.length () == 1);
4736 tree first_vect = PHI_RESULT (new_phis[0]);
4737 gassign *new_vec_stmt = NULL;
4738 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4739 gimple *next_phi = new_phis[0];
4740 for (int k = 1; k < ncopies; ++k)
4742 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4743 tree second_vect = PHI_RESULT (next_phi);
4744 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4745 new_vec_stmt = gimple_build_assign (tem, code,
4746 first_vect, second_vect);
4747 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4748 first_vect = tem;
4750 new_phi_result = first_vect;
4751 new_phis.truncate (0);
4752 new_phis.safe_push (new_vec_stmt);
4754 else
4755 new_phi_result = PHI_RESULT (new_phis[0]);
4757 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4758 && reduc_fn != IFN_LAST)
4760 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4761 various data values where the condition matched and another vector
4762 (INDUCTION_INDEX) containing all the indexes of those matches. We
4763 need to extract the last matching index (which will be the index with
4764 highest value) and use this to index into the data vector.
4765 For the case where there were no matches, the data vector will contain
4766 all default values and the index vector will be all zeros. */
4768 /* Get various versions of the type of the vector of indexes. */
4769 tree index_vec_type = TREE_TYPE (induction_index);
4770 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4771 tree index_scalar_type = TREE_TYPE (index_vec_type);
4772 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4773 (index_vec_type);
4775 /* Get an unsigned integer version of the type of the data vector. */
4776 int scalar_precision
4777 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4778 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4779 tree vectype_unsigned = build_vector_type
4780 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4782 /* First we need to create a vector (ZERO_VEC) of zeros and another
4783 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4784 can create using a MAX reduction and then expanding.
4785 In the case where the loop never made any matches, the max index will
4786 be zero. */
4788 /* Vector of {0, 0, 0,...}. */
4789 tree zero_vec = make_ssa_name (vectype);
4790 tree zero_vec_rhs = build_zero_cst (vectype);
4791 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4792 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4794 /* Find maximum value from the vector of found indexes. */
4795 tree max_index = make_ssa_name (index_scalar_type);
4796 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4797 1, induction_index);
4798 gimple_call_set_lhs (max_index_stmt, max_index);
4799 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4801 /* Vector of {max_index, max_index, max_index,...}. */
4802 tree max_index_vec = make_ssa_name (index_vec_type);
4803 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4804 max_index);
4805 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4806 max_index_vec_rhs);
4807 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4809 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4810 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4811 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4812 otherwise. Only one value should match, resulting in a vector
4813 (VEC_COND) with one data value and the rest zeros.
4814 In the case where the loop never made any matches, every index will
4815 match, resulting in a vector with all data values (which will all be
4816 the default value). */
4818 /* Compare the max index vector to the vector of found indexes to find
4819 the position of the max value. */
4820 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4821 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4822 induction_index,
4823 max_index_vec);
4824 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4826 /* Use the compare to choose either values from the data vector or
4827 zero. */
4828 tree vec_cond = make_ssa_name (vectype);
4829 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4830 vec_compare, new_phi_result,
4831 zero_vec);
4832 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4834 /* Finally we need to extract the data value from the vector (VEC_COND)
4835 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4836 reduction, but because this doesn't exist, we can use a MAX reduction
4837 instead. The data value might be signed or a float so we need to cast
4838 it first.
4839 In the case where the loop never made any matches, the data values are
4840 all identical, and so will reduce down correctly. */
4842 /* Make the matched data values unsigned. */
4843 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4844 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4845 vec_cond);
4846 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4847 VIEW_CONVERT_EXPR,
4848 vec_cond_cast_rhs);
4849 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4851 /* Reduce down to a scalar value. */
4852 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4853 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4854 1, vec_cond_cast);
4855 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4856 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4858 /* Convert the reduced value back to the result type and set as the
4859 result. */
4860 gimple_seq stmts = NULL;
4861 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4862 data_reduc);
4863 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4864 scalar_results.safe_push (new_temp);
4866 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4867 && reduc_fn == IFN_LAST)
4869 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4870 idx = 0;
4871 idx_val = induction_index[0];
4872 val = data_reduc[0];
4873 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4874 if (induction_index[i] > idx_val)
4875 val = data_reduc[i], idx_val = induction_index[i];
4876 return val; */
4878 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4879 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4880 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4881 unsigned HOST_WIDE_INT v_size
4882 = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4883 tree idx_val = NULL_TREE, val = NULL_TREE;
4884 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4886 tree old_idx_val = idx_val;
4887 tree old_val = val;
4888 idx_val = make_ssa_name (idx_eltype);
4889 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4890 build3 (BIT_FIELD_REF, idx_eltype,
4891 induction_index,
4892 bitsize_int (el_size),
4893 bitsize_int (off)));
4894 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4895 val = make_ssa_name (data_eltype);
4896 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4897 build3 (BIT_FIELD_REF,
4898 data_eltype,
4899 new_phi_result,
4900 bitsize_int (el_size),
4901 bitsize_int (off)));
4902 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4903 if (off != 0)
4905 tree new_idx_val = idx_val;
4906 tree new_val = val;
4907 if (off != v_size - el_size)
4909 new_idx_val = make_ssa_name (idx_eltype);
4910 epilog_stmt = gimple_build_assign (new_idx_val,
4911 MAX_EXPR, idx_val,
4912 old_idx_val);
4913 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4915 new_val = make_ssa_name (data_eltype);
4916 epilog_stmt = gimple_build_assign (new_val,
4917 COND_EXPR,
4918 build2 (GT_EXPR,
4919 boolean_type_node,
4920 idx_val,
4921 old_idx_val),
4922 val, old_val);
4923 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4924 idx_val = new_idx_val;
4925 val = new_val;
4928 /* Convert the reduced value back to the result type and set as the
4929 result. */
4930 gimple_seq stmts = NULL;
4931 val = gimple_convert (&stmts, scalar_type, val);
4932 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4933 scalar_results.safe_push (val);
4936 /* 2.3 Create the reduction code, using one of the three schemes described
4937 above. In SLP we simply need to extract all the elements from the
4938 vector (without reducing them), so we use scalar shifts. */
4939 else if (reduc_fn != IFN_LAST && !slp_reduc)
4941 tree tmp;
4942 tree vec_elem_type;
4944 /* Case 1: Create:
4945 v_out2 = reduc_expr <v_out1> */
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_NOTE, vect_location,
4949 "Reduce using direct vector reduction.\n");
4951 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4952 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4954 tree tmp_dest
4955 = vect_create_destination_var (scalar_dest, vec_elem_type);
4956 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4957 new_phi_result);
4958 gimple_set_lhs (epilog_stmt, tmp_dest);
4959 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4960 gimple_set_lhs (epilog_stmt, new_temp);
4961 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4964 new_temp);
4966 else
4968 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4969 new_phi_result);
4970 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4973 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4974 gimple_set_lhs (epilog_stmt, new_temp);
4975 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4977 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4978 == INTEGER_INDUC_COND_REDUCTION)
4980 /* Earlier we set the initial value to be zero. Check the result
4981 and if it is zero then replace with the original initial
4982 value. */
4983 tree zero = build_zero_cst (scalar_type);
4984 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4986 tmp = make_ssa_name (new_scalar_dest);
4987 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4988 initial_def, new_temp);
4989 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990 new_temp = tmp;
4993 scalar_results.safe_push (new_temp);
4995 else
4997 bool reduce_with_shift = have_whole_vector_shift (mode);
4998 int element_bitsize = tree_to_uhwi (bitsize);
4999 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5000 tree vec_temp;
5002 /* COND reductions all do the final reduction with MAX_EXPR. */
5003 if (code == COND_EXPR)
5004 code = MAX_EXPR;
5006 /* Regardless of whether we have a whole vector shift, if we're
5007 emulating the operation via tree-vect-generic, we don't want
5008 to use it. Only the first round of the reduction is likely
5009 to still be profitable via emulation. */
5010 /* ??? It might be better to emit a reduction tree code here, so that
5011 tree-vect-generic can expand the first round via bit tricks. */
5012 if (!VECTOR_MODE_P (mode))
5013 reduce_with_shift = false;
5014 else
5016 optab optab = optab_for_tree_code (code, vectype, optab_default);
5017 if (optab_handler (optab, mode) == CODE_FOR_nothing)
5018 reduce_with_shift = false;
5021 if (reduce_with_shift && !slp_reduc)
5023 int nelements = vec_size_in_bits / element_bitsize;
5024 auto_vec_perm_indices sel (nelements);
5026 int elt_offset;
5028 tree zero_vec = build_zero_cst (vectype);
5029 /* Case 2: Create:
5030 for (offset = nelements/2; offset >= 1; offset/=2)
5032 Create: va' = vec_shift <va, offset>
5033 Create: va = vop <va, va'>
5034 } */
5036 tree rhs;
5038 if (dump_enabled_p ())
5039 dump_printf_loc (MSG_NOTE, vect_location,
5040 "Reduce using vector shifts\n");
5042 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5043 new_temp = new_phi_result;
5044 for (elt_offset = nelements / 2;
5045 elt_offset >= 1;
5046 elt_offset /= 2)
5048 sel.truncate (0);
5049 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5050 tree mask = vect_gen_perm_mask_any (vectype, sel);
5051 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5052 new_temp, zero_vec, mask);
5053 new_name = make_ssa_name (vec_dest, epilog_stmt);
5054 gimple_assign_set_lhs (epilog_stmt, new_name);
5055 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5057 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5058 new_temp);
5059 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5060 gimple_assign_set_lhs (epilog_stmt, new_temp);
5061 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5064 /* 2.4 Extract the final scalar result. Create:
5065 s_out3 = extract_field <v_out2, bitpos> */
5067 if (dump_enabled_p ())
5068 dump_printf_loc (MSG_NOTE, vect_location,
5069 "extract scalar result\n");
5071 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5072 bitsize, bitsize_zero_node);
5073 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5074 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5075 gimple_assign_set_lhs (epilog_stmt, new_temp);
5076 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5077 scalar_results.safe_push (new_temp);
5079 else
5081 /* Case 3: Create:
5082 s = extract_field <v_out2, 0>
5083 for (offset = element_size;
5084 offset < vector_size;
5085 offset += element_size;)
5087 Create: s' = extract_field <v_out2, offset>
5088 Create: s = op <s, s'> // For non SLP cases
5089 } */
5091 if (dump_enabled_p ())
5092 dump_printf_loc (MSG_NOTE, vect_location,
5093 "Reduce using scalar code.\n");
5095 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5096 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5098 int bit_offset;
5099 if (gimple_code (new_phi) == GIMPLE_PHI)
5100 vec_temp = PHI_RESULT (new_phi);
5101 else
5102 vec_temp = gimple_assign_lhs (new_phi);
5103 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5104 bitsize_zero_node);
5105 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5106 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5107 gimple_assign_set_lhs (epilog_stmt, new_temp);
5108 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5110 /* In SLP we don't need to apply reduction operation, so we just
5111 collect s' values in SCALAR_RESULTS. */
5112 if (slp_reduc)
5113 scalar_results.safe_push (new_temp);
5115 for (bit_offset = element_bitsize;
5116 bit_offset < vec_size_in_bits;
5117 bit_offset += element_bitsize)
5119 tree bitpos = bitsize_int (bit_offset);
5120 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5121 bitsize, bitpos);
5123 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5124 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5125 gimple_assign_set_lhs (epilog_stmt, new_name);
5126 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5128 if (slp_reduc)
5130 /* In SLP we don't need to apply reduction operation, so
5131 we just collect s' values in SCALAR_RESULTS. */
5132 new_temp = new_name;
5133 scalar_results.safe_push (new_name);
5135 else
5137 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5138 new_name, new_temp);
5139 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5140 gimple_assign_set_lhs (epilog_stmt, new_temp);
5141 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146 /* The only case where we need to reduce scalar results in SLP, is
5147 unrolling. If the size of SCALAR_RESULTS is greater than
5148 GROUP_SIZE, we reduce them combining elements modulo
5149 GROUP_SIZE. */
5150 if (slp_reduc)
5152 tree res, first_res, new_res;
5153 gimple *new_stmt;
5155 /* Reduce multiple scalar results in case of SLP unrolling. */
5156 for (j = group_size; scalar_results.iterate (j, &res);
5157 j++)
5159 first_res = scalar_results[j % group_size];
5160 new_stmt = gimple_build_assign (new_scalar_dest, code,
5161 first_res, res);
5162 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5163 gimple_assign_set_lhs (new_stmt, new_res);
5164 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5165 scalar_results[j % group_size] = new_res;
5168 else
5169 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5170 scalar_results.safe_push (new_temp);
5173 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5174 == INTEGER_INDUC_COND_REDUCTION)
5176 /* Earlier we set the initial value to be zero. Check the result
5177 and if it is zero then replace with the original initial
5178 value. */
5179 tree zero = build_zero_cst (scalar_type);
5180 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5182 tree tmp = make_ssa_name (new_scalar_dest);
5183 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5184 initial_def, new_temp);
5185 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5186 scalar_results[0] = tmp;
5190 vect_finalize_reduction:
5192 if (double_reduc)
5193 loop = loop->inner;
5195 /* 2.5 Adjust the final result by the initial value of the reduction
5196 variable. (When such adjustment is not needed, then
5197 'adjustment_def' is zero). For example, if code is PLUS we create:
5198 new_temp = loop_exit_def + adjustment_def */
5200 if (adjustment_def)
5202 gcc_assert (!slp_reduc);
5203 if (nested_in_vect_loop)
5205 new_phi = new_phis[0];
5206 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5207 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5208 new_dest = vect_create_destination_var (scalar_dest, vectype);
5210 else
5212 new_temp = scalar_results[0];
5213 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5214 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5215 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5218 epilog_stmt = gimple_build_assign (new_dest, expr);
5219 new_temp = make_ssa_name (new_dest, epilog_stmt);
5220 gimple_assign_set_lhs (epilog_stmt, new_temp);
5221 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5222 if (nested_in_vect_loop)
5224 set_vinfo_for_stmt (epilog_stmt,
5225 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5226 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5227 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5229 if (!double_reduc)
5230 scalar_results.quick_push (new_temp);
5231 else
5232 scalar_results[0] = new_temp;
5234 else
5235 scalar_results[0] = new_temp;
5237 new_phis[0] = epilog_stmt;
5240 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5241 phis with new adjusted scalar results, i.e., replace use <s_out0>
5242 with use <s_out4>.
5244 Transform:
5245 loop_exit:
5246 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5247 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5248 v_out2 = reduce <v_out1>
5249 s_out3 = extract_field <v_out2, 0>
5250 s_out4 = adjust_result <s_out3>
5251 use <s_out0>
5252 use <s_out0>
5254 into:
5256 loop_exit:
5257 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5258 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5259 v_out2 = reduce <v_out1>
5260 s_out3 = extract_field <v_out2, 0>
5261 s_out4 = adjust_result <s_out3>
5262 use <s_out4>
5263 use <s_out4> */
5266 /* In SLP reduction chain we reduce vector results into one vector if
5267 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5268 the last stmt in the reduction chain, since we are looking for the loop
5269 exit phi node. */
5270 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5272 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5273 /* Handle reduction patterns. */
5274 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5275 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5277 scalar_dest = gimple_assign_lhs (dest_stmt);
5278 group_size = 1;
5281 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5282 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5283 need to match SCALAR_RESULTS with corresponding statements. The first
5284 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5285 the first vector stmt, etc.
5286 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5287 if (group_size > new_phis.length ())
5289 ratio = group_size / new_phis.length ();
5290 gcc_assert (!(group_size % new_phis.length ()));
5292 else
5293 ratio = 1;
5295 for (k = 0; k < group_size; k++)
5297 if (k % ratio == 0)
5299 epilog_stmt = new_phis[k / ratio];
5300 reduction_phi = reduction_phis[k / ratio];
5301 if (double_reduc)
5302 inner_phi = inner_phis[k / ratio];
5305 if (slp_reduc)
5307 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5309 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5310 /* SLP statements can't participate in patterns. */
5311 gcc_assert (!orig_stmt);
5312 scalar_dest = gimple_assign_lhs (current_stmt);
5315 phis.create (3);
5316 /* Find the loop-closed-use at the loop exit of the original scalar
5317 result. (The reduction result is expected to have two immediate uses -
5318 one at the latch block, and one at the loop exit). */
5319 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5320 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5321 && !is_gimple_debug (USE_STMT (use_p)))
5322 phis.safe_push (USE_STMT (use_p));
5324 /* While we expect to have found an exit_phi because of loop-closed-ssa
5325 form we can end up without one if the scalar cycle is dead. */
5327 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5329 if (outer_loop)
5331 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5332 gphi *vect_phi;
5334 /* FORNOW. Currently not supporting the case that an inner-loop
5335 reduction is not used in the outer-loop (but only outside the
5336 outer-loop), unless it is double reduction. */
5337 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5338 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5339 || double_reduc);
5341 if (double_reduc)
5342 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5343 else
5344 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5345 if (!double_reduc
5346 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5347 != vect_double_reduction_def)
5348 continue;
5350 /* Handle double reduction:
5352 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5353 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5354 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5355 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5357 At that point the regular reduction (stmt2 and stmt3) is
5358 already vectorized, as well as the exit phi node, stmt4.
5359 Here we vectorize the phi node of double reduction, stmt1, and
5360 update all relevant statements. */
5362 /* Go through all the uses of s2 to find double reduction phi
5363 node, i.e., stmt1 above. */
5364 orig_name = PHI_RESULT (exit_phi);
5365 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5367 stmt_vec_info use_stmt_vinfo;
5368 stmt_vec_info new_phi_vinfo;
5369 tree vect_phi_init, preheader_arg, vect_phi_res;
5370 basic_block bb = gimple_bb (use_stmt);
5371 gimple *use;
5373 /* Check that USE_STMT is really double reduction phi
5374 node. */
5375 if (gimple_code (use_stmt) != GIMPLE_PHI
5376 || gimple_phi_num_args (use_stmt) != 2
5377 || bb->loop_father != outer_loop)
5378 continue;
5379 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5380 if (!use_stmt_vinfo
5381 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5382 != vect_double_reduction_def)
5383 continue;
5385 /* Create vector phi node for double reduction:
5386 vs1 = phi <vs0, vs2>
5387 vs1 was created previously in this function by a call to
5388 vect_get_vec_def_for_operand and is stored in
5389 vec_initial_def;
5390 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5391 vs0 is created here. */
5393 /* Create vector phi node. */
5394 vect_phi = create_phi_node (vec_initial_def, bb);
5395 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5396 loop_vec_info_for_loop (outer_loop));
5397 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5399 /* Create vs0 - initial def of the double reduction phi. */
5400 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5401 loop_preheader_edge (outer_loop));
5402 vect_phi_init = get_initial_def_for_reduction
5403 (stmt, preheader_arg, NULL);
5405 /* Update phi node arguments with vs0 and vs2. */
5406 add_phi_arg (vect_phi, vect_phi_init,
5407 loop_preheader_edge (outer_loop),
5408 UNKNOWN_LOCATION);
5409 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5410 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5411 if (dump_enabled_p ())
5413 dump_printf_loc (MSG_NOTE, vect_location,
5414 "created double reduction phi node: ");
5415 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5418 vect_phi_res = PHI_RESULT (vect_phi);
5420 /* Replace the use, i.e., set the correct vs1 in the regular
5421 reduction phi node. FORNOW, NCOPIES is always 1, so the
5422 loop is redundant. */
5423 use = reduction_phi;
5424 for (j = 0; j < ncopies; j++)
5426 edge pr_edge = loop_preheader_edge (loop);
5427 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5428 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5434 phis.release ();
5435 if (nested_in_vect_loop)
5437 if (double_reduc)
5438 loop = outer_loop;
5439 else
5440 continue;
5443 phis.create (3);
5444 /* Find the loop-closed-use at the loop exit of the original scalar
5445 result. (The reduction result is expected to have two immediate uses,
5446 one at the latch block, and one at the loop exit). For double
5447 reductions we are looking for exit phis of the outer loop. */
5448 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5450 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5452 if (!is_gimple_debug (USE_STMT (use_p)))
5453 phis.safe_push (USE_STMT (use_p));
5455 else
5457 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5459 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5461 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5463 if (!flow_bb_inside_loop_p (loop,
5464 gimple_bb (USE_STMT (phi_use_p)))
5465 && !is_gimple_debug (USE_STMT (phi_use_p)))
5466 phis.safe_push (USE_STMT (phi_use_p));
5472 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5474 /* Replace the uses: */
5475 orig_name = PHI_RESULT (exit_phi);
5476 scalar_result = scalar_results[k];
5477 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5478 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5479 SET_USE (use_p, scalar_result);
5482 phis.release ();
5487 /* Function is_nonwrapping_integer_induction.
5489 Check if STMT (which is part of loop LOOP) both increments and
5490 does not cause overflow. */
5492 static bool
5493 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5495 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5496 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5497 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5498 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5499 widest_int ni, max_loop_value, lhs_max;
5500 bool overflow = false;
5502 /* Make sure the loop is integer based. */
5503 if (TREE_CODE (base) != INTEGER_CST
5504 || TREE_CODE (step) != INTEGER_CST)
5505 return false;
5507 /* Check that the induction increments. */
5508 if (tree_int_cst_sgn (step) == -1)
5509 return false;
5511 /* Check that the max size of the loop will not wrap. */
5513 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5514 return true;
5516 if (! max_stmt_executions (loop, &ni))
5517 return false;
5519 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5520 &overflow);
5521 if (overflow)
5522 return false;
5524 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5525 TYPE_SIGN (lhs_type), &overflow);
5526 if (overflow)
5527 return false;
5529 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5530 <= TYPE_PRECISION (lhs_type));
5533 /* Function vectorizable_reduction.
5535 Check if STMT performs a reduction operation that can be vectorized.
5536 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5537 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5538 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5540 This function also handles reduction idioms (patterns) that have been
5541 recognized in advance during vect_pattern_recog. In this case, STMT may be
5542 of this form:
5543 X = pattern_expr (arg0, arg1, ..., X)
5544 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5545 sequence that had been detected and replaced by the pattern-stmt (STMT).
5547 This function also handles reduction of condition expressions, for example:
5548 for (int i = 0; i < N; i++)
5549 if (a[i] < value)
5550 last = a[i];
5551 This is handled by vectorising the loop and creating an additional vector
5552 containing the loop indexes for which "a[i] < value" was true. In the
5553 function epilogue this is reduced to a single max value and then used to
5554 index into the vector of results.
5556 In some cases of reduction patterns, the type of the reduction variable X is
5557 different than the type of the other arguments of STMT.
5558 In such cases, the vectype that is used when transforming STMT into a vector
5559 stmt is different than the vectype that is used to determine the
5560 vectorization factor, because it consists of a different number of elements
5561 than the actual number of elements that are being operated upon in parallel.
5563 For example, consider an accumulation of shorts into an int accumulator.
5564 On some targets it's possible to vectorize this pattern operating on 8
5565 shorts at a time (hence, the vectype for purposes of determining the
5566 vectorization factor should be V8HI); on the other hand, the vectype that
5567 is used to create the vector form is actually V4SI (the type of the result).
5569 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5570 indicates what is the actual level of parallelism (V8HI in the example), so
5571 that the right vectorization factor would be derived. This vectype
5572 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5573 be used to create the vectorized stmt. The right vectype for the vectorized
5574 stmt is obtained from the type of the result X:
5575 get_vectype_for_scalar_type (TREE_TYPE (X))
5577 This means that, contrary to "regular" reductions (or "regular" stmts in
5578 general), the following equation:
5579 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5580 does *NOT* necessarily hold for reduction patterns. */
5582 bool
5583 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5584 gimple **vec_stmt, slp_tree slp_node,
5585 slp_instance slp_node_instance)
5587 tree vec_dest;
5588 tree scalar_dest;
5589 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5590 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5591 tree vectype_in = NULL_TREE;
5592 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5594 enum tree_code code, orig_code;
5595 internal_fn reduc_fn;
5596 machine_mode vec_mode;
5597 int op_type;
5598 optab optab;
5599 tree new_temp = NULL_TREE;
5600 gimple *def_stmt;
5601 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5602 tree scalar_type;
5603 bool is_simple_use;
5604 gimple *orig_stmt;
5605 stmt_vec_info orig_stmt_info = NULL;
5606 int i;
5607 int ncopies;
5608 int epilog_copies;
5609 stmt_vec_info prev_stmt_info, prev_phi_info;
5610 bool single_defuse_cycle = false;
5611 gimple *new_stmt = NULL;
5612 int j;
5613 tree ops[3];
5614 enum vect_def_type dts[3];
5615 bool nested_cycle = false, found_nested_cycle_def = false;
5616 bool double_reduc = false;
5617 basic_block def_bb;
5618 struct loop * def_stmt_loop, *outer_loop = NULL;
5619 tree def_arg;
5620 gimple *def_arg_stmt;
5621 auto_vec<tree> vec_oprnds0;
5622 auto_vec<tree> vec_oprnds1;
5623 auto_vec<tree> vec_oprnds2;
5624 auto_vec<tree> vect_defs;
5625 auto_vec<gimple *> phis;
5626 int vec_num;
5627 tree def0, tem;
5628 bool first_p = true;
5629 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5630 tree cond_reduc_val = NULL_TREE;
5632 /* Make sure it was already recognized as a reduction computation. */
5633 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5634 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5635 return false;
5637 if (nested_in_vect_loop_p (loop, stmt))
5639 outer_loop = loop;
5640 loop = loop->inner;
5641 nested_cycle = true;
5644 /* In case of reduction chain we switch to the first stmt in the chain, but
5645 we don't update STMT_INFO, since only the last stmt is marked as reduction
5646 and has reduction properties. */
5647 if (GROUP_FIRST_ELEMENT (stmt_info)
5648 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5650 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5651 first_p = false;
5654 if (gimple_code (stmt) == GIMPLE_PHI)
5656 /* Analysis is fully done on the reduction stmt invocation. */
5657 if (! vec_stmt)
5659 if (slp_node)
5660 slp_node_instance->reduc_phis = slp_node;
5662 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5663 return true;
5666 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5667 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5668 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5670 gcc_assert (is_gimple_assign (reduc_stmt));
5671 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5673 tree op = gimple_op (reduc_stmt, k);
5674 if (op == gimple_phi_result (stmt))
5675 continue;
5676 if (k == 1
5677 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5678 continue;
5679 tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5680 if (! vectype_in
5681 || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5682 vectype_in = tem;
5683 break;
5685 gcc_assert (vectype_in);
5687 if (slp_node)
5688 ncopies = 1;
5689 else
5690 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5692 use_operand_p use_p;
5693 gimple *use_stmt;
5694 if (ncopies > 1
5695 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5696 <= vect_used_only_live)
5697 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5698 && (use_stmt == reduc_stmt
5699 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5700 == reduc_stmt)))
5701 single_defuse_cycle = true;
5703 /* Create the destination vector */
5704 scalar_dest = gimple_assign_lhs (reduc_stmt);
5705 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5707 if (slp_node)
5708 /* The size vect_schedule_slp_instance computes is off for us. */
5709 vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5710 * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5711 / TYPE_VECTOR_SUBPARTS (vectype_in));
5712 else
5713 vec_num = 1;
5715 /* Generate the reduction PHIs upfront. */
5716 prev_phi_info = NULL;
5717 for (j = 0; j < ncopies; j++)
5719 if (j == 0 || !single_defuse_cycle)
5721 for (i = 0; i < vec_num; i++)
5723 /* Create the reduction-phi that defines the reduction
5724 operand. */
5725 gimple *new_phi = create_phi_node (vec_dest, loop->header);
5726 set_vinfo_for_stmt (new_phi,
5727 new_stmt_vec_info (new_phi, loop_vinfo));
5729 if (slp_node)
5730 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5731 else
5733 if (j == 0)
5734 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5735 else
5736 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5737 prev_phi_info = vinfo_for_stmt (new_phi);
5743 return true;
5746 /* 1. Is vectorizable reduction? */
5747 /* Not supportable if the reduction variable is used in the loop, unless
5748 it's a reduction chain. */
5749 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5750 && !GROUP_FIRST_ELEMENT (stmt_info))
5751 return false;
5753 /* Reductions that are not used even in an enclosing outer-loop,
5754 are expected to be "live" (used out of the loop). */
5755 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5756 && !STMT_VINFO_LIVE_P (stmt_info))
5757 return false;
5759 /* 2. Has this been recognized as a reduction pattern?
5761 Check if STMT represents a pattern that has been recognized
5762 in earlier analysis stages. For stmts that represent a pattern,
5763 the STMT_VINFO_RELATED_STMT field records the last stmt in
5764 the original sequence that constitutes the pattern. */
5766 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5767 if (orig_stmt)
5769 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5770 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5771 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5774 /* 3. Check the operands of the operation. The first operands are defined
5775 inside the loop body. The last operand is the reduction variable,
5776 which is defined by the loop-header-phi. */
5778 gcc_assert (is_gimple_assign (stmt));
5780 /* Flatten RHS. */
5781 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5783 case GIMPLE_BINARY_RHS:
5784 code = gimple_assign_rhs_code (stmt);
5785 op_type = TREE_CODE_LENGTH (code);
5786 gcc_assert (op_type == binary_op);
5787 ops[0] = gimple_assign_rhs1 (stmt);
5788 ops[1] = gimple_assign_rhs2 (stmt);
5789 break;
5791 case GIMPLE_TERNARY_RHS:
5792 code = gimple_assign_rhs_code (stmt);
5793 op_type = TREE_CODE_LENGTH (code);
5794 gcc_assert (op_type == ternary_op);
5795 ops[0] = gimple_assign_rhs1 (stmt);
5796 ops[1] = gimple_assign_rhs2 (stmt);
5797 ops[2] = gimple_assign_rhs3 (stmt);
5798 break;
5800 case GIMPLE_UNARY_RHS:
5801 return false;
5803 default:
5804 gcc_unreachable ();
5807 if (code == COND_EXPR && slp_node)
5808 return false;
5810 scalar_dest = gimple_assign_lhs (stmt);
5811 scalar_type = TREE_TYPE (scalar_dest);
5812 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5813 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5814 return false;
5816 /* Do not try to vectorize bit-precision reductions. */
5817 if (!type_has_mode_precision_p (scalar_type))
5818 return false;
5820 /* All uses but the last are expected to be defined in the loop.
5821 The last use is the reduction variable. In case of nested cycle this
5822 assumption is not true: we use reduc_index to record the index of the
5823 reduction variable. */
5824 gimple *reduc_def_stmt = NULL;
5825 int reduc_index = -1;
5826 for (i = 0; i < op_type; i++)
5828 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5829 if (i == 0 && code == COND_EXPR)
5830 continue;
5832 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5833 &def_stmt, &dts[i], &tem);
5834 dt = dts[i];
5835 gcc_assert (is_simple_use);
5836 if (dt == vect_reduction_def)
5838 reduc_def_stmt = def_stmt;
5839 reduc_index = i;
5840 continue;
5842 else if (tem)
5844 /* To properly compute ncopies we are interested in the widest
5845 input type in case we're looking at a widening accumulation. */
5846 if (!vectype_in
5847 || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5848 vectype_in = tem;
5851 if (dt != vect_internal_def
5852 && dt != vect_external_def
5853 && dt != vect_constant_def
5854 && dt != vect_induction_def
5855 && !(dt == vect_nested_cycle && nested_cycle))
5856 return false;
5858 if (dt == vect_nested_cycle)
5860 found_nested_cycle_def = true;
5861 reduc_def_stmt = def_stmt;
5862 reduc_index = i;
5865 if (i == 1 && code == COND_EXPR)
5867 /* Record how value of COND_EXPR is defined. */
5868 if (dt == vect_constant_def)
5870 cond_reduc_dt = dt;
5871 cond_reduc_val = ops[i];
5873 if (dt == vect_induction_def && def_stmt != NULL
5874 && is_nonwrapping_integer_induction (def_stmt, loop))
5875 cond_reduc_dt = dt;
5879 if (!vectype_in)
5880 vectype_in = vectype_out;
5882 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5883 directy used in stmt. */
5884 if (reduc_index == -1)
5886 if (orig_stmt)
5887 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5888 else
5889 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5892 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5893 return false;
5895 if (!(reduc_index == -1
5896 || dts[reduc_index] == vect_reduction_def
5897 || dts[reduc_index] == vect_nested_cycle
5898 || ((dts[reduc_index] == vect_internal_def
5899 || dts[reduc_index] == vect_external_def
5900 || dts[reduc_index] == vect_constant_def
5901 || dts[reduc_index] == vect_induction_def)
5902 && nested_cycle && found_nested_cycle_def)))
5904 /* For pattern recognized stmts, orig_stmt might be a reduction,
5905 but some helper statements for the pattern might not, or
5906 might be COND_EXPRs with reduction uses in the condition. */
5907 gcc_assert (orig_stmt);
5908 return false;
5911 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5912 enum vect_reduction_type v_reduc_type
5913 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5914 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5916 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5917 /* If we have a condition reduction, see if we can simplify it further. */
5918 if (v_reduc_type == COND_REDUCTION)
5920 if (cond_reduc_dt == vect_induction_def)
5922 if (dump_enabled_p ())
5923 dump_printf_loc (MSG_NOTE, vect_location,
5924 "condition expression based on "
5925 "integer induction.\n");
5926 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5927 = INTEGER_INDUC_COND_REDUCTION;
5930 /* Loop peeling modifies initial value of reduction PHI, which
5931 makes the reduction stmt to be transformed different to the
5932 original stmt analyzed. We need to record reduction code for
5933 CONST_COND_REDUCTION type reduction at analyzing stage, thus
5934 it can be used directly at transform stage. */
5935 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5936 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5938 /* Also set the reduction type to CONST_COND_REDUCTION. */
5939 gcc_assert (cond_reduc_dt == vect_constant_def);
5940 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5942 else if (cond_reduc_dt == vect_constant_def)
5944 enum vect_def_type cond_initial_dt;
5945 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5946 tree cond_initial_val
5947 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5949 gcc_assert (cond_reduc_val != NULL_TREE);
5950 vect_is_simple_use (cond_initial_val, loop_vinfo,
5951 &def_stmt, &cond_initial_dt);
5952 if (cond_initial_dt == vect_constant_def
5953 && types_compatible_p (TREE_TYPE (cond_initial_val),
5954 TREE_TYPE (cond_reduc_val)))
5956 tree e = fold_binary (LE_EXPR, boolean_type_node,
5957 cond_initial_val, cond_reduc_val);
5958 if (e && (integer_onep (e) || integer_zerop (e)))
5960 if (dump_enabled_p ())
5961 dump_printf_loc (MSG_NOTE, vect_location,
5962 "condition expression based on "
5963 "compile time constant.\n");
5964 /* Record reduction code at analysis stage. */
5965 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5966 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5967 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5968 = CONST_COND_REDUCTION;
5974 if (orig_stmt)
5975 gcc_assert (tmp == orig_stmt
5976 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5977 else
5978 /* We changed STMT to be the first stmt in reduction chain, hence we
5979 check that in this case the first element in the chain is STMT. */
5980 gcc_assert (stmt == tmp
5981 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5983 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5984 return false;
5986 if (slp_node)
5987 ncopies = 1;
5988 else
5989 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5991 gcc_assert (ncopies >= 1);
5993 vec_mode = TYPE_MODE (vectype_in);
5995 if (code == COND_EXPR)
5997 /* Only call during the analysis stage, otherwise we'll lose
5998 STMT_VINFO_TYPE. */
5999 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6000 ops[reduc_index], 0, NULL))
6002 if (dump_enabled_p ())
6003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004 "unsupported condition in reduction\n");
6005 return false;
6008 else
6010 /* 4. Supportable by target? */
6012 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6013 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6015 /* Shifts and rotates are only supported by vectorizable_shifts,
6016 not vectorizable_reduction. */
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6019 "unsupported shift or rotation.\n");
6020 return false;
6023 /* 4.1. check support for the operation in the loop */
6024 optab = optab_for_tree_code (code, vectype_in, optab_default);
6025 if (!optab)
6027 if (dump_enabled_p ())
6028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6029 "no optab.\n");
6031 return false;
6034 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6036 if (dump_enabled_p ())
6037 dump_printf (MSG_NOTE, "op not supported by target.\n");
6039 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6040 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6041 return false;
6043 if (dump_enabled_p ())
6044 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6047 /* Worthwhile without SIMD support? */
6048 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6049 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6051 if (dump_enabled_p ())
6052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6053 "not worthwhile without SIMD support.\n");
6055 return false;
6059 /* 4.2. Check support for the epilog operation.
6061 If STMT represents a reduction pattern, then the type of the
6062 reduction variable may be different than the type of the rest
6063 of the arguments. For example, consider the case of accumulation
6064 of shorts into an int accumulator; The original code:
6065 S1: int_a = (int) short_a;
6066 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6068 was replaced with:
6069 STMT: int_acc = widen_sum <short_a, int_acc>
6071 This means that:
6072 1. The tree-code that is used to create the vector operation in the
6073 epilog code (that reduces the partial results) is not the
6074 tree-code of STMT, but is rather the tree-code of the original
6075 stmt from the pattern that STMT is replacing. I.e, in the example
6076 above we want to use 'widen_sum' in the loop, but 'plus' in the
6077 epilog.
6078 2. The type (mode) we use to check available target support
6079 for the vector operation to be created in the *epilog*, is
6080 determined by the type of the reduction variable (in the example
6081 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6082 However the type (mode) we use to check available target support
6083 for the vector operation to be created *inside the loop*, is
6084 determined by the type of the other arguments to STMT (in the
6085 example we'd check this: optab_handler (widen_sum_optab,
6086 vect_short_mode)).
6088 This is contrary to "regular" reductions, in which the types of all
6089 the arguments are the same as the type of the reduction variable.
6090 For "regular" reductions we can therefore use the same vector type
6091 (and also the same tree-code) when generating the epilog code and
6092 when generating the code inside the loop. */
6094 if (orig_stmt)
6096 /* This is a reduction pattern: get the vectype from the type of the
6097 reduction variable, and get the tree-code from orig_stmt. */
6098 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6099 == TREE_CODE_REDUCTION);
6100 orig_code = gimple_assign_rhs_code (orig_stmt);
6101 gcc_assert (vectype_out);
6102 vec_mode = TYPE_MODE (vectype_out);
6104 else
6106 /* Regular reduction: use the same vectype and tree-code as used for
6107 the vector code inside the loop can be used for the epilog code. */
6108 orig_code = code;
6110 if (code == MINUS_EXPR)
6111 orig_code = PLUS_EXPR;
6113 /* For simple condition reductions, replace with the actual expression
6114 we want to base our reduction around. */
6115 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6117 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6118 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6120 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6121 == INTEGER_INDUC_COND_REDUCTION)
6122 orig_code = MAX_EXPR;
6125 if (nested_cycle)
6127 def_bb = gimple_bb (reduc_def_stmt);
6128 def_stmt_loop = def_bb->loop_father;
6129 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6130 loop_preheader_edge (def_stmt_loop));
6131 if (TREE_CODE (def_arg) == SSA_NAME
6132 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6133 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6134 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6135 && vinfo_for_stmt (def_arg_stmt)
6136 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6137 == vect_double_reduction_def)
6138 double_reduc = true;
6141 reduc_fn = IFN_LAST;
6143 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6145 if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6147 if (reduc_fn != IFN_LAST
6148 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6149 OPTIMIZE_FOR_SPEED))
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153 "reduc op not supported by target.\n");
6155 reduc_fn = IFN_LAST;
6158 else
6160 if (!nested_cycle || double_reduc)
6162 if (dump_enabled_p ())
6163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6164 "no reduc code for scalar code.\n");
6166 return false;
6170 else
6172 int scalar_precision
6173 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6174 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6175 cr_index_vector_type = build_vector_type
6176 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6178 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6179 OPTIMIZE_FOR_SPEED))
6180 reduc_fn = IFN_REDUC_MAX;
6183 if ((double_reduc
6184 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6185 && ncopies > 1)
6187 if (dump_enabled_p ())
6188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6189 "multiple types in double reduction or condition "
6190 "reduction.\n");
6191 return false;
6194 /* In case of widenning multiplication by a constant, we update the type
6195 of the constant to be the type of the other operand. We check that the
6196 constant fits the type in the pattern recognition pass. */
6197 if (code == DOT_PROD_EXPR
6198 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6200 if (TREE_CODE (ops[0]) == INTEGER_CST)
6201 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6202 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6203 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6204 else
6206 if (dump_enabled_p ())
6207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6208 "invalid types in dot-prod\n");
6210 return false;
6214 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6216 widest_int ni;
6218 if (! max_loop_iterations (loop, &ni))
6220 if (dump_enabled_p ())
6221 dump_printf_loc (MSG_NOTE, vect_location,
6222 "loop count not known, cannot create cond "
6223 "reduction.\n");
6224 return false;
6226 /* Convert backedges to iterations. */
6227 ni += 1;
6229 /* The additional index will be the same type as the condition. Check
6230 that the loop can fit into this less one (because we'll use up the
6231 zero slot for when there are no matches). */
6232 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6233 if (wi::geu_p (ni, wi::to_widest (max_index)))
6235 if (dump_enabled_p ())
6236 dump_printf_loc (MSG_NOTE, vect_location,
6237 "loop size is greater than data size.\n");
6238 return false;
6242 /* In case the vectorization factor (VF) is bigger than the number
6243 of elements that we can fit in a vectype (nunits), we have to generate
6244 more than one vector stmt - i.e - we need to "unroll" the
6245 vector stmt by a factor VF/nunits. For more details see documentation
6246 in vectorizable_operation. */
6248 /* If the reduction is used in an outer loop we need to generate
6249 VF intermediate results, like so (e.g. for ncopies=2):
6250 r0 = phi (init, r0)
6251 r1 = phi (init, r1)
6252 r0 = x0 + r0;
6253 r1 = x1 + r1;
6254 (i.e. we generate VF results in 2 registers).
6255 In this case we have a separate def-use cycle for each copy, and therefore
6256 for each copy we get the vector def for the reduction variable from the
6257 respective phi node created for this copy.
6259 Otherwise (the reduction is unused in the loop nest), we can combine
6260 together intermediate results, like so (e.g. for ncopies=2):
6261 r = phi (init, r)
6262 r = x0 + r;
6263 r = x1 + r;
6264 (i.e. we generate VF/2 results in a single register).
6265 In this case for each copy we get the vector def for the reduction variable
6266 from the vectorized reduction operation generated in the previous iteration.
6268 This only works when we see both the reduction PHI and its only consumer
6269 in vectorizable_reduction and there are no intermediate stmts
6270 participating. */
6271 use_operand_p use_p;
6272 gimple *use_stmt;
6273 if (ncopies > 1
6274 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6275 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6276 && (use_stmt == stmt
6277 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6279 single_defuse_cycle = true;
6280 epilog_copies = 1;
6282 else
6283 epilog_copies = ncopies;
6285 /* If the reduction stmt is one of the patterns that have lane
6286 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6287 if ((ncopies > 1
6288 && ! single_defuse_cycle)
6289 && (code == DOT_PROD_EXPR
6290 || code == WIDEN_SUM_EXPR
6291 || code == SAD_EXPR))
6293 if (dump_enabled_p ())
6294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6295 "multi def-use cycle not possible for lane-reducing "
6296 "reduction operation\n");
6297 return false;
6300 if (!vec_stmt) /* transformation not required. */
6302 if (first_p)
6303 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6304 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6305 return true;
6308 /* Transform. */
6310 if (dump_enabled_p ())
6311 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6313 /* FORNOW: Multiple types are not supported for condition. */
6314 if (code == COND_EXPR)
6315 gcc_assert (ncopies == 1);
6317 /* Create the destination vector */
6318 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6320 prev_stmt_info = NULL;
6321 prev_phi_info = NULL;
6322 if (slp_node)
6323 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6324 else
6326 vec_num = 1;
6327 vec_oprnds0.create (1);
6328 vec_oprnds1.create (1);
6329 if (op_type == ternary_op)
6330 vec_oprnds2.create (1);
6333 phis.create (vec_num);
6334 vect_defs.create (vec_num);
6335 if (!slp_node)
6336 vect_defs.quick_push (NULL_TREE);
6338 if (slp_node)
6339 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6340 else
6341 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6343 for (j = 0; j < ncopies; j++)
6345 if (code == COND_EXPR)
6347 gcc_assert (!slp_node);
6348 vectorizable_condition (stmt, gsi, vec_stmt,
6349 PHI_RESULT (phis[0]),
6350 reduc_index, NULL);
6351 /* Multiple types are not supported for condition. */
6352 break;
6355 /* Handle uses. */
6356 if (j == 0)
6358 if (slp_node)
6360 /* Get vec defs for all the operands except the reduction index,
6361 ensuring the ordering of the ops in the vector is kept. */
6362 auto_vec<tree, 3> slp_ops;
6363 auto_vec<vec<tree>, 3> vec_defs;
6365 slp_ops.quick_push (ops[0]);
6366 slp_ops.quick_push (ops[1]);
6367 if (op_type == ternary_op)
6368 slp_ops.quick_push (ops[2]);
6370 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6372 vec_oprnds0.safe_splice (vec_defs[0]);
6373 vec_defs[0].release ();
6374 vec_oprnds1.safe_splice (vec_defs[1]);
6375 vec_defs[1].release ();
6376 if (op_type == ternary_op)
6378 vec_oprnds2.safe_splice (vec_defs[2]);
6379 vec_defs[2].release ();
6382 else
6384 vec_oprnds0.quick_push
6385 (vect_get_vec_def_for_operand (ops[0], stmt));
6386 vec_oprnds1.quick_push
6387 (vect_get_vec_def_for_operand (ops[1], stmt));
6388 if (op_type == ternary_op)
6389 vec_oprnds2.quick_push
6390 (vect_get_vec_def_for_operand (ops[2], stmt));
6393 else
6395 if (!slp_node)
6397 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6399 if (single_defuse_cycle && reduc_index == 0)
6400 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6401 else
6402 vec_oprnds0[0]
6403 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6404 if (single_defuse_cycle && reduc_index == 1)
6405 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6406 else
6407 vec_oprnds1[0]
6408 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6409 if (op_type == ternary_op)
6411 if (single_defuse_cycle && reduc_index == 2)
6412 vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6413 else
6414 vec_oprnds2[0]
6415 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6420 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6422 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6423 if (op_type == ternary_op)
6424 vop[2] = vec_oprnds2[i];
6426 new_temp = make_ssa_name (vec_dest, new_stmt);
6427 new_stmt = gimple_build_assign (new_temp, code,
6428 vop[0], vop[1], vop[2]);
6429 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6431 if (slp_node)
6433 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6434 vect_defs.quick_push (new_temp);
6436 else
6437 vect_defs[0] = new_temp;
6440 if (slp_node)
6441 continue;
6443 if (j == 0)
6444 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6445 else
6446 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6448 prev_stmt_info = vinfo_for_stmt (new_stmt);
6451 /* Finalize the reduction-phi (set its arguments) and create the
6452 epilog reduction code. */
6453 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6454 vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6456 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6457 epilog_copies, reduc_fn, phis,
6458 double_reduc, slp_node, slp_node_instance);
6460 return true;
6463 /* Function vect_min_worthwhile_factor.
6465 For a loop where we could vectorize the operation indicated by CODE,
6466 return the minimum vectorization factor that makes it worthwhile
6467 to use generic vectors. */
6469 vect_min_worthwhile_factor (enum tree_code code)
6471 switch (code)
6473 case PLUS_EXPR:
6474 case MINUS_EXPR:
6475 case NEGATE_EXPR:
6476 return 4;
6478 case BIT_AND_EXPR:
6479 case BIT_IOR_EXPR:
6480 case BIT_XOR_EXPR:
6481 case BIT_NOT_EXPR:
6482 return 2;
6484 default:
6485 return INT_MAX;
6489 /* Return true if VINFO indicates we are doing loop vectorization and if
6490 it is worth decomposing CODE operations into scalar operations for
6491 that loop's vectorization factor. */
6493 bool
6494 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6496 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6497 return (loop_vinfo
6498 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6499 >= vect_min_worthwhile_factor (code)));
6502 /* Function vectorizable_induction
6504 Check if PHI performs an induction computation that can be vectorized.
6505 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6506 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6507 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6509 bool
6510 vectorizable_induction (gimple *phi,
6511 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6512 gimple **vec_stmt, slp_tree slp_node)
6514 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6515 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6516 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6517 unsigned ncopies;
6518 bool nested_in_vect_loop = false;
6519 struct loop *iv_loop;
6520 tree vec_def;
6521 edge pe = loop_preheader_edge (loop);
6522 basic_block new_bb;
6523 tree new_vec, vec_init, vec_step, t;
6524 tree new_name;
6525 gimple *new_stmt;
6526 gphi *induction_phi;
6527 tree induc_def, vec_dest;
6528 tree init_expr, step_expr;
6529 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6530 unsigned i;
6531 tree expr;
6532 gimple_seq stmts;
6533 imm_use_iterator imm_iter;
6534 use_operand_p use_p;
6535 gimple *exit_phi;
6536 edge latch_e;
6537 tree loop_arg;
6538 gimple_stmt_iterator si;
6539 basic_block bb = gimple_bb (phi);
6541 if (gimple_code (phi) != GIMPLE_PHI)
6542 return false;
6544 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6545 return false;
6547 /* Make sure it was recognized as induction computation. */
6548 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6549 return false;
6551 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6552 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6554 if (slp_node)
6555 ncopies = 1;
6556 else
6557 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6558 gcc_assert (ncopies >= 1);
6560 /* FORNOW. These restrictions should be relaxed. */
6561 if (nested_in_vect_loop_p (loop, phi))
6563 imm_use_iterator imm_iter;
6564 use_operand_p use_p;
6565 gimple *exit_phi;
6566 edge latch_e;
6567 tree loop_arg;
6569 if (ncopies > 1)
6571 if (dump_enabled_p ())
6572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 "multiple types in nested loop.\n");
6574 return false;
6577 /* FORNOW: outer loop induction with SLP not supported. */
6578 if (STMT_SLP_TYPE (stmt_info))
6579 return false;
6581 exit_phi = NULL;
6582 latch_e = loop_latch_edge (loop->inner);
6583 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6584 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6586 gimple *use_stmt = USE_STMT (use_p);
6587 if (is_gimple_debug (use_stmt))
6588 continue;
6590 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6592 exit_phi = use_stmt;
6593 break;
6596 if (exit_phi)
6598 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6599 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6600 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6602 if (dump_enabled_p ())
6603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6604 "inner-loop induction only used outside "
6605 "of the outer vectorized loop.\n");
6606 return false;
6610 nested_in_vect_loop = true;
6611 iv_loop = loop->inner;
6613 else
6614 iv_loop = loop;
6615 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6617 if (!vec_stmt) /* transformation not required. */
6619 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6620 if (dump_enabled_p ())
6621 dump_printf_loc (MSG_NOTE, vect_location,
6622 "=== vectorizable_induction ===\n");
6623 vect_model_induction_cost (stmt_info, ncopies);
6624 return true;
6627 /* Transform. */
6629 /* Compute a vector variable, initialized with the first VF values of
6630 the induction variable. E.g., for an iv with IV_PHI='X' and
6631 evolution S, for a vector of 4 units, we want to compute:
6632 [X, X + S, X + 2*S, X + 3*S]. */
6634 if (dump_enabled_p ())
6635 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6637 latch_e = loop_latch_edge (iv_loop);
6638 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6640 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6641 gcc_assert (step_expr != NULL_TREE);
6643 pe = loop_preheader_edge (iv_loop);
6644 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6645 loop_preheader_edge (iv_loop));
6647 /* Convert the step to the desired type. */
6648 stmts = NULL;
6649 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6650 if (stmts)
6652 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6653 gcc_assert (!new_bb);
6656 /* Find the first insertion point in the BB. */
6657 si = gsi_after_labels (bb);
6659 /* For SLP induction we have to generate several IVs as for example
6660 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6661 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
6662 [VF*S, VF*S, VF*S, VF*S] for all. */
6663 if (slp_node)
6665 /* Convert the init to the desired type. */
6666 stmts = NULL;
6667 init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6668 if (stmts)
6670 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6671 gcc_assert (!new_bb);
6674 /* Generate [VF*S, VF*S, ... ]. */
6675 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6677 expr = build_int_cst (integer_type_node, vf);
6678 expr = fold_convert (TREE_TYPE (step_expr), expr);
6680 else
6681 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6682 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6683 expr, step_expr);
6684 if (! CONSTANT_CLASS_P (new_name))
6685 new_name = vect_init_vector (phi, new_name,
6686 TREE_TYPE (step_expr), NULL);
6687 new_vec = build_vector_from_val (vectype, new_name);
6688 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6690 /* Now generate the IVs. */
6691 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6692 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6693 unsigned elts = nunits * nvects;
6694 unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6695 gcc_assert (elts % group_size == 0);
6696 tree elt = init_expr;
6697 unsigned ivn;
6698 for (ivn = 0; ivn < nivs; ++ivn)
6700 auto_vec<tree, 32> elts (nunits);
6701 stmts = NULL;
6702 for (unsigned eltn = 0; eltn < nunits; ++eltn)
6704 if (ivn*nunits + eltn >= group_size
6705 && (ivn*nunits + eltn) % group_size == 0)
6706 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6707 elt, step_expr);
6708 elts.quick_push (elt);
6710 vec_init = gimple_build_vector (&stmts, vectype, elts);
6711 if (stmts)
6713 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6714 gcc_assert (!new_bb);
6717 /* Create the induction-phi that defines the induction-operand. */
6718 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6719 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6720 set_vinfo_for_stmt (induction_phi,
6721 new_stmt_vec_info (induction_phi, loop_vinfo));
6722 induc_def = PHI_RESULT (induction_phi);
6724 /* Create the iv update inside the loop */
6725 vec_def = make_ssa_name (vec_dest);
6726 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6727 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6728 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6730 /* Set the arguments of the phi node: */
6731 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6732 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6733 UNKNOWN_LOCATION);
6735 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6738 /* Re-use IVs when we can. */
6739 if (ivn < nvects)
6741 unsigned vfp
6742 = least_common_multiple (group_size, nunits) / group_size;
6743 /* Generate [VF'*S, VF'*S, ... ]. */
6744 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6746 expr = build_int_cst (integer_type_node, vfp);
6747 expr = fold_convert (TREE_TYPE (step_expr), expr);
6749 else
6750 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6751 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6752 expr, step_expr);
6753 if (! CONSTANT_CLASS_P (new_name))
6754 new_name = vect_init_vector (phi, new_name,
6755 TREE_TYPE (step_expr), NULL);
6756 new_vec = build_vector_from_val (vectype, new_name);
6757 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6758 for (; ivn < nvects; ++ivn)
6760 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6761 tree def;
6762 if (gimple_code (iv) == GIMPLE_PHI)
6763 def = gimple_phi_result (iv);
6764 else
6765 def = gimple_assign_lhs (iv);
6766 new_stmt = gimple_build_assign (make_ssa_name (vectype),
6767 PLUS_EXPR,
6768 def, vec_step);
6769 if (gimple_code (iv) == GIMPLE_PHI)
6770 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6771 else
6773 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6774 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6776 set_vinfo_for_stmt (new_stmt,
6777 new_stmt_vec_info (new_stmt, loop_vinfo));
6778 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6782 return true;
6785 /* Create the vector that holds the initial_value of the induction. */
6786 if (nested_in_vect_loop)
6788 /* iv_loop is nested in the loop to be vectorized. init_expr had already
6789 been created during vectorization of previous stmts. We obtain it
6790 from the STMT_VINFO_VEC_STMT of the defining stmt. */
6791 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6792 /* If the initial value is not of proper type, convert it. */
6793 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6795 new_stmt
6796 = gimple_build_assign (vect_get_new_ssa_name (vectype,
6797 vect_simple_var,
6798 "vec_iv_"),
6799 VIEW_CONVERT_EXPR,
6800 build1 (VIEW_CONVERT_EXPR, vectype,
6801 vec_init));
6802 vec_init = gimple_assign_lhs (new_stmt);
6803 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6804 new_stmt);
6805 gcc_assert (!new_bb);
6806 set_vinfo_for_stmt (new_stmt,
6807 new_stmt_vec_info (new_stmt, loop_vinfo));
6810 else
6812 /* iv_loop is the loop to be vectorized. Create:
6813 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
6814 stmts = NULL;
6815 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6817 auto_vec<tree, 32> elts (nunits);
6818 elts.quick_push (new_name);
6819 for (i = 1; i < nunits; i++)
6821 /* Create: new_name_i = new_name + step_expr */
6822 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6823 new_name, step_expr);
6824 elts.quick_push (new_name);
6826 /* Create a vector from [new_name_0, new_name_1, ...,
6827 new_name_nunits-1] */
6828 vec_init = gimple_build_vector (&stmts, vectype, elts);
6829 if (stmts)
6831 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6832 gcc_assert (!new_bb);
6837 /* Create the vector that holds the step of the induction. */
6838 if (nested_in_vect_loop)
6839 /* iv_loop is nested in the loop to be vectorized. Generate:
6840 vec_step = [S, S, S, S] */
6841 new_name = step_expr;
6842 else
6844 /* iv_loop is the loop to be vectorized. Generate:
6845 vec_step = [VF*S, VF*S, VF*S, VF*S] */
6846 gimple_seq seq = NULL;
6847 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6849 expr = build_int_cst (integer_type_node, vf);
6850 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6852 else
6853 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6854 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6855 expr, step_expr);
6856 if (seq)
6858 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6859 gcc_assert (!new_bb);
6863 t = unshare_expr (new_name);
6864 gcc_assert (CONSTANT_CLASS_P (new_name)
6865 || TREE_CODE (new_name) == SSA_NAME);
6866 new_vec = build_vector_from_val (vectype, t);
6867 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6870 /* Create the following def-use cycle:
6871 loop prolog:
6872 vec_init = ...
6873 vec_step = ...
6874 loop:
6875 vec_iv = PHI <vec_init, vec_loop>
6877 STMT
6879 vec_loop = vec_iv + vec_step; */
6881 /* Create the induction-phi that defines the induction-operand. */
6882 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6883 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6884 set_vinfo_for_stmt (induction_phi,
6885 new_stmt_vec_info (induction_phi, loop_vinfo));
6886 induc_def = PHI_RESULT (induction_phi);
6888 /* Create the iv update inside the loop */
6889 vec_def = make_ssa_name (vec_dest);
6890 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6891 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6892 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6894 /* Set the arguments of the phi node: */
6895 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6896 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6897 UNKNOWN_LOCATION);
6899 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6901 /* In case that vectorization factor (VF) is bigger than the number
6902 of elements that we can fit in a vectype (nunits), we have to generate
6903 more than one vector stmt - i.e - we need to "unroll" the
6904 vector stmt by a factor VF/nunits. For more details see documentation
6905 in vectorizable_operation. */
6907 if (ncopies > 1)
6909 gimple_seq seq = NULL;
6910 stmt_vec_info prev_stmt_vinfo;
6911 /* FORNOW. This restriction should be relaxed. */
6912 gcc_assert (!nested_in_vect_loop);
6914 /* Create the vector that holds the step of the induction. */
6915 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6917 expr = build_int_cst (integer_type_node, nunits);
6918 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6920 else
6921 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6922 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6923 expr, step_expr);
6924 if (seq)
6926 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6927 gcc_assert (!new_bb);
6930 t = unshare_expr (new_name);
6931 gcc_assert (CONSTANT_CLASS_P (new_name)
6932 || TREE_CODE (new_name) == SSA_NAME);
6933 new_vec = build_vector_from_val (vectype, t);
6934 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6936 vec_def = induc_def;
6937 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6938 for (i = 1; i < ncopies; i++)
6940 /* vec_i = vec_prev + vec_step */
6941 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6942 vec_def, vec_step);
6943 vec_def = make_ssa_name (vec_dest, new_stmt);
6944 gimple_assign_set_lhs (new_stmt, vec_def);
6946 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6947 set_vinfo_for_stmt (new_stmt,
6948 new_stmt_vec_info (new_stmt, loop_vinfo));
6949 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6950 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6954 if (nested_in_vect_loop)
6956 /* Find the loop-closed exit-phi of the induction, and record
6957 the final vector of induction results: */
6958 exit_phi = NULL;
6959 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6961 gimple *use_stmt = USE_STMT (use_p);
6962 if (is_gimple_debug (use_stmt))
6963 continue;
6965 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6967 exit_phi = use_stmt;
6968 break;
6971 if (exit_phi)
6973 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6974 /* FORNOW. Currently not supporting the case that an inner-loop induction
6975 is not used in the outer-loop (i.e. only outside the outer-loop). */
6976 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6977 && !STMT_VINFO_LIVE_P (stmt_vinfo));
6979 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6980 if (dump_enabled_p ())
6982 dump_printf_loc (MSG_NOTE, vect_location,
6983 "vector of inductions after inner-loop:");
6984 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6990 if (dump_enabled_p ())
6992 dump_printf_loc (MSG_NOTE, vect_location,
6993 "transform induction: created def-use cycle: ");
6994 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
6995 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6996 SSA_NAME_DEF_STMT (vec_def), 0);
6999 return true;
7002 /* Function vectorizable_live_operation.
7004 STMT computes a value that is used outside the loop. Check if
7005 it can be supported. */
7007 bool
7008 vectorizable_live_operation (gimple *stmt,
7009 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7010 slp_tree slp_node, int slp_index,
7011 gimple **vec_stmt)
7013 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7014 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7015 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7016 imm_use_iterator imm_iter;
7017 tree lhs, lhs_type, bitsize, vec_bitsize;
7018 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7019 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7020 int ncopies;
7021 gimple *use_stmt;
7022 auto_vec<tree> vec_oprnds;
7024 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7026 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7027 return false;
7029 /* FORNOW. CHECKME. */
7030 if (nested_in_vect_loop_p (loop, stmt))
7031 return false;
7033 /* If STMT is not relevant and it is a simple assignment and its inputs are
7034 invariant then it can remain in place, unvectorized. The original last
7035 scalar value that it computes will be used. */
7036 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7038 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7039 if (dump_enabled_p ())
7040 dump_printf_loc (MSG_NOTE, vect_location,
7041 "statement is simple and uses invariant. Leaving in "
7042 "place.\n");
7043 return true;
7046 if (slp_node)
7047 ncopies = 1;
7048 else
7049 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7051 if (!vec_stmt)
7052 /* No transformation required. */
7053 return true;
7055 /* If stmt has a related stmt, then use that for getting the lhs. */
7056 if (is_pattern_stmt_p (stmt_info))
7057 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7059 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7060 : gimple_get_lhs (stmt);
7061 lhs_type = TREE_TYPE (lhs);
7063 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7064 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7065 : TYPE_SIZE (TREE_TYPE (vectype)));
7066 vec_bitsize = TYPE_SIZE (vectype);
7068 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7069 tree vec_lhs, bitstart;
7070 if (slp_node)
7072 gcc_assert (slp_index >= 0);
7074 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7075 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7077 /* Get the last occurrence of the scalar index from the concatenation of
7078 all the slp vectors. Calculate which slp vector it is and the index
7079 within. */
7080 int pos = (num_vec * nunits) - num_scalar + slp_index;
7081 int vec_entry = pos / nunits;
7082 int vec_index = pos % nunits;
7084 /* Get the correct slp vectorized stmt. */
7085 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7087 /* Get entry to use. */
7088 bitstart = bitsize_int (vec_index);
7089 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7091 else
7093 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7094 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7096 /* For multiple copies, get the last copy. */
7097 for (int i = 1; i < ncopies; ++i)
7098 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7099 vec_lhs);
7101 /* Get the last lane in the vector. */
7102 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7105 /* Create a new vectorized stmt for the uses of STMT and insert outside the
7106 loop. */
7107 gimple_seq stmts = NULL;
7108 tree bftype = TREE_TYPE (vectype);
7109 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7110 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7111 tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7112 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7113 true, NULL_TREE);
7114 if (stmts)
7115 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7117 /* Replace use of lhs with newly computed result. If the use stmt is a
7118 single arg PHI, just replace all uses of PHI result. It's necessary
7119 because lcssa PHI defining lhs may be before newly inserted stmt. */
7120 use_operand_p use_p;
7121 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7122 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7123 && !is_gimple_debug (use_stmt))
7125 if (gimple_code (use_stmt) == GIMPLE_PHI
7126 && gimple_phi_num_args (use_stmt) == 1)
7128 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7130 else
7132 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7133 SET_USE (use_p, new_tree);
7135 update_stmt (use_stmt);
7138 return true;
7141 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
7143 static void
7144 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7146 ssa_op_iter op_iter;
7147 imm_use_iterator imm_iter;
7148 def_operand_p def_p;
7149 gimple *ustmt;
7151 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7153 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7155 basic_block bb;
7157 if (!is_gimple_debug (ustmt))
7158 continue;
7160 bb = gimple_bb (ustmt);
7162 if (!flow_bb_inside_loop_p (loop, bb))
7164 if (gimple_debug_bind_p (ustmt))
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_NOTE, vect_location,
7168 "killing debug use\n");
7170 gimple_debug_bind_reset_value (ustmt);
7171 update_stmt (ustmt);
7173 else
7174 gcc_unreachable ();
7180 /* Given loop represented by LOOP_VINFO, return true if computation of
7181 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7182 otherwise. */
7184 static bool
7185 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7187 /* Constant case. */
7188 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7190 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7191 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7193 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7194 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7195 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7196 return true;
7199 widest_int max;
7200 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7201 /* Check the upper bound of loop niters. */
7202 if (get_max_loop_iterations (loop, &max))
7204 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7205 signop sgn = TYPE_SIGN (type);
7206 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7207 if (max < type_max)
7208 return true;
7210 return false;
7213 /* Scale profiling counters by estimation for LOOP which is vectorized
7214 by factor VF. */
7216 static void
7217 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7219 edge preheader = loop_preheader_edge (loop);
7220 /* Reduce loop iterations by the vectorization factor. */
7221 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7222 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7224 if (freq_h.nonzero_p ())
7226 profile_probability p;
7228 /* Avoid dropping loop body profile counter to 0 because of zero count
7229 in loop's preheader. */
7230 if (!(freq_e == profile_count::zero ()))
7231 freq_e = freq_e.force_nonzero ();
7232 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7233 scale_loop_frequencies (loop, p);
7236 edge exit_e = single_exit (loop);
7237 exit_e->probability = profile_probability::always ()
7238 .apply_scale (1, new_est_niter + 1);
7240 edge exit_l = single_pred_edge (loop->latch);
7241 profile_probability prob = exit_l->probability;
7242 exit_l->probability = exit_e->probability.invert ();
7243 if (prob.initialized_p () && exit_l->probability.initialized_p ())
7244 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7247 /* Function vect_transform_loop.
7249 The analysis phase has determined that the loop is vectorizable.
7250 Vectorize the loop - created vectorized stmts to replace the scalar
7251 stmts in the loop, and update the loop exit condition.
7252 Returns scalar epilogue loop if any. */
7254 struct loop *
7255 vect_transform_loop (loop_vec_info loop_vinfo)
7257 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7258 struct loop *epilogue = NULL;
7259 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7260 int nbbs = loop->num_nodes;
7261 int i;
7262 tree niters_vector = NULL;
7263 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7264 bool grouped_store;
7265 bool slp_scheduled = false;
7266 gimple *stmt, *pattern_stmt;
7267 gimple_seq pattern_def_seq = NULL;
7268 gimple_stmt_iterator pattern_def_si = gsi_none ();
7269 bool transform_pattern_stmt = false;
7270 bool check_profitability = false;
7271 int th;
7273 if (dump_enabled_p ())
7274 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7276 /* Use the more conservative vectorization threshold. If the number
7277 of iterations is constant assume the cost check has been performed
7278 by our caller. If the threshold makes all loops profitable that
7279 run at least the vectorization factor number of times checking
7280 is pointless, too. */
7281 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7282 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7283 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7285 if (dump_enabled_p ())
7286 dump_printf_loc (MSG_NOTE, vect_location,
7287 "Profitability threshold is %d loop iterations.\n",
7288 th);
7289 check_profitability = true;
7292 /* Make sure there exists a single-predecessor exit bb. Do this before
7293 versioning. */
7294 edge e = single_exit (loop);
7295 if (! single_pred_p (e->dest))
7297 split_loop_exit_edge (e);
7298 if (dump_enabled_p ())
7299 dump_printf (MSG_NOTE, "split exit edge\n");
7302 /* Version the loop first, if required, so the profitability check
7303 comes first. */
7305 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7307 vect_loop_versioning (loop_vinfo, th, check_profitability);
7308 check_profitability = false;
7311 /* Make sure there exists a single-predecessor exit bb also on the
7312 scalar loop copy. Do this after versioning but before peeling
7313 so CFG structure is fine for both scalar and if-converted loop
7314 to make slpeel_duplicate_current_defs_from_edges face matched
7315 loop closed PHI nodes on the exit. */
7316 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7318 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7319 if (! single_pred_p (e->dest))
7321 split_loop_exit_edge (e);
7322 if (dump_enabled_p ())
7323 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7327 tree niters = vect_build_loop_niters (loop_vinfo);
7328 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7329 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7330 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7331 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7332 check_profitability, niters_no_overflow);
7333 if (niters_vector == NULL_TREE)
7335 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7336 niters_vector
7337 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7338 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7339 else
7340 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7341 niters_no_overflow);
7344 /* 1) Make sure the loop header has exactly two entries
7345 2) Make sure we have a preheader basic block. */
7347 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7349 split_edge (loop_preheader_edge (loop));
7351 /* FORNOW: the vectorizer supports only loops which body consist
7352 of one basic block (header + empty latch). When the vectorizer will
7353 support more involved loop forms, the order by which the BBs are
7354 traversed need to be reconsidered. */
7356 for (i = 0; i < nbbs; i++)
7358 basic_block bb = bbs[i];
7359 stmt_vec_info stmt_info;
7361 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7362 gsi_next (&si))
7364 gphi *phi = si.phi ();
7365 if (dump_enabled_p ())
7367 dump_printf_loc (MSG_NOTE, vect_location,
7368 "------>vectorizing phi: ");
7369 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7371 stmt_info = vinfo_for_stmt (phi);
7372 if (!stmt_info)
7373 continue;
7375 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7376 vect_loop_kill_debug_uses (loop, phi);
7378 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7379 && !STMT_VINFO_LIVE_P (stmt_info))
7380 continue;
7382 if (STMT_VINFO_VECTYPE (stmt_info)
7383 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7384 != (unsigned HOST_WIDE_INT) vf)
7385 && dump_enabled_p ())
7386 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7388 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7389 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7390 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7391 && ! PURE_SLP_STMT (stmt_info))
7393 if (dump_enabled_p ())
7394 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7395 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7399 pattern_stmt = NULL;
7400 for (gimple_stmt_iterator si = gsi_start_bb (bb);
7401 !gsi_end_p (si) || transform_pattern_stmt;)
7403 bool is_store;
7405 if (transform_pattern_stmt)
7406 stmt = pattern_stmt;
7407 else
7409 stmt = gsi_stmt (si);
7410 /* During vectorization remove existing clobber stmts. */
7411 if (gimple_clobber_p (stmt))
7413 unlink_stmt_vdef (stmt);
7414 gsi_remove (&si, true);
7415 release_defs (stmt);
7416 continue;
7420 if (dump_enabled_p ())
7422 dump_printf_loc (MSG_NOTE, vect_location,
7423 "------>vectorizing statement: ");
7424 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7427 stmt_info = vinfo_for_stmt (stmt);
7429 /* vector stmts created in the outer-loop during vectorization of
7430 stmts in an inner-loop may not have a stmt_info, and do not
7431 need to be vectorized. */
7432 if (!stmt_info)
7434 gsi_next (&si);
7435 continue;
7438 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7439 vect_loop_kill_debug_uses (loop, stmt);
7441 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7442 && !STMT_VINFO_LIVE_P (stmt_info))
7444 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7445 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7446 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7447 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7449 stmt = pattern_stmt;
7450 stmt_info = vinfo_for_stmt (stmt);
7452 else
7454 gsi_next (&si);
7455 continue;
7458 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7459 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7460 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7461 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7462 transform_pattern_stmt = true;
7464 /* If pattern statement has def stmts, vectorize them too. */
7465 if (is_pattern_stmt_p (stmt_info))
7467 if (pattern_def_seq == NULL)
7469 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7470 pattern_def_si = gsi_start (pattern_def_seq);
7472 else if (!gsi_end_p (pattern_def_si))
7473 gsi_next (&pattern_def_si);
7474 if (pattern_def_seq != NULL)
7476 gimple *pattern_def_stmt = NULL;
7477 stmt_vec_info pattern_def_stmt_info = NULL;
7479 while (!gsi_end_p (pattern_def_si))
7481 pattern_def_stmt = gsi_stmt (pattern_def_si);
7482 pattern_def_stmt_info
7483 = vinfo_for_stmt (pattern_def_stmt);
7484 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7485 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7486 break;
7487 gsi_next (&pattern_def_si);
7490 if (!gsi_end_p (pattern_def_si))
7492 if (dump_enabled_p ())
7494 dump_printf_loc (MSG_NOTE, vect_location,
7495 "==> vectorizing pattern def "
7496 "stmt: ");
7497 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7498 pattern_def_stmt, 0);
7501 stmt = pattern_def_stmt;
7502 stmt_info = pattern_def_stmt_info;
7504 else
7506 pattern_def_si = gsi_none ();
7507 transform_pattern_stmt = false;
7510 else
7511 transform_pattern_stmt = false;
7514 if (STMT_VINFO_VECTYPE (stmt_info))
7516 unsigned int nunits
7517 = (unsigned int)
7518 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7519 if (!STMT_SLP_TYPE (stmt_info)
7520 && nunits != (unsigned int) vf
7521 && dump_enabled_p ())
7522 /* For SLP VF is set according to unrolling factor, and not
7523 to vector size, hence for SLP this print is not valid. */
7524 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7527 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7528 reached. */
7529 if (STMT_SLP_TYPE (stmt_info))
7531 if (!slp_scheduled)
7533 slp_scheduled = true;
7535 if (dump_enabled_p ())
7536 dump_printf_loc (MSG_NOTE, vect_location,
7537 "=== scheduling SLP instances ===\n");
7539 vect_schedule_slp (loop_vinfo);
7542 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7543 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7545 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7547 pattern_def_seq = NULL;
7548 gsi_next (&si);
7550 continue;
7554 /* -------- vectorize statement ------------ */
7555 if (dump_enabled_p ())
7556 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7558 grouped_store = false;
7559 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7560 if (is_store)
7562 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7564 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7565 interleaving chain was completed - free all the stores in
7566 the chain. */
7567 gsi_next (&si);
7568 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7570 else
7572 /* Free the attached stmt_vec_info and remove the stmt. */
7573 gimple *store = gsi_stmt (si);
7574 free_stmt_vec_info (store);
7575 unlink_stmt_vdef (store);
7576 gsi_remove (&si, true);
7577 release_defs (store);
7580 /* Stores can only appear at the end of pattern statements. */
7581 gcc_assert (!transform_pattern_stmt);
7582 pattern_def_seq = NULL;
7584 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7586 pattern_def_seq = NULL;
7587 gsi_next (&si);
7589 } /* stmts in BB */
7590 } /* BBs in loop */
7592 slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7594 scale_profile_for_vect_loop (loop, vf);
7596 /* The minimum number of iterations performed by the epilogue. This
7597 is 1 when peeling for gaps because we always need a final scalar
7598 iteration. */
7599 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7600 /* +1 to convert latch counts to loop iteration counts,
7601 -min_epilogue_iters to remove iterations that cannot be performed
7602 by the vector code. */
7603 int bias = 1 - min_epilogue_iters;
7604 /* In these calculations the "- 1" converts loop iteration counts
7605 back to latch counts. */
7606 if (loop->any_upper_bound)
7607 loop->nb_iterations_upper_bound
7608 = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7609 if (loop->any_likely_upper_bound)
7610 loop->nb_iterations_likely_upper_bound
7611 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7612 if (loop->any_estimate)
7613 loop->nb_iterations_estimate
7614 = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7616 if (dump_enabled_p ())
7618 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7620 dump_printf_loc (MSG_NOTE, vect_location,
7621 "LOOP VECTORIZED\n");
7622 if (loop->inner)
7623 dump_printf_loc (MSG_NOTE, vect_location,
7624 "OUTER LOOP VECTORIZED\n");
7625 dump_printf (MSG_NOTE, "\n");
7627 else
7628 dump_printf_loc (MSG_NOTE, vect_location,
7629 "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7630 current_vector_size);
7633 /* Free SLP instances here because otherwise stmt reference counting
7634 won't work. */
7635 slp_instance instance;
7636 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7637 vect_free_slp_instance (instance);
7638 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7639 /* Clear-up safelen field since its value is invalid after vectorization
7640 since vectorized loop can have loop-carried dependencies. */
7641 loop->safelen = 0;
7643 /* Don't vectorize epilogue for epilogue. */
7644 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7645 epilogue = NULL;
7647 if (epilogue)
7649 unsigned int vector_sizes
7650 = targetm.vectorize.autovectorize_vector_sizes ();
7651 vector_sizes &= current_vector_size - 1;
7653 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7654 epilogue = NULL;
7655 else if (!vector_sizes)
7656 epilogue = NULL;
7657 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7658 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7660 int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7661 int ratio = current_vector_size / smallest_vec_size;
7662 int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7663 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7664 eiters = eiters % vf;
7666 epilogue->nb_iterations_upper_bound = eiters - 1;
7668 if (eiters < vf / ratio)
7669 epilogue = NULL;
7673 if (epilogue)
7675 epilogue->force_vectorize = loop->force_vectorize;
7676 epilogue->safelen = loop->safelen;
7677 epilogue->dont_vectorize = false;
7679 /* We may need to if-convert epilogue to vectorize it. */
7680 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7681 tree_if_conversion (epilogue);
7684 return epilogue;
7687 /* The code below is trying to perform simple optimization - revert
7688 if-conversion for masked stores, i.e. if the mask of a store is zero
7689 do not perform it and all stored value producers also if possible.
7690 For example,
7691 for (i=0; i<n; i++)
7692 if (c[i])
7694 p1[i] += 1;
7695 p2[i] = p3[i] +2;
7697 this transformation will produce the following semi-hammock:
7699 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7701 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7702 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7703 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7704 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7705 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7706 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7710 void
7711 optimize_mask_stores (struct loop *loop)
7713 basic_block *bbs = get_loop_body (loop);
7714 unsigned nbbs = loop->num_nodes;
7715 unsigned i;
7716 basic_block bb;
7717 struct loop *bb_loop;
7718 gimple_stmt_iterator gsi;
7719 gimple *stmt;
7720 auto_vec<gimple *> worklist;
7722 vect_location = find_loop_location (loop);
7723 /* Pick up all masked stores in loop if any. */
7724 for (i = 0; i < nbbs; i++)
7726 bb = bbs[i];
7727 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7728 gsi_next (&gsi))
7730 stmt = gsi_stmt (gsi);
7731 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7732 worklist.safe_push (stmt);
7736 free (bbs);
7737 if (worklist.is_empty ())
7738 return;
7740 /* Loop has masked stores. */
7741 while (!worklist.is_empty ())
7743 gimple *last, *last_store;
7744 edge e, efalse;
7745 tree mask;
7746 basic_block store_bb, join_bb;
7747 gimple_stmt_iterator gsi_to;
7748 tree vdef, new_vdef;
7749 gphi *phi;
7750 tree vectype;
7751 tree zero;
7753 last = worklist.pop ();
7754 mask = gimple_call_arg (last, 2);
7755 bb = gimple_bb (last);
7756 /* Create then_bb and if-then structure in CFG, then_bb belongs to
7757 the same loop as if_bb. It could be different to LOOP when two
7758 level loop-nest is vectorized and mask_store belongs to the inner
7759 one. */
7760 e = split_block (bb, last);
7761 bb_loop = bb->loop_father;
7762 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7763 join_bb = e->dest;
7764 store_bb = create_empty_bb (bb);
7765 add_bb_to_loop (store_bb, bb_loop);
7766 e->flags = EDGE_TRUE_VALUE;
7767 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7768 /* Put STORE_BB to likely part. */
7769 efalse->probability = profile_probability::unlikely ();
7770 store_bb->count = efalse->count ();
7771 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7772 if (dom_info_available_p (CDI_DOMINATORS))
7773 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7774 if (dump_enabled_p ())
7775 dump_printf_loc (MSG_NOTE, vect_location,
7776 "Create new block %d to sink mask stores.",
7777 store_bb->index);
7778 /* Create vector comparison with boolean result. */
7779 vectype = TREE_TYPE (mask);
7780 zero = build_zero_cst (vectype);
7781 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7782 gsi = gsi_last_bb (bb);
7783 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7784 /* Create new PHI node for vdef of the last masked store:
7785 .MEM_2 = VDEF <.MEM_1>
7786 will be converted to
7787 .MEM.3 = VDEF <.MEM_1>
7788 and new PHI node will be created in join bb
7789 .MEM_2 = PHI <.MEM_1, .MEM_3>
7791 vdef = gimple_vdef (last);
7792 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7793 gimple_set_vdef (last, new_vdef);
7794 phi = create_phi_node (vdef, join_bb);
7795 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7797 /* Put all masked stores with the same mask to STORE_BB if possible. */
7798 while (true)
7800 gimple_stmt_iterator gsi_from;
7801 gimple *stmt1 = NULL;
7803 /* Move masked store to STORE_BB. */
7804 last_store = last;
7805 gsi = gsi_for_stmt (last);
7806 gsi_from = gsi;
7807 /* Shift GSI to the previous stmt for further traversal. */
7808 gsi_prev (&gsi);
7809 gsi_to = gsi_start_bb (store_bb);
7810 gsi_move_before (&gsi_from, &gsi_to);
7811 /* Setup GSI_TO to the non-empty block start. */
7812 gsi_to = gsi_start_bb (store_bb);
7813 if (dump_enabled_p ())
7815 dump_printf_loc (MSG_NOTE, vect_location,
7816 "Move stmt to created bb\n");
7817 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7819 /* Move all stored value producers if possible. */
7820 while (!gsi_end_p (gsi))
7822 tree lhs;
7823 imm_use_iterator imm_iter;
7824 use_operand_p use_p;
7825 bool res;
7827 /* Skip debug statements. */
7828 if (is_gimple_debug (gsi_stmt (gsi)))
7830 gsi_prev (&gsi);
7831 continue;
7833 stmt1 = gsi_stmt (gsi);
7834 /* Do not consider statements writing to memory or having
7835 volatile operand. */
7836 if (gimple_vdef (stmt1)
7837 || gimple_has_volatile_ops (stmt1))
7838 break;
7839 gsi_from = gsi;
7840 gsi_prev (&gsi);
7841 lhs = gimple_get_lhs (stmt1);
7842 if (!lhs)
7843 break;
7845 /* LHS of vectorized stmt must be SSA_NAME. */
7846 if (TREE_CODE (lhs) != SSA_NAME)
7847 break;
7849 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7851 /* Remove dead scalar statement. */
7852 if (has_zero_uses (lhs))
7854 gsi_remove (&gsi_from, true);
7855 continue;
7859 /* Check that LHS does not have uses outside of STORE_BB. */
7860 res = true;
7861 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7863 gimple *use_stmt;
7864 use_stmt = USE_STMT (use_p);
7865 if (is_gimple_debug (use_stmt))
7866 continue;
7867 if (gimple_bb (use_stmt) != store_bb)
7869 res = false;
7870 break;
7873 if (!res)
7874 break;
7876 if (gimple_vuse (stmt1)
7877 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7878 break;
7880 /* Can move STMT1 to STORE_BB. */
7881 if (dump_enabled_p ())
7883 dump_printf_loc (MSG_NOTE, vect_location,
7884 "Move stmt to created bb\n");
7885 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7887 gsi_move_before (&gsi_from, &gsi_to);
7888 /* Shift GSI_TO for further insertion. */
7889 gsi_prev (&gsi_to);
7891 /* Put other masked stores with the same mask to STORE_BB. */
7892 if (worklist.is_empty ()
7893 || gimple_call_arg (worklist.last (), 2) != mask
7894 || worklist.last () != stmt1)
7895 break;
7896 last = worklist.pop ();
7898 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);