Avoid ICE for nested inductions (PR 83914)
[official-gcc.git] / gcc / tree-vect-loop.c
blob8b2ecf84e3f652e2a31cbd5bcbb0c34d59f0e6b8
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 /* Function vect_determine_vectorization_factor
159 Determine the vectorization factor (VF). VF is the number of data elements
160 that are operated upon in parallel in a single iteration of the vectorized
161 loop. For example, when vectorizing a loop that operates on 4byte elements,
162 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
163 elements can fit in a single vector register.
165 We currently support vectorization of loops in which all types operated upon
166 are of the same size. Therefore this function currently sets VF according to
167 the size of the types operated upon, and fails if there are multiple sizes
168 in the loop.
170 VF is also the factor by which the loop iterations are strip-mined, e.g.:
171 original loop:
172 for (i=0; i<N; i++){
173 a[i] = b[i] + c[i];
176 vectorized loop:
177 for (i=0; i<N; i+=VF){
178 a[i:VF] = b[i:VF] + c[i:VF];
182 static bool
183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
187 unsigned nbbs = loop->num_nodes;
188 poly_uint64 vectorization_factor = 1;
189 tree scalar_type = NULL_TREE;
190 gphi *phi;
191 tree vectype;
192 stmt_vec_info stmt_info;
193 unsigned i;
194 HOST_WIDE_INT dummy;
195 gimple *stmt, *pattern_stmt = NULL;
196 gimple_seq pattern_def_seq = NULL;
197 gimple_stmt_iterator pattern_def_si = gsi_none ();
198 bool analyze_pattern_stmt = false;
199 bool bool_result;
200 auto_vec<stmt_vec_info> mask_producers;
202 if (dump_enabled_p ())
203 dump_printf_loc (MSG_NOTE, vect_location,
204 "=== vect_determine_vectorization_factor ===\n");
206 for (i = 0; i < nbbs; i++)
208 basic_block bb = bbs[i];
210 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
211 gsi_next (&si))
213 phi = si.phi ();
214 stmt_info = vinfo_for_stmt (phi);
215 if (dump_enabled_p ())
217 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
218 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
221 gcc_assert (stmt_info);
223 if (STMT_VINFO_RELEVANT_P (stmt_info)
224 || STMT_VINFO_LIVE_P (stmt_info))
226 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
227 scalar_type = TREE_TYPE (PHI_RESULT (phi));
229 if (dump_enabled_p ())
231 dump_printf_loc (MSG_NOTE, vect_location,
232 "get vectype for scalar type: ");
233 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
234 dump_printf (MSG_NOTE, "\n");
237 vectype = get_vectype_for_scalar_type (scalar_type);
238 if (!vectype)
240 if (dump_enabled_p ())
242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
243 "not vectorized: unsupported "
244 "data-type ");
245 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
246 scalar_type);
247 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
249 return false;
251 STMT_VINFO_VECTYPE (stmt_info) = vectype;
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
256 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
257 dump_printf (MSG_NOTE, "\n");
260 if (dump_enabled_p ())
262 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
263 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
264 dump_printf (MSG_NOTE, "\n");
267 vect_update_max_nunits (&vectorization_factor, vectype);
271 for (gimple_stmt_iterator si = gsi_start_bb (bb);
272 !gsi_end_p (si) || analyze_pattern_stmt;)
274 tree vf_vectype;
276 if (analyze_pattern_stmt)
277 stmt = pattern_stmt;
278 else
279 stmt = gsi_stmt (si);
281 stmt_info = vinfo_for_stmt (stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
290 gcc_assert (stmt_info);
292 /* Skip stmts which do not need to be vectorized. */
293 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
294 && !STMT_VINFO_LIVE_P (stmt_info))
295 || gimple_clobber_p (stmt))
297 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
298 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
299 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
300 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 stmt = pattern_stmt;
303 stmt_info = vinfo_for_stmt (pattern_stmt);
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location,
307 "==> examining pattern statement: ");
308 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
311 else
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
315 gsi_next (&si);
316 continue;
319 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
320 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
321 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
322 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
323 analyze_pattern_stmt = true;
325 /* If a pattern statement has def stmts, analyze them too. */
326 if (is_pattern_stmt_p (stmt_info))
328 if (pattern_def_seq == NULL)
330 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
331 pattern_def_si = gsi_start (pattern_def_seq);
333 else if (!gsi_end_p (pattern_def_si))
334 gsi_next (&pattern_def_si);
335 if (pattern_def_seq != NULL)
337 gimple *pattern_def_stmt = NULL;
338 stmt_vec_info pattern_def_stmt_info = NULL;
340 while (!gsi_end_p (pattern_def_si))
342 pattern_def_stmt = gsi_stmt (pattern_def_si);
343 pattern_def_stmt_info
344 = vinfo_for_stmt (pattern_def_stmt);
345 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
346 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
347 break;
348 gsi_next (&pattern_def_si);
351 if (!gsi_end_p (pattern_def_si))
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "==> examining pattern def stmt: ");
357 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
358 pattern_def_stmt, 0);
361 stmt = pattern_def_stmt;
362 stmt_info = pattern_def_stmt_info;
364 else
366 pattern_def_si = gsi_none ();
367 analyze_pattern_stmt = false;
370 else
371 analyze_pattern_stmt = false;
374 if (gimple_get_lhs (stmt) == NULL_TREE
375 /* MASK_STORE has no lhs, but is ok. */
376 && (!is_gimple_call (stmt)
377 || !gimple_call_internal_p (stmt)
378 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
380 if (is_gimple_call (stmt))
382 /* Ignore calls with no lhs. These must be calls to
383 #pragma omp simd functions, and what vectorization factor
384 it really needs can't be determined until
385 vectorizable_simd_clone_call. */
386 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
388 pattern_def_seq = NULL;
389 gsi_next (&si);
391 continue;
393 if (dump_enabled_p ())
395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
396 "not vectorized: irregular stmt.");
397 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
400 return false;
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
411 return false;
414 bool_result = false;
416 if (STMT_VINFO_VECTYPE (stmt_info))
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 else
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
431 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
432 else
433 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
435 /* Bool ops don't participate in vectorization factor
436 computation. For comparison use compared types to
437 compute a factor. */
438 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
439 && is_gimple_assign (stmt)
440 && gimple_assign_rhs_code (stmt) != COND_EXPR)
442 if (STMT_VINFO_RELEVANT_P (stmt_info)
443 || STMT_VINFO_LIVE_P (stmt_info))
444 mask_producers.safe_push (stmt_info);
445 bool_result = true;
447 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
448 == tcc_comparison
449 && !VECT_SCALAR_BOOLEAN_TYPE_P
450 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
451 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
452 else
454 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 pattern_def_seq = NULL;
457 gsi_next (&si);
459 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "get vectype for scalar type: ");
467 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
468 dump_printf (MSG_NOTE, "\n");
470 vectype = get_vectype_for_scalar_type (scalar_type);
471 if (!vectype)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "not vectorized: unsupported "
477 "data-type ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 scalar_type);
480 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 return false;
485 if (!bool_result)
486 STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
492 dump_printf (MSG_NOTE, "\n");
496 /* Don't try to compute VF out scalar types if we stmt
497 produces boolean vector. Use result vectype instead. */
498 if (VECTOR_BOOLEAN_TYPE_P (vectype))
499 vf_vectype = vectype;
500 else
502 /* The vectorization factor is according to the smallest
503 scalar type (or the largest vector size, but we only
504 support one vector size per loop). */
505 if (!bool_result)
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 if (!vf_vectype)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "not vectorized: unsupported data-type ");
523 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
524 scalar_type);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 return false;
530 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
531 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: different sized vector "
537 "types in statement, ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
539 vectype);
540 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vf_vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 return false;
548 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
551 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
552 dump_printf (MSG_NOTE, "\n");
555 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
558 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
559 dump_printf (MSG_NOTE, "\n");
562 vect_update_max_nunits (&vectorization_factor, vf_vectype);
564 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
566 pattern_def_seq = NULL;
567 gsi_next (&si);
572 /* TODO: Analyze cost. Decide if worth while to vectorize. */
573 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
576 dump_dec (MSG_NOTE, vectorization_factor);
577 dump_printf (MSG_NOTE, "\n");
580 if (known_le (vectorization_factor, 1U))
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
584 "not vectorized: unsupported data-type\n");
585 return false;
587 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
589 for (i = 0; i < mask_producers.length (); i++)
591 tree mask_type = NULL;
593 stmt = STMT_VINFO_STMT (mask_producers[i]);
595 if (is_gimple_assign (stmt)
596 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
597 && !VECT_SCALAR_BOOLEAN_TYPE_P
598 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
600 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
601 mask_type = get_mask_type_for_scalar_type (scalar_type);
603 if (!mask_type)
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "not vectorized: unsupported mask\n");
608 return false;
611 else
613 tree rhs;
614 ssa_op_iter iter;
615 gimple *def_stmt;
616 enum vect_def_type dt;
618 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
620 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
621 &def_stmt, &dt, &vectype))
623 if (dump_enabled_p ())
625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
626 "not vectorized: can't compute mask type "
627 "for statement, ");
628 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
631 return false;
634 /* No vectype probably means external definition.
635 Allow it in case there is another operand which
636 allows to determine mask type. */
637 if (!vectype)
638 continue;
640 if (!mask_type)
641 mask_type = vectype;
642 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
643 TYPE_VECTOR_SUBPARTS (vectype)))
645 if (dump_enabled_p ())
647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
648 "not vectorized: different sized masks "
649 "types in statement, ");
650 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
651 mask_type);
652 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
653 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
654 vectype);
655 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
657 return false;
659 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
660 != VECTOR_BOOLEAN_TYPE_P (vectype))
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "not vectorized: mixed mask and "
666 "nonmask vector types in statement, ");
667 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
668 mask_type);
669 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
670 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
671 vectype);
672 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
674 return false;
678 /* We may compare boolean value loaded as vector of integers.
679 Fix mask_type in such case. */
680 if (mask_type
681 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
682 && gimple_code (stmt) == GIMPLE_ASSIGN
683 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
684 mask_type = build_same_sized_truth_vector_type (mask_type);
687 /* No mask_type should mean loop invariant predicate.
688 This is probably a subject for optimization in
689 if-conversion. */
690 if (!mask_type)
692 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "not vectorized: can't compute mask type "
696 "for statement, ");
697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
700 return false;
703 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
706 return true;
710 /* Function vect_is_simple_iv_evolution.
712 FORNOW: A simple evolution of an induction variables in the loop is
713 considered a polynomial evolution. */
715 static bool
716 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
717 tree * step)
719 tree init_expr;
720 tree step_expr;
721 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
722 basic_block bb;
724 /* When there is no evolution in this loop, the evolution function
725 is not "simple". */
726 if (evolution_part == NULL_TREE)
727 return false;
729 /* When the evolution is a polynomial of degree >= 2
730 the evolution function is not "simple". */
731 if (tree_is_chrec (evolution_part))
732 return false;
734 step_expr = evolution_part;
735 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
737 if (dump_enabled_p ())
739 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
740 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
741 dump_printf (MSG_NOTE, ", init: ");
742 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
743 dump_printf (MSG_NOTE, "\n");
746 *init = init_expr;
747 *step = step_expr;
749 if (TREE_CODE (step_expr) != INTEGER_CST
750 && (TREE_CODE (step_expr) != SSA_NAME
751 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
752 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
753 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
754 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
755 || !flag_associative_math)))
756 && (TREE_CODE (step_expr) != REAL_CST
757 || !flag_associative_math))
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
761 "step unknown.\n");
762 return false;
765 return true;
768 /* Function vect_analyze_scalar_cycles_1.
770 Examine the cross iteration def-use cycles of scalar variables
771 in LOOP. LOOP_VINFO represents the loop that is now being
772 considered for vectorization (can be LOOP, or an outer-loop
773 enclosing LOOP). */
775 static void
776 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
778 basic_block bb = loop->header;
779 tree init, step;
780 auto_vec<gimple *, 64> worklist;
781 gphi_iterator gsi;
782 bool double_reduc;
784 if (dump_enabled_p ())
785 dump_printf_loc (MSG_NOTE, vect_location,
786 "=== vect_analyze_scalar_cycles ===\n");
788 /* First - identify all inductions. Reduction detection assumes that all the
789 inductions have been identified, therefore, this order must not be
790 changed. */
791 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
793 gphi *phi = gsi.phi ();
794 tree access_fn = NULL;
795 tree def = PHI_RESULT (phi);
796 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
798 if (dump_enabled_p ())
800 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
804 /* Skip virtual phi's. The data dependences that are associated with
805 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
806 if (virtual_operand_p (def))
807 continue;
809 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
811 /* Analyze the evolution function. */
812 access_fn = analyze_scalar_evolution (loop, def);
813 if (access_fn)
815 STRIP_NOPS (access_fn);
816 if (dump_enabled_p ())
818 dump_printf_loc (MSG_NOTE, vect_location,
819 "Access function of PHI: ");
820 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
821 dump_printf (MSG_NOTE, "\n");
823 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
824 = initial_condition_in_loop_num (access_fn, loop->num);
825 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
826 = evolution_part_in_loop_num (access_fn, loop->num);
829 if (!access_fn
830 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
831 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
832 && TREE_CODE (step) != INTEGER_CST))
834 worklist.safe_push (phi);
835 continue;
838 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
839 != NULL_TREE);
840 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
842 if (dump_enabled_p ())
843 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
844 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
848 /* Second - identify all reductions and nested cycles. */
849 while (worklist.length () > 0)
851 gimple *phi = worklist.pop ();
852 tree def = PHI_RESULT (phi);
853 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
854 gimple *reduc_stmt;
856 if (dump_enabled_p ())
858 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
859 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
862 gcc_assert (!virtual_operand_p (def)
863 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
865 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
866 &double_reduc, false);
867 if (reduc_stmt)
869 if (double_reduc)
871 if (dump_enabled_p ())
872 dump_printf_loc (MSG_NOTE, vect_location,
873 "Detected double reduction.\n");
875 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
876 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
877 vect_double_reduction_def;
879 else
881 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
883 if (dump_enabled_p ())
884 dump_printf_loc (MSG_NOTE, vect_location,
885 "Detected vectorizable nested cycle.\n");
887 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
888 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
889 vect_nested_cycle;
891 else
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE, vect_location,
895 "Detected reduction.\n");
897 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
898 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
899 vect_reduction_def;
900 /* Store the reduction cycles for possible vectorization in
901 loop-aware SLP if it was not detected as reduction
902 chain. */
903 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
904 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
908 else
909 if (dump_enabled_p ())
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "Unknown def-use cycle pattern.\n");
916 /* Function vect_analyze_scalar_cycles.
918 Examine the cross iteration def-use cycles of scalar variables, by
919 analyzing the loop-header PHIs of scalar variables. Classify each
920 cycle as one of the following: invariant, induction, reduction, unknown.
921 We do that for the loop represented by LOOP_VINFO, and also to its
922 inner-loop, if exists.
923 Examples for scalar cycles:
925 Example1: reduction:
927 loop1:
928 for (i=0; i<N; i++)
929 sum += a[i];
931 Example2: induction:
933 loop2:
934 for (i=0; i<N; i++)
935 a[i] = i; */
937 static void
938 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
945 Reductions in such inner-loop therefore have different properties than
946 the reductions in the nest that gets vectorized:
947 1. When vectorized, they are executed in the same order as in the original
948 scalar loop, so we can't change the order of computation when
949 vectorizing them.
950 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
951 current checks are too strict. */
953 if (loop->inner)
954 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
957 /* Transfer group and reduction information from STMT to its pattern stmt. */
959 static void
960 vect_fixup_reduc_chain (gimple *stmt)
962 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
963 gimple *stmtp;
964 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
965 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
966 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
969 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
970 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
971 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
972 if (stmt)
973 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
974 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976 while (stmt);
977 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
980 /* Fixup scalar cycles that now have their stmts detected as patterns. */
982 static void
983 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 gimple *first;
986 unsigned i;
988 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
989 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
992 while (next)
994 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
995 break;
996 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998 /* If not all stmt in the chain are patterns try to handle
999 the chain without patterns. */
1000 if (! next)
1002 vect_fixup_reduc_chain (first);
1003 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1004 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1009 /* Function vect_get_loop_niters.
1011 Determine how many iterations the loop is executed and place it
1012 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1013 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1014 niter information holds in ASSUMPTIONS.
1016 Return the loop exit condition. */
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1021 tree *number_of_iterations, tree *number_of_iterationsm1)
1023 edge exit = single_exit (loop);
1024 struct tree_niter_desc niter_desc;
1025 tree niter_assumptions, niter, may_be_zero;
1026 gcond *cond = get_loop_exit_condition (loop);
1028 *assumptions = boolean_true_node;
1029 *number_of_iterationsm1 = chrec_dont_know;
1030 *number_of_iterations = chrec_dont_know;
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "=== get_loop_niters ===\n");
1035 if (!exit)
1036 return cond;
1038 niter = chrec_dont_know;
1039 may_be_zero = NULL_TREE;
1040 niter_assumptions = boolean_true_node;
1041 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1042 || chrec_contains_undetermined (niter_desc.niter))
1043 return cond;
1045 niter_assumptions = niter_desc.assumptions;
1046 may_be_zero = niter_desc.may_be_zero;
1047 niter = niter_desc.niter;
1049 if (may_be_zero && integer_zerop (may_be_zero))
1050 may_be_zero = NULL_TREE;
1052 if (may_be_zero)
1054 if (COMPARISON_CLASS_P (may_be_zero))
1056 /* Try to combine may_be_zero with assumptions, this can simplify
1057 computation of niter expression. */
1058 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1059 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1060 niter_assumptions,
1061 fold_build1 (TRUTH_NOT_EXPR,
1062 boolean_type_node,
1063 may_be_zero));
1064 else
1065 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1066 build_int_cst (TREE_TYPE (niter), 0), niter);
1068 may_be_zero = NULL_TREE;
1070 else if (integer_nonzerop (may_be_zero))
1072 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1073 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1074 return cond;
1076 else
1077 return cond;
1080 *assumptions = niter_assumptions;
1081 *number_of_iterationsm1 = niter;
1083 /* We want the number of loop header executions which is the number
1084 of latch executions plus one.
1085 ??? For UINT_MAX latch executions this number overflows to zero
1086 for loops like do { n++; } while (n != 0); */
1087 if (niter && !chrec_contains_undetermined (niter))
1088 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1089 build_int_cst (TREE_TYPE (niter), 1));
1090 *number_of_iterations = niter;
1092 return cond;
1095 /* Function bb_in_loop_p
1097 Used as predicate for dfs order traversal of the loop bbs. */
1099 static bool
1100 bb_in_loop_p (const_basic_block bb, const void *data)
1102 const struct loop *const loop = (const struct loop *)data;
1103 if (flow_bb_inside_loop_p (loop, bb))
1104 return true;
1105 return false;
1109 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1110 stmt_vec_info structs for all the stmts in LOOP_IN. */
1112 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1113 : vec_info (vec_info::loop, init_cost (loop_in)),
1114 loop (loop_in),
1115 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1116 num_itersm1 (NULL_TREE),
1117 num_iters (NULL_TREE),
1118 num_iters_unchanged (NULL_TREE),
1119 num_iters_assumptions (NULL_TREE),
1120 th (0),
1121 versioning_threshold (0),
1122 vectorization_factor (0),
1123 max_vectorization_factor (0),
1124 mask_skip_niters (NULL_TREE),
1125 mask_compare_type (NULL_TREE),
1126 unaligned_dr (NULL),
1127 peeling_for_alignment (0),
1128 ptr_mask (0),
1129 slp_unrolling_factor (1),
1130 single_scalar_iteration_cost (0),
1131 vectorizable (false),
1132 can_fully_mask_p (true),
1133 fully_masked_p (false),
1134 peeling_for_gaps (false),
1135 peeling_for_niter (false),
1136 operands_swapped (false),
1137 no_data_dependencies (false),
1138 has_mask_store (false),
1139 scalar_loop (NULL),
1140 orig_loop_info (NULL)
1142 /* Create/Update stmt_info for all stmts in the loop. */
1143 basic_block *body = get_loop_body (loop);
1144 for (unsigned int i = 0; i < loop->num_nodes; i++)
1146 basic_block bb = body[i];
1147 gimple_stmt_iterator si;
1149 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 gimple *phi = gsi_stmt (si);
1152 gimple_set_uid (phi, 0);
1153 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1156 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1158 gimple *stmt = gsi_stmt (si);
1159 gimple_set_uid (stmt, 0);
1160 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1163 free (body);
1165 /* CHECKME: We want to visit all BBs before their successors (except for
1166 latch blocks, for which this assertion wouldn't hold). In the simple
1167 case of the loop forms we allow, a dfs order of the BBs would the same
1168 as reversed postorder traversal, so we are safe. */
1170 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1171 bbs, loop->num_nodes, loop);
1172 gcc_assert (nbbs == loop->num_nodes);
1175 /* Free all levels of MASKS. */
1177 void
1178 release_vec_loop_masks (vec_loop_masks *masks)
1180 rgroup_masks *rgm;
1181 unsigned int i;
1182 FOR_EACH_VEC_ELT (*masks, i, rgm)
1183 rgm->masks.release ();
1184 masks->release ();
1187 /* Free all memory used by the _loop_vec_info, as well as all the
1188 stmt_vec_info structs of all the stmts in the loop. */
1190 _loop_vec_info::~_loop_vec_info ()
1192 int nbbs;
1193 gimple_stmt_iterator si;
1194 int j;
1196 nbbs = loop->num_nodes;
1197 for (j = 0; j < nbbs; j++)
1199 basic_block bb = bbs[j];
1200 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1201 free_stmt_vec_info (gsi_stmt (si));
1203 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1205 gimple *stmt = gsi_stmt (si);
1207 /* We may have broken canonical form by moving a constant
1208 into RHS1 of a commutative op. Fix such occurrences. */
1209 if (operands_swapped && is_gimple_assign (stmt))
1211 enum tree_code code = gimple_assign_rhs_code (stmt);
1213 if ((code == PLUS_EXPR
1214 || code == POINTER_PLUS_EXPR
1215 || code == MULT_EXPR)
1216 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1217 swap_ssa_operands (stmt,
1218 gimple_assign_rhs1_ptr (stmt),
1219 gimple_assign_rhs2_ptr (stmt));
1220 else if (code == COND_EXPR
1221 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1223 tree cond_expr = gimple_assign_rhs1 (stmt);
1224 enum tree_code cond_code = TREE_CODE (cond_expr);
1226 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1228 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1229 0));
1230 cond_code = invert_tree_comparison (cond_code,
1231 honor_nans);
1232 if (cond_code != ERROR_MARK)
1234 TREE_SET_CODE (cond_expr, cond_code);
1235 swap_ssa_operands (stmt,
1236 gimple_assign_rhs2_ptr (stmt),
1237 gimple_assign_rhs3_ptr (stmt));
1243 /* Free stmt_vec_info. */
1244 free_stmt_vec_info (stmt);
1245 gsi_next (&si);
1249 free (bbs);
1251 release_vec_loop_masks (&masks);
1253 loop->aux = NULL;
1256 /* Return true if we can use CMP_TYPE as the comparison type to produce
1257 all masks required to mask LOOP_VINFO. */
1259 static bool
1260 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1262 rgroup_masks *rgm;
1263 unsigned int i;
1264 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1265 if (rgm->mask_type != NULL_TREE
1266 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1267 cmp_type, rgm->mask_type,
1268 OPTIMIZE_FOR_SPEED))
1269 return false;
1270 return true;
1273 /* Calculate the maximum number of scalars per iteration for every
1274 rgroup in LOOP_VINFO. */
1276 static unsigned int
1277 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1279 unsigned int res = 1;
1280 unsigned int i;
1281 rgroup_masks *rgm;
1282 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1283 res = MAX (res, rgm->max_nscalars_per_iter);
1284 return res;
1287 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1288 whether we can actually generate the masks required. Return true if so,
1289 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1295 unsigned int min_ni_width;
1297 /* Get the maximum number of iterations that is representable
1298 in the counter type. */
1299 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1300 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1302 /* Get a more refined estimate for the number of iterations. */
1303 widest_int max_back_edges;
1304 if (max_loop_iterations (loop, &max_back_edges))
1305 max_ni = wi::smin (max_ni, max_back_edges + 1);
1307 /* Account for rgroup masks, in which each bit is replicated N times. */
1308 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1310 /* Work out how many bits we need to represent the limit. */
1311 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1313 /* Find a scalar mode for which WHILE_ULT is supported. */
1314 opt_scalar_int_mode cmp_mode_iter;
1315 tree cmp_type = NULL_TREE;
1316 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1318 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1319 if (cmp_bits >= min_ni_width
1320 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1322 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1323 if (this_type
1324 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1326 /* Although we could stop as soon as we find a valid mode,
1327 it's often better to continue until we hit Pmode, since the
1328 operands to the WHILE are more likely to be reusable in
1329 address calculations. */
1330 cmp_type = this_type;
1331 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1332 break;
1337 if (!cmp_type)
1338 return false;
1340 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1341 return true;
1344 /* Calculate the cost of one scalar iteration of the loop. */
1345 static void
1346 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1348 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1349 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1350 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1351 int innerloop_iters, i;
1353 /* Count statements in scalar loop. Using this as scalar cost for a single
1354 iteration for now.
1356 TODO: Add outer loop support.
1358 TODO: Consider assigning different costs to different scalar
1359 statements. */
1361 /* FORNOW. */
1362 innerloop_iters = 1;
1363 if (loop->inner)
1364 innerloop_iters = 50; /* FIXME */
1366 for (i = 0; i < nbbs; i++)
1368 gimple_stmt_iterator si;
1369 basic_block bb = bbs[i];
1371 if (bb->loop_father == loop->inner)
1372 factor = innerloop_iters;
1373 else
1374 factor = 1;
1376 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1378 gimple *stmt = gsi_stmt (si);
1379 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1381 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1382 continue;
1384 /* Skip stmts that are not vectorized inside the loop. */
1385 if (stmt_info
1386 && !STMT_VINFO_RELEVANT_P (stmt_info)
1387 && (!STMT_VINFO_LIVE_P (stmt_info)
1388 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1390 continue;
1392 vect_cost_for_stmt kind;
1393 if (STMT_VINFO_DATA_REF (stmt_info))
1395 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1396 kind = scalar_load;
1397 else
1398 kind = scalar_store;
1400 else
1401 kind = scalar_stmt;
1403 scalar_single_iter_cost
1404 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405 factor, kind, stmt_info, 0, vect_prologue);
1408 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1409 = scalar_single_iter_cost;
1413 /* Function vect_analyze_loop_form_1.
1415 Verify that certain CFG restrictions hold, including:
1416 - the loop has a pre-header
1417 - the loop has a single entry and exit
1418 - the loop exit condition is simple enough
1419 - the number of iterations can be analyzed, i.e, a countable loop. The
1420 niter could be analyzed under some assumptions. */
1422 bool
1423 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1424 tree *assumptions, tree *number_of_iterationsm1,
1425 tree *number_of_iterations, gcond **inner_loop_cond)
1427 if (dump_enabled_p ())
1428 dump_printf_loc (MSG_NOTE, vect_location,
1429 "=== vect_analyze_loop_form ===\n");
1431 /* Different restrictions apply when we are considering an inner-most loop,
1432 vs. an outer (nested) loop.
1433 (FORNOW. May want to relax some of these restrictions in the future). */
1435 if (!loop->inner)
1437 /* Inner-most loop. We currently require that the number of BBs is
1438 exactly 2 (the header and latch). Vectorizable inner-most loops
1439 look like this:
1441 (pre-header)
1443 header <--------+
1444 | | |
1445 | +--> latch --+
1447 (exit-bb) */
1449 if (loop->num_nodes != 2)
1451 if (dump_enabled_p ())
1452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453 "not vectorized: control flow in loop.\n");
1454 return false;
1457 if (empty_block_p (loop->header))
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 "not vectorized: empty loop.\n");
1462 return false;
1465 else
1467 struct loop *innerloop = loop->inner;
1468 edge entryedge;
1470 /* Nested loop. We currently require that the loop is doubly-nested,
1471 contains a single inner loop, and the number of BBs is exactly 5.
1472 Vectorizable outer-loops look like this:
1474 (pre-header)
1476 header <---+
1478 inner-loop |
1480 tail ------+
1482 (exit-bb)
1484 The inner-loop has the properties expected of inner-most loops
1485 as described above. */
1487 if ((loop->inner)->inner || (loop->inner)->next)
1489 if (dump_enabled_p ())
1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491 "not vectorized: multiple nested loops.\n");
1492 return false;
1495 if (loop->num_nodes != 5)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: control flow in loop.\n");
1500 return false;
1503 entryedge = loop_preheader_edge (innerloop);
1504 if (entryedge->src != loop->header
1505 || !single_exit (innerloop)
1506 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1508 if (dump_enabled_p ())
1509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1510 "not vectorized: unsupported outerloop form.\n");
1511 return false;
1514 /* Analyze the inner-loop. */
1515 tree inner_niterm1, inner_niter, inner_assumptions;
1516 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1517 &inner_assumptions, &inner_niterm1,
1518 &inner_niter, NULL)
1519 /* Don't support analyzing niter under assumptions for inner
1520 loop. */
1521 || !integer_onep (inner_assumptions))
1523 if (dump_enabled_p ())
1524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525 "not vectorized: Bad inner loop.\n");
1526 return false;
1529 if (!expr_invariant_in_loop_p (loop, inner_niter))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "not vectorized: inner-loop count not"
1534 " invariant.\n");
1535 return false;
1538 if (dump_enabled_p ())
1539 dump_printf_loc (MSG_NOTE, vect_location,
1540 "Considering outer-loop vectorization.\n");
1543 if (!single_exit (loop)
1544 || EDGE_COUNT (loop->header->preds) != 2)
1546 if (dump_enabled_p ())
1548 if (!single_exit (loop))
1549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550 "not vectorized: multiple exits.\n");
1551 else if (EDGE_COUNT (loop->header->preds) != 2)
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 "not vectorized: too many incoming edges.\n");
1555 return false;
1558 /* We assume that the loop exit condition is at the end of the loop. i.e,
1559 that the loop is represented as a do-while (with a proper if-guard
1560 before the loop if needed), where the loop header contains all the
1561 executable statements, and the latch is empty. */
1562 if (!empty_block_p (loop->latch)
1563 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1565 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567 "not vectorized: latch block not empty.\n");
1568 return false;
1571 /* Make sure the exit is not abnormal. */
1572 edge e = single_exit (loop);
1573 if (e->flags & EDGE_ABNORMAL)
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: abnormal loop exit edge.\n");
1578 return false;
1581 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1582 number_of_iterationsm1);
1583 if (!*loop_cond)
1585 if (dump_enabled_p ())
1586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1587 "not vectorized: complicated exit condition.\n");
1588 return false;
1591 if (integer_zerop (*assumptions)
1592 || !*number_of_iterations
1593 || chrec_contains_undetermined (*number_of_iterations))
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1597 "not vectorized: number of iterations cannot be "
1598 "computed.\n");
1599 return false;
1602 if (integer_zerop (*number_of_iterations))
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "not vectorized: number of iterations = 0.\n");
1607 return false;
1610 return true;
1613 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1615 loop_vec_info
1616 vect_analyze_loop_form (struct loop *loop)
1618 tree assumptions, number_of_iterations, number_of_iterationsm1;
1619 gcond *loop_cond, *inner_loop_cond = NULL;
1621 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1622 &assumptions, &number_of_iterationsm1,
1623 &number_of_iterations, &inner_loop_cond))
1624 return NULL;
1626 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1627 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1628 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1629 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1630 if (!integer_onep (assumptions))
1632 /* We consider to vectorize this loop by versioning it under
1633 some assumptions. In order to do this, we need to clear
1634 existing information computed by scev and niter analyzer. */
1635 scev_reset_htab ();
1636 free_numbers_of_iterations_estimates (loop);
1637 /* Also set flag for this loop so that following scev and niter
1638 analysis are done under the assumptions. */
1639 loop_constraint_set (loop, LOOP_C_FINITE);
1640 /* Also record the assumptions for versioning. */
1641 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1644 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1646 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE, vect_location,
1649 "Symbolic number of iterations is ");
1650 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1651 dump_printf (MSG_NOTE, "\n");
1655 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1656 if (inner_loop_cond)
1657 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1658 = loop_exit_ctrl_vec_info_type;
1660 gcc_assert (!loop->aux);
1661 loop->aux = loop_vinfo;
1662 return loop_vinfo;
1667 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1668 statements update the vectorization factor. */
1670 static void
1671 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1673 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1675 int nbbs = loop->num_nodes;
1676 poly_uint64 vectorization_factor;
1677 int i;
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "=== vect_update_vf_for_slp ===\n");
1683 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1684 gcc_assert (known_ne (vectorization_factor, 0U));
1686 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1687 vectorization factor of the loop is the unrolling factor required by
1688 the SLP instances. If that unrolling factor is 1, we say, that we
1689 perform pure SLP on loop - cross iteration parallelism is not
1690 exploited. */
1691 bool only_slp_in_loop = true;
1692 for (i = 0; i < nbbs; i++)
1694 basic_block bb = bbs[i];
1695 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1696 gsi_next (&si))
1698 gimple *stmt = gsi_stmt (si);
1699 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1700 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1701 && STMT_VINFO_RELATED_STMT (stmt_info))
1703 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1704 stmt_info = vinfo_for_stmt (stmt);
1706 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1707 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1708 && !PURE_SLP_STMT (stmt_info))
1709 /* STMT needs both SLP and loop-based vectorization. */
1710 only_slp_in_loop = false;
1714 if (only_slp_in_loop)
1716 dump_printf_loc (MSG_NOTE, vect_location,
1717 "Loop contains only SLP stmts\n");
1718 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1720 else
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "Loop contains SLP and non-SLP stmts\n");
1724 /* Both the vectorization factor and unroll factor have the form
1725 current_vector_size * X for some rational X, so they must have
1726 a common multiple. */
1727 vectorization_factor
1728 = force_common_multiple (vectorization_factor,
1729 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1732 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1733 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Updating vectorization factor to ");
1737 dump_dec (MSG_NOTE, vectorization_factor);
1738 dump_printf (MSG_NOTE, ".\n");
1742 /* Function vect_analyze_loop_operations.
1744 Scan the loop stmts and make sure they are all vectorizable. */
1746 static bool
1747 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1749 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1751 int nbbs = loop->num_nodes;
1752 int i;
1753 stmt_vec_info stmt_info;
1754 bool need_to_vectorize = false;
1755 bool ok;
1757 if (dump_enabled_p ())
1758 dump_printf_loc (MSG_NOTE, vect_location,
1759 "=== vect_analyze_loop_operations ===\n");
1761 for (i = 0; i < nbbs; i++)
1763 basic_block bb = bbs[i];
1765 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1766 gsi_next (&si))
1768 gphi *phi = si.phi ();
1769 ok = true;
1771 stmt_info = vinfo_for_stmt (phi);
1772 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1775 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1777 if (virtual_operand_p (gimple_phi_result (phi)))
1778 continue;
1780 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1781 (i.e., a phi in the tail of the outer-loop). */
1782 if (! is_loop_header_bb_p (bb))
1784 /* FORNOW: we currently don't support the case that these phis
1785 are not used in the outerloop (unless it is double reduction,
1786 i.e., this phi is vect_reduction_def), cause this case
1787 requires to actually do something here. */
1788 if (STMT_VINFO_LIVE_P (stmt_info)
1789 && STMT_VINFO_DEF_TYPE (stmt_info)
1790 != vect_double_reduction_def)
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1794 "Unsupported loop-closed phi in "
1795 "outer-loop.\n");
1796 return false;
1799 /* If PHI is used in the outer loop, we check that its operand
1800 is defined in the inner loop. */
1801 if (STMT_VINFO_RELEVANT_P (stmt_info))
1803 tree phi_op;
1804 gimple *op_def_stmt;
1806 if (gimple_phi_num_args (phi) != 1)
1807 return false;
1809 phi_op = PHI_ARG_DEF (phi, 0);
1810 if (TREE_CODE (phi_op) != SSA_NAME)
1811 return false;
1813 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1814 if (gimple_nop_p (op_def_stmt)
1815 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1816 || !vinfo_for_stmt (op_def_stmt))
1817 return false;
1819 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1820 != vect_used_in_outer
1821 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1822 != vect_used_in_outer_by_reduction)
1823 return false;
1826 continue;
1829 gcc_assert (stmt_info);
1831 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1832 || STMT_VINFO_LIVE_P (stmt_info))
1833 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1835 /* A scalar-dependence cycle that we don't support. */
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: scalar dependence cycle.\n");
1839 return false;
1842 if (STMT_VINFO_RELEVANT_P (stmt_info))
1844 need_to_vectorize = true;
1845 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1846 && ! PURE_SLP_STMT (stmt_info))
1847 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1848 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1849 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1850 && ! PURE_SLP_STMT (stmt_info))
1851 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1854 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1855 if (ok
1856 && STMT_VINFO_LIVE_P (stmt_info)
1857 && !PURE_SLP_STMT (stmt_info))
1858 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1860 if (!ok)
1862 if (dump_enabled_p ())
1864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 "not vectorized: relevant phi not "
1866 "supported: ");
1867 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1869 return false;
1873 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1874 gsi_next (&si))
1876 gimple *stmt = gsi_stmt (si);
1877 if (!gimple_clobber_p (stmt)
1878 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1879 return false;
1881 } /* bbs */
1883 /* All operations in the loop are either irrelevant (deal with loop
1884 control, or dead), or only used outside the loop and can be moved
1885 out of the loop (e.g. invariants, inductions). The loop can be
1886 optimized away by scalar optimizations. We're better off not
1887 touching this loop. */
1888 if (!need_to_vectorize)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_NOTE, vect_location,
1892 "All the computation can be taken out of the loop.\n");
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "not vectorized: redundant loop. no profit to "
1896 "vectorize.\n");
1897 return false;
1900 return true;
1903 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1904 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1905 definitely no, or -1 if it's worth retrying. */
1907 static int
1908 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1910 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1913 /* Only fully-masked loops can have iteration counts less than the
1914 vectorization factor. */
1915 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1917 HOST_WIDE_INT max_niter;
1919 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1920 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1921 else
1922 max_niter = max_stmt_executions_int (loop);
1924 if (max_niter != -1
1925 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "not vectorized: iteration count smaller than "
1930 "vectorization factor.\n");
1931 return 0;
1935 int min_profitable_iters, min_profitable_estimate;
1936 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1937 &min_profitable_estimate);
1939 if (min_profitable_iters < 0)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943 "not vectorized: vectorization not profitable.\n");
1944 if (dump_enabled_p ())
1945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1946 "not vectorized: vector version will never be "
1947 "profitable.\n");
1948 return -1;
1951 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1952 * assumed_vf);
1954 /* Use the cost model only if it is more conservative than user specified
1955 threshold. */
1956 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1957 min_profitable_iters);
1959 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1961 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1962 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966 "not vectorized: vectorization not profitable.\n");
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_NOTE, vect_location,
1969 "not vectorized: iteration count smaller than user "
1970 "specified loop bound parameter or minimum profitable "
1971 "iterations (whichever is more conservative).\n");
1972 return 0;
1975 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1976 if (estimated_niter == -1)
1977 estimated_niter = likely_max_stmt_executions_int (loop);
1978 if (estimated_niter != -1
1979 && ((unsigned HOST_WIDE_INT) estimated_niter
1980 < MAX (th, (unsigned) min_profitable_estimate)))
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1984 "not vectorized: estimated iteration count too "
1985 "small.\n");
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "not vectorized: estimated iteration count smaller "
1989 "than specified loop bound parameter or minimum "
1990 "profitable iterations (whichever is more "
1991 "conservative).\n");
1992 return -1;
1995 return 1;
1999 /* Function vect_analyze_loop_2.
2001 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2002 for it. The different analyses will record information in the
2003 loop_vec_info struct. */
2004 static bool
2005 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2007 bool ok;
2008 int res;
2009 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2010 poly_uint64 min_vf = 2;
2011 unsigned int n_stmts = 0;
2013 /* The first group of checks is independent of the vector size. */
2014 fatal = true;
2016 /* Find all data references in the loop (which correspond to vdefs/vuses)
2017 and analyze their evolution in the loop. */
2019 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2021 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2022 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "not vectorized: loop nest containing two "
2027 "or more consecutive inner loops cannot be "
2028 "vectorized\n");
2029 return false;
2032 for (unsigned i = 0; i < loop->num_nodes; i++)
2033 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2034 !gsi_end_p (gsi); gsi_next (&gsi))
2036 gimple *stmt = gsi_stmt (gsi);
2037 if (is_gimple_debug (stmt))
2038 continue;
2039 ++n_stmts;
2040 if (!find_data_references_in_stmt (loop, stmt,
2041 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2043 if (is_gimple_call (stmt) && loop->safelen)
2045 tree fndecl = gimple_call_fndecl (stmt), op;
2046 if (fndecl != NULL_TREE)
2048 cgraph_node *node = cgraph_node::get (fndecl);
2049 if (node != NULL && node->simd_clones != NULL)
2051 unsigned int j, n = gimple_call_num_args (stmt);
2052 for (j = 0; j < n; j++)
2054 op = gimple_call_arg (stmt, j);
2055 if (DECL_P (op)
2056 || (REFERENCE_CLASS_P (op)
2057 && get_base_address (op)))
2058 break;
2060 op = gimple_call_lhs (stmt);
2061 /* Ignore #pragma omp declare simd functions
2062 if they don't have data references in the
2063 call stmt itself. */
2064 if (j == n
2065 && !(op
2066 && (DECL_P (op)
2067 || (REFERENCE_CLASS_P (op)
2068 && get_base_address (op)))))
2069 continue;
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075 "not vectorized: loop contains function "
2076 "calls or data references that cannot "
2077 "be analyzed\n");
2078 return false;
2082 /* Analyze the data references and also adjust the minimal
2083 vectorization factor according to the loads and stores. */
2085 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2086 if (!ok)
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090 "bad data references.\n");
2091 return false;
2094 /* Classify all cross-iteration scalar data-flow cycles.
2095 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2096 vect_analyze_scalar_cycles (loop_vinfo);
2098 vect_pattern_recog (loop_vinfo);
2100 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2102 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2103 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2105 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2106 if (!ok)
2108 if (dump_enabled_p ())
2109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2110 "bad data access.\n");
2111 return false;
2114 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2116 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2117 if (!ok)
2119 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121 "unexpected pattern.\n");
2122 return false;
2125 /* While the rest of the analysis below depends on it in some way. */
2126 fatal = false;
2128 /* Analyze data dependences between the data-refs in the loop
2129 and adjust the maximum vectorization factor according to
2130 the dependences.
2131 FORNOW: fail at the first data dependence that we encounter. */
2133 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2134 if (!ok
2135 || (max_vf != MAX_VECTORIZATION_FACTOR
2136 && maybe_lt (max_vf, min_vf)))
2138 if (dump_enabled_p ())
2139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140 "bad data dependence.\n");
2141 return false;
2143 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2145 ok = vect_determine_vectorization_factor (loop_vinfo);
2146 if (!ok)
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "can't determine vectorization factor.\n");
2151 return false;
2153 if (max_vf != MAX_VECTORIZATION_FACTOR
2154 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2156 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2158 "bad data dependence.\n");
2159 return false;
2162 /* Compute the scalar iteration cost. */
2163 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2165 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2166 unsigned th;
2168 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2169 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2170 if (!ok)
2171 return false;
2173 /* If there are any SLP instances mark them as pure_slp. */
2174 bool slp = vect_make_slp_decision (loop_vinfo);
2175 if (slp)
2177 /* Find stmts that need to be both vectorized and SLPed. */
2178 vect_detect_hybrid_slp (loop_vinfo);
2180 /* Update the vectorization factor based on the SLP decision. */
2181 vect_update_vf_for_slp (loop_vinfo);
2184 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2186 /* We don't expect to have to roll back to anything other than an empty
2187 set of rgroups. */
2188 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2190 /* This is the point where we can re-start analysis with SLP forced off. */
2191 start_over:
2193 /* Now the vectorization factor is final. */
2194 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2195 gcc_assert (known_ne (vectorization_factor, 0U));
2197 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2199 dump_printf_loc (MSG_NOTE, vect_location,
2200 "vectorization_factor = ");
2201 dump_dec (MSG_NOTE, vectorization_factor);
2202 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2203 LOOP_VINFO_INT_NITERS (loop_vinfo));
2206 HOST_WIDE_INT max_niter
2207 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2209 /* Analyze the alignment of the data-refs in the loop.
2210 Fail if a data reference is found that cannot be vectorized. */
2212 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2213 if (!ok)
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 "bad data alignment.\n");
2218 return false;
2221 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2222 It is important to call pruning after vect_analyze_data_ref_accesses,
2223 since we use grouping information gathered by interleaving analysis. */
2224 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2225 if (!ok)
2226 return false;
2228 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2229 vectorization. */
2230 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2232 /* This pass will decide on using loop versioning and/or loop peeling in
2233 order to enhance the alignment of data references in the loop. */
2234 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2235 if (!ok)
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 "bad data alignment.\n");
2240 return false;
2244 if (slp)
2246 /* Analyze operations in the SLP instances. Note this may
2247 remove unsupported SLP instances which makes the above
2248 SLP kind detection invalid. */
2249 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2250 vect_slp_analyze_operations (loop_vinfo);
2251 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2252 goto again;
2255 /* Scan all the remaining operations in the loop that are not subject
2256 to SLP and make sure they are vectorizable. */
2257 ok = vect_analyze_loop_operations (loop_vinfo);
2258 if (!ok)
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "bad operation or unsupported loop bound.\n");
2263 return false;
2266 /* Decide whether to use a fully-masked loop for this vectorization
2267 factor. */
2268 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2269 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2270 && vect_verify_full_masking (loop_vinfo));
2271 if (dump_enabled_p ())
2273 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2274 dump_printf_loc (MSG_NOTE, vect_location,
2275 "using a fully-masked loop.\n");
2276 else
2277 dump_printf_loc (MSG_NOTE, vect_location,
2278 "not using a fully-masked loop.\n");
2281 /* If epilog loop is required because of data accesses with gaps,
2282 one additional iteration needs to be peeled. Check if there is
2283 enough iterations for vectorization. */
2284 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2285 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2286 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2288 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2289 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2291 if (known_lt (wi::to_widest (scalar_niters), vf))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "loop has no enough iterations to support"
2296 " peeling for gaps.\n");
2297 return false;
2301 /* Check the costings of the loop make vectorizing worthwhile. */
2302 res = vect_analyze_loop_costing (loop_vinfo);
2303 if (res < 0)
2304 goto again;
2305 if (!res)
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "Loop costings not worthwhile.\n");
2310 return false;
2313 /* Decide whether we need to create an epilogue loop to handle
2314 remaining scalar iterations. */
2315 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2317 unsigned HOST_WIDE_INT const_vf;
2318 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2319 /* The main loop handles all iterations. */
2320 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2321 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2324 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2325 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2326 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2327 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2329 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2330 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2331 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2332 < (unsigned) exact_log2 (const_vf))
2333 /* In case of versioning, check if the maximum number of
2334 iterations is greater than th. If they are identical,
2335 the epilogue is unnecessary. */
2336 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2337 || ((unsigned HOST_WIDE_INT) max_niter
2338 > (th / const_vf) * const_vf))))
2339 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2341 /* If an epilogue loop is required make sure we can create one. */
2342 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2345 if (dump_enabled_p ())
2346 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2347 if (!vect_can_advance_ivs_p (loop_vinfo)
2348 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2349 single_exit (LOOP_VINFO_LOOP
2350 (loop_vinfo))))
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 "not vectorized: can't create required "
2355 "epilog loop\n");
2356 goto again;
2360 /* During peeling, we need to check if number of loop iterations is
2361 enough for both peeled prolog loop and vector loop. This check
2362 can be merged along with threshold check of loop versioning, so
2363 increase threshold for this case if necessary. */
2364 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2366 poly_uint64 niters_th = 0;
2368 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2370 /* Niters for peeled prolog loop. */
2371 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2373 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2374 tree vectype
2375 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2376 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2378 else
2379 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2382 /* Niters for at least one iteration of vectorized loop. */
2383 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2384 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2385 /* One additional iteration because of peeling for gap. */
2386 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2387 niters_th += 1;
2388 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2391 gcc_assert (known_eq (vectorization_factor,
2392 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2394 /* Ok to vectorize! */
2395 return true;
2397 again:
2398 /* Try again with SLP forced off but if we didn't do any SLP there is
2399 no point in re-trying. */
2400 if (!slp)
2401 return false;
2403 /* If there are reduction chains re-trying will fail anyway. */
2404 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2405 return false;
2407 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2408 via interleaving or lane instructions. */
2409 slp_instance instance;
2410 slp_tree node;
2411 unsigned i, j;
2412 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2414 stmt_vec_info vinfo;
2415 vinfo = vinfo_for_stmt
2416 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2417 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2418 continue;
2419 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2420 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2421 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2422 if (! vect_store_lanes_supported (vectype, size, false)
2423 && ! vect_grouped_store_supported (vectype, size))
2424 return false;
2425 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2427 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2428 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2429 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2430 size = STMT_VINFO_GROUP_SIZE (vinfo);
2431 vectype = STMT_VINFO_VECTYPE (vinfo);
2432 if (! vect_load_lanes_supported (vectype, size, false)
2433 && ! vect_grouped_load_supported (vectype, single_element_p,
2434 size))
2435 return false;
2439 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "re-trying with SLP disabled\n");
2443 /* Roll back state appropriately. No SLP this time. */
2444 slp = false;
2445 /* Restore vectorization factor as it were without SLP. */
2446 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2447 /* Free the SLP instances. */
2448 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2449 vect_free_slp_instance (instance);
2450 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2451 /* Reset SLP type to loop_vect on all stmts. */
2452 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2454 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2455 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2456 !gsi_end_p (si); gsi_next (&si))
2458 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2459 STMT_SLP_TYPE (stmt_info) = loop_vect;
2461 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2462 !gsi_end_p (si); gsi_next (&si))
2464 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2465 STMT_SLP_TYPE (stmt_info) = loop_vect;
2466 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2468 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2469 STMT_SLP_TYPE (stmt_info) = loop_vect;
2470 for (gimple_stmt_iterator pi
2471 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2472 !gsi_end_p (pi); gsi_next (&pi))
2474 gimple *pstmt = gsi_stmt (pi);
2475 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2480 /* Free optimized alias test DDRS. */
2481 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2482 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2483 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2484 /* Reset target cost data. */
2485 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2486 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2487 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2488 /* Reset accumulated rgroup information. */
2489 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2490 /* Reset assorted flags. */
2491 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2492 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2493 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2494 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2495 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2497 goto start_over;
2500 /* Function vect_analyze_loop.
2502 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2503 for it. The different analyses will record information in the
2504 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2505 be vectorized. */
2506 loop_vec_info
2507 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2509 loop_vec_info loop_vinfo;
2510 auto_vector_sizes vector_sizes;
2512 /* Autodetect first vector size we try. */
2513 current_vector_size = 0;
2514 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2515 unsigned int next_size = 0;
2517 if (dump_enabled_p ())
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 "===== analyze_loop_nest =====\n");
2521 if (loop_outer (loop)
2522 && loop_vec_info_for_loop (loop_outer (loop))
2523 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location,
2527 "outer-loop already vectorized.\n");
2528 return NULL;
2531 poly_uint64 autodetected_vector_size = 0;
2532 while (1)
2534 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2535 loop_vinfo = vect_analyze_loop_form (loop);
2536 if (!loop_vinfo)
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2540 "bad loop form.\n");
2541 return NULL;
2544 bool fatal = false;
2546 if (orig_loop_vinfo)
2547 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2549 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2551 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2553 return loop_vinfo;
2556 delete loop_vinfo;
2558 if (next_size == 0)
2559 autodetected_vector_size = current_vector_size;
2561 if (next_size < vector_sizes.length ()
2562 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2563 next_size += 1;
2565 if (fatal
2566 || next_size == vector_sizes.length ()
2567 || known_eq (current_vector_size, 0U))
2568 return NULL;
2570 /* Try the next biggest vector size. */
2571 current_vector_size = vector_sizes[next_size++];
2572 if (dump_enabled_p ())
2574 dump_printf_loc (MSG_NOTE, vect_location,
2575 "***** Re-trying analysis with "
2576 "vector size ");
2577 dump_dec (MSG_NOTE, current_vector_size);
2578 dump_printf (MSG_NOTE, "\n");
2583 /* Return true if there is an in-order reduction function for CODE, storing
2584 it in *REDUC_FN if so. */
2586 static bool
2587 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2589 switch (code)
2591 case PLUS_EXPR:
2592 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2593 return true;
2595 default:
2596 return false;
2600 /* Function reduction_fn_for_scalar_code
2602 Input:
2603 CODE - tree_code of a reduction operations.
2605 Output:
2606 REDUC_FN - the corresponding internal function to be used to reduce the
2607 vector of partial results into a single scalar result, or IFN_LAST
2608 if the operation is a supported reduction operation, but does not have
2609 such an internal function.
2611 Return FALSE if CODE currently cannot be vectorized as reduction. */
2613 static bool
2614 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2616 switch (code)
2618 case MAX_EXPR:
2619 *reduc_fn = IFN_REDUC_MAX;
2620 return true;
2622 case MIN_EXPR:
2623 *reduc_fn = IFN_REDUC_MIN;
2624 return true;
2626 case PLUS_EXPR:
2627 *reduc_fn = IFN_REDUC_PLUS;
2628 return true;
2630 case BIT_AND_EXPR:
2631 *reduc_fn = IFN_REDUC_AND;
2632 return true;
2634 case BIT_IOR_EXPR:
2635 *reduc_fn = IFN_REDUC_IOR;
2636 return true;
2638 case BIT_XOR_EXPR:
2639 *reduc_fn = IFN_REDUC_XOR;
2640 return true;
2642 case MULT_EXPR:
2643 case MINUS_EXPR:
2644 *reduc_fn = IFN_LAST;
2645 return true;
2647 default:
2648 return false;
2652 /* If there is a neutral value X such that SLP reduction NODE would not
2653 be affected by the introduction of additional X elements, return that X,
2654 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2655 is true if the SLP statements perform a single reduction, false if each
2656 statement performs an independent reduction. */
2658 static tree
2659 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2660 bool reduc_chain)
2662 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2663 gimple *stmt = stmts[0];
2664 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2665 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2666 tree scalar_type = TREE_TYPE (vector_type);
2667 struct loop *loop = gimple_bb (stmt)->loop_father;
2668 gcc_assert (loop);
2670 switch (code)
2672 case WIDEN_SUM_EXPR:
2673 case DOT_PROD_EXPR:
2674 case SAD_EXPR:
2675 case PLUS_EXPR:
2676 case MINUS_EXPR:
2677 case BIT_IOR_EXPR:
2678 case BIT_XOR_EXPR:
2679 return build_zero_cst (scalar_type);
2681 case MULT_EXPR:
2682 return build_one_cst (scalar_type);
2684 case BIT_AND_EXPR:
2685 return build_all_ones_cst (scalar_type);
2687 case MAX_EXPR:
2688 case MIN_EXPR:
2689 /* For MIN/MAX the initial values are neutral. A reduction chain
2690 has only a single initial value, so that value is neutral for
2691 all statements. */
2692 if (reduc_chain)
2693 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2694 return NULL_TREE;
2696 default:
2697 return NULL_TREE;
2701 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2702 STMT is printed with a message MSG. */
2704 static void
2705 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2707 dump_printf_loc (msg_type, vect_location, "%s", msg);
2708 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2712 /* Detect SLP reduction of the form:
2714 #a1 = phi <a5, a0>
2715 a2 = operation (a1)
2716 a3 = operation (a2)
2717 a4 = operation (a3)
2718 a5 = operation (a4)
2720 #a = phi <a5>
2722 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2723 FIRST_STMT is the first reduction stmt in the chain
2724 (a2 = operation (a1)).
2726 Return TRUE if a reduction chain was detected. */
2728 static bool
2729 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2730 gimple *first_stmt)
2732 struct loop *loop = (gimple_bb (phi))->loop_father;
2733 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2734 enum tree_code code;
2735 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2736 stmt_vec_info use_stmt_info, current_stmt_info;
2737 tree lhs;
2738 imm_use_iterator imm_iter;
2739 use_operand_p use_p;
2740 int nloop_uses, size = 0, n_out_of_loop_uses;
2741 bool found = false;
2743 if (loop != vect_loop)
2744 return false;
2746 lhs = PHI_RESULT (phi);
2747 code = gimple_assign_rhs_code (first_stmt);
2748 while (1)
2750 nloop_uses = 0;
2751 n_out_of_loop_uses = 0;
2752 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2754 gimple *use_stmt = USE_STMT (use_p);
2755 if (is_gimple_debug (use_stmt))
2756 continue;
2758 /* Check if we got back to the reduction phi. */
2759 if (use_stmt == phi)
2761 loop_use_stmt = use_stmt;
2762 found = true;
2763 break;
2766 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2768 loop_use_stmt = use_stmt;
2769 nloop_uses++;
2771 else
2772 n_out_of_loop_uses++;
2774 /* There are can be either a single use in the loop or two uses in
2775 phi nodes. */
2776 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2777 return false;
2780 if (found)
2781 break;
2783 /* We reached a statement with no loop uses. */
2784 if (nloop_uses == 0)
2785 return false;
2787 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2788 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2789 return false;
2791 if (!is_gimple_assign (loop_use_stmt)
2792 || code != gimple_assign_rhs_code (loop_use_stmt)
2793 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2794 return false;
2796 /* Insert USE_STMT into reduction chain. */
2797 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2798 if (current_stmt)
2800 current_stmt_info = vinfo_for_stmt (current_stmt);
2801 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2802 GROUP_FIRST_ELEMENT (use_stmt_info)
2803 = GROUP_FIRST_ELEMENT (current_stmt_info);
2805 else
2806 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2808 lhs = gimple_assign_lhs (loop_use_stmt);
2809 current_stmt = loop_use_stmt;
2810 size++;
2813 if (!found || loop_use_stmt != phi || size < 2)
2814 return false;
2816 /* Swap the operands, if needed, to make the reduction operand be the second
2817 operand. */
2818 lhs = PHI_RESULT (phi);
2819 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2820 while (next_stmt)
2822 if (gimple_assign_rhs2 (next_stmt) == lhs)
2824 tree op = gimple_assign_rhs1 (next_stmt);
2825 gimple *def_stmt = NULL;
2827 if (TREE_CODE (op) == SSA_NAME)
2828 def_stmt = SSA_NAME_DEF_STMT (op);
2830 /* Check that the other def is either defined in the loop
2831 ("vect_internal_def"), or it's an induction (defined by a
2832 loop-header phi-node). */
2833 if (def_stmt
2834 && gimple_bb (def_stmt)
2835 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2836 && (is_gimple_assign (def_stmt)
2837 || is_gimple_call (def_stmt)
2838 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2839 == vect_induction_def
2840 || (gimple_code (def_stmt) == GIMPLE_PHI
2841 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2842 == vect_internal_def
2843 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2845 lhs = gimple_assign_lhs (next_stmt);
2846 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2847 continue;
2850 return false;
2852 else
2854 tree op = gimple_assign_rhs2 (next_stmt);
2855 gimple *def_stmt = NULL;
2857 if (TREE_CODE (op) == SSA_NAME)
2858 def_stmt = SSA_NAME_DEF_STMT (op);
2860 /* Check that the other def is either defined in the loop
2861 ("vect_internal_def"), or it's an induction (defined by a
2862 loop-header phi-node). */
2863 if (def_stmt
2864 && gimple_bb (def_stmt)
2865 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2866 && (is_gimple_assign (def_stmt)
2867 || is_gimple_call (def_stmt)
2868 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2869 == vect_induction_def
2870 || (gimple_code (def_stmt) == GIMPLE_PHI
2871 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2872 == vect_internal_def
2873 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2875 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2878 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2881 swap_ssa_operands (next_stmt,
2882 gimple_assign_rhs1_ptr (next_stmt),
2883 gimple_assign_rhs2_ptr (next_stmt));
2884 update_stmt (next_stmt);
2886 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2887 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2889 else
2890 return false;
2893 lhs = gimple_assign_lhs (next_stmt);
2894 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2897 /* Save the chain for further analysis in SLP detection. */
2898 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2899 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2900 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2902 return true;
2905 /* Return true if we need an in-order reduction for operation CODE
2906 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2907 overflow must wrap. */
2909 static bool
2910 needs_fold_left_reduction_p (tree type, tree_code code,
2911 bool need_wrapping_integral_overflow)
2913 /* CHECKME: check for !flag_finite_math_only too? */
2914 if (SCALAR_FLOAT_TYPE_P (type))
2915 switch (code)
2917 case MIN_EXPR:
2918 case MAX_EXPR:
2919 return false;
2921 default:
2922 return !flag_associative_math;
2925 if (INTEGRAL_TYPE_P (type))
2927 if (!operation_no_trapping_overflow (type, code))
2928 return true;
2929 if (need_wrapping_integral_overflow
2930 && !TYPE_OVERFLOW_WRAPS (type)
2931 && operation_can_overflow (code))
2932 return true;
2933 return false;
2936 if (SAT_FIXED_POINT_TYPE_P (type))
2937 return true;
2939 return false;
2942 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2943 reduction operation CODE has a handled computation expression. */
2945 bool
2946 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2947 enum tree_code code)
2949 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2950 auto_bitmap visited;
2951 tree lookfor = PHI_RESULT (phi);
2952 ssa_op_iter curri;
2953 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2954 while (USE_FROM_PTR (curr) != loop_arg)
2955 curr = op_iter_next_use (&curri);
2956 curri.i = curri.numops;
2959 path.safe_push (std::make_pair (curri, curr));
2960 tree use = USE_FROM_PTR (curr);
2961 if (use == lookfor)
2962 break;
2963 gimple *def = SSA_NAME_DEF_STMT (use);
2964 if (gimple_nop_p (def)
2965 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2967 pop:
2970 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2971 curri = x.first;
2972 curr = x.second;
2974 curr = op_iter_next_use (&curri);
2975 /* Skip already visited or non-SSA operands (from iterating
2976 over PHI args). */
2977 while (curr != NULL_USE_OPERAND_P
2978 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2979 || ! bitmap_set_bit (visited,
2980 SSA_NAME_VERSION
2981 (USE_FROM_PTR (curr)))));
2983 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2984 if (curr == NULL_USE_OPERAND_P)
2985 break;
2987 else
2989 if (gimple_code (def) == GIMPLE_PHI)
2990 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2991 else
2992 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2993 while (curr != NULL_USE_OPERAND_P
2994 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2995 || ! bitmap_set_bit (visited,
2996 SSA_NAME_VERSION
2997 (USE_FROM_PTR (curr)))))
2998 curr = op_iter_next_use (&curri);
2999 if (curr == NULL_USE_OPERAND_P)
3000 goto pop;
3003 while (1);
3004 if (dump_file && (dump_flags & TDF_DETAILS))
3006 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3007 unsigned i;
3008 std::pair<ssa_op_iter, use_operand_p> *x;
3009 FOR_EACH_VEC_ELT (path, i, x)
3011 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3012 dump_printf (MSG_NOTE, " ");
3014 dump_printf (MSG_NOTE, "\n");
3017 /* Check whether the reduction path detected is valid. */
3018 bool fail = path.length () == 0;
3019 bool neg = false;
3020 for (unsigned i = 1; i < path.length (); ++i)
3022 gimple *use_stmt = USE_STMT (path[i].second);
3023 tree op = USE_FROM_PTR (path[i].second);
3024 if (! has_single_use (op)
3025 || ! is_gimple_assign (use_stmt))
3027 fail = true;
3028 break;
3030 if (gimple_assign_rhs_code (use_stmt) != code)
3032 if (code == PLUS_EXPR
3033 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3035 /* Track whether we negate the reduction value each iteration. */
3036 if (gimple_assign_rhs2 (use_stmt) == op)
3037 neg = ! neg;
3039 else
3041 fail = true;
3042 break;
3046 return ! fail && ! neg;
3050 /* Function vect_is_simple_reduction
3052 (1) Detect a cross-iteration def-use cycle that represents a simple
3053 reduction computation. We look for the following pattern:
3055 loop_header:
3056 a1 = phi < a0, a2 >
3057 a3 = ...
3058 a2 = operation (a3, a1)
3062 a3 = ...
3063 loop_header:
3064 a1 = phi < a0, a2 >
3065 a2 = operation (a3, a1)
3067 such that:
3068 1. operation is commutative and associative and it is safe to
3069 change the order of the computation
3070 2. no uses for a2 in the loop (a2 is used out of the loop)
3071 3. no uses of a1 in the loop besides the reduction operation
3072 4. no uses of a1 outside the loop.
3074 Conditions 1,4 are tested here.
3075 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3077 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3078 nested cycles.
3080 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3081 reductions:
3083 a1 = phi < a0, a2 >
3084 inner loop (def of a3)
3085 a2 = phi < a3 >
3087 (4) Detect condition expressions, ie:
3088 for (int i = 0; i < N; i++)
3089 if (a[i] < val)
3090 ret_val = a[i];
3094 static gimple *
3095 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3096 bool *double_reduc,
3097 bool need_wrapping_integral_overflow,
3098 enum vect_reduction_type *v_reduc_type)
3100 struct loop *loop = (gimple_bb (phi))->loop_father;
3101 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3102 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3103 enum tree_code orig_code, code;
3104 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3105 tree type;
3106 int nloop_uses;
3107 tree name;
3108 imm_use_iterator imm_iter;
3109 use_operand_p use_p;
3110 bool phi_def;
3112 *double_reduc = false;
3113 *v_reduc_type = TREE_CODE_REDUCTION;
3115 tree phi_name = PHI_RESULT (phi);
3116 /* ??? If there are no uses of the PHI result the inner loop reduction
3117 won't be detected as possibly double-reduction by vectorizable_reduction
3118 because that tries to walk the PHI arg from the preheader edge which
3119 can be constant. See PR60382. */
3120 if (has_zero_uses (phi_name))
3121 return NULL;
3122 nloop_uses = 0;
3123 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3125 gimple *use_stmt = USE_STMT (use_p);
3126 if (is_gimple_debug (use_stmt))
3127 continue;
3129 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133 "intermediate value used outside loop.\n");
3135 return NULL;
3138 nloop_uses++;
3139 if (nloop_uses > 1)
3141 if (dump_enabled_p ())
3142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3143 "reduction value used in loop.\n");
3144 return NULL;
3147 phi_use_stmt = use_stmt;
3150 edge latch_e = loop_latch_edge (loop);
3151 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3152 if (TREE_CODE (loop_arg) != SSA_NAME)
3154 if (dump_enabled_p ())
3156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3157 "reduction: not ssa_name: ");
3158 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3159 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3161 return NULL;
3164 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3165 if (is_gimple_assign (def_stmt))
3167 name = gimple_assign_lhs (def_stmt);
3168 phi_def = false;
3170 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3172 name = PHI_RESULT (def_stmt);
3173 phi_def = true;
3175 else
3177 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3180 "reduction: unhandled reduction operation: ");
3181 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3183 return NULL;
3186 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3187 return NULL;
3189 nloop_uses = 0;
3190 auto_vec<gphi *, 3> lcphis;
3191 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3193 gimple *use_stmt = USE_STMT (use_p);
3194 if (is_gimple_debug (use_stmt))
3195 continue;
3196 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3197 nloop_uses++;
3198 else
3199 /* We can have more than one loop-closed PHI. */
3200 lcphis.safe_push (as_a <gphi *> (use_stmt));
3201 if (nloop_uses > 1)
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205 "reduction used in loop.\n");
3206 return NULL;
3210 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3211 defined in the inner loop. */
3212 if (phi_def)
3214 op1 = PHI_ARG_DEF (def_stmt, 0);
3216 if (gimple_phi_num_args (def_stmt) != 1
3217 || TREE_CODE (op1) != SSA_NAME)
3219 if (dump_enabled_p ())
3220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221 "unsupported phi node definition.\n");
3223 return NULL;
3226 def1 = SSA_NAME_DEF_STMT (op1);
3227 if (gimple_bb (def1)
3228 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3229 && loop->inner
3230 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3231 && is_gimple_assign (def1)
3232 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3234 if (dump_enabled_p ())
3235 report_vect_op (MSG_NOTE, def_stmt,
3236 "detected double reduction: ");
3238 *double_reduc = true;
3239 return def_stmt;
3242 return NULL;
3245 /* If we are vectorizing an inner reduction we are executing that
3246 in the original order only in case we are not dealing with a
3247 double reduction. */
3248 bool check_reduction = true;
3249 if (flow_loop_nested_p (vect_loop, loop))
3251 gphi *lcphi;
3252 unsigned i;
3253 check_reduction = false;
3254 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3255 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3257 gimple *use_stmt = USE_STMT (use_p);
3258 if (is_gimple_debug (use_stmt))
3259 continue;
3260 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3261 check_reduction = true;
3265 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3266 code = orig_code = gimple_assign_rhs_code (def_stmt);
3268 /* We can handle "res -= x[i]", which is non-associative by
3269 simply rewriting this into "res += -x[i]". Avoid changing
3270 gimple instruction for the first simple tests and only do this
3271 if we're allowed to change code at all. */
3272 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3273 code = PLUS_EXPR;
3275 if (code == COND_EXPR)
3277 if (! nested_in_vect_loop)
3278 *v_reduc_type = COND_REDUCTION;
3280 op3 = gimple_assign_rhs1 (def_stmt);
3281 if (COMPARISON_CLASS_P (op3))
3283 op4 = TREE_OPERAND (op3, 1);
3284 op3 = TREE_OPERAND (op3, 0);
3286 if (op3 == phi_name || op4 == phi_name)
3288 if (dump_enabled_p ())
3289 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3290 "reduction: condition depends on previous"
3291 " iteration: ");
3292 return NULL;
3295 op1 = gimple_assign_rhs2 (def_stmt);
3296 op2 = gimple_assign_rhs3 (def_stmt);
3298 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3300 if (dump_enabled_p ())
3301 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3302 "reduction: not commutative/associative: ");
3303 return NULL;
3305 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3307 op1 = gimple_assign_rhs1 (def_stmt);
3308 op2 = gimple_assign_rhs2 (def_stmt);
3310 else
3312 if (dump_enabled_p ())
3313 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3314 "reduction: not handled operation: ");
3315 return NULL;
3318 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3320 if (dump_enabled_p ())
3321 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3322 "reduction: both uses not ssa_names: ");
3324 return NULL;
3327 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3328 if ((TREE_CODE (op1) == SSA_NAME
3329 && !types_compatible_p (type,TREE_TYPE (op1)))
3330 || (TREE_CODE (op2) == SSA_NAME
3331 && !types_compatible_p (type, TREE_TYPE (op2)))
3332 || (op3 && TREE_CODE (op3) == SSA_NAME
3333 && !types_compatible_p (type, TREE_TYPE (op3)))
3334 || (op4 && TREE_CODE (op4) == SSA_NAME
3335 && !types_compatible_p (type, TREE_TYPE (op4))))
3337 if (dump_enabled_p ())
3339 dump_printf_loc (MSG_NOTE, vect_location,
3340 "reduction: multiple types: operation type: ");
3341 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3342 dump_printf (MSG_NOTE, ", operands types: ");
3343 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3344 TREE_TYPE (op1));
3345 dump_printf (MSG_NOTE, ",");
3346 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3347 TREE_TYPE (op2));
3348 if (op3)
3350 dump_printf (MSG_NOTE, ",");
3351 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3352 TREE_TYPE (op3));
3355 if (op4)
3357 dump_printf (MSG_NOTE, ",");
3358 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3359 TREE_TYPE (op4));
3361 dump_printf (MSG_NOTE, "\n");
3364 return NULL;
3367 /* Check whether it's ok to change the order of the computation.
3368 Generally, when vectorizing a reduction we change the order of the
3369 computation. This may change the behavior of the program in some
3370 cases, so we need to check that this is ok. One exception is when
3371 vectorizing an outer-loop: the inner-loop is executed sequentially,
3372 and therefore vectorizing reductions in the inner-loop during
3373 outer-loop vectorization is safe. */
3374 if (check_reduction
3375 && *v_reduc_type == TREE_CODE_REDUCTION
3376 && needs_fold_left_reduction_p (type, code,
3377 need_wrapping_integral_overflow))
3378 *v_reduc_type = FOLD_LEFT_REDUCTION;
3380 /* Reduction is safe. We're dealing with one of the following:
3381 1) integer arithmetic and no trapv
3382 2) floating point arithmetic, and special flags permit this optimization
3383 3) nested cycle (i.e., outer loop vectorization). */
3384 if (TREE_CODE (op1) == SSA_NAME)
3385 def1 = SSA_NAME_DEF_STMT (op1);
3387 if (TREE_CODE (op2) == SSA_NAME)
3388 def2 = SSA_NAME_DEF_STMT (op2);
3390 if (code != COND_EXPR
3391 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3393 if (dump_enabled_p ())
3394 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3395 return NULL;
3398 /* Check that one def is the reduction def, defined by PHI,
3399 the other def is either defined in the loop ("vect_internal_def"),
3400 or it's an induction (defined by a loop-header phi-node). */
3402 if (def2 && def2 == phi
3403 && (code == COND_EXPR
3404 || !def1 || gimple_nop_p (def1)
3405 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3406 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3407 && (is_gimple_assign (def1)
3408 || is_gimple_call (def1)
3409 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3410 == vect_induction_def
3411 || (gimple_code (def1) == GIMPLE_PHI
3412 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3413 == vect_internal_def
3414 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3416 if (dump_enabled_p ())
3417 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3418 return def_stmt;
3421 if (def1 && def1 == phi
3422 && (code == COND_EXPR
3423 || !def2 || gimple_nop_p (def2)
3424 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3425 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3426 && (is_gimple_assign (def2)
3427 || is_gimple_call (def2)
3428 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3429 == vect_induction_def
3430 || (gimple_code (def2) == GIMPLE_PHI
3431 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3432 == vect_internal_def
3433 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3435 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3437 /* Check if we can swap operands (just for simplicity - so that
3438 the rest of the code can assume that the reduction variable
3439 is always the last (second) argument). */
3440 if (code == COND_EXPR)
3442 /* Swap cond_expr by inverting the condition. */
3443 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3444 enum tree_code invert_code = ERROR_MARK;
3445 enum tree_code cond_code = TREE_CODE (cond_expr);
3447 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3449 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3450 invert_code = invert_tree_comparison (cond_code, honor_nans);
3452 if (invert_code != ERROR_MARK)
3454 TREE_SET_CODE (cond_expr, invert_code);
3455 swap_ssa_operands (def_stmt,
3456 gimple_assign_rhs2_ptr (def_stmt),
3457 gimple_assign_rhs3_ptr (def_stmt));
3459 else
3461 if (dump_enabled_p ())
3462 report_vect_op (MSG_NOTE, def_stmt,
3463 "detected reduction: cannot swap operands "
3464 "for cond_expr");
3465 return NULL;
3468 else
3469 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3470 gimple_assign_rhs2_ptr (def_stmt));
3472 if (dump_enabled_p ())
3473 report_vect_op (MSG_NOTE, def_stmt,
3474 "detected reduction: need to swap operands: ");
3476 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3477 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3479 else
3481 if (dump_enabled_p ())
3482 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3485 return def_stmt;
3488 /* Try to find SLP reduction chain. */
3489 if (! nested_in_vect_loop
3490 && code != COND_EXPR
3491 && orig_code != MINUS_EXPR
3492 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3494 if (dump_enabled_p ())
3495 report_vect_op (MSG_NOTE, def_stmt,
3496 "reduction: detected reduction chain: ");
3498 return def_stmt;
3501 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3502 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3503 while (first)
3505 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3506 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3507 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3508 first = next;
3511 /* Look for the expression computing loop_arg from loop PHI result. */
3512 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3513 code))
3514 return def_stmt;
3516 if (dump_enabled_p ())
3518 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3519 "reduction: unknown pattern: ");
3522 return NULL;
3525 /* Wrapper around vect_is_simple_reduction, which will modify code
3526 in-place if it enables detection of more reductions. Arguments
3527 as there. */
3529 gimple *
3530 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3531 bool *double_reduc,
3532 bool need_wrapping_integral_overflow)
3534 enum vect_reduction_type v_reduc_type;
3535 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3536 need_wrapping_integral_overflow,
3537 &v_reduc_type);
3538 if (def)
3540 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3541 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3542 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3543 reduc_def_info = vinfo_for_stmt (def);
3544 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3545 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3547 return def;
3550 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3552 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3553 int *peel_iters_epilogue,
3554 stmt_vector_for_cost *scalar_cost_vec,
3555 stmt_vector_for_cost *prologue_cost_vec,
3556 stmt_vector_for_cost *epilogue_cost_vec)
3558 int retval = 0;
3559 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3561 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3563 *peel_iters_epilogue = assumed_vf / 2;
3564 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_NOTE, vect_location,
3566 "cost model: epilogue peel iters set to vf/2 "
3567 "because loop iterations are unknown .\n");
3569 /* If peeled iterations are known but number of scalar loop
3570 iterations are unknown, count a taken branch per peeled loop. */
3571 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3572 NULL, 0, vect_prologue);
3573 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3574 NULL, 0, vect_epilogue);
3576 else
3578 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3579 peel_iters_prologue = niters < peel_iters_prologue ?
3580 niters : peel_iters_prologue;
3581 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3582 /* If we need to peel for gaps, but no peeling is required, we have to
3583 peel VF iterations. */
3584 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3585 *peel_iters_epilogue = assumed_vf;
3588 stmt_info_for_cost *si;
3589 int j;
3590 if (peel_iters_prologue)
3591 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3593 stmt_vec_info stmt_info
3594 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3595 retval += record_stmt_cost (prologue_cost_vec,
3596 si->count * peel_iters_prologue,
3597 si->kind, stmt_info, si->misalign,
3598 vect_prologue);
3600 if (*peel_iters_epilogue)
3601 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3603 stmt_vec_info stmt_info
3604 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3605 retval += record_stmt_cost (epilogue_cost_vec,
3606 si->count * *peel_iters_epilogue,
3607 si->kind, stmt_info, si->misalign,
3608 vect_epilogue);
3611 return retval;
3614 /* Function vect_estimate_min_profitable_iters
3616 Return the number of iterations required for the vector version of the
3617 loop to be profitable relative to the cost of the scalar version of the
3618 loop.
3620 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3621 of iterations for vectorization. -1 value means loop vectorization
3622 is not profitable. This returned value may be used for dynamic
3623 profitability check.
3625 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3626 for static check against estimated number of iterations. */
3628 static void
3629 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3630 int *ret_min_profitable_niters,
3631 int *ret_min_profitable_estimate)
3633 int min_profitable_iters;
3634 int min_profitable_estimate;
3635 int peel_iters_prologue;
3636 int peel_iters_epilogue;
3637 unsigned vec_inside_cost = 0;
3638 int vec_outside_cost = 0;
3639 unsigned vec_prologue_cost = 0;
3640 unsigned vec_epilogue_cost = 0;
3641 int scalar_single_iter_cost = 0;
3642 int scalar_outside_cost = 0;
3643 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3644 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3645 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3647 /* Cost model disabled. */
3648 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3650 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3651 *ret_min_profitable_niters = 0;
3652 *ret_min_profitable_estimate = 0;
3653 return;
3656 /* Requires loop versioning tests to handle misalignment. */
3657 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3659 /* FIXME: Make cost depend on complexity of individual check. */
3660 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3661 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3662 vect_prologue);
3663 dump_printf (MSG_NOTE,
3664 "cost model: Adding cost of checks for loop "
3665 "versioning to treat misalignment.\n");
3668 /* Requires loop versioning with alias checks. */
3669 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3671 /* FIXME: Make cost depend on complexity of individual check. */
3672 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3673 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3674 vect_prologue);
3675 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3676 if (len)
3677 /* Count LEN - 1 ANDs and LEN comparisons. */
3678 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3679 NULL, 0, vect_prologue);
3680 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3681 if (len)
3683 /* Count LEN - 1 ANDs and LEN comparisons. */
3684 unsigned int nstmts = len * 2 - 1;
3685 /* +1 for each bias that needs adding. */
3686 for (unsigned int i = 0; i < len; ++i)
3687 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3688 nstmts += 1;
3689 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3690 NULL, 0, vect_prologue);
3692 dump_printf (MSG_NOTE,
3693 "cost model: Adding cost of checks for loop "
3694 "versioning aliasing.\n");
3697 /* Requires loop versioning with niter checks. */
3698 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3700 /* FIXME: Make cost depend on complexity of individual check. */
3701 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3702 vect_prologue);
3703 dump_printf (MSG_NOTE,
3704 "cost model: Adding cost of checks for loop "
3705 "versioning niters.\n");
3708 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3709 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3710 vect_prologue);
3712 /* Count statements in scalar loop. Using this as scalar cost for a single
3713 iteration for now.
3715 TODO: Add outer loop support.
3717 TODO: Consider assigning different costs to different scalar
3718 statements. */
3720 scalar_single_iter_cost
3721 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3723 /* Add additional cost for the peeled instructions in prologue and epilogue
3724 loop. (For fully-masked loops there will be no peeling.)
3726 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3727 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3729 TODO: Build an expression that represents peel_iters for prologue and
3730 epilogue to be used in a run-time test. */
3732 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3734 peel_iters_prologue = 0;
3735 peel_iters_epilogue = 0;
3737 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3739 /* We need to peel exactly one iteration. */
3740 peel_iters_epilogue += 1;
3741 stmt_info_for_cost *si;
3742 int j;
3743 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3744 j, si)
3746 struct _stmt_vec_info *stmt_info
3747 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3748 (void) add_stmt_cost (target_cost_data, si->count,
3749 si->kind, stmt_info, si->misalign,
3750 vect_epilogue);
3754 else if (npeel < 0)
3756 peel_iters_prologue = assumed_vf / 2;
3757 dump_printf (MSG_NOTE, "cost model: "
3758 "prologue peel iters set to vf/2.\n");
3760 /* If peeling for alignment is unknown, loop bound of main loop becomes
3761 unknown. */
3762 peel_iters_epilogue = assumed_vf / 2;
3763 dump_printf (MSG_NOTE, "cost model: "
3764 "epilogue peel iters set to vf/2 because "
3765 "peeling for alignment is unknown.\n");
3767 /* If peeled iterations are unknown, count a taken branch and a not taken
3768 branch per peeled loop. Even if scalar loop iterations are known,
3769 vector iterations are not known since peeled prologue iterations are
3770 not known. Hence guards remain the same. */
3771 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3772 NULL, 0, vect_prologue);
3773 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3774 NULL, 0, vect_prologue);
3775 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3776 NULL, 0, vect_epilogue);
3777 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3778 NULL, 0, vect_epilogue);
3779 stmt_info_for_cost *si;
3780 int j;
3781 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3783 struct _stmt_vec_info *stmt_info
3784 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3785 (void) add_stmt_cost (target_cost_data,
3786 si->count * peel_iters_prologue,
3787 si->kind, stmt_info, si->misalign,
3788 vect_prologue);
3789 (void) add_stmt_cost (target_cost_data,
3790 si->count * peel_iters_epilogue,
3791 si->kind, stmt_info, si->misalign,
3792 vect_epilogue);
3795 else
3797 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3798 stmt_info_for_cost *si;
3799 int j;
3800 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3802 prologue_cost_vec.create (2);
3803 epilogue_cost_vec.create (2);
3804 peel_iters_prologue = npeel;
3806 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3807 &peel_iters_epilogue,
3808 &LOOP_VINFO_SCALAR_ITERATION_COST
3809 (loop_vinfo),
3810 &prologue_cost_vec,
3811 &epilogue_cost_vec);
3813 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3815 struct _stmt_vec_info *stmt_info
3816 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3817 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3818 si->misalign, vect_prologue);
3821 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3823 struct _stmt_vec_info *stmt_info
3824 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3825 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3826 si->misalign, vect_epilogue);
3829 prologue_cost_vec.release ();
3830 epilogue_cost_vec.release ();
3833 /* FORNOW: The scalar outside cost is incremented in one of the
3834 following ways:
3836 1. The vectorizer checks for alignment and aliasing and generates
3837 a condition that allows dynamic vectorization. A cost model
3838 check is ANDED with the versioning condition. Hence scalar code
3839 path now has the added cost of the versioning check.
3841 if (cost > th & versioning_check)
3842 jmp to vector code
3844 Hence run-time scalar is incremented by not-taken branch cost.
3846 2. The vectorizer then checks if a prologue is required. If the
3847 cost model check was not done before during versioning, it has to
3848 be done before the prologue check.
3850 if (cost <= th)
3851 prologue = scalar_iters
3852 if (prologue == 0)
3853 jmp to vector code
3854 else
3855 execute prologue
3856 if (prologue == num_iters)
3857 go to exit
3859 Hence the run-time scalar cost is incremented by a taken branch,
3860 plus a not-taken branch, plus a taken branch cost.
3862 3. The vectorizer then checks if an epilogue is required. If the
3863 cost model check was not done before during prologue check, it
3864 has to be done with the epilogue check.
3866 if (prologue == 0)
3867 jmp to vector code
3868 else
3869 execute prologue
3870 if (prologue == num_iters)
3871 go to exit
3872 vector code:
3873 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3874 jmp to epilogue
3876 Hence the run-time scalar cost should be incremented by 2 taken
3877 branches.
3879 TODO: The back end may reorder the BBS's differently and reverse
3880 conditions/branch directions. Change the estimates below to
3881 something more reasonable. */
3883 /* If the number of iterations is known and we do not do versioning, we can
3884 decide whether to vectorize at compile time. Hence the scalar version
3885 do not carry cost model guard costs. */
3886 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3887 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3889 /* Cost model check occurs at versioning. */
3890 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3891 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3892 else
3894 /* Cost model check occurs at prologue generation. */
3895 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3896 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3897 + vect_get_stmt_cost (cond_branch_not_taken);
3898 /* Cost model check occurs at epilogue generation. */
3899 else
3900 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3904 /* Complete the target-specific cost calculations. */
3905 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3906 &vec_inside_cost, &vec_epilogue_cost);
3908 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3910 if (dump_enabled_p ())
3912 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3913 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3914 vec_inside_cost);
3915 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3916 vec_prologue_cost);
3917 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3918 vec_epilogue_cost);
3919 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3920 scalar_single_iter_cost);
3921 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3922 scalar_outside_cost);
3923 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3924 vec_outside_cost);
3925 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3926 peel_iters_prologue);
3927 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3928 peel_iters_epilogue);
3931 /* Calculate number of iterations required to make the vector version
3932 profitable, relative to the loop bodies only. The following condition
3933 must hold true:
3934 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3935 where
3936 SIC = scalar iteration cost, VIC = vector iteration cost,
3937 VOC = vector outside cost, VF = vectorization factor,
3938 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3939 SOC = scalar outside cost for run time cost model check. */
3941 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3943 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3944 * assumed_vf
3945 - vec_inside_cost * peel_iters_prologue
3946 - vec_inside_cost * peel_iters_epilogue);
3947 if (min_profitable_iters <= 0)
3948 min_profitable_iters = 0;
3949 else
3951 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3952 - vec_inside_cost);
3954 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3955 <= (((int) vec_inside_cost * min_profitable_iters)
3956 + (((int) vec_outside_cost - scalar_outside_cost)
3957 * assumed_vf)))
3958 min_profitable_iters++;
3961 /* vector version will never be profitable. */
3962 else
3964 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3965 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3966 "did not happen for a simd loop");
3968 if (dump_enabled_p ())
3969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3970 "cost model: the vector iteration cost = %d "
3971 "divided by the scalar iteration cost = %d "
3972 "is greater or equal to the vectorization factor = %d"
3973 ".\n",
3974 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3975 *ret_min_profitable_niters = -1;
3976 *ret_min_profitable_estimate = -1;
3977 return;
3980 dump_printf (MSG_NOTE,
3981 " Calculated minimum iters for profitability: %d\n",
3982 min_profitable_iters);
3984 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3985 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3986 /* We want the vectorized loop to execute at least once. */
3987 min_profitable_iters = assumed_vf + peel_iters_prologue;
3989 if (dump_enabled_p ())
3990 dump_printf_loc (MSG_NOTE, vect_location,
3991 " Runtime profitability threshold = %d\n",
3992 min_profitable_iters);
3994 *ret_min_profitable_niters = min_profitable_iters;
3996 /* Calculate number of iterations required to make the vector version
3997 profitable, relative to the loop bodies only.
3999 Non-vectorized variant is SIC * niters and it must win over vector
4000 variant on the expected loop trip count. The following condition must hold true:
4001 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4003 if (vec_outside_cost <= 0)
4004 min_profitable_estimate = 0;
4005 else
4007 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4008 * assumed_vf
4009 - vec_inside_cost * peel_iters_prologue
4010 - vec_inside_cost * peel_iters_epilogue)
4011 / ((scalar_single_iter_cost * assumed_vf)
4012 - vec_inside_cost);
4014 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4015 if (dump_enabled_p ())
4016 dump_printf_loc (MSG_NOTE, vect_location,
4017 " Static estimate profitability threshold = %d\n",
4018 min_profitable_estimate);
4020 *ret_min_profitable_estimate = min_profitable_estimate;
4023 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4024 vector elements (not bits) for a vector with NELT elements. */
4025 static void
4026 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4027 vec_perm_builder *sel)
4029 /* The encoding is a single stepped pattern. Any wrap-around is handled
4030 by vec_perm_indices. */
4031 sel->new_vector (nelt, 1, 3);
4032 for (unsigned int i = 0; i < 3; i++)
4033 sel->quick_push (i + offset);
4036 /* Checks whether the target supports whole-vector shifts for vectors of mode
4037 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4038 it supports vec_perm_const with masks for all necessary shift amounts. */
4039 static bool
4040 have_whole_vector_shift (machine_mode mode)
4042 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4043 return true;
4045 /* Variable-length vectors should be handled via the optab. */
4046 unsigned int nelt;
4047 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4048 return false;
4050 vec_perm_builder sel;
4051 vec_perm_indices indices;
4052 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4054 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4055 indices.new_vector (sel, 2, nelt);
4056 if (!can_vec_perm_const_p (mode, indices, false))
4057 return false;
4059 return true;
4062 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4063 functions. Design better to avoid maintenance issues. */
4065 /* Function vect_model_reduction_cost.
4067 Models cost for a reduction operation, including the vector ops
4068 generated within the strip-mine loop, the initial definition before
4069 the loop, and the epilogue code that must be generated. */
4071 static void
4072 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4073 int ncopies)
4075 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4076 enum tree_code code;
4077 optab optab;
4078 tree vectype;
4079 gimple *orig_stmt;
4080 machine_mode mode;
4081 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4082 struct loop *loop = NULL;
4083 void *target_cost_data;
4085 if (loop_vinfo)
4087 loop = LOOP_VINFO_LOOP (loop_vinfo);
4088 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4090 else
4091 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4093 /* Condition reductions generate two reductions in the loop. */
4094 vect_reduction_type reduction_type
4095 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4096 if (reduction_type == COND_REDUCTION)
4097 ncopies *= 2;
4099 vectype = STMT_VINFO_VECTYPE (stmt_info);
4100 mode = TYPE_MODE (vectype);
4101 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4103 if (!orig_stmt)
4104 orig_stmt = STMT_VINFO_STMT (stmt_info);
4106 code = gimple_assign_rhs_code (orig_stmt);
4108 if (reduction_type == EXTRACT_LAST_REDUCTION
4109 || reduction_type == FOLD_LEFT_REDUCTION)
4111 /* No extra instructions needed in the prologue. */
4112 prologue_cost = 0;
4114 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4115 /* Count one reduction-like operation per vector. */
4116 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4117 stmt_info, 0, vect_body);
4118 else
4120 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4121 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4122 inside_cost = add_stmt_cost (target_cost_data, nelements,
4123 vec_to_scalar, stmt_info, 0,
4124 vect_body);
4125 inside_cost += add_stmt_cost (target_cost_data, nelements,
4126 scalar_stmt, stmt_info, 0,
4127 vect_body);
4130 else
4132 /* Add in cost for initial definition.
4133 For cond reduction we have four vectors: initial index, step,
4134 initial result of the data reduction, initial value of the index
4135 reduction. */
4136 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4137 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4138 scalar_to_vec, stmt_info, 0,
4139 vect_prologue);
4141 /* Cost of reduction op inside loop. */
4142 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4143 stmt_info, 0, vect_body);
4146 /* Determine cost of epilogue code.
4148 We have a reduction operator that will reduce the vector in one statement.
4149 Also requires scalar extract. */
4151 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4153 if (reduc_fn != IFN_LAST)
4155 if (reduction_type == COND_REDUCTION)
4157 /* An EQ stmt and an COND_EXPR stmt. */
4158 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4159 vector_stmt, stmt_info, 0,
4160 vect_epilogue);
4161 /* Reduction of the max index and a reduction of the found
4162 values. */
4163 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4164 vec_to_scalar, stmt_info, 0,
4165 vect_epilogue);
4166 /* A broadcast of the max value. */
4167 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4168 scalar_to_vec, stmt_info, 0,
4169 vect_epilogue);
4171 else
4173 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4174 stmt_info, 0, vect_epilogue);
4175 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4176 vec_to_scalar, stmt_info, 0,
4177 vect_epilogue);
4180 else if (reduction_type == COND_REDUCTION)
4182 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4183 /* Extraction of scalar elements. */
4184 epilogue_cost += add_stmt_cost (target_cost_data,
4185 2 * estimated_nunits,
4186 vec_to_scalar, stmt_info, 0,
4187 vect_epilogue);
4188 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4189 epilogue_cost += add_stmt_cost (target_cost_data,
4190 2 * estimated_nunits - 3,
4191 scalar_stmt, stmt_info, 0,
4192 vect_epilogue);
4194 else if (reduction_type == EXTRACT_LAST_REDUCTION
4195 || reduction_type == FOLD_LEFT_REDUCTION)
4196 /* No extra instructions need in the epilogue. */
4198 else
4200 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4201 tree bitsize =
4202 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4203 int element_bitsize = tree_to_uhwi (bitsize);
4204 int nelements = vec_size_in_bits / element_bitsize;
4206 if (code == COND_EXPR)
4207 code = MAX_EXPR;
4209 optab = optab_for_tree_code (code, vectype, optab_default);
4211 /* We have a whole vector shift available. */
4212 if (optab != unknown_optab
4213 && VECTOR_MODE_P (mode)
4214 && optab_handler (optab, mode) != CODE_FOR_nothing
4215 && have_whole_vector_shift (mode))
4217 /* Final reduction via vector shifts and the reduction operator.
4218 Also requires scalar extract. */
4219 epilogue_cost += add_stmt_cost (target_cost_data,
4220 exact_log2 (nelements) * 2,
4221 vector_stmt, stmt_info, 0,
4222 vect_epilogue);
4223 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4224 vec_to_scalar, stmt_info, 0,
4225 vect_epilogue);
4227 else
4228 /* Use extracts and reduction op for final reduction. For N
4229 elements, we have N extracts and N-1 reduction ops. */
4230 epilogue_cost += add_stmt_cost (target_cost_data,
4231 nelements + nelements - 1,
4232 vector_stmt, stmt_info, 0,
4233 vect_epilogue);
4237 if (dump_enabled_p ())
4238 dump_printf (MSG_NOTE,
4239 "vect_model_reduction_cost: inside_cost = %d, "
4240 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4241 prologue_cost, epilogue_cost);
4245 /* Function vect_model_induction_cost.
4247 Models cost for induction operations. */
4249 static void
4250 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4252 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4253 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4254 unsigned inside_cost, prologue_cost;
4256 if (PURE_SLP_STMT (stmt_info))
4257 return;
4259 /* loop cost for vec_loop. */
4260 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4261 stmt_info, 0, vect_body);
4263 /* prologue cost for vec_init and vec_step. */
4264 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4265 stmt_info, 0, vect_prologue);
4267 if (dump_enabled_p ())
4268 dump_printf_loc (MSG_NOTE, vect_location,
4269 "vect_model_induction_cost: inside_cost = %d, "
4270 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4275 /* Function get_initial_def_for_reduction
4277 Input:
4278 STMT - a stmt that performs a reduction operation in the loop.
4279 INIT_VAL - the initial value of the reduction variable
4281 Output:
4282 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4283 of the reduction (used for adjusting the epilog - see below).
4284 Return a vector variable, initialized according to the operation that STMT
4285 performs. This vector will be used as the initial value of the
4286 vector of partial results.
4288 Option1 (adjust in epilog): Initialize the vector as follows:
4289 add/bit or/xor: [0,0,...,0,0]
4290 mult/bit and: [1,1,...,1,1]
4291 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4292 and when necessary (e.g. add/mult case) let the caller know
4293 that it needs to adjust the result by init_val.
4295 Option2: Initialize the vector as follows:
4296 add/bit or/xor: [init_val,0,0,...,0]
4297 mult/bit and: [init_val,1,1,...,1]
4298 min/max/cond_expr: [init_val,init_val,...,init_val]
4299 and no adjustments are needed.
4301 For example, for the following code:
4303 s = init_val;
4304 for (i=0;i<n;i++)
4305 s = s + a[i];
4307 STMT is 's = s + a[i]', and the reduction variable is 's'.
4308 For a vector of 4 units, we want to return either [0,0,0,init_val],
4309 or [0,0,0,0] and let the caller know that it needs to adjust
4310 the result at the end by 'init_val'.
4312 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4313 initialization vector is simpler (same element in all entries), if
4314 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4316 A cost model should help decide between these two schemes. */
4318 tree
4319 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4320 tree *adjustment_def)
4322 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4323 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4324 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4325 tree scalar_type = TREE_TYPE (init_val);
4326 tree vectype = get_vectype_for_scalar_type (scalar_type);
4327 enum tree_code code = gimple_assign_rhs_code (stmt);
4328 tree def_for_init;
4329 tree init_def;
4330 bool nested_in_vect_loop = false;
4331 REAL_VALUE_TYPE real_init_val = dconst0;
4332 int int_init_val = 0;
4333 gimple *def_stmt = NULL;
4334 gimple_seq stmts = NULL;
4336 gcc_assert (vectype);
4338 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4339 || SCALAR_FLOAT_TYPE_P (scalar_type));
4341 if (nested_in_vect_loop_p (loop, stmt))
4342 nested_in_vect_loop = true;
4343 else
4344 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4346 /* In case of double reduction we only create a vector variable to be put
4347 in the reduction phi node. The actual statement creation is done in
4348 vect_create_epilog_for_reduction. */
4349 if (adjustment_def && nested_in_vect_loop
4350 && TREE_CODE (init_val) == SSA_NAME
4351 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4352 && gimple_code (def_stmt) == GIMPLE_PHI
4353 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4354 && vinfo_for_stmt (def_stmt)
4355 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4356 == vect_double_reduction_def)
4358 *adjustment_def = NULL;
4359 return vect_create_destination_var (init_val, vectype);
4362 vect_reduction_type reduction_type
4363 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4365 /* In case of a nested reduction do not use an adjustment def as
4366 that case is not supported by the epilogue generation correctly
4367 if ncopies is not one. */
4368 if (adjustment_def && nested_in_vect_loop)
4370 *adjustment_def = NULL;
4371 return vect_get_vec_def_for_operand (init_val, stmt);
4374 switch (code)
4376 case WIDEN_SUM_EXPR:
4377 case DOT_PROD_EXPR:
4378 case SAD_EXPR:
4379 case PLUS_EXPR:
4380 case MINUS_EXPR:
4381 case BIT_IOR_EXPR:
4382 case BIT_XOR_EXPR:
4383 case MULT_EXPR:
4384 case BIT_AND_EXPR:
4386 /* ADJUSTMENT_DEF is NULL when called from
4387 vect_create_epilog_for_reduction to vectorize double reduction. */
4388 if (adjustment_def)
4389 *adjustment_def = init_val;
4391 if (code == MULT_EXPR)
4393 real_init_val = dconst1;
4394 int_init_val = 1;
4397 if (code == BIT_AND_EXPR)
4398 int_init_val = -1;
4400 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4401 def_for_init = build_real (scalar_type, real_init_val);
4402 else
4403 def_for_init = build_int_cst (scalar_type, int_init_val);
4405 if (adjustment_def)
4406 /* Option1: the first element is '0' or '1' as well. */
4407 init_def = gimple_build_vector_from_val (&stmts, vectype,
4408 def_for_init);
4409 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4411 /* Option2 (variable length): the first element is INIT_VAL. */
4412 init_def = build_vector_from_val (vectype, def_for_init);
4413 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4414 2, init_def, init_val);
4415 init_def = make_ssa_name (vectype);
4416 gimple_call_set_lhs (call, init_def);
4417 gimple_seq_add_stmt (&stmts, call);
4419 else
4421 /* Option2: the first element is INIT_VAL. */
4422 tree_vector_builder elts (vectype, 1, 2);
4423 elts.quick_push (init_val);
4424 elts.quick_push (def_for_init);
4425 init_def = gimple_build_vector (&stmts, &elts);
4428 break;
4430 case MIN_EXPR:
4431 case MAX_EXPR:
4432 case COND_EXPR:
4434 if (adjustment_def)
4436 *adjustment_def = NULL_TREE;
4437 if (reduction_type != COND_REDUCTION
4438 && reduction_type != EXTRACT_LAST_REDUCTION)
4440 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4441 break;
4444 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4445 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4447 break;
4449 default:
4450 gcc_unreachable ();
4453 if (stmts)
4454 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4455 return init_def;
4458 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4459 NUMBER_OF_VECTORS is the number of vector defs to create.
4460 If NEUTRAL_OP is nonnull, introducing extra elements of that
4461 value will not change the result. */
4463 static void
4464 get_initial_defs_for_reduction (slp_tree slp_node,
4465 vec<tree> *vec_oprnds,
4466 unsigned int number_of_vectors,
4467 bool reduc_chain, tree neutral_op)
4469 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4470 gimple *stmt = stmts[0];
4471 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4472 unsigned HOST_WIDE_INT nunits;
4473 unsigned j, number_of_places_left_in_vector;
4474 tree vector_type;
4475 tree vop;
4476 int group_size = stmts.length ();
4477 unsigned int vec_num, i;
4478 unsigned number_of_copies = 1;
4479 vec<tree> voprnds;
4480 voprnds.create (number_of_vectors);
4481 struct loop *loop;
4482 auto_vec<tree, 16> permute_results;
4484 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4486 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4488 loop = (gimple_bb (stmt))->loop_father;
4489 gcc_assert (loop);
4490 edge pe = loop_preheader_edge (loop);
4492 gcc_assert (!reduc_chain || neutral_op);
4494 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4495 created vectors. It is greater than 1 if unrolling is performed.
4497 For example, we have two scalar operands, s1 and s2 (e.g., group of
4498 strided accesses of size two), while NUNITS is four (i.e., four scalars
4499 of this type can be packed in a vector). The output vector will contain
4500 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4501 will be 2).
4503 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4504 containing the operands.
4506 For example, NUNITS is four as before, and the group size is 8
4507 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4508 {s5, s6, s7, s8}. */
4510 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4511 nunits = group_size;
4513 number_of_copies = nunits * number_of_vectors / group_size;
4515 number_of_places_left_in_vector = nunits;
4516 bool constant_p = true;
4517 tree_vector_builder elts (vector_type, nunits, 1);
4518 elts.quick_grow (nunits);
4519 for (j = 0; j < number_of_copies; j++)
4521 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4523 tree op;
4524 /* Get the def before the loop. In reduction chain we have only
4525 one initial value. */
4526 if ((j != (number_of_copies - 1)
4527 || (reduc_chain && i != 0))
4528 && neutral_op)
4529 op = neutral_op;
4530 else
4531 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4533 /* Create 'vect_ = {op0,op1,...,opn}'. */
4534 number_of_places_left_in_vector--;
4535 elts[number_of_places_left_in_vector] = op;
4536 if (!CONSTANT_CLASS_P (op))
4537 constant_p = false;
4539 if (number_of_places_left_in_vector == 0)
4541 gimple_seq ctor_seq = NULL;
4542 tree init;
4543 if (constant_p && !neutral_op
4544 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4545 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4546 /* Build the vector directly from ELTS. */
4547 init = gimple_build_vector (&ctor_seq, &elts);
4548 else if (neutral_op)
4550 /* Build a vector of the neutral value and shift the
4551 other elements into place. */
4552 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4553 neutral_op);
4554 int k = nunits;
4555 while (k > 0 && elts[k - 1] == neutral_op)
4556 k -= 1;
4557 while (k > 0)
4559 k -= 1;
4560 gcall *call = gimple_build_call_internal
4561 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4562 init = make_ssa_name (vector_type);
4563 gimple_call_set_lhs (call, init);
4564 gimple_seq_add_stmt (&ctor_seq, call);
4567 else
4569 /* First time round, duplicate ELTS to fill the
4570 required number of vectors, then cherry pick the
4571 appropriate result for each iteration. */
4572 if (vec_oprnds->is_empty ())
4573 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4574 number_of_vectors,
4575 permute_results);
4576 init = permute_results[number_of_vectors - j - 1];
4578 if (ctor_seq != NULL)
4579 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4580 voprnds.quick_push (init);
4582 number_of_places_left_in_vector = nunits;
4583 elts.new_vector (vector_type, nunits, 1);
4584 elts.quick_grow (nunits);
4585 constant_p = true;
4590 /* Since the vectors are created in the reverse order, we should invert
4591 them. */
4592 vec_num = voprnds.length ();
4593 for (j = vec_num; j != 0; j--)
4595 vop = voprnds[j - 1];
4596 vec_oprnds->quick_push (vop);
4599 voprnds.release ();
4601 /* In case that VF is greater than the unrolling factor needed for the SLP
4602 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4603 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4604 to replicate the vectors. */
4605 tree neutral_vec = NULL;
4606 while (number_of_vectors > vec_oprnds->length ())
4608 if (neutral_op)
4610 if (!neutral_vec)
4612 gimple_seq ctor_seq = NULL;
4613 neutral_vec = gimple_build_vector_from_val
4614 (&ctor_seq, vector_type, neutral_op);
4615 if (ctor_seq != NULL)
4616 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4618 vec_oprnds->quick_push (neutral_vec);
4620 else
4622 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4623 vec_oprnds->quick_push (vop);
4629 /* Function vect_create_epilog_for_reduction
4631 Create code at the loop-epilog to finalize the result of a reduction
4632 computation.
4634 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4635 reduction statements.
4636 STMT is the scalar reduction stmt that is being vectorized.
4637 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4638 number of elements that we can fit in a vectype (nunits). In this case
4639 we have to generate more than one vector stmt - i.e - we need to "unroll"
4640 the vector stmt by a factor VF/nunits. For more details see documentation
4641 in vectorizable_operation.
4642 REDUC_FN is the internal function for the epilog reduction.
4643 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4644 computation.
4645 REDUC_INDEX is the index of the operand in the right hand side of the
4646 statement that is defined by REDUCTION_PHI.
4647 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4648 SLP_NODE is an SLP node containing a group of reduction statements. The
4649 first one in this group is STMT.
4650 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4651 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4652 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4653 any value of the IV in the loop.
4654 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4655 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4656 null if this is not an SLP reduction
4658 This function:
4659 1. Creates the reduction def-use cycles: sets the arguments for
4660 REDUCTION_PHIS:
4661 The loop-entry argument is the vectorized initial-value of the reduction.
4662 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4663 sums.
4664 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4665 by calling the function specified by REDUC_FN if available, or by
4666 other means (whole-vector shifts or a scalar loop).
4667 The function also creates a new phi node at the loop exit to preserve
4668 loop-closed form, as illustrated below.
4670 The flow at the entry to this function:
4672 loop:
4673 vec_def = phi <null, null> # REDUCTION_PHI
4674 VECT_DEF = vector_stmt # vectorized form of STMT
4675 s_loop = scalar_stmt # (scalar) STMT
4676 loop_exit:
4677 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4678 use <s_out0>
4679 use <s_out0>
4681 The above is transformed by this function into:
4683 loop:
4684 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4685 VECT_DEF = vector_stmt # vectorized form of STMT
4686 s_loop = scalar_stmt # (scalar) STMT
4687 loop_exit:
4688 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4689 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4690 v_out2 = reduce <v_out1>
4691 s_out3 = extract_field <v_out2, 0>
4692 s_out4 = adjust_result <s_out3>
4693 use <s_out4>
4694 use <s_out4>
4697 static void
4698 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4699 gimple *reduc_def_stmt,
4700 int ncopies, internal_fn reduc_fn,
4701 vec<gimple *> reduction_phis,
4702 bool double_reduc,
4703 slp_tree slp_node,
4704 slp_instance slp_node_instance,
4705 tree induc_val, enum tree_code induc_code,
4706 tree neutral_op)
4708 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4709 stmt_vec_info prev_phi_info;
4710 tree vectype;
4711 machine_mode mode;
4712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4713 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4714 basic_block exit_bb;
4715 tree scalar_dest;
4716 tree scalar_type;
4717 gimple *new_phi = NULL, *phi;
4718 gimple_stmt_iterator exit_gsi;
4719 tree vec_dest;
4720 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4721 gimple *epilog_stmt = NULL;
4722 enum tree_code code = gimple_assign_rhs_code (stmt);
4723 gimple *exit_phi;
4724 tree bitsize;
4725 tree adjustment_def = NULL;
4726 tree vec_initial_def = NULL;
4727 tree expr, def, initial_def = NULL;
4728 tree orig_name, scalar_result;
4729 imm_use_iterator imm_iter, phi_imm_iter;
4730 use_operand_p use_p, phi_use_p;
4731 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4732 bool nested_in_vect_loop = false;
4733 auto_vec<gimple *> new_phis;
4734 auto_vec<gimple *> inner_phis;
4735 enum vect_def_type dt = vect_unknown_def_type;
4736 int j, i;
4737 auto_vec<tree> scalar_results;
4738 unsigned int group_size = 1, k, ratio;
4739 auto_vec<tree> vec_initial_defs;
4740 auto_vec<gimple *> phis;
4741 bool slp_reduc = false;
4742 bool direct_slp_reduc;
4743 tree new_phi_result;
4744 gimple *inner_phi = NULL;
4745 tree induction_index = NULL_TREE;
4747 if (slp_node)
4748 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4750 if (nested_in_vect_loop_p (loop, stmt))
4752 outer_loop = loop;
4753 loop = loop->inner;
4754 nested_in_vect_loop = true;
4755 gcc_assert (!slp_node);
4758 vectype = STMT_VINFO_VECTYPE (stmt_info);
4759 gcc_assert (vectype);
4760 mode = TYPE_MODE (vectype);
4762 /* 1. Create the reduction def-use cycle:
4763 Set the arguments of REDUCTION_PHIS, i.e., transform
4765 loop:
4766 vec_def = phi <null, null> # REDUCTION_PHI
4767 VECT_DEF = vector_stmt # vectorized form of STMT
4770 into:
4772 loop:
4773 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4774 VECT_DEF = vector_stmt # vectorized form of STMT
4777 (in case of SLP, do it for all the phis). */
4779 /* Get the loop-entry arguments. */
4780 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4781 if (slp_node)
4783 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4784 vec_initial_defs.reserve (vec_num);
4785 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4786 &vec_initial_defs, vec_num,
4787 GROUP_FIRST_ELEMENT (stmt_info),
4788 neutral_op);
4790 else
4792 /* Get at the scalar def before the loop, that defines the initial value
4793 of the reduction variable. */
4794 gimple *def_stmt;
4795 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4796 loop_preheader_edge (loop));
4797 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4798 and we can't use zero for induc_val, use initial_def. Similarly
4799 for REDUC_MIN and initial_def larger than the base. */
4800 if (TREE_CODE (initial_def) == INTEGER_CST
4801 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4802 == INTEGER_INDUC_COND_REDUCTION)
4803 && !integer_zerop (induc_val)
4804 && ((induc_code == MAX_EXPR
4805 && tree_int_cst_lt (initial_def, induc_val))
4806 || (induc_code == MIN_EXPR
4807 && tree_int_cst_lt (induc_val, initial_def))))
4808 induc_val = initial_def;
4809 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4810 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4811 &adjustment_def);
4812 vec_initial_defs.create (1);
4813 vec_initial_defs.quick_push (vec_initial_def);
4816 /* Set phi nodes arguments. */
4817 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4819 tree vec_init_def = vec_initial_defs[i];
4820 tree def = vect_defs[i];
4821 for (j = 0; j < ncopies; j++)
4823 if (j != 0)
4825 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4826 if (nested_in_vect_loop)
4827 vec_init_def
4828 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4829 vec_init_def);
4832 /* Set the loop-entry arg of the reduction-phi. */
4834 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4835 == INTEGER_INDUC_COND_REDUCTION)
4837 /* Initialise the reduction phi to zero. This prevents initial
4838 values of non-zero interferring with the reduction op. */
4839 gcc_assert (ncopies == 1);
4840 gcc_assert (i == 0);
4842 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4843 tree induc_val_vec
4844 = build_vector_from_val (vec_init_def_type, induc_val);
4846 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4847 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4849 else
4850 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4851 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4853 /* Set the loop-latch arg for the reduction-phi. */
4854 if (j > 0)
4855 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4857 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4858 UNKNOWN_LOCATION);
4860 if (dump_enabled_p ())
4862 dump_printf_loc (MSG_NOTE, vect_location,
4863 "transform reduction: created def-use cycle: ");
4864 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4865 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4870 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4871 which is updated with the current index of the loop for every match of
4872 the original loop's cond_expr (VEC_STMT). This results in a vector
4873 containing the last time the condition passed for that vector lane.
4874 The first match will be a 1 to allow 0 to be used for non-matching
4875 indexes. If there are no matches at all then the vector will be all
4876 zeroes. */
4877 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4879 tree indx_before_incr, indx_after_incr;
4880 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4882 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4883 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4885 int scalar_precision
4886 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4887 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4888 tree cr_index_vector_type = build_vector_type
4889 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4891 /* First we create a simple vector induction variable which starts
4892 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4893 vector size (STEP). */
4895 /* Create a {1,2,3,...} vector. */
4896 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4898 /* Create a vector of the step value. */
4899 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4900 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4902 /* Create an induction variable. */
4903 gimple_stmt_iterator incr_gsi;
4904 bool insert_after;
4905 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4906 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4907 insert_after, &indx_before_incr, &indx_after_incr);
4909 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4910 filled with zeros (VEC_ZERO). */
4912 /* Create a vector of 0s. */
4913 tree zero = build_zero_cst (cr_index_scalar_type);
4914 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4916 /* Create a vector phi node. */
4917 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4918 new_phi = create_phi_node (new_phi_tree, loop->header);
4919 set_vinfo_for_stmt (new_phi,
4920 new_stmt_vec_info (new_phi, loop_vinfo));
4921 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4922 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4924 /* Now take the condition from the loops original cond_expr
4925 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4926 every match uses values from the induction variable
4927 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4928 (NEW_PHI_TREE).
4929 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4930 the new cond_expr (INDEX_COND_EXPR). */
4932 /* Duplicate the condition from vec_stmt. */
4933 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4935 /* Create a conditional, where the condition is taken from vec_stmt
4936 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4937 else is the phi (NEW_PHI_TREE). */
4938 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4939 ccompare, indx_before_incr,
4940 new_phi_tree);
4941 induction_index = make_ssa_name (cr_index_vector_type);
4942 gimple *index_condition = gimple_build_assign (induction_index,
4943 index_cond_expr);
4944 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4945 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4946 loop_vinfo);
4947 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4948 set_vinfo_for_stmt (index_condition, index_vec_info);
4950 /* Update the phi with the vec cond. */
4951 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4952 loop_latch_edge (loop), UNKNOWN_LOCATION);
4955 /* 2. Create epilog code.
4956 The reduction epilog code operates across the elements of the vector
4957 of partial results computed by the vectorized loop.
4958 The reduction epilog code consists of:
4960 step 1: compute the scalar result in a vector (v_out2)
4961 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4962 step 3: adjust the scalar result (s_out3) if needed.
4964 Step 1 can be accomplished using one the following three schemes:
4965 (scheme 1) using reduc_fn, if available.
4966 (scheme 2) using whole-vector shifts, if available.
4967 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4968 combined.
4970 The overall epilog code looks like this:
4972 s_out0 = phi <s_loop> # original EXIT_PHI
4973 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4974 v_out2 = reduce <v_out1> # step 1
4975 s_out3 = extract_field <v_out2, 0> # step 2
4976 s_out4 = adjust_result <s_out3> # step 3
4978 (step 3 is optional, and steps 1 and 2 may be combined).
4979 Lastly, the uses of s_out0 are replaced by s_out4. */
4982 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4983 v_out1 = phi <VECT_DEF>
4984 Store them in NEW_PHIS. */
4986 exit_bb = single_exit (loop)->dest;
4987 prev_phi_info = NULL;
4988 new_phis.create (vect_defs.length ());
4989 FOR_EACH_VEC_ELT (vect_defs, i, def)
4991 for (j = 0; j < ncopies; j++)
4993 tree new_def = copy_ssa_name (def);
4994 phi = create_phi_node (new_def, exit_bb);
4995 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4996 if (j == 0)
4997 new_phis.quick_push (phi);
4998 else
5000 def = vect_get_vec_def_for_stmt_copy (dt, def);
5001 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5004 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5005 prev_phi_info = vinfo_for_stmt (phi);
5009 /* The epilogue is created for the outer-loop, i.e., for the loop being
5010 vectorized. Create exit phis for the outer loop. */
5011 if (double_reduc)
5013 loop = outer_loop;
5014 exit_bb = single_exit (loop)->dest;
5015 inner_phis.create (vect_defs.length ());
5016 FOR_EACH_VEC_ELT (new_phis, i, phi)
5018 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5019 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5020 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5021 PHI_RESULT (phi));
5022 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5023 loop_vinfo));
5024 inner_phis.quick_push (phi);
5025 new_phis[i] = outer_phi;
5026 prev_phi_info = vinfo_for_stmt (outer_phi);
5027 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5029 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5030 new_result = copy_ssa_name (PHI_RESULT (phi));
5031 outer_phi = create_phi_node (new_result, exit_bb);
5032 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5033 PHI_RESULT (phi));
5034 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5035 loop_vinfo));
5036 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5037 prev_phi_info = vinfo_for_stmt (outer_phi);
5042 exit_gsi = gsi_after_labels (exit_bb);
5044 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5045 (i.e. when reduc_fn is not available) and in the final adjustment
5046 code (if needed). Also get the original scalar reduction variable as
5047 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5048 represents a reduction pattern), the tree-code and scalar-def are
5049 taken from the original stmt that the pattern-stmt (STMT) replaces.
5050 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5051 are taken from STMT. */
5053 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5054 if (!orig_stmt)
5056 /* Regular reduction */
5057 orig_stmt = stmt;
5059 else
5061 /* Reduction pattern */
5062 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5063 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5064 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5067 code = gimple_assign_rhs_code (orig_stmt);
5068 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5069 partial results are added and not subtracted. */
5070 if (code == MINUS_EXPR)
5071 code = PLUS_EXPR;
5073 scalar_dest = gimple_assign_lhs (orig_stmt);
5074 scalar_type = TREE_TYPE (scalar_dest);
5075 scalar_results.create (group_size);
5076 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5077 bitsize = TYPE_SIZE (scalar_type);
5079 /* In case this is a reduction in an inner-loop while vectorizing an outer
5080 loop - we don't need to extract a single scalar result at the end of the
5081 inner-loop (unless it is double reduction, i.e., the use of reduction is
5082 outside the outer-loop). The final vector of partial results will be used
5083 in the vectorized outer-loop, or reduced to a scalar result at the end of
5084 the outer-loop. */
5085 if (nested_in_vect_loop && !double_reduc)
5086 goto vect_finalize_reduction;
5088 /* SLP reduction without reduction chain, e.g.,
5089 # a1 = phi <a2, a0>
5090 # b1 = phi <b2, b0>
5091 a2 = operation (a1)
5092 b2 = operation (b1) */
5093 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5095 /* True if we should implement SLP_REDUC using native reduction operations
5096 instead of scalar operations. */
5097 direct_slp_reduc = (reduc_fn != IFN_LAST
5098 && slp_reduc
5099 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5101 /* In case of reduction chain, e.g.,
5102 # a1 = phi <a3, a0>
5103 a2 = operation (a1)
5104 a3 = operation (a2),
5106 we may end up with more than one vector result. Here we reduce them to
5107 one vector. */
5108 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5110 tree first_vect = PHI_RESULT (new_phis[0]);
5111 gassign *new_vec_stmt = NULL;
5112 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5113 for (k = 1; k < new_phis.length (); k++)
5115 gimple *next_phi = new_phis[k];
5116 tree second_vect = PHI_RESULT (next_phi);
5117 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5118 new_vec_stmt = gimple_build_assign (tem, code,
5119 first_vect, second_vect);
5120 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5121 first_vect = tem;
5124 new_phi_result = first_vect;
5125 if (new_vec_stmt)
5127 new_phis.truncate (0);
5128 new_phis.safe_push (new_vec_stmt);
5131 /* Likewise if we couldn't use a single defuse cycle. */
5132 else if (ncopies > 1)
5134 gcc_assert (new_phis.length () == 1);
5135 tree first_vect = PHI_RESULT (new_phis[0]);
5136 gassign *new_vec_stmt = NULL;
5137 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5138 gimple *next_phi = new_phis[0];
5139 for (int k = 1; k < ncopies; ++k)
5141 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5142 tree second_vect = PHI_RESULT (next_phi);
5143 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5144 new_vec_stmt = gimple_build_assign (tem, code,
5145 first_vect, second_vect);
5146 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5147 first_vect = tem;
5149 new_phi_result = first_vect;
5150 new_phis.truncate (0);
5151 new_phis.safe_push (new_vec_stmt);
5153 else
5154 new_phi_result = PHI_RESULT (new_phis[0]);
5156 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5157 && reduc_fn != IFN_LAST)
5159 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5160 various data values where the condition matched and another vector
5161 (INDUCTION_INDEX) containing all the indexes of those matches. We
5162 need to extract the last matching index (which will be the index with
5163 highest value) and use this to index into the data vector.
5164 For the case where there were no matches, the data vector will contain
5165 all default values and the index vector will be all zeros. */
5167 /* Get various versions of the type of the vector of indexes. */
5168 tree index_vec_type = TREE_TYPE (induction_index);
5169 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5170 tree index_scalar_type = TREE_TYPE (index_vec_type);
5171 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5172 (index_vec_type);
5174 /* Get an unsigned integer version of the type of the data vector. */
5175 int scalar_precision
5176 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5177 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5178 tree vectype_unsigned = build_vector_type
5179 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5181 /* First we need to create a vector (ZERO_VEC) of zeros and another
5182 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5183 can create using a MAX reduction and then expanding.
5184 In the case where the loop never made any matches, the max index will
5185 be zero. */
5187 /* Vector of {0, 0, 0,...}. */
5188 tree zero_vec = make_ssa_name (vectype);
5189 tree zero_vec_rhs = build_zero_cst (vectype);
5190 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5191 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5193 /* Find maximum value from the vector of found indexes. */
5194 tree max_index = make_ssa_name (index_scalar_type);
5195 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5196 1, induction_index);
5197 gimple_call_set_lhs (max_index_stmt, max_index);
5198 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5200 /* Vector of {max_index, max_index, max_index,...}. */
5201 tree max_index_vec = make_ssa_name (index_vec_type);
5202 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5203 max_index);
5204 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5205 max_index_vec_rhs);
5206 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5208 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5209 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5210 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5211 otherwise. Only one value should match, resulting in a vector
5212 (VEC_COND) with one data value and the rest zeros.
5213 In the case where the loop never made any matches, every index will
5214 match, resulting in a vector with all data values (which will all be
5215 the default value). */
5217 /* Compare the max index vector to the vector of found indexes to find
5218 the position of the max value. */
5219 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5220 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5221 induction_index,
5222 max_index_vec);
5223 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5225 /* Use the compare to choose either values from the data vector or
5226 zero. */
5227 tree vec_cond = make_ssa_name (vectype);
5228 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5229 vec_compare, new_phi_result,
5230 zero_vec);
5231 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5233 /* Finally we need to extract the data value from the vector (VEC_COND)
5234 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5235 reduction, but because this doesn't exist, we can use a MAX reduction
5236 instead. The data value might be signed or a float so we need to cast
5237 it first.
5238 In the case where the loop never made any matches, the data values are
5239 all identical, and so will reduce down correctly. */
5241 /* Make the matched data values unsigned. */
5242 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5243 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5244 vec_cond);
5245 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5246 VIEW_CONVERT_EXPR,
5247 vec_cond_cast_rhs);
5248 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5250 /* Reduce down to a scalar value. */
5251 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5252 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5253 1, vec_cond_cast);
5254 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5255 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5257 /* Convert the reduced value back to the result type and set as the
5258 result. */
5259 gimple_seq stmts = NULL;
5260 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5261 data_reduc);
5262 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5263 scalar_results.safe_push (new_temp);
5265 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5266 && reduc_fn == IFN_LAST)
5268 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5269 idx = 0;
5270 idx_val = induction_index[0];
5271 val = data_reduc[0];
5272 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5273 if (induction_index[i] > idx_val)
5274 val = data_reduc[i], idx_val = induction_index[i];
5275 return val; */
5277 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5278 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5279 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5280 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5281 /* Enforced by vectorizable_reduction, which ensures we have target
5282 support before allowing a conditional reduction on variable-length
5283 vectors. */
5284 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5285 tree idx_val = NULL_TREE, val = NULL_TREE;
5286 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5288 tree old_idx_val = idx_val;
5289 tree old_val = val;
5290 idx_val = make_ssa_name (idx_eltype);
5291 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5292 build3 (BIT_FIELD_REF, idx_eltype,
5293 induction_index,
5294 bitsize_int (el_size),
5295 bitsize_int (off)));
5296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5297 val = make_ssa_name (data_eltype);
5298 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5299 build3 (BIT_FIELD_REF,
5300 data_eltype,
5301 new_phi_result,
5302 bitsize_int (el_size),
5303 bitsize_int (off)));
5304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305 if (off != 0)
5307 tree new_idx_val = idx_val;
5308 tree new_val = val;
5309 if (off != v_size - el_size)
5311 new_idx_val = make_ssa_name (idx_eltype);
5312 epilog_stmt = gimple_build_assign (new_idx_val,
5313 MAX_EXPR, idx_val,
5314 old_idx_val);
5315 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5317 new_val = make_ssa_name (data_eltype);
5318 epilog_stmt = gimple_build_assign (new_val,
5319 COND_EXPR,
5320 build2 (GT_EXPR,
5321 boolean_type_node,
5322 idx_val,
5323 old_idx_val),
5324 val, old_val);
5325 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5326 idx_val = new_idx_val;
5327 val = new_val;
5330 /* Convert the reduced value back to the result type and set as the
5331 result. */
5332 gimple_seq stmts = NULL;
5333 val = gimple_convert (&stmts, scalar_type, val);
5334 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5335 scalar_results.safe_push (val);
5338 /* 2.3 Create the reduction code, using one of the three schemes described
5339 above. In SLP we simply need to extract all the elements from the
5340 vector (without reducing them), so we use scalar shifts. */
5341 else if (reduc_fn != IFN_LAST && !slp_reduc)
5343 tree tmp;
5344 tree vec_elem_type;
5346 /* Case 1: Create:
5347 v_out2 = reduc_expr <v_out1> */
5349 if (dump_enabled_p ())
5350 dump_printf_loc (MSG_NOTE, vect_location,
5351 "Reduce using direct vector reduction.\n");
5353 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5354 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5356 tree tmp_dest
5357 = vect_create_destination_var (scalar_dest, vec_elem_type);
5358 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5359 new_phi_result);
5360 gimple_set_lhs (epilog_stmt, tmp_dest);
5361 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5362 gimple_set_lhs (epilog_stmt, new_temp);
5363 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5366 new_temp);
5368 else
5370 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5371 new_phi_result);
5372 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5375 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376 gimple_set_lhs (epilog_stmt, new_temp);
5377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5380 == INTEGER_INDUC_COND_REDUCTION)
5381 && !operand_equal_p (initial_def, induc_val, 0))
5383 /* Earlier we set the initial value to be a vector if induc_val
5384 values. Check the result and if it is induc_val then replace
5385 with the original initial value, unless induc_val is
5386 the same as initial_def already. */
5387 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5388 induc_val);
5390 tmp = make_ssa_name (new_scalar_dest);
5391 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5392 initial_def, new_temp);
5393 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5394 new_temp = tmp;
5397 scalar_results.safe_push (new_temp);
5399 else if (direct_slp_reduc)
5401 /* Here we create one vector for each of the GROUP_SIZE results,
5402 with the elements for other SLP statements replaced with the
5403 neutral value. We can then do a normal reduction on each vector. */
5405 /* Enforced by vectorizable_reduction. */
5406 gcc_assert (new_phis.length () == 1);
5407 gcc_assert (pow2p_hwi (group_size));
5409 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5410 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5411 gimple_seq seq = NULL;
5413 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5414 and the same element size as VECTYPE. */
5415 tree index = build_index_vector (vectype, 0, 1);
5416 tree index_type = TREE_TYPE (index);
5417 tree index_elt_type = TREE_TYPE (index_type);
5418 tree mask_type = build_same_sized_truth_vector_type (index_type);
5420 /* Create a vector that, for each element, identifies which of
5421 the GROUP_SIZE results should use it. */
5422 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5423 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5424 build_vector_from_val (index_type, index_mask));
5426 /* Get a neutral vector value. This is simply a splat of the neutral
5427 scalar value if we have one, otherwise the initial scalar value
5428 is itself a neutral value. */
5429 tree vector_identity = NULL_TREE;
5430 if (neutral_op)
5431 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5432 neutral_op);
5433 for (unsigned int i = 0; i < group_size; ++i)
5435 /* If there's no univeral neutral value, we can use the
5436 initial scalar value from the original PHI. This is used
5437 for MIN and MAX reduction, for example. */
5438 if (!neutral_op)
5440 tree scalar_value
5441 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5442 loop_preheader_edge (loop));
5443 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5444 scalar_value);
5447 /* Calculate the equivalent of:
5449 sel[j] = (index[j] == i);
5451 which selects the elements of NEW_PHI_RESULT that should
5452 be included in the result. */
5453 tree compare_val = build_int_cst (index_elt_type, i);
5454 compare_val = build_vector_from_val (index_type, compare_val);
5455 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5456 index, compare_val);
5458 /* Calculate the equivalent of:
5460 vec = seq ? new_phi_result : vector_identity;
5462 VEC is now suitable for a full vector reduction. */
5463 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5464 sel, new_phi_result, vector_identity);
5466 /* Do the reduction and convert it to the appropriate type. */
5467 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5468 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5469 gimple_call_set_lhs (call, scalar);
5470 gimple_seq_add_stmt (&seq, call);
5471 scalar = gimple_convert (&seq, scalar_type, scalar);
5472 scalar_results.safe_push (scalar);
5474 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5476 else
5478 bool reduce_with_shift;
5479 tree vec_temp;
5481 /* COND reductions all do the final reduction with MAX_EXPR
5482 or MIN_EXPR. */
5483 if (code == COND_EXPR)
5485 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5486 == INTEGER_INDUC_COND_REDUCTION)
5487 code = induc_code;
5488 else
5489 code = MAX_EXPR;
5492 /* See if the target wants to do the final (shift) reduction
5493 in a vector mode of smaller size and first reduce upper/lower
5494 halves against each other. */
5495 enum machine_mode mode1 = mode;
5496 tree vectype1 = vectype;
5497 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5498 unsigned sz1 = sz;
5499 if (!slp_reduc
5500 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5501 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5503 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5504 reduce_with_shift = have_whole_vector_shift (mode1);
5505 if (!VECTOR_MODE_P (mode1))
5506 reduce_with_shift = false;
5507 else
5509 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5510 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5511 reduce_with_shift = false;
5514 /* First reduce the vector to the desired vector size we should
5515 do shift reduction on by combining upper and lower halves. */
5516 new_temp = new_phi_result;
5517 while (sz > sz1)
5519 gcc_assert (!slp_reduc);
5520 sz /= 2;
5521 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5523 /* The target has to make sure we support lowpart/highpart
5524 extraction, either via direct vector extract or through
5525 an integer mode punning. */
5526 tree dst1, dst2;
5527 if (convert_optab_handler (vec_extract_optab,
5528 TYPE_MODE (TREE_TYPE (new_temp)),
5529 TYPE_MODE (vectype1))
5530 != CODE_FOR_nothing)
5532 /* Extract sub-vectors directly once vec_extract becomes
5533 a conversion optab. */
5534 dst1 = make_ssa_name (vectype1);
5535 epilog_stmt
5536 = gimple_build_assign (dst1, BIT_FIELD_REF,
5537 build3 (BIT_FIELD_REF, vectype1,
5538 new_temp, TYPE_SIZE (vectype1),
5539 bitsize_int (0)));
5540 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5541 dst2 = make_ssa_name (vectype1);
5542 epilog_stmt
5543 = gimple_build_assign (dst2, BIT_FIELD_REF,
5544 build3 (BIT_FIELD_REF, vectype1,
5545 new_temp, TYPE_SIZE (vectype1),
5546 bitsize_int (sz * BITS_PER_UNIT)));
5547 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5549 else
5551 /* Extract via punning to appropriately sized integer mode
5552 vector. */
5553 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5555 tree etype = build_vector_type (eltype, 2);
5556 gcc_assert (convert_optab_handler (vec_extract_optab,
5557 TYPE_MODE (etype),
5558 TYPE_MODE (eltype))
5559 != CODE_FOR_nothing);
5560 tree tem = make_ssa_name (etype);
5561 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5562 build1 (VIEW_CONVERT_EXPR,
5563 etype, new_temp));
5564 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5565 new_temp = tem;
5566 tem = make_ssa_name (eltype);
5567 epilog_stmt
5568 = gimple_build_assign (tem, BIT_FIELD_REF,
5569 build3 (BIT_FIELD_REF, eltype,
5570 new_temp, TYPE_SIZE (eltype),
5571 bitsize_int (0)));
5572 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573 dst1 = make_ssa_name (vectype1);
5574 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5575 build1 (VIEW_CONVERT_EXPR,
5576 vectype1, tem));
5577 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5578 tem = make_ssa_name (eltype);
5579 epilog_stmt
5580 = gimple_build_assign (tem, BIT_FIELD_REF,
5581 build3 (BIT_FIELD_REF, eltype,
5582 new_temp, TYPE_SIZE (eltype),
5583 bitsize_int (sz * BITS_PER_UNIT)));
5584 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5585 dst2 = make_ssa_name (vectype1);
5586 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5587 build1 (VIEW_CONVERT_EXPR,
5588 vectype1, tem));
5589 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5592 new_temp = make_ssa_name (vectype1);
5593 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5597 if (reduce_with_shift && !slp_reduc)
5599 int element_bitsize = tree_to_uhwi (bitsize);
5600 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5601 for variable-length vectors and also requires direct target support
5602 for loop reductions. */
5603 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5604 int nelements = vec_size_in_bits / element_bitsize;
5605 vec_perm_builder sel;
5606 vec_perm_indices indices;
5608 int elt_offset;
5610 tree zero_vec = build_zero_cst (vectype1);
5611 /* Case 2: Create:
5612 for (offset = nelements/2; offset >= 1; offset/=2)
5614 Create: va' = vec_shift <va, offset>
5615 Create: va = vop <va, va'>
5616 } */
5618 tree rhs;
5620 if (dump_enabled_p ())
5621 dump_printf_loc (MSG_NOTE, vect_location,
5622 "Reduce using vector shifts\n");
5624 mode1 = TYPE_MODE (vectype1);
5625 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5626 for (elt_offset = nelements / 2;
5627 elt_offset >= 1;
5628 elt_offset /= 2)
5630 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5631 indices.new_vector (sel, 2, nelements);
5632 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5633 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5634 new_temp, zero_vec, mask);
5635 new_name = make_ssa_name (vec_dest, epilog_stmt);
5636 gimple_assign_set_lhs (epilog_stmt, new_name);
5637 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5639 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5640 new_temp);
5641 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5642 gimple_assign_set_lhs (epilog_stmt, new_temp);
5643 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5646 /* 2.4 Extract the final scalar result. Create:
5647 s_out3 = extract_field <v_out2, bitpos> */
5649 if (dump_enabled_p ())
5650 dump_printf_loc (MSG_NOTE, vect_location,
5651 "extract scalar result\n");
5653 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5654 bitsize, bitsize_zero_node);
5655 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5656 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5657 gimple_assign_set_lhs (epilog_stmt, new_temp);
5658 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5659 scalar_results.safe_push (new_temp);
5661 else
5663 /* Case 3: Create:
5664 s = extract_field <v_out2, 0>
5665 for (offset = element_size;
5666 offset < vector_size;
5667 offset += element_size;)
5669 Create: s' = extract_field <v_out2, offset>
5670 Create: s = op <s, s'> // For non SLP cases
5671 } */
5673 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_NOTE, vect_location,
5675 "Reduce using scalar code.\n");
5677 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5678 int element_bitsize = tree_to_uhwi (bitsize);
5679 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5681 int bit_offset;
5682 if (gimple_code (new_phi) == GIMPLE_PHI)
5683 vec_temp = PHI_RESULT (new_phi);
5684 else
5685 vec_temp = gimple_assign_lhs (new_phi);
5686 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5687 bitsize_zero_node);
5688 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5689 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5690 gimple_assign_set_lhs (epilog_stmt, new_temp);
5691 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5693 /* In SLP we don't need to apply reduction operation, so we just
5694 collect s' values in SCALAR_RESULTS. */
5695 if (slp_reduc)
5696 scalar_results.safe_push (new_temp);
5698 for (bit_offset = element_bitsize;
5699 bit_offset < vec_size_in_bits;
5700 bit_offset += element_bitsize)
5702 tree bitpos = bitsize_int (bit_offset);
5703 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5704 bitsize, bitpos);
5706 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5707 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5708 gimple_assign_set_lhs (epilog_stmt, new_name);
5709 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5711 if (slp_reduc)
5713 /* In SLP we don't need to apply reduction operation, so
5714 we just collect s' values in SCALAR_RESULTS. */
5715 new_temp = new_name;
5716 scalar_results.safe_push (new_name);
5718 else
5720 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5721 new_name, new_temp);
5722 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5723 gimple_assign_set_lhs (epilog_stmt, new_temp);
5724 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5729 /* The only case where we need to reduce scalar results in SLP, is
5730 unrolling. If the size of SCALAR_RESULTS is greater than
5731 GROUP_SIZE, we reduce them combining elements modulo
5732 GROUP_SIZE. */
5733 if (slp_reduc)
5735 tree res, first_res, new_res;
5736 gimple *new_stmt;
5738 /* Reduce multiple scalar results in case of SLP unrolling. */
5739 for (j = group_size; scalar_results.iterate (j, &res);
5740 j++)
5742 first_res = scalar_results[j % group_size];
5743 new_stmt = gimple_build_assign (new_scalar_dest, code,
5744 first_res, res);
5745 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5746 gimple_assign_set_lhs (new_stmt, new_res);
5747 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5748 scalar_results[j % group_size] = new_res;
5751 else
5752 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5753 scalar_results.safe_push (new_temp);
5756 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5757 == INTEGER_INDUC_COND_REDUCTION)
5758 && !operand_equal_p (initial_def, induc_val, 0))
5760 /* Earlier we set the initial value to be a vector if induc_val
5761 values. Check the result and if it is induc_val then replace
5762 with the original initial value, unless induc_val is
5763 the same as initial_def already. */
5764 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5765 induc_val);
5767 tree tmp = make_ssa_name (new_scalar_dest);
5768 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5769 initial_def, new_temp);
5770 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5771 scalar_results[0] = tmp;
5775 vect_finalize_reduction:
5777 if (double_reduc)
5778 loop = loop->inner;
5780 /* 2.5 Adjust the final result by the initial value of the reduction
5781 variable. (When such adjustment is not needed, then
5782 'adjustment_def' is zero). For example, if code is PLUS we create:
5783 new_temp = loop_exit_def + adjustment_def */
5785 if (adjustment_def)
5787 gcc_assert (!slp_reduc);
5788 if (nested_in_vect_loop)
5790 new_phi = new_phis[0];
5791 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5792 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5793 new_dest = vect_create_destination_var (scalar_dest, vectype);
5795 else
5797 new_temp = scalar_results[0];
5798 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5799 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5800 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5803 epilog_stmt = gimple_build_assign (new_dest, expr);
5804 new_temp = make_ssa_name (new_dest, epilog_stmt);
5805 gimple_assign_set_lhs (epilog_stmt, new_temp);
5806 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5807 if (nested_in_vect_loop)
5809 set_vinfo_for_stmt (epilog_stmt,
5810 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5811 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5812 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5814 if (!double_reduc)
5815 scalar_results.quick_push (new_temp);
5816 else
5817 scalar_results[0] = new_temp;
5819 else
5820 scalar_results[0] = new_temp;
5822 new_phis[0] = epilog_stmt;
5825 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5826 phis with new adjusted scalar results, i.e., replace use <s_out0>
5827 with use <s_out4>.
5829 Transform:
5830 loop_exit:
5831 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5832 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5833 v_out2 = reduce <v_out1>
5834 s_out3 = extract_field <v_out2, 0>
5835 s_out4 = adjust_result <s_out3>
5836 use <s_out0>
5837 use <s_out0>
5839 into:
5841 loop_exit:
5842 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5843 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5844 v_out2 = reduce <v_out1>
5845 s_out3 = extract_field <v_out2, 0>
5846 s_out4 = adjust_result <s_out3>
5847 use <s_out4>
5848 use <s_out4> */
5851 /* In SLP reduction chain we reduce vector results into one vector if
5852 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5853 the last stmt in the reduction chain, since we are looking for the loop
5854 exit phi node. */
5855 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5857 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5858 /* Handle reduction patterns. */
5859 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5860 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5862 scalar_dest = gimple_assign_lhs (dest_stmt);
5863 group_size = 1;
5866 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5867 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5868 need to match SCALAR_RESULTS with corresponding statements. The first
5869 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5870 the first vector stmt, etc.
5871 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5872 if (group_size > new_phis.length ())
5874 ratio = group_size / new_phis.length ();
5875 gcc_assert (!(group_size % new_phis.length ()));
5877 else
5878 ratio = 1;
5880 for (k = 0; k < group_size; k++)
5882 if (k % ratio == 0)
5884 epilog_stmt = new_phis[k / ratio];
5885 reduction_phi = reduction_phis[k / ratio];
5886 if (double_reduc)
5887 inner_phi = inner_phis[k / ratio];
5890 if (slp_reduc)
5892 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5894 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5895 /* SLP statements can't participate in patterns. */
5896 gcc_assert (!orig_stmt);
5897 scalar_dest = gimple_assign_lhs (current_stmt);
5900 phis.create (3);
5901 /* Find the loop-closed-use at the loop exit of the original scalar
5902 result. (The reduction result is expected to have two immediate uses -
5903 one at the latch block, and one at the loop exit). */
5904 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5905 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5906 && !is_gimple_debug (USE_STMT (use_p)))
5907 phis.safe_push (USE_STMT (use_p));
5909 /* While we expect to have found an exit_phi because of loop-closed-ssa
5910 form we can end up without one if the scalar cycle is dead. */
5912 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5914 if (outer_loop)
5916 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5917 gphi *vect_phi;
5919 /* FORNOW. Currently not supporting the case that an inner-loop
5920 reduction is not used in the outer-loop (but only outside the
5921 outer-loop), unless it is double reduction. */
5922 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5923 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5924 || double_reduc);
5926 if (double_reduc)
5927 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5928 else
5929 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5930 if (!double_reduc
5931 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5932 != vect_double_reduction_def)
5933 continue;
5935 /* Handle double reduction:
5937 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5938 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5939 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5940 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5942 At that point the regular reduction (stmt2 and stmt3) is
5943 already vectorized, as well as the exit phi node, stmt4.
5944 Here we vectorize the phi node of double reduction, stmt1, and
5945 update all relevant statements. */
5947 /* Go through all the uses of s2 to find double reduction phi
5948 node, i.e., stmt1 above. */
5949 orig_name = PHI_RESULT (exit_phi);
5950 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5952 stmt_vec_info use_stmt_vinfo;
5953 stmt_vec_info new_phi_vinfo;
5954 tree vect_phi_init, preheader_arg, vect_phi_res;
5955 basic_block bb = gimple_bb (use_stmt);
5956 gimple *use;
5958 /* Check that USE_STMT is really double reduction phi
5959 node. */
5960 if (gimple_code (use_stmt) != GIMPLE_PHI
5961 || gimple_phi_num_args (use_stmt) != 2
5962 || bb->loop_father != outer_loop)
5963 continue;
5964 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5965 if (!use_stmt_vinfo
5966 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5967 != vect_double_reduction_def)
5968 continue;
5970 /* Create vector phi node for double reduction:
5971 vs1 = phi <vs0, vs2>
5972 vs1 was created previously in this function by a call to
5973 vect_get_vec_def_for_operand and is stored in
5974 vec_initial_def;
5975 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5976 vs0 is created here. */
5978 /* Create vector phi node. */
5979 vect_phi = create_phi_node (vec_initial_def, bb);
5980 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5981 loop_vec_info_for_loop (outer_loop));
5982 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5984 /* Create vs0 - initial def of the double reduction phi. */
5985 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5986 loop_preheader_edge (outer_loop));
5987 vect_phi_init = get_initial_def_for_reduction
5988 (stmt, preheader_arg, NULL);
5990 /* Update phi node arguments with vs0 and vs2. */
5991 add_phi_arg (vect_phi, vect_phi_init,
5992 loop_preheader_edge (outer_loop),
5993 UNKNOWN_LOCATION);
5994 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5995 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5996 if (dump_enabled_p ())
5998 dump_printf_loc (MSG_NOTE, vect_location,
5999 "created double reduction phi node: ");
6000 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6003 vect_phi_res = PHI_RESULT (vect_phi);
6005 /* Replace the use, i.e., set the correct vs1 in the regular
6006 reduction phi node. FORNOW, NCOPIES is always 1, so the
6007 loop is redundant. */
6008 use = reduction_phi;
6009 for (j = 0; j < ncopies; j++)
6011 edge pr_edge = loop_preheader_edge (loop);
6012 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6013 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6019 phis.release ();
6020 if (nested_in_vect_loop)
6022 if (double_reduc)
6023 loop = outer_loop;
6024 else
6025 continue;
6028 phis.create (3);
6029 /* Find the loop-closed-use at the loop exit of the original scalar
6030 result. (The reduction result is expected to have two immediate uses,
6031 one at the latch block, and one at the loop exit). For double
6032 reductions we are looking for exit phis of the outer loop. */
6033 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6035 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6037 if (!is_gimple_debug (USE_STMT (use_p)))
6038 phis.safe_push (USE_STMT (use_p));
6040 else
6042 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6044 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6046 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6048 if (!flow_bb_inside_loop_p (loop,
6049 gimple_bb (USE_STMT (phi_use_p)))
6050 && !is_gimple_debug (USE_STMT (phi_use_p)))
6051 phis.safe_push (USE_STMT (phi_use_p));
6057 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6059 /* Replace the uses: */
6060 orig_name = PHI_RESULT (exit_phi);
6061 scalar_result = scalar_results[k];
6062 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6063 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064 SET_USE (use_p, scalar_result);
6067 phis.release ();
6071 /* Return a vector of type VECTYPE that is equal to the vector select
6072 operation "MASK ? VEC : IDENTITY". Insert the select statements
6073 before GSI. */
6075 static tree
6076 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6077 tree vec, tree identity)
6079 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6080 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6081 mask, vec, identity);
6082 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6083 return cond;
6086 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6087 order, starting with LHS. Insert the extraction statements before GSI and
6088 associate the new scalar SSA names with variable SCALAR_DEST.
6089 Return the SSA name for the result. */
6091 static tree
6092 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6093 tree_code code, tree lhs, tree vector_rhs)
6095 tree vectype = TREE_TYPE (vector_rhs);
6096 tree scalar_type = TREE_TYPE (vectype);
6097 tree bitsize = TYPE_SIZE (scalar_type);
6098 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6099 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6101 for (unsigned HOST_WIDE_INT bit_offset = 0;
6102 bit_offset < vec_size_in_bits;
6103 bit_offset += element_bitsize)
6105 tree bitpos = bitsize_int (bit_offset);
6106 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6107 bitsize, bitpos);
6109 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6110 rhs = make_ssa_name (scalar_dest, stmt);
6111 gimple_assign_set_lhs (stmt, rhs);
6112 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6114 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6115 tree new_name = make_ssa_name (scalar_dest, stmt);
6116 gimple_assign_set_lhs (stmt, new_name);
6117 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6118 lhs = new_name;
6120 return lhs;
6123 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6124 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6125 statement. CODE is the operation performed by STMT and OPS are
6126 its scalar operands. REDUC_INDEX is the index of the operand in
6127 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6128 implements in-order reduction, or IFN_LAST if we should open-code it.
6129 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6130 that should be used to control the operation in a fully-masked loop. */
6132 static bool
6133 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6134 gimple **vec_stmt, slp_tree slp_node,
6135 gimple *reduc_def_stmt,
6136 tree_code code, internal_fn reduc_fn,
6137 tree ops[3], tree vectype_in,
6138 int reduc_index, vec_loop_masks *masks)
6140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6141 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6142 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6143 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6144 gimple *new_stmt = NULL;
6146 int ncopies;
6147 if (slp_node)
6148 ncopies = 1;
6149 else
6150 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6152 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6153 gcc_assert (ncopies == 1);
6154 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6155 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6156 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6157 == FOLD_LEFT_REDUCTION);
6159 if (slp_node)
6160 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6161 TYPE_VECTOR_SUBPARTS (vectype_in)));
6163 tree op0 = ops[1 - reduc_index];
6165 int group_size = 1;
6166 gimple *scalar_dest_def;
6167 auto_vec<tree> vec_oprnds0;
6168 if (slp_node)
6170 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6171 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6172 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6174 else
6176 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6177 vec_oprnds0.create (1);
6178 vec_oprnds0.quick_push (loop_vec_def0);
6179 scalar_dest_def = stmt;
6182 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6183 tree scalar_type = TREE_TYPE (scalar_dest);
6184 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6186 int vec_num = vec_oprnds0.length ();
6187 gcc_assert (vec_num == 1 || slp_node);
6188 tree vec_elem_type = TREE_TYPE (vectype_out);
6189 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6191 tree vector_identity = NULL_TREE;
6192 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6193 vector_identity = build_zero_cst (vectype_out);
6195 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6196 int i;
6197 tree def0;
6198 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6200 tree mask = NULL_TREE;
6201 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6202 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6204 /* Handle MINUS by adding the negative. */
6205 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6207 tree negated = make_ssa_name (vectype_out);
6208 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6209 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6210 def0 = negated;
6213 if (mask)
6214 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6215 vector_identity);
6217 /* On the first iteration the input is simply the scalar phi
6218 result, and for subsequent iterations it is the output of
6219 the preceding operation. */
6220 if (reduc_fn != IFN_LAST)
6222 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6223 /* For chained SLP reductions the output of the previous reduction
6224 operation serves as the input of the next. For the final statement
6225 the output cannot be a temporary - we reuse the original
6226 scalar destination of the last statement. */
6227 if (i != vec_num - 1)
6229 gimple_set_lhs (new_stmt, scalar_dest_var);
6230 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6231 gimple_set_lhs (new_stmt, reduc_var);
6234 else
6236 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6237 reduc_var, def0);
6238 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6239 /* Remove the statement, so that we can use the same code paths
6240 as for statements that we've just created. */
6241 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6242 gsi_remove (&tmp_gsi, false);
6245 if (i == vec_num - 1)
6247 gimple_set_lhs (new_stmt, scalar_dest);
6248 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6250 else
6251 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6253 if (slp_node)
6254 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6257 if (!slp_node)
6258 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6260 return true;
6263 /* Function is_nonwrapping_integer_induction.
6265 Check if STMT (which is part of loop LOOP) both increments and
6266 does not cause overflow. */
6268 static bool
6269 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6271 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6272 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6273 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6274 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6275 widest_int ni, max_loop_value, lhs_max;
6276 bool overflow = false;
6278 /* Make sure the loop is integer based. */
6279 if (TREE_CODE (base) != INTEGER_CST
6280 || TREE_CODE (step) != INTEGER_CST)
6281 return false;
6283 /* Check that the max size of the loop will not wrap. */
6285 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6286 return true;
6288 if (! max_stmt_executions (loop, &ni))
6289 return false;
6291 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6292 &overflow);
6293 if (overflow)
6294 return false;
6296 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6297 TYPE_SIGN (lhs_type), &overflow);
6298 if (overflow)
6299 return false;
6301 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6302 <= TYPE_PRECISION (lhs_type));
6305 /* Function vectorizable_reduction.
6307 Check if STMT performs a reduction operation that can be vectorized.
6308 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6309 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6310 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6312 This function also handles reduction idioms (patterns) that have been
6313 recognized in advance during vect_pattern_recog. In this case, STMT may be
6314 of this form:
6315 X = pattern_expr (arg0, arg1, ..., X)
6316 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6317 sequence that had been detected and replaced by the pattern-stmt (STMT).
6319 This function also handles reduction of condition expressions, for example:
6320 for (int i = 0; i < N; i++)
6321 if (a[i] < value)
6322 last = a[i];
6323 This is handled by vectorising the loop and creating an additional vector
6324 containing the loop indexes for which "a[i] < value" was true. In the
6325 function epilogue this is reduced to a single max value and then used to
6326 index into the vector of results.
6328 In some cases of reduction patterns, the type of the reduction variable X is
6329 different than the type of the other arguments of STMT.
6330 In such cases, the vectype that is used when transforming STMT into a vector
6331 stmt is different than the vectype that is used to determine the
6332 vectorization factor, because it consists of a different number of elements
6333 than the actual number of elements that are being operated upon in parallel.
6335 For example, consider an accumulation of shorts into an int accumulator.
6336 On some targets it's possible to vectorize this pattern operating on 8
6337 shorts at a time (hence, the vectype for purposes of determining the
6338 vectorization factor should be V8HI); on the other hand, the vectype that
6339 is used to create the vector form is actually V4SI (the type of the result).
6341 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6342 indicates what is the actual level of parallelism (V8HI in the example), so
6343 that the right vectorization factor would be derived. This vectype
6344 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6345 be used to create the vectorized stmt. The right vectype for the vectorized
6346 stmt is obtained from the type of the result X:
6347 get_vectype_for_scalar_type (TREE_TYPE (X))
6349 This means that, contrary to "regular" reductions (or "regular" stmts in
6350 general), the following equation:
6351 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6352 does *NOT* necessarily hold for reduction patterns. */
6354 bool
6355 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6356 gimple **vec_stmt, slp_tree slp_node,
6357 slp_instance slp_node_instance)
6359 tree vec_dest;
6360 tree scalar_dest;
6361 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6362 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6363 tree vectype_in = NULL_TREE;
6364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6366 enum tree_code code, orig_code;
6367 internal_fn reduc_fn;
6368 machine_mode vec_mode;
6369 int op_type;
6370 optab optab;
6371 tree new_temp = NULL_TREE;
6372 gimple *def_stmt;
6373 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6374 gimple *cond_reduc_def_stmt = NULL;
6375 enum tree_code cond_reduc_op_code = ERROR_MARK;
6376 tree scalar_type;
6377 bool is_simple_use;
6378 gimple *orig_stmt;
6379 stmt_vec_info orig_stmt_info = NULL;
6380 int i;
6381 int ncopies;
6382 int epilog_copies;
6383 stmt_vec_info prev_stmt_info, prev_phi_info;
6384 bool single_defuse_cycle = false;
6385 gimple *new_stmt = NULL;
6386 int j;
6387 tree ops[3];
6388 enum vect_def_type dts[3];
6389 bool nested_cycle = false, found_nested_cycle_def = false;
6390 bool double_reduc = false;
6391 basic_block def_bb;
6392 struct loop * def_stmt_loop, *outer_loop = NULL;
6393 tree def_arg;
6394 gimple *def_arg_stmt;
6395 auto_vec<tree> vec_oprnds0;
6396 auto_vec<tree> vec_oprnds1;
6397 auto_vec<tree> vec_oprnds2;
6398 auto_vec<tree> vect_defs;
6399 auto_vec<gimple *> phis;
6400 int vec_num;
6401 tree def0, tem;
6402 bool first_p = true;
6403 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6404 tree cond_reduc_val = NULL_TREE;
6406 /* Make sure it was already recognized as a reduction computation. */
6407 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6408 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6409 return false;
6411 if (nested_in_vect_loop_p (loop, stmt))
6413 outer_loop = loop;
6414 loop = loop->inner;
6415 nested_cycle = true;
6418 /* In case of reduction chain we switch to the first stmt in the chain, but
6419 we don't update STMT_INFO, since only the last stmt is marked as reduction
6420 and has reduction properties. */
6421 if (GROUP_FIRST_ELEMENT (stmt_info)
6422 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6424 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6425 first_p = false;
6428 if (gimple_code (stmt) == GIMPLE_PHI)
6430 /* Analysis is fully done on the reduction stmt invocation. */
6431 if (! vec_stmt)
6433 if (slp_node)
6434 slp_node_instance->reduc_phis = slp_node;
6436 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6437 return true;
6440 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6441 /* Leave the scalar phi in place. Note that checking
6442 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6443 for reductions involving a single statement. */
6444 return true;
6446 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6447 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6448 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6450 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6451 == EXTRACT_LAST_REDUCTION)
6452 /* Leave the scalar phi in place. */
6453 return true;
6455 gcc_assert (is_gimple_assign (reduc_stmt));
6456 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6458 tree op = gimple_op (reduc_stmt, k);
6459 if (op == gimple_phi_result (stmt))
6460 continue;
6461 if (k == 1
6462 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6463 continue;
6464 if (!vectype_in
6465 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6466 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6467 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6468 break;
6470 gcc_assert (vectype_in);
6472 if (slp_node)
6473 ncopies = 1;
6474 else
6475 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6477 use_operand_p use_p;
6478 gimple *use_stmt;
6479 if (ncopies > 1
6480 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6481 <= vect_used_only_live)
6482 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6483 && (use_stmt == reduc_stmt
6484 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6485 == reduc_stmt)))
6486 single_defuse_cycle = true;
6488 /* Create the destination vector */
6489 scalar_dest = gimple_assign_lhs (reduc_stmt);
6490 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6492 if (slp_node)
6493 /* The size vect_schedule_slp_instance computes is off for us. */
6494 vec_num = vect_get_num_vectors
6495 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6496 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6497 vectype_in);
6498 else
6499 vec_num = 1;
6501 /* Generate the reduction PHIs upfront. */
6502 prev_phi_info = NULL;
6503 for (j = 0; j < ncopies; j++)
6505 if (j == 0 || !single_defuse_cycle)
6507 for (i = 0; i < vec_num; i++)
6509 /* Create the reduction-phi that defines the reduction
6510 operand. */
6511 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6512 set_vinfo_for_stmt (new_phi,
6513 new_stmt_vec_info (new_phi, loop_vinfo));
6515 if (slp_node)
6516 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6517 else
6519 if (j == 0)
6520 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6521 else
6522 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6523 prev_phi_info = vinfo_for_stmt (new_phi);
6529 return true;
6532 /* 1. Is vectorizable reduction? */
6533 /* Not supportable if the reduction variable is used in the loop, unless
6534 it's a reduction chain. */
6535 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6536 && !GROUP_FIRST_ELEMENT (stmt_info))
6537 return false;
6539 /* Reductions that are not used even in an enclosing outer-loop,
6540 are expected to be "live" (used out of the loop). */
6541 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6542 && !STMT_VINFO_LIVE_P (stmt_info))
6543 return false;
6545 /* 2. Has this been recognized as a reduction pattern?
6547 Check if STMT represents a pattern that has been recognized
6548 in earlier analysis stages. For stmts that represent a pattern,
6549 the STMT_VINFO_RELATED_STMT field records the last stmt in
6550 the original sequence that constitutes the pattern. */
6552 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6553 if (orig_stmt)
6555 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6556 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6557 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6560 /* 3. Check the operands of the operation. The first operands are defined
6561 inside the loop body. The last operand is the reduction variable,
6562 which is defined by the loop-header-phi. */
6564 gcc_assert (is_gimple_assign (stmt));
6566 /* Flatten RHS. */
6567 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6569 case GIMPLE_BINARY_RHS:
6570 code = gimple_assign_rhs_code (stmt);
6571 op_type = TREE_CODE_LENGTH (code);
6572 gcc_assert (op_type == binary_op);
6573 ops[0] = gimple_assign_rhs1 (stmt);
6574 ops[1] = gimple_assign_rhs2 (stmt);
6575 break;
6577 case GIMPLE_TERNARY_RHS:
6578 code = gimple_assign_rhs_code (stmt);
6579 op_type = TREE_CODE_LENGTH (code);
6580 gcc_assert (op_type == ternary_op);
6581 ops[0] = gimple_assign_rhs1 (stmt);
6582 ops[1] = gimple_assign_rhs2 (stmt);
6583 ops[2] = gimple_assign_rhs3 (stmt);
6584 break;
6586 case GIMPLE_UNARY_RHS:
6587 return false;
6589 default:
6590 gcc_unreachable ();
6593 if (code == COND_EXPR && slp_node)
6594 return false;
6596 scalar_dest = gimple_assign_lhs (stmt);
6597 scalar_type = TREE_TYPE (scalar_dest);
6598 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6599 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6600 return false;
6602 /* Do not try to vectorize bit-precision reductions. */
6603 if (!type_has_mode_precision_p (scalar_type))
6604 return false;
6606 /* All uses but the last are expected to be defined in the loop.
6607 The last use is the reduction variable. In case of nested cycle this
6608 assumption is not true: we use reduc_index to record the index of the
6609 reduction variable. */
6610 gimple *reduc_def_stmt = NULL;
6611 int reduc_index = -1;
6612 for (i = 0; i < op_type; i++)
6614 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6615 if (i == 0 && code == COND_EXPR)
6616 continue;
6618 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6619 &def_stmt, &dts[i], &tem);
6620 dt = dts[i];
6621 gcc_assert (is_simple_use);
6622 if (dt == vect_reduction_def)
6624 reduc_def_stmt = def_stmt;
6625 reduc_index = i;
6626 continue;
6628 else if (tem)
6630 /* To properly compute ncopies we are interested in the widest
6631 input type in case we're looking at a widening accumulation. */
6632 if (!vectype_in
6633 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6634 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6635 vectype_in = tem;
6638 if (dt != vect_internal_def
6639 && dt != vect_external_def
6640 && dt != vect_constant_def
6641 && dt != vect_induction_def
6642 && !(dt == vect_nested_cycle && nested_cycle))
6643 return false;
6645 if (dt == vect_nested_cycle)
6647 found_nested_cycle_def = true;
6648 reduc_def_stmt = def_stmt;
6649 reduc_index = i;
6652 if (i == 1 && code == COND_EXPR)
6654 /* Record how value of COND_EXPR is defined. */
6655 if (dt == vect_constant_def)
6657 cond_reduc_dt = dt;
6658 cond_reduc_val = ops[i];
6660 if (dt == vect_induction_def
6661 && def_stmt != NULL
6662 && is_nonwrapping_integer_induction (def_stmt, loop))
6664 cond_reduc_dt = dt;
6665 cond_reduc_def_stmt = def_stmt;
6670 if (!vectype_in)
6671 vectype_in = vectype_out;
6673 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6674 directy used in stmt. */
6675 if (reduc_index == -1)
6677 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6679 if (dump_enabled_p ())
6680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6681 "in-order reduction chain without SLP.\n");
6682 return false;
6685 if (orig_stmt)
6686 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6687 else
6688 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6691 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6692 return false;
6694 if (!(reduc_index == -1
6695 || dts[reduc_index] == vect_reduction_def
6696 || dts[reduc_index] == vect_nested_cycle
6697 || ((dts[reduc_index] == vect_internal_def
6698 || dts[reduc_index] == vect_external_def
6699 || dts[reduc_index] == vect_constant_def
6700 || dts[reduc_index] == vect_induction_def)
6701 && nested_cycle && found_nested_cycle_def)))
6703 /* For pattern recognized stmts, orig_stmt might be a reduction,
6704 but some helper statements for the pattern might not, or
6705 might be COND_EXPRs with reduction uses in the condition. */
6706 gcc_assert (orig_stmt);
6707 return false;
6710 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6711 enum vect_reduction_type v_reduc_type
6712 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6713 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6715 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6716 /* If we have a condition reduction, see if we can simplify it further. */
6717 if (v_reduc_type == COND_REDUCTION)
6719 /* Loop peeling modifies initial value of reduction PHI, which
6720 makes the reduction stmt to be transformed different to the
6721 original stmt analyzed. We need to record reduction code for
6722 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6723 it can be used directly at transform stage. */
6724 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6725 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6727 /* Also set the reduction type to CONST_COND_REDUCTION. */
6728 gcc_assert (cond_reduc_dt == vect_constant_def);
6729 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6731 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6732 vectype_in, OPTIMIZE_FOR_SPEED))
6734 if (dump_enabled_p ())
6735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736 "optimizing condition reduction with"
6737 " FOLD_EXTRACT_LAST.\n");
6738 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6740 else if (cond_reduc_dt == vect_induction_def)
6742 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6743 tree base
6744 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6745 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6747 gcc_assert (TREE_CODE (base) == INTEGER_CST
6748 && TREE_CODE (step) == INTEGER_CST);
6749 cond_reduc_val = NULL_TREE;
6750 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6751 above base; punt if base is the minimum value of the type for
6752 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6753 if (tree_int_cst_sgn (step) == -1)
6755 cond_reduc_op_code = MIN_EXPR;
6756 if (tree_int_cst_sgn (base) == -1)
6757 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6758 else if (tree_int_cst_lt (base,
6759 TYPE_MAX_VALUE (TREE_TYPE (base))))
6760 cond_reduc_val
6761 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6763 else
6765 cond_reduc_op_code = MAX_EXPR;
6766 if (tree_int_cst_sgn (base) == 1)
6767 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6768 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6769 base))
6770 cond_reduc_val
6771 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6773 if (cond_reduc_val)
6775 if (dump_enabled_p ())
6776 dump_printf_loc (MSG_NOTE, vect_location,
6777 "condition expression based on "
6778 "integer induction.\n");
6779 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6780 = INTEGER_INDUC_COND_REDUCTION;
6783 else if (cond_reduc_dt == vect_constant_def)
6785 enum vect_def_type cond_initial_dt;
6786 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6787 tree cond_initial_val
6788 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6790 gcc_assert (cond_reduc_val != NULL_TREE);
6791 vect_is_simple_use (cond_initial_val, loop_vinfo,
6792 &def_stmt, &cond_initial_dt);
6793 if (cond_initial_dt == vect_constant_def
6794 && types_compatible_p (TREE_TYPE (cond_initial_val),
6795 TREE_TYPE (cond_reduc_val)))
6797 tree e = fold_binary (LE_EXPR, boolean_type_node,
6798 cond_initial_val, cond_reduc_val);
6799 if (e && (integer_onep (e) || integer_zerop (e)))
6801 if (dump_enabled_p ())
6802 dump_printf_loc (MSG_NOTE, vect_location,
6803 "condition expression based on "
6804 "compile time constant.\n");
6805 /* Record reduction code at analysis stage. */
6806 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6807 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6808 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6809 = CONST_COND_REDUCTION;
6815 if (orig_stmt)
6816 gcc_assert (tmp == orig_stmt
6817 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6818 else
6819 /* We changed STMT to be the first stmt in reduction chain, hence we
6820 check that in this case the first element in the chain is STMT. */
6821 gcc_assert (stmt == tmp
6822 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6824 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6825 return false;
6827 if (slp_node)
6828 ncopies = 1;
6829 else
6830 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6832 gcc_assert (ncopies >= 1);
6834 vec_mode = TYPE_MODE (vectype_in);
6835 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6837 if (code == COND_EXPR)
6839 /* Only call during the analysis stage, otherwise we'll lose
6840 STMT_VINFO_TYPE. */
6841 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6842 ops[reduc_index], 0, NULL))
6844 if (dump_enabled_p ())
6845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6846 "unsupported condition in reduction\n");
6847 return false;
6850 else
6852 /* 4. Supportable by target? */
6854 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6855 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6857 /* Shifts and rotates are only supported by vectorizable_shifts,
6858 not vectorizable_reduction. */
6859 if (dump_enabled_p ())
6860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6861 "unsupported shift or rotation.\n");
6862 return false;
6865 /* 4.1. check support for the operation in the loop */
6866 optab = optab_for_tree_code (code, vectype_in, optab_default);
6867 if (!optab)
6869 if (dump_enabled_p ())
6870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6871 "no optab.\n");
6873 return false;
6876 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6878 if (dump_enabled_p ())
6879 dump_printf (MSG_NOTE, "op not supported by target.\n");
6881 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6882 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6883 return false;
6885 if (dump_enabled_p ())
6886 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6889 /* Worthwhile without SIMD support? */
6890 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6891 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6893 if (dump_enabled_p ())
6894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895 "not worthwhile without SIMD support.\n");
6897 return false;
6901 /* 4.2. Check support for the epilog operation.
6903 If STMT represents a reduction pattern, then the type of the
6904 reduction variable may be different than the type of the rest
6905 of the arguments. For example, consider the case of accumulation
6906 of shorts into an int accumulator; The original code:
6907 S1: int_a = (int) short_a;
6908 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6910 was replaced with:
6911 STMT: int_acc = widen_sum <short_a, int_acc>
6913 This means that:
6914 1. The tree-code that is used to create the vector operation in the
6915 epilog code (that reduces the partial results) is not the
6916 tree-code of STMT, but is rather the tree-code of the original
6917 stmt from the pattern that STMT is replacing. I.e, in the example
6918 above we want to use 'widen_sum' in the loop, but 'plus' in the
6919 epilog.
6920 2. The type (mode) we use to check available target support
6921 for the vector operation to be created in the *epilog*, is
6922 determined by the type of the reduction variable (in the example
6923 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6924 However the type (mode) we use to check available target support
6925 for the vector operation to be created *inside the loop*, is
6926 determined by the type of the other arguments to STMT (in the
6927 example we'd check this: optab_handler (widen_sum_optab,
6928 vect_short_mode)).
6930 This is contrary to "regular" reductions, in which the types of all
6931 the arguments are the same as the type of the reduction variable.
6932 For "regular" reductions we can therefore use the same vector type
6933 (and also the same tree-code) when generating the epilog code and
6934 when generating the code inside the loop. */
6936 vect_reduction_type reduction_type
6937 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6938 if (orig_stmt
6939 && (reduction_type == TREE_CODE_REDUCTION
6940 || reduction_type == FOLD_LEFT_REDUCTION))
6942 /* This is a reduction pattern: get the vectype from the type of the
6943 reduction variable, and get the tree-code from orig_stmt. */
6944 orig_code = gimple_assign_rhs_code (orig_stmt);
6945 gcc_assert (vectype_out);
6946 vec_mode = TYPE_MODE (vectype_out);
6948 else
6950 /* Regular reduction: use the same vectype and tree-code as used for
6951 the vector code inside the loop can be used for the epilog code. */
6952 orig_code = code;
6954 if (code == MINUS_EXPR)
6955 orig_code = PLUS_EXPR;
6957 /* For simple condition reductions, replace with the actual expression
6958 we want to base our reduction around. */
6959 if (reduction_type == CONST_COND_REDUCTION)
6961 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6962 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6964 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6965 orig_code = cond_reduc_op_code;
6968 if (nested_cycle)
6970 def_bb = gimple_bb (reduc_def_stmt);
6971 def_stmt_loop = def_bb->loop_father;
6972 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6973 loop_preheader_edge (def_stmt_loop));
6974 if (TREE_CODE (def_arg) == SSA_NAME
6975 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6976 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6977 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6978 && vinfo_for_stmt (def_arg_stmt)
6979 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6980 == vect_double_reduction_def)
6981 double_reduc = true;
6984 reduc_fn = IFN_LAST;
6986 if (reduction_type == TREE_CODE_REDUCTION
6987 || reduction_type == FOLD_LEFT_REDUCTION
6988 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6989 || reduction_type == CONST_COND_REDUCTION)
6991 if (reduction_type == FOLD_LEFT_REDUCTION
6992 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6993 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6995 if (reduc_fn != IFN_LAST
6996 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6997 OPTIMIZE_FOR_SPEED))
6999 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 "reduc op not supported by target.\n");
7003 reduc_fn = IFN_LAST;
7006 else
7008 if (!nested_cycle || double_reduc)
7010 if (dump_enabled_p ())
7011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7012 "no reduc code for scalar code.\n");
7014 return false;
7018 else if (reduction_type == COND_REDUCTION)
7020 int scalar_precision
7021 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7022 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7023 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7024 nunits_out);
7026 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7027 OPTIMIZE_FOR_SPEED))
7028 reduc_fn = IFN_REDUC_MAX;
7031 if (reduction_type != EXTRACT_LAST_REDUCTION
7032 && reduc_fn == IFN_LAST
7033 && !nunits_out.is_constant ())
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037 "missing target support for reduction on"
7038 " variable-length vectors.\n");
7039 return false;
7042 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7043 && ncopies > 1)
7045 if (dump_enabled_p ())
7046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7047 "multiple types in double reduction or condition "
7048 "reduction.\n");
7049 return false;
7052 /* For SLP reductions, see if there is a neutral value we can use. */
7053 tree neutral_op = NULL_TREE;
7054 if (slp_node)
7055 neutral_op
7056 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7057 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7059 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7061 /* We can't support in-order reductions of code such as this:
7063 for (int i = 0; i < n1; ++i)
7064 for (int j = 0; j < n2; ++j)
7065 l += a[j];
7067 since GCC effectively transforms the loop when vectorizing:
7069 for (int i = 0; i < n1 / VF; ++i)
7070 for (int j = 0; j < n2; ++j)
7071 for (int k = 0; k < VF; ++k)
7072 l += a[j];
7074 which is a reassociation of the original operation. */
7075 if (dump_enabled_p ())
7076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077 "in-order double reduction not supported.\n");
7079 return false;
7082 if (reduction_type == FOLD_LEFT_REDUCTION
7083 && slp_node
7084 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7086 /* We cannot use in-order reductions in this case because there is
7087 an implicit reassociation of the operations involved. */
7088 if (dump_enabled_p ())
7089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7090 "in-order unchained SLP reductions not supported.\n");
7091 return false;
7094 /* For double reductions, and for SLP reductions with a neutral value,
7095 we construct a variable-length initial vector by loading a vector
7096 full of the neutral value and then shift-and-inserting the start
7097 values into the low-numbered elements. */
7098 if ((double_reduc || neutral_op)
7099 && !nunits_out.is_constant ()
7100 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7101 vectype_out, OPTIMIZE_FOR_SPEED))
7103 if (dump_enabled_p ())
7104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105 "reduction on variable-length vectors requires"
7106 " target support for a vector-shift-and-insert"
7107 " operation.\n");
7108 return false;
7111 /* Check extra constraints for variable-length unchained SLP reductions. */
7112 if (STMT_SLP_TYPE (stmt_info)
7113 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7114 && !nunits_out.is_constant ())
7116 /* We checked above that we could build the initial vector when
7117 there's a neutral element value. Check here for the case in
7118 which each SLP statement has its own initial value and in which
7119 that value needs to be repeated for every instance of the
7120 statement within the initial vector. */
7121 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7122 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7123 if (!neutral_op
7124 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7126 if (dump_enabled_p ())
7127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7128 "unsupported form of SLP reduction for"
7129 " variable-length vectors: cannot build"
7130 " initial vector.\n");
7131 return false;
7133 /* The epilogue code relies on the number of elements being a multiple
7134 of the group size. The duplicate-and-interleave approach to setting
7135 up the the initial vector does too. */
7136 if (!multiple_p (nunits_out, group_size))
7138 if (dump_enabled_p ())
7139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7140 "unsupported form of SLP reduction for"
7141 " variable-length vectors: the vector size"
7142 " is not a multiple of the number of results.\n");
7143 return false;
7147 /* In case of widenning multiplication by a constant, we update the type
7148 of the constant to be the type of the other operand. We check that the
7149 constant fits the type in the pattern recognition pass. */
7150 if (code == DOT_PROD_EXPR
7151 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7153 if (TREE_CODE (ops[0]) == INTEGER_CST)
7154 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7155 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7156 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7157 else
7159 if (dump_enabled_p ())
7160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7161 "invalid types in dot-prod\n");
7163 return false;
7167 if (reduction_type == COND_REDUCTION)
7169 widest_int ni;
7171 if (! max_loop_iterations (loop, &ni))
7173 if (dump_enabled_p ())
7174 dump_printf_loc (MSG_NOTE, vect_location,
7175 "loop count not known, cannot create cond "
7176 "reduction.\n");
7177 return false;
7179 /* Convert backedges to iterations. */
7180 ni += 1;
7182 /* The additional index will be the same type as the condition. Check
7183 that the loop can fit into this less one (because we'll use up the
7184 zero slot for when there are no matches). */
7185 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7186 if (wi::geu_p (ni, wi::to_widest (max_index)))
7188 if (dump_enabled_p ())
7189 dump_printf_loc (MSG_NOTE, vect_location,
7190 "loop size is greater than data size.\n");
7191 return false;
7195 /* In case the vectorization factor (VF) is bigger than the number
7196 of elements that we can fit in a vectype (nunits), we have to generate
7197 more than one vector stmt - i.e - we need to "unroll" the
7198 vector stmt by a factor VF/nunits. For more details see documentation
7199 in vectorizable_operation. */
7201 /* If the reduction is used in an outer loop we need to generate
7202 VF intermediate results, like so (e.g. for ncopies=2):
7203 r0 = phi (init, r0)
7204 r1 = phi (init, r1)
7205 r0 = x0 + r0;
7206 r1 = x1 + r1;
7207 (i.e. we generate VF results in 2 registers).
7208 In this case we have a separate def-use cycle for each copy, and therefore
7209 for each copy we get the vector def for the reduction variable from the
7210 respective phi node created for this copy.
7212 Otherwise (the reduction is unused in the loop nest), we can combine
7213 together intermediate results, like so (e.g. for ncopies=2):
7214 r = phi (init, r)
7215 r = x0 + r;
7216 r = x1 + r;
7217 (i.e. we generate VF/2 results in a single register).
7218 In this case for each copy we get the vector def for the reduction variable
7219 from the vectorized reduction operation generated in the previous iteration.
7221 This only works when we see both the reduction PHI and its only consumer
7222 in vectorizable_reduction and there are no intermediate stmts
7223 participating. */
7224 use_operand_p use_p;
7225 gimple *use_stmt;
7226 if (ncopies > 1
7227 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7228 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7229 && (use_stmt == stmt
7230 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7232 single_defuse_cycle = true;
7233 epilog_copies = 1;
7235 else
7236 epilog_copies = ncopies;
7238 /* If the reduction stmt is one of the patterns that have lane
7239 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7240 if ((ncopies > 1
7241 && ! single_defuse_cycle)
7242 && (code == DOT_PROD_EXPR
7243 || code == WIDEN_SUM_EXPR
7244 || code == SAD_EXPR))
7246 if (dump_enabled_p ())
7247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7248 "multi def-use cycle not possible for lane-reducing "
7249 "reduction operation\n");
7250 return false;
7253 if (slp_node)
7254 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7255 else
7256 vec_num = 1;
7258 internal_fn cond_fn = get_conditional_internal_fn (code);
7259 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7261 if (!vec_stmt) /* transformation not required. */
7263 if (first_p)
7264 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7265 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7267 if (reduction_type != FOLD_LEFT_REDUCTION
7268 && (cond_fn == IFN_LAST
7269 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7270 OPTIMIZE_FOR_SPEED)))
7272 if (dump_enabled_p ())
7273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274 "can't use a fully-masked loop because no"
7275 " conditional operation is available.\n");
7276 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7278 else if (reduc_index == -1)
7280 if (dump_enabled_p ())
7281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7282 "can't use a fully-masked loop for chained"
7283 " reductions.\n");
7284 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7286 else
7287 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7288 vectype_in);
7290 if (dump_enabled_p ()
7291 && reduction_type == FOLD_LEFT_REDUCTION)
7292 dump_printf_loc (MSG_NOTE, vect_location,
7293 "using an in-order (fold-left) reduction.\n");
7294 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7295 return true;
7298 /* Transform. */
7300 if (dump_enabled_p ())
7301 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7303 /* FORNOW: Multiple types are not supported for condition. */
7304 if (code == COND_EXPR)
7305 gcc_assert (ncopies == 1);
7307 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7309 if (reduction_type == FOLD_LEFT_REDUCTION)
7310 return vectorize_fold_left_reduction
7311 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7312 reduc_fn, ops, vectype_in, reduc_index, masks);
7314 if (reduction_type == EXTRACT_LAST_REDUCTION)
7316 gcc_assert (!slp_node);
7317 return vectorizable_condition (stmt, gsi, vec_stmt,
7318 NULL, reduc_index, NULL);
7321 /* Create the destination vector */
7322 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7324 prev_stmt_info = NULL;
7325 prev_phi_info = NULL;
7326 if (!slp_node)
7328 vec_oprnds0.create (1);
7329 vec_oprnds1.create (1);
7330 if (op_type == ternary_op)
7331 vec_oprnds2.create (1);
7334 phis.create (vec_num);
7335 vect_defs.create (vec_num);
7336 if (!slp_node)
7337 vect_defs.quick_push (NULL_TREE);
7339 if (slp_node)
7340 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7341 else
7342 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7344 for (j = 0; j < ncopies; j++)
7346 if (code == COND_EXPR)
7348 gcc_assert (!slp_node);
7349 vectorizable_condition (stmt, gsi, vec_stmt,
7350 PHI_RESULT (phis[0]),
7351 reduc_index, NULL);
7352 /* Multiple types are not supported for condition. */
7353 break;
7356 /* Handle uses. */
7357 if (j == 0)
7359 if (slp_node)
7361 /* Get vec defs for all the operands except the reduction index,
7362 ensuring the ordering of the ops in the vector is kept. */
7363 auto_vec<tree, 3> slp_ops;
7364 auto_vec<vec<tree>, 3> vec_defs;
7366 slp_ops.quick_push (ops[0]);
7367 slp_ops.quick_push (ops[1]);
7368 if (op_type == ternary_op)
7369 slp_ops.quick_push (ops[2]);
7371 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7373 vec_oprnds0.safe_splice (vec_defs[0]);
7374 vec_defs[0].release ();
7375 vec_oprnds1.safe_splice (vec_defs[1]);
7376 vec_defs[1].release ();
7377 if (op_type == ternary_op)
7379 vec_oprnds2.safe_splice (vec_defs[2]);
7380 vec_defs[2].release ();
7383 else
7385 vec_oprnds0.quick_push
7386 (vect_get_vec_def_for_operand (ops[0], stmt));
7387 vec_oprnds1.quick_push
7388 (vect_get_vec_def_for_operand (ops[1], stmt));
7389 if (op_type == ternary_op)
7390 vec_oprnds2.quick_push
7391 (vect_get_vec_def_for_operand (ops[2], stmt));
7394 else
7396 if (!slp_node)
7398 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7400 if (single_defuse_cycle && reduc_index == 0)
7401 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7402 else
7403 vec_oprnds0[0]
7404 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7405 if (single_defuse_cycle && reduc_index == 1)
7406 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7407 else
7408 vec_oprnds1[0]
7409 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7410 if (op_type == ternary_op)
7412 if (single_defuse_cycle && reduc_index == 2)
7413 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7414 else
7415 vec_oprnds2[0]
7416 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7421 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7423 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7424 if (masked_loop_p)
7426 /* Make sure that the reduction accumulator is vop[0]. */
7427 if (reduc_index == 1)
7429 gcc_assert (commutative_tree_code (code));
7430 std::swap (vop[0], vop[1]);
7432 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7433 vectype_in, i * ncopies + j);
7434 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7435 vop[0], vop[1]);
7436 new_temp = make_ssa_name (vec_dest, call);
7437 gimple_call_set_lhs (call, new_temp);
7438 gimple_call_set_nothrow (call, true);
7439 new_stmt = call;
7441 else
7443 if (op_type == ternary_op)
7444 vop[2] = vec_oprnds2[i];
7446 new_temp = make_ssa_name (vec_dest, new_stmt);
7447 new_stmt = gimple_build_assign (new_temp, code,
7448 vop[0], vop[1], vop[2]);
7450 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7452 if (slp_node)
7454 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7455 vect_defs.quick_push (new_temp);
7457 else
7458 vect_defs[0] = new_temp;
7461 if (slp_node)
7462 continue;
7464 if (j == 0)
7465 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7466 else
7467 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7469 prev_stmt_info = vinfo_for_stmt (new_stmt);
7472 /* Finalize the reduction-phi (set its arguments) and create the
7473 epilog reduction code. */
7474 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7475 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7477 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7478 epilog_copies, reduc_fn, phis,
7479 double_reduc, slp_node, slp_node_instance,
7480 cond_reduc_val, cond_reduc_op_code,
7481 neutral_op);
7483 return true;
7486 /* Function vect_min_worthwhile_factor.
7488 For a loop where we could vectorize the operation indicated by CODE,
7489 return the minimum vectorization factor that makes it worthwhile
7490 to use generic vectors. */
7491 static unsigned int
7492 vect_min_worthwhile_factor (enum tree_code code)
7494 switch (code)
7496 case PLUS_EXPR:
7497 case MINUS_EXPR:
7498 case NEGATE_EXPR:
7499 return 4;
7501 case BIT_AND_EXPR:
7502 case BIT_IOR_EXPR:
7503 case BIT_XOR_EXPR:
7504 case BIT_NOT_EXPR:
7505 return 2;
7507 default:
7508 return INT_MAX;
7512 /* Return true if VINFO indicates we are doing loop vectorization and if
7513 it is worth decomposing CODE operations into scalar operations for
7514 that loop's vectorization factor. */
7516 bool
7517 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7519 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7520 unsigned HOST_WIDE_INT value;
7521 return (loop_vinfo
7522 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7523 && value >= vect_min_worthwhile_factor (code));
7526 /* Function vectorizable_induction
7528 Check if PHI performs an induction computation that can be vectorized.
7529 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7530 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7531 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7533 bool
7534 vectorizable_induction (gimple *phi,
7535 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7536 gimple **vec_stmt, slp_tree slp_node)
7538 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7539 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7540 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7541 unsigned ncopies;
7542 bool nested_in_vect_loop = false;
7543 struct loop *iv_loop;
7544 tree vec_def;
7545 edge pe = loop_preheader_edge (loop);
7546 basic_block new_bb;
7547 tree new_vec, vec_init, vec_step, t;
7548 tree new_name;
7549 gimple *new_stmt;
7550 gphi *induction_phi;
7551 tree induc_def, vec_dest;
7552 tree init_expr, step_expr;
7553 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7554 unsigned i;
7555 tree expr;
7556 gimple_seq stmts;
7557 imm_use_iterator imm_iter;
7558 use_operand_p use_p;
7559 gimple *exit_phi;
7560 edge latch_e;
7561 tree loop_arg;
7562 gimple_stmt_iterator si;
7563 basic_block bb = gimple_bb (phi);
7565 if (gimple_code (phi) != GIMPLE_PHI)
7566 return false;
7568 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7569 return false;
7571 /* Make sure it was recognized as induction computation. */
7572 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7573 return false;
7575 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7576 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7578 if (slp_node)
7579 ncopies = 1;
7580 else
7581 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7582 gcc_assert (ncopies >= 1);
7584 /* FORNOW. These restrictions should be relaxed. */
7585 if (nested_in_vect_loop_p (loop, phi))
7587 imm_use_iterator imm_iter;
7588 use_operand_p use_p;
7589 gimple *exit_phi;
7590 edge latch_e;
7591 tree loop_arg;
7593 if (ncopies > 1)
7595 if (dump_enabled_p ())
7596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597 "multiple types in nested loop.\n");
7598 return false;
7601 /* FORNOW: outer loop induction with SLP not supported. */
7602 if (STMT_SLP_TYPE (stmt_info))
7603 return false;
7605 exit_phi = NULL;
7606 latch_e = loop_latch_edge (loop->inner);
7607 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7608 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7610 gimple *use_stmt = USE_STMT (use_p);
7611 if (is_gimple_debug (use_stmt))
7612 continue;
7614 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7616 exit_phi = use_stmt;
7617 break;
7620 if (exit_phi)
7622 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7623 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7624 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7626 if (dump_enabled_p ())
7627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7628 "inner-loop induction only used outside "
7629 "of the outer vectorized loop.\n");
7630 return false;
7634 nested_in_vect_loop = true;
7635 iv_loop = loop->inner;
7637 else
7638 iv_loop = loop;
7639 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7641 if (slp_node && !nunits.is_constant ())
7643 /* The current SLP code creates the initial value element-by-element. */
7644 if (dump_enabled_p ())
7645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646 "SLP induction not supported for variable-length"
7647 " vectors.\n");
7648 return false;
7651 if (!vec_stmt) /* transformation not required. */
7653 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7654 if (dump_enabled_p ())
7655 dump_printf_loc (MSG_NOTE, vect_location,
7656 "=== vectorizable_induction ===\n");
7657 vect_model_induction_cost (stmt_info, ncopies);
7658 return true;
7661 /* Transform. */
7663 /* Compute a vector variable, initialized with the first VF values of
7664 the induction variable. E.g., for an iv with IV_PHI='X' and
7665 evolution S, for a vector of 4 units, we want to compute:
7666 [X, X + S, X + 2*S, X + 3*S]. */
7668 if (dump_enabled_p ())
7669 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7671 latch_e = loop_latch_edge (iv_loop);
7672 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7674 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7675 gcc_assert (step_expr != NULL_TREE);
7677 pe = loop_preheader_edge (iv_loop);
7678 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7679 loop_preheader_edge (iv_loop));
7681 stmts = NULL;
7682 if (!nested_in_vect_loop)
7684 /* Convert the initial value to the desired type. */
7685 tree new_type = TREE_TYPE (vectype);
7686 init_expr = gimple_convert (&stmts, new_type, init_expr);
7688 /* If we are using the loop mask to "peel" for alignment then we need
7689 to adjust the start value here. */
7690 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7691 if (skip_niters != NULL_TREE)
7693 if (FLOAT_TYPE_P (vectype))
7694 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7695 skip_niters);
7696 else
7697 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7698 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7699 skip_niters, step_expr);
7700 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7701 init_expr, skip_step);
7705 /* Convert the step to the desired type. */
7706 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7708 if (stmts)
7710 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7711 gcc_assert (!new_bb);
7714 /* Find the first insertion point in the BB. */
7715 si = gsi_after_labels (bb);
7717 /* For SLP induction we have to generate several IVs as for example
7718 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7719 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7720 [VF*S, VF*S, VF*S, VF*S] for all. */
7721 if (slp_node)
7723 /* Enforced above. */
7724 unsigned int const_nunits = nunits.to_constant ();
7726 /* Generate [VF*S, VF*S, ... ]. */
7727 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7729 expr = build_int_cst (integer_type_node, vf);
7730 expr = fold_convert (TREE_TYPE (step_expr), expr);
7732 else
7733 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7734 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7735 expr, step_expr);
7736 if (! CONSTANT_CLASS_P (new_name))
7737 new_name = vect_init_vector (phi, new_name,
7738 TREE_TYPE (step_expr), NULL);
7739 new_vec = build_vector_from_val (vectype, new_name);
7740 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7742 /* Now generate the IVs. */
7743 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7744 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7745 unsigned elts = const_nunits * nvects;
7746 unsigned nivs = least_common_multiple (group_size,
7747 const_nunits) / const_nunits;
7748 gcc_assert (elts % group_size == 0);
7749 tree elt = init_expr;
7750 unsigned ivn;
7751 for (ivn = 0; ivn < nivs; ++ivn)
7753 tree_vector_builder elts (vectype, const_nunits, 1);
7754 stmts = NULL;
7755 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7757 if (ivn*const_nunits + eltn >= group_size
7758 && (ivn * const_nunits + eltn) % group_size == 0)
7759 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7760 elt, step_expr);
7761 elts.quick_push (elt);
7763 vec_init = gimple_build_vector (&stmts, &elts);
7764 if (stmts)
7766 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7767 gcc_assert (!new_bb);
7770 /* Create the induction-phi that defines the induction-operand. */
7771 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7772 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7773 set_vinfo_for_stmt (induction_phi,
7774 new_stmt_vec_info (induction_phi, loop_vinfo));
7775 induc_def = PHI_RESULT (induction_phi);
7777 /* Create the iv update inside the loop */
7778 vec_def = make_ssa_name (vec_dest);
7779 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7780 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7781 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7783 /* Set the arguments of the phi node: */
7784 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7785 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7786 UNKNOWN_LOCATION);
7788 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7791 /* Re-use IVs when we can. */
7792 if (ivn < nvects)
7794 unsigned vfp
7795 = least_common_multiple (group_size, const_nunits) / group_size;
7796 /* Generate [VF'*S, VF'*S, ... ]. */
7797 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7799 expr = build_int_cst (integer_type_node, vfp);
7800 expr = fold_convert (TREE_TYPE (step_expr), expr);
7802 else
7803 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7804 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7805 expr, step_expr);
7806 if (! CONSTANT_CLASS_P (new_name))
7807 new_name = vect_init_vector (phi, new_name,
7808 TREE_TYPE (step_expr), NULL);
7809 new_vec = build_vector_from_val (vectype, new_name);
7810 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7811 for (; ivn < nvects; ++ivn)
7813 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7814 tree def;
7815 if (gimple_code (iv) == GIMPLE_PHI)
7816 def = gimple_phi_result (iv);
7817 else
7818 def = gimple_assign_lhs (iv);
7819 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7820 PLUS_EXPR,
7821 def, vec_step);
7822 if (gimple_code (iv) == GIMPLE_PHI)
7823 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7824 else
7826 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7827 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7829 set_vinfo_for_stmt (new_stmt,
7830 new_stmt_vec_info (new_stmt, loop_vinfo));
7831 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7835 return true;
7838 /* Create the vector that holds the initial_value of the induction. */
7839 if (nested_in_vect_loop)
7841 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7842 been created during vectorization of previous stmts. We obtain it
7843 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7844 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7845 /* If the initial value is not of proper type, convert it. */
7846 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7848 new_stmt
7849 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7850 vect_simple_var,
7851 "vec_iv_"),
7852 VIEW_CONVERT_EXPR,
7853 build1 (VIEW_CONVERT_EXPR, vectype,
7854 vec_init));
7855 vec_init = gimple_assign_lhs (new_stmt);
7856 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7857 new_stmt);
7858 gcc_assert (!new_bb);
7859 set_vinfo_for_stmt (new_stmt,
7860 new_stmt_vec_info (new_stmt, loop_vinfo));
7863 else
7865 /* iv_loop is the loop to be vectorized. Create:
7866 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7867 stmts = NULL;
7868 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7870 unsigned HOST_WIDE_INT const_nunits;
7871 if (nunits.is_constant (&const_nunits))
7873 tree_vector_builder elts (vectype, const_nunits, 1);
7874 elts.quick_push (new_name);
7875 for (i = 1; i < const_nunits; i++)
7877 /* Create: new_name_i = new_name + step_expr */
7878 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7879 new_name, step_expr);
7880 elts.quick_push (new_name);
7882 /* Create a vector from [new_name_0, new_name_1, ...,
7883 new_name_nunits-1] */
7884 vec_init = gimple_build_vector (&stmts, &elts);
7886 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7887 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7888 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7889 new_name, step_expr);
7890 else
7892 /* Build:
7893 [base, base, base, ...]
7894 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7895 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7896 gcc_assert (flag_associative_math);
7897 tree index = build_index_vector (vectype, 0, 1);
7898 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7899 new_name);
7900 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7901 step_expr);
7902 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7903 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7904 vec_init, step_vec);
7905 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7906 vec_init, base_vec);
7909 if (stmts)
7911 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7912 gcc_assert (!new_bb);
7917 /* Create the vector that holds the step of the induction. */
7918 if (nested_in_vect_loop)
7919 /* iv_loop is nested in the loop to be vectorized. Generate:
7920 vec_step = [S, S, S, S] */
7921 new_name = step_expr;
7922 else
7924 /* iv_loop is the loop to be vectorized. Generate:
7925 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7926 gimple_seq seq = NULL;
7927 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7929 expr = build_int_cst (integer_type_node, vf);
7930 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7932 else
7933 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7934 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7935 expr, step_expr);
7936 if (seq)
7938 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7939 gcc_assert (!new_bb);
7943 t = unshare_expr (new_name);
7944 gcc_assert (CONSTANT_CLASS_P (new_name)
7945 || TREE_CODE (new_name) == SSA_NAME);
7946 new_vec = build_vector_from_val (vectype, t);
7947 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7950 /* Create the following def-use cycle:
7951 loop prolog:
7952 vec_init = ...
7953 vec_step = ...
7954 loop:
7955 vec_iv = PHI <vec_init, vec_loop>
7957 STMT
7959 vec_loop = vec_iv + vec_step; */
7961 /* Create the induction-phi that defines the induction-operand. */
7962 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7963 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7964 set_vinfo_for_stmt (induction_phi,
7965 new_stmt_vec_info (induction_phi, loop_vinfo));
7966 induc_def = PHI_RESULT (induction_phi);
7968 /* Create the iv update inside the loop */
7969 vec_def = make_ssa_name (vec_dest);
7970 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7971 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7972 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7974 /* Set the arguments of the phi node: */
7975 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7976 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7977 UNKNOWN_LOCATION);
7979 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7981 /* In case that vectorization factor (VF) is bigger than the number
7982 of elements that we can fit in a vectype (nunits), we have to generate
7983 more than one vector stmt - i.e - we need to "unroll" the
7984 vector stmt by a factor VF/nunits. For more details see documentation
7985 in vectorizable_operation. */
7987 if (ncopies > 1)
7989 gimple_seq seq = NULL;
7990 stmt_vec_info prev_stmt_vinfo;
7991 /* FORNOW. This restriction should be relaxed. */
7992 gcc_assert (!nested_in_vect_loop);
7994 /* Create the vector that holds the step of the induction. */
7995 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7997 expr = build_int_cst (integer_type_node, nunits);
7998 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8000 else
8001 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8002 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8003 expr, step_expr);
8004 if (seq)
8006 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8007 gcc_assert (!new_bb);
8010 t = unshare_expr (new_name);
8011 gcc_assert (CONSTANT_CLASS_P (new_name)
8012 || TREE_CODE (new_name) == SSA_NAME);
8013 new_vec = build_vector_from_val (vectype, t);
8014 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8016 vec_def = induc_def;
8017 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8018 for (i = 1; i < ncopies; i++)
8020 /* vec_i = vec_prev + vec_step */
8021 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8022 vec_def, vec_step);
8023 vec_def = make_ssa_name (vec_dest, new_stmt);
8024 gimple_assign_set_lhs (new_stmt, vec_def);
8026 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8027 set_vinfo_for_stmt (new_stmt,
8028 new_stmt_vec_info (new_stmt, loop_vinfo));
8029 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8030 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8034 if (nested_in_vect_loop)
8036 /* Find the loop-closed exit-phi of the induction, and record
8037 the final vector of induction results: */
8038 exit_phi = NULL;
8039 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8041 gimple *use_stmt = USE_STMT (use_p);
8042 if (is_gimple_debug (use_stmt))
8043 continue;
8045 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8047 exit_phi = use_stmt;
8048 break;
8051 if (exit_phi)
8053 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8054 /* FORNOW. Currently not supporting the case that an inner-loop induction
8055 is not used in the outer-loop (i.e. only outside the outer-loop). */
8056 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8057 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8059 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8060 if (dump_enabled_p ())
8062 dump_printf_loc (MSG_NOTE, vect_location,
8063 "vector of inductions after inner-loop:");
8064 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8070 if (dump_enabled_p ())
8072 dump_printf_loc (MSG_NOTE, vect_location,
8073 "transform induction: created def-use cycle: ");
8074 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8075 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8076 SSA_NAME_DEF_STMT (vec_def), 0);
8079 return true;
8082 /* Function vectorizable_live_operation.
8084 STMT computes a value that is used outside the loop. Check if
8085 it can be supported. */
8087 bool
8088 vectorizable_live_operation (gimple *stmt,
8089 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8090 slp_tree slp_node, int slp_index,
8091 gimple **vec_stmt)
8093 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8094 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8095 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8096 imm_use_iterator imm_iter;
8097 tree lhs, lhs_type, bitsize, vec_bitsize;
8098 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8099 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8100 int ncopies;
8101 gimple *use_stmt;
8102 auto_vec<tree> vec_oprnds;
8103 int vec_entry = 0;
8104 poly_uint64 vec_index = 0;
8106 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8108 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8109 return false;
8111 /* FORNOW. CHECKME. */
8112 if (nested_in_vect_loop_p (loop, stmt))
8113 return false;
8115 /* If STMT is not relevant and it is a simple assignment and its inputs are
8116 invariant then it can remain in place, unvectorized. The original last
8117 scalar value that it computes will be used. */
8118 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8120 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8121 if (dump_enabled_p ())
8122 dump_printf_loc (MSG_NOTE, vect_location,
8123 "statement is simple and uses invariant. Leaving in "
8124 "place.\n");
8125 return true;
8128 if (slp_node)
8129 ncopies = 1;
8130 else
8131 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8133 if (slp_node)
8135 gcc_assert (slp_index >= 0);
8137 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8138 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8140 /* Get the last occurrence of the scalar index from the concatenation of
8141 all the slp vectors. Calculate which slp vector it is and the index
8142 within. */
8143 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8145 /* Calculate which vector contains the result, and which lane of
8146 that vector we need. */
8147 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8149 if (dump_enabled_p ())
8150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8151 "Cannot determine which vector holds the"
8152 " final result.\n");
8153 return false;
8157 if (!vec_stmt)
8159 /* No transformation required. */
8160 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8162 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8163 OPTIMIZE_FOR_SPEED))
8165 if (dump_enabled_p ())
8166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8167 "can't use a fully-masked loop because "
8168 "the target doesn't support extract last "
8169 "reduction.\n");
8170 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8172 else if (slp_node)
8174 if (dump_enabled_p ())
8175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8176 "can't use a fully-masked loop because an "
8177 "SLP statement is live after the loop.\n");
8178 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8180 else if (ncopies > 1)
8182 if (dump_enabled_p ())
8183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8184 "can't use a fully-masked loop because"
8185 " ncopies is greater than 1.\n");
8186 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8188 else
8190 gcc_assert (ncopies == 1 && !slp_node);
8191 vect_record_loop_mask (loop_vinfo,
8192 &LOOP_VINFO_MASKS (loop_vinfo),
8193 1, vectype);
8196 return true;
8199 /* If stmt has a related stmt, then use that for getting the lhs. */
8200 if (is_pattern_stmt_p (stmt_info))
8201 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8203 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8204 : gimple_get_lhs (stmt);
8205 lhs_type = TREE_TYPE (lhs);
8207 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8208 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8209 : TYPE_SIZE (TREE_TYPE (vectype)));
8210 vec_bitsize = TYPE_SIZE (vectype);
8212 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8213 tree vec_lhs, bitstart;
8214 if (slp_node)
8216 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8218 /* Get the correct slp vectorized stmt. */
8219 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8220 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8221 vec_lhs = gimple_phi_result (phi);
8222 else
8223 vec_lhs = gimple_get_lhs (vec_stmt);
8225 /* Get entry to use. */
8226 bitstart = bitsize_int (vec_index);
8227 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8229 else
8231 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8232 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8233 gcc_checking_assert (ncopies == 1
8234 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8236 /* For multiple copies, get the last copy. */
8237 for (int i = 1; i < ncopies; ++i)
8238 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8239 vec_lhs);
8241 /* Get the last lane in the vector. */
8242 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8245 gimple_seq stmts = NULL;
8246 tree new_tree;
8247 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8249 /* Emit:
8251 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8253 where VEC_LHS is the vectorized live-out result and MASK is
8254 the loop mask for the final iteration. */
8255 gcc_assert (ncopies == 1 && !slp_node);
8256 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8257 tree scalar_res = make_ssa_name (scalar_type);
8258 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8259 1, vectype, 0);
8260 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8261 2, mask, vec_lhs);
8262 gimple_call_set_lhs (new_stmt, scalar_res);
8263 gimple_seq_add_stmt (&stmts, new_stmt);
8265 /* Convert the extracted vector element to the required scalar type. */
8266 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8268 else
8270 tree bftype = TREE_TYPE (vectype);
8271 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8272 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8273 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8274 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8275 &stmts, true, NULL_TREE);
8278 if (stmts)
8279 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8281 /* Replace use of lhs with newly computed result. If the use stmt is a
8282 single arg PHI, just replace all uses of PHI result. It's necessary
8283 because lcssa PHI defining lhs may be before newly inserted stmt. */
8284 use_operand_p use_p;
8285 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8286 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8287 && !is_gimple_debug (use_stmt))
8289 if (gimple_code (use_stmt) == GIMPLE_PHI
8290 && gimple_phi_num_args (use_stmt) == 1)
8292 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8294 else
8296 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8297 SET_USE (use_p, new_tree);
8299 update_stmt (use_stmt);
8302 return true;
8305 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8307 static void
8308 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8310 ssa_op_iter op_iter;
8311 imm_use_iterator imm_iter;
8312 def_operand_p def_p;
8313 gimple *ustmt;
8315 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8317 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8319 basic_block bb;
8321 if (!is_gimple_debug (ustmt))
8322 continue;
8324 bb = gimple_bb (ustmt);
8326 if (!flow_bb_inside_loop_p (loop, bb))
8328 if (gimple_debug_bind_p (ustmt))
8330 if (dump_enabled_p ())
8331 dump_printf_loc (MSG_NOTE, vect_location,
8332 "killing debug use\n");
8334 gimple_debug_bind_reset_value (ustmt);
8335 update_stmt (ustmt);
8337 else
8338 gcc_unreachable ();
8344 /* Given loop represented by LOOP_VINFO, return true if computation of
8345 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8346 otherwise. */
8348 static bool
8349 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8351 /* Constant case. */
8352 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8354 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8355 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8357 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8358 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8359 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8360 return true;
8363 widest_int max;
8364 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8365 /* Check the upper bound of loop niters. */
8366 if (get_max_loop_iterations (loop, &max))
8368 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8369 signop sgn = TYPE_SIGN (type);
8370 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8371 if (max < type_max)
8372 return true;
8374 return false;
8377 /* Return a mask type with half the number of elements as TYPE. */
8379 tree
8380 vect_halve_mask_nunits (tree type)
8382 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8383 return build_truth_vector_type (nunits, current_vector_size);
8386 /* Return a mask type with twice as many elements as TYPE. */
8388 tree
8389 vect_double_mask_nunits (tree type)
8391 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8392 return build_truth_vector_type (nunits, current_vector_size);
8395 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8396 contain a sequence of NVECTORS masks that each control a vector of type
8397 VECTYPE. */
8399 void
8400 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8401 unsigned int nvectors, tree vectype)
8403 gcc_assert (nvectors != 0);
8404 if (masks->length () < nvectors)
8405 masks->safe_grow_cleared (nvectors);
8406 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8407 /* The number of scalars per iteration and the number of vectors are
8408 both compile-time constants. */
8409 unsigned int nscalars_per_iter
8410 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8411 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8412 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8414 rgm->max_nscalars_per_iter = nscalars_per_iter;
8415 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8419 /* Given a complete set of masks MASKS, extract mask number INDEX
8420 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8421 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8423 See the comment above vec_loop_masks for more details about the mask
8424 arrangement. */
8426 tree
8427 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8428 unsigned int nvectors, tree vectype, unsigned int index)
8430 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8431 tree mask_type = rgm->mask_type;
8433 /* Populate the rgroup's mask array, if this is the first time we've
8434 used it. */
8435 if (rgm->masks.is_empty ())
8437 rgm->masks.safe_grow_cleared (nvectors);
8438 for (unsigned int i = 0; i < nvectors; ++i)
8440 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8441 /* Provide a dummy definition until the real one is available. */
8442 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8443 rgm->masks[i] = mask;
8447 tree mask = rgm->masks[index];
8448 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8449 TYPE_VECTOR_SUBPARTS (vectype)))
8451 /* A loop mask for data type X can be reused for data type Y
8452 if X has N times more elements than Y and if Y's elements
8453 are N times bigger than X's. In this case each sequence
8454 of N elements in the loop mask will be all-zero or all-one.
8455 We can then view-convert the mask so that each sequence of
8456 N elements is replaced by a single element. */
8457 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8458 TYPE_VECTOR_SUBPARTS (vectype)));
8459 gimple_seq seq = NULL;
8460 mask_type = build_same_sized_truth_vector_type (vectype);
8461 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8462 if (seq)
8463 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8465 return mask;
8468 /* Scale profiling counters by estimation for LOOP which is vectorized
8469 by factor VF. */
8471 static void
8472 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8474 edge preheader = loop_preheader_edge (loop);
8475 /* Reduce loop iterations by the vectorization factor. */
8476 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8477 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8479 if (freq_h.nonzero_p ())
8481 profile_probability p;
8483 /* Avoid dropping loop body profile counter to 0 because of zero count
8484 in loop's preheader. */
8485 if (!(freq_e == profile_count::zero ()))
8486 freq_e = freq_e.force_nonzero ();
8487 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8488 scale_loop_frequencies (loop, p);
8491 edge exit_e = single_exit (loop);
8492 exit_e->probability = profile_probability::always ()
8493 .apply_scale (1, new_est_niter + 1);
8495 edge exit_l = single_pred_edge (loop->latch);
8496 profile_probability prob = exit_l->probability;
8497 exit_l->probability = exit_e->probability.invert ();
8498 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8499 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8502 /* Function vect_transform_loop.
8504 The analysis phase has determined that the loop is vectorizable.
8505 Vectorize the loop - created vectorized stmts to replace the scalar
8506 stmts in the loop, and update the loop exit condition.
8507 Returns scalar epilogue loop if any. */
8509 struct loop *
8510 vect_transform_loop (loop_vec_info loop_vinfo)
8512 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8513 struct loop *epilogue = NULL;
8514 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8515 int nbbs = loop->num_nodes;
8516 int i;
8517 tree niters_vector = NULL_TREE;
8518 tree step_vector = NULL_TREE;
8519 tree niters_vector_mult_vf = NULL_TREE;
8520 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8521 unsigned int lowest_vf = constant_lower_bound (vf);
8522 bool grouped_store;
8523 bool slp_scheduled = false;
8524 gimple *stmt, *pattern_stmt;
8525 gimple_seq pattern_def_seq = NULL;
8526 gimple_stmt_iterator pattern_def_si = gsi_none ();
8527 bool transform_pattern_stmt = false;
8528 bool check_profitability = false;
8529 unsigned int th;
8531 if (dump_enabled_p ())
8532 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8534 /* Use the more conservative vectorization threshold. If the number
8535 of iterations is constant assume the cost check has been performed
8536 by our caller. If the threshold makes all loops profitable that
8537 run at least the (estimated) vectorization factor number of times
8538 checking is pointless, too. */
8539 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8540 if (th >= vect_vf_for_cost (loop_vinfo)
8541 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8543 if (dump_enabled_p ())
8544 dump_printf_loc (MSG_NOTE, vect_location,
8545 "Profitability threshold is %d loop iterations.\n",
8546 th);
8547 check_profitability = true;
8550 /* Make sure there exists a single-predecessor exit bb. Do this before
8551 versioning. */
8552 edge e = single_exit (loop);
8553 if (! single_pred_p (e->dest))
8555 split_loop_exit_edge (e);
8556 if (dump_enabled_p ())
8557 dump_printf (MSG_NOTE, "split exit edge\n");
8560 /* Version the loop first, if required, so the profitability check
8561 comes first. */
8563 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8565 poly_uint64 versioning_threshold
8566 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8567 if (check_profitability
8568 && ordered_p (poly_uint64 (th), versioning_threshold))
8570 versioning_threshold = ordered_max (poly_uint64 (th),
8571 versioning_threshold);
8572 check_profitability = false;
8574 vect_loop_versioning (loop_vinfo, th, check_profitability,
8575 versioning_threshold);
8576 check_profitability = false;
8579 /* Make sure there exists a single-predecessor exit bb also on the
8580 scalar loop copy. Do this after versioning but before peeling
8581 so CFG structure is fine for both scalar and if-converted loop
8582 to make slpeel_duplicate_current_defs_from_edges face matched
8583 loop closed PHI nodes on the exit. */
8584 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8586 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8587 if (! single_pred_p (e->dest))
8589 split_loop_exit_edge (e);
8590 if (dump_enabled_p ())
8591 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8595 tree niters = vect_build_loop_niters (loop_vinfo);
8596 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8597 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8598 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8599 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8600 &step_vector, &niters_vector_mult_vf, th,
8601 check_profitability, niters_no_overflow);
8603 if (niters_vector == NULL_TREE)
8605 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8606 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8607 && known_eq (lowest_vf, vf))
8609 niters_vector
8610 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8611 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8612 step_vector = build_one_cst (TREE_TYPE (niters));
8614 else
8615 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8616 &step_vector, niters_no_overflow);
8619 /* 1) Make sure the loop header has exactly two entries
8620 2) Make sure we have a preheader basic block. */
8622 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8624 split_edge (loop_preheader_edge (loop));
8626 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8627 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8628 /* This will deal with any possible peeling. */
8629 vect_prepare_for_masked_peels (loop_vinfo);
8631 /* FORNOW: the vectorizer supports only loops which body consist
8632 of one basic block (header + empty latch). When the vectorizer will
8633 support more involved loop forms, the order by which the BBs are
8634 traversed need to be reconsidered. */
8636 for (i = 0; i < nbbs; i++)
8638 basic_block bb = bbs[i];
8639 stmt_vec_info stmt_info;
8641 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8642 gsi_next (&si))
8644 gphi *phi = si.phi ();
8645 if (dump_enabled_p ())
8647 dump_printf_loc (MSG_NOTE, vect_location,
8648 "------>vectorizing phi: ");
8649 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8651 stmt_info = vinfo_for_stmt (phi);
8652 if (!stmt_info)
8653 continue;
8655 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8656 vect_loop_kill_debug_uses (loop, phi);
8658 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8659 && !STMT_VINFO_LIVE_P (stmt_info))
8660 continue;
8662 if (STMT_VINFO_VECTYPE (stmt_info)
8663 && (maybe_ne
8664 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8665 && dump_enabled_p ())
8666 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8668 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8669 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8670 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8671 && ! PURE_SLP_STMT (stmt_info))
8673 if (dump_enabled_p ())
8674 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8675 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8679 pattern_stmt = NULL;
8680 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8681 !gsi_end_p (si) || transform_pattern_stmt;)
8683 bool is_store;
8685 if (transform_pattern_stmt)
8686 stmt = pattern_stmt;
8687 else
8689 stmt = gsi_stmt (si);
8690 /* During vectorization remove existing clobber stmts. */
8691 if (gimple_clobber_p (stmt))
8693 unlink_stmt_vdef (stmt);
8694 gsi_remove (&si, true);
8695 release_defs (stmt);
8696 continue;
8700 if (dump_enabled_p ())
8702 dump_printf_loc (MSG_NOTE, vect_location,
8703 "------>vectorizing statement: ");
8704 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8707 stmt_info = vinfo_for_stmt (stmt);
8709 /* vector stmts created in the outer-loop during vectorization of
8710 stmts in an inner-loop may not have a stmt_info, and do not
8711 need to be vectorized. */
8712 if (!stmt_info)
8714 gsi_next (&si);
8715 continue;
8718 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8719 vect_loop_kill_debug_uses (loop, stmt);
8721 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8722 && !STMT_VINFO_LIVE_P (stmt_info))
8724 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8725 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8726 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8727 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8729 stmt = pattern_stmt;
8730 stmt_info = vinfo_for_stmt (stmt);
8732 else
8734 gsi_next (&si);
8735 continue;
8738 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8739 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8740 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8741 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8742 transform_pattern_stmt = true;
8744 /* If pattern statement has def stmts, vectorize them too. */
8745 if (is_pattern_stmt_p (stmt_info))
8747 if (pattern_def_seq == NULL)
8749 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8750 pattern_def_si = gsi_start (pattern_def_seq);
8752 else if (!gsi_end_p (pattern_def_si))
8753 gsi_next (&pattern_def_si);
8754 if (pattern_def_seq != NULL)
8756 gimple *pattern_def_stmt = NULL;
8757 stmt_vec_info pattern_def_stmt_info = NULL;
8759 while (!gsi_end_p (pattern_def_si))
8761 pattern_def_stmt = gsi_stmt (pattern_def_si);
8762 pattern_def_stmt_info
8763 = vinfo_for_stmt (pattern_def_stmt);
8764 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8765 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8766 break;
8767 gsi_next (&pattern_def_si);
8770 if (!gsi_end_p (pattern_def_si))
8772 if (dump_enabled_p ())
8774 dump_printf_loc (MSG_NOTE, vect_location,
8775 "==> vectorizing pattern def "
8776 "stmt: ");
8777 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8778 pattern_def_stmt, 0);
8781 stmt = pattern_def_stmt;
8782 stmt_info = pattern_def_stmt_info;
8784 else
8786 pattern_def_si = gsi_none ();
8787 transform_pattern_stmt = false;
8790 else
8791 transform_pattern_stmt = false;
8794 if (STMT_VINFO_VECTYPE (stmt_info))
8796 poly_uint64 nunits
8797 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8798 if (!STMT_SLP_TYPE (stmt_info)
8799 && maybe_ne (nunits, vf)
8800 && dump_enabled_p ())
8801 /* For SLP VF is set according to unrolling factor, and not
8802 to vector size, hence for SLP this print is not valid. */
8803 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8806 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8807 reached. */
8808 if (STMT_SLP_TYPE (stmt_info))
8810 if (!slp_scheduled)
8812 slp_scheduled = true;
8814 if (dump_enabled_p ())
8815 dump_printf_loc (MSG_NOTE, vect_location,
8816 "=== scheduling SLP instances ===\n");
8818 vect_schedule_slp (loop_vinfo);
8821 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8822 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8824 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8826 pattern_def_seq = NULL;
8827 gsi_next (&si);
8829 continue;
8833 /* -------- vectorize statement ------------ */
8834 if (dump_enabled_p ())
8835 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8837 grouped_store = false;
8838 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8839 if (is_store)
8841 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8843 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8844 interleaving chain was completed - free all the stores in
8845 the chain. */
8846 gsi_next (&si);
8847 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8849 else
8851 /* Free the attached stmt_vec_info and remove the stmt. */
8852 gimple *store = gsi_stmt (si);
8853 free_stmt_vec_info (store);
8854 unlink_stmt_vdef (store);
8855 gsi_remove (&si, true);
8856 release_defs (store);
8859 /* Stores can only appear at the end of pattern statements. */
8860 gcc_assert (!transform_pattern_stmt);
8861 pattern_def_seq = NULL;
8863 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8865 pattern_def_seq = NULL;
8866 gsi_next (&si);
8868 } /* stmts in BB */
8870 /* Stub out scalar statements that must not survive vectorization.
8871 Doing this here helps with grouped statements, or statements that
8872 are involved in patterns. */
8873 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8874 !gsi_end_p (gsi); gsi_next (&gsi))
8876 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8877 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8879 tree lhs = gimple_get_lhs (call);
8880 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8882 tree zero = build_zero_cst (TREE_TYPE (lhs));
8883 gimple *new_stmt = gimple_build_assign (lhs, zero);
8884 gsi_replace (&gsi, new_stmt, true);
8888 } /* BBs in loop */
8890 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8891 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8892 if (integer_onep (step_vector))
8893 niters_no_overflow = true;
8894 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8895 niters_vector_mult_vf, !niters_no_overflow);
8897 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8898 scale_profile_for_vect_loop (loop, assumed_vf);
8900 /* True if the final iteration might not handle a full vector's
8901 worth of scalar iterations. */
8902 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8903 /* The minimum number of iterations performed by the epilogue. This
8904 is 1 when peeling for gaps because we always need a final scalar
8905 iteration. */
8906 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8907 /* +1 to convert latch counts to loop iteration counts,
8908 -min_epilogue_iters to remove iterations that cannot be performed
8909 by the vector code. */
8910 int bias_for_lowest = 1 - min_epilogue_iters;
8911 int bias_for_assumed = bias_for_lowest;
8912 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8913 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8915 /* When the amount of peeling is known at compile time, the first
8916 iteration will have exactly alignment_npeels active elements.
8917 In the worst case it will have at least one. */
8918 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8919 bias_for_lowest += lowest_vf - min_first_active;
8920 bias_for_assumed += assumed_vf - min_first_active;
8922 /* In these calculations the "- 1" converts loop iteration counts
8923 back to latch counts. */
8924 if (loop->any_upper_bound)
8925 loop->nb_iterations_upper_bound
8926 = (final_iter_may_be_partial
8927 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8928 lowest_vf) - 1
8929 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8930 lowest_vf) - 1);
8931 if (loop->any_likely_upper_bound)
8932 loop->nb_iterations_likely_upper_bound
8933 = (final_iter_may_be_partial
8934 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8935 + bias_for_lowest, lowest_vf) - 1
8936 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8937 + bias_for_lowest, lowest_vf) - 1);
8938 if (loop->any_estimate)
8939 loop->nb_iterations_estimate
8940 = (final_iter_may_be_partial
8941 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8942 assumed_vf) - 1
8943 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8944 assumed_vf) - 1);
8946 if (dump_enabled_p ())
8948 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8950 dump_printf_loc (MSG_NOTE, vect_location,
8951 "LOOP VECTORIZED\n");
8952 if (loop->inner)
8953 dump_printf_loc (MSG_NOTE, vect_location,
8954 "OUTER LOOP VECTORIZED\n");
8955 dump_printf (MSG_NOTE, "\n");
8957 else
8959 dump_printf_loc (MSG_NOTE, vect_location,
8960 "LOOP EPILOGUE VECTORIZED (VS=");
8961 dump_dec (MSG_NOTE, current_vector_size);
8962 dump_printf (MSG_NOTE, ")\n");
8966 /* Free SLP instances here because otherwise stmt reference counting
8967 won't work. */
8968 slp_instance instance;
8969 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8970 vect_free_slp_instance (instance);
8971 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8972 /* Clear-up safelen field since its value is invalid after vectorization
8973 since vectorized loop can have loop-carried dependencies. */
8974 loop->safelen = 0;
8976 /* Don't vectorize epilogue for epilogue. */
8977 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8978 epilogue = NULL;
8980 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8981 epilogue = NULL;
8983 if (epilogue)
8985 auto_vector_sizes vector_sizes;
8986 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8987 unsigned int next_size = 0;
8989 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8990 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8991 && known_eq (vf, lowest_vf))
8993 unsigned int eiters
8994 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8995 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8996 eiters = eiters % lowest_vf;
8997 epilogue->nb_iterations_upper_bound = eiters - 1;
8999 unsigned int ratio;
9000 while (next_size < vector_sizes.length ()
9001 && !(constant_multiple_p (current_vector_size,
9002 vector_sizes[next_size], &ratio)
9003 && eiters >= lowest_vf / ratio))
9004 next_size += 1;
9006 else
9007 while (next_size < vector_sizes.length ()
9008 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9009 next_size += 1;
9011 if (next_size == vector_sizes.length ())
9012 epilogue = NULL;
9015 if (epilogue)
9017 epilogue->force_vectorize = loop->force_vectorize;
9018 epilogue->safelen = loop->safelen;
9019 epilogue->dont_vectorize = false;
9021 /* We may need to if-convert epilogue to vectorize it. */
9022 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9023 tree_if_conversion (epilogue);
9026 return epilogue;
9029 /* The code below is trying to perform simple optimization - revert
9030 if-conversion for masked stores, i.e. if the mask of a store is zero
9031 do not perform it and all stored value producers also if possible.
9032 For example,
9033 for (i=0; i<n; i++)
9034 if (c[i])
9036 p1[i] += 1;
9037 p2[i] = p3[i] +2;
9039 this transformation will produce the following semi-hammock:
9041 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9043 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9044 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9045 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9046 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9047 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9048 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9052 void
9053 optimize_mask_stores (struct loop *loop)
9055 basic_block *bbs = get_loop_body (loop);
9056 unsigned nbbs = loop->num_nodes;
9057 unsigned i;
9058 basic_block bb;
9059 struct loop *bb_loop;
9060 gimple_stmt_iterator gsi;
9061 gimple *stmt;
9062 auto_vec<gimple *> worklist;
9064 vect_location = find_loop_location (loop);
9065 /* Pick up all masked stores in loop if any. */
9066 for (i = 0; i < nbbs; i++)
9068 bb = bbs[i];
9069 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9070 gsi_next (&gsi))
9072 stmt = gsi_stmt (gsi);
9073 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9074 worklist.safe_push (stmt);
9078 free (bbs);
9079 if (worklist.is_empty ())
9080 return;
9082 /* Loop has masked stores. */
9083 while (!worklist.is_empty ())
9085 gimple *last, *last_store;
9086 edge e, efalse;
9087 tree mask;
9088 basic_block store_bb, join_bb;
9089 gimple_stmt_iterator gsi_to;
9090 tree vdef, new_vdef;
9091 gphi *phi;
9092 tree vectype;
9093 tree zero;
9095 last = worklist.pop ();
9096 mask = gimple_call_arg (last, 2);
9097 bb = gimple_bb (last);
9098 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9099 the same loop as if_bb. It could be different to LOOP when two
9100 level loop-nest is vectorized and mask_store belongs to the inner
9101 one. */
9102 e = split_block (bb, last);
9103 bb_loop = bb->loop_father;
9104 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9105 join_bb = e->dest;
9106 store_bb = create_empty_bb (bb);
9107 add_bb_to_loop (store_bb, bb_loop);
9108 e->flags = EDGE_TRUE_VALUE;
9109 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9110 /* Put STORE_BB to likely part. */
9111 efalse->probability = profile_probability::unlikely ();
9112 store_bb->count = efalse->count ();
9113 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9114 if (dom_info_available_p (CDI_DOMINATORS))
9115 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9116 if (dump_enabled_p ())
9117 dump_printf_loc (MSG_NOTE, vect_location,
9118 "Create new block %d to sink mask stores.",
9119 store_bb->index);
9120 /* Create vector comparison with boolean result. */
9121 vectype = TREE_TYPE (mask);
9122 zero = build_zero_cst (vectype);
9123 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9124 gsi = gsi_last_bb (bb);
9125 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9126 /* Create new PHI node for vdef of the last masked store:
9127 .MEM_2 = VDEF <.MEM_1>
9128 will be converted to
9129 .MEM.3 = VDEF <.MEM_1>
9130 and new PHI node will be created in join bb
9131 .MEM_2 = PHI <.MEM_1, .MEM_3>
9133 vdef = gimple_vdef (last);
9134 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9135 gimple_set_vdef (last, new_vdef);
9136 phi = create_phi_node (vdef, join_bb);
9137 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9139 /* Put all masked stores with the same mask to STORE_BB if possible. */
9140 while (true)
9142 gimple_stmt_iterator gsi_from;
9143 gimple *stmt1 = NULL;
9145 /* Move masked store to STORE_BB. */
9146 last_store = last;
9147 gsi = gsi_for_stmt (last);
9148 gsi_from = gsi;
9149 /* Shift GSI to the previous stmt for further traversal. */
9150 gsi_prev (&gsi);
9151 gsi_to = gsi_start_bb (store_bb);
9152 gsi_move_before (&gsi_from, &gsi_to);
9153 /* Setup GSI_TO to the non-empty block start. */
9154 gsi_to = gsi_start_bb (store_bb);
9155 if (dump_enabled_p ())
9157 dump_printf_loc (MSG_NOTE, vect_location,
9158 "Move stmt to created bb\n");
9159 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9161 /* Move all stored value producers if possible. */
9162 while (!gsi_end_p (gsi))
9164 tree lhs;
9165 imm_use_iterator imm_iter;
9166 use_operand_p use_p;
9167 bool res;
9169 /* Skip debug statements. */
9170 if (is_gimple_debug (gsi_stmt (gsi)))
9172 gsi_prev (&gsi);
9173 continue;
9175 stmt1 = gsi_stmt (gsi);
9176 /* Do not consider statements writing to memory or having
9177 volatile operand. */
9178 if (gimple_vdef (stmt1)
9179 || gimple_has_volatile_ops (stmt1))
9180 break;
9181 gsi_from = gsi;
9182 gsi_prev (&gsi);
9183 lhs = gimple_get_lhs (stmt1);
9184 if (!lhs)
9185 break;
9187 /* LHS of vectorized stmt must be SSA_NAME. */
9188 if (TREE_CODE (lhs) != SSA_NAME)
9189 break;
9191 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9193 /* Remove dead scalar statement. */
9194 if (has_zero_uses (lhs))
9196 gsi_remove (&gsi_from, true);
9197 continue;
9201 /* Check that LHS does not have uses outside of STORE_BB. */
9202 res = true;
9203 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9205 gimple *use_stmt;
9206 use_stmt = USE_STMT (use_p);
9207 if (is_gimple_debug (use_stmt))
9208 continue;
9209 if (gimple_bb (use_stmt) != store_bb)
9211 res = false;
9212 break;
9215 if (!res)
9216 break;
9218 if (gimple_vuse (stmt1)
9219 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9220 break;
9222 /* Can move STMT1 to STORE_BB. */
9223 if (dump_enabled_p ())
9225 dump_printf_loc (MSG_NOTE, vect_location,
9226 "Move stmt to created bb\n");
9227 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9229 gsi_move_before (&gsi_from, &gsi_to);
9230 /* Shift GSI_TO for further insertion. */
9231 gsi_prev (&gsi_to);
9233 /* Put other masked stores with the same mask to STORE_BB. */
9234 if (worklist.is_empty ()
9235 || gimple_call_arg (worklist.last (), 2) != mask
9236 || worklist.last () != stmt1)
9237 break;
9238 last = worklist.pop ();
9240 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);