PR tree-optimization/81661
[official-gcc.git] / gcc / tree-vect-loop.c
blobc5301684028562656c951ff1e0d7623ffc96c4ad
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
169 in the loop.
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
172 original loop:
173 for (i=0; i<N; i++){
174 a[i] = b[i] + c[i];
177 vectorized loop:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188 unsigned nbbs = loop->num_nodes;
189 poly_uint64 vectorization_factor = 1;
190 tree scalar_type = NULL_TREE;
191 gphi *phi;
192 tree vectype;
193 stmt_vec_info stmt_info;
194 unsigned i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
200 bool bool_result;
201 auto_vec<stmt_vec_info> mask_producers;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i = 0; i < nbbs; i++)
209 basic_block bb = bbs[i];
211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 gsi_next (&si))
214 phi = si.phi ();
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 gcc_assert (stmt_info);
224 if (STMT_VINFO_RELEVANT_P (stmt_info)
225 || STMT_VINFO_LIVE_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 dump_printf (MSG_NOTE, "\n");
268 vect_update_max_nunits (&vectorization_factor, vectype);
272 for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 !gsi_end_p (si) || analyze_pattern_stmt;)
275 tree vf_vectype;
277 if (analyze_pattern_stmt)
278 stmt = pattern_stmt;
279 else
280 stmt = gsi_stmt (si);
282 stmt_info = vinfo_for_stmt (stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 return false;
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 else
434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
438 compute a factor. */
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 && is_gimple_assign (stmt)
441 && gimple_assign_rhs_code (stmt) != COND_EXPR)
443 if (STMT_VINFO_RELEVANT_P (stmt_info)
444 || STMT_VINFO_LIVE_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 == tcc_comparison
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 else
455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
457 pattern_def_seq = NULL;
458 gsi_next (&si);
460 continue;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469 dump_printf (MSG_NOTE, "\n");
471 vectype = get_vectype_for_scalar_type (scalar_type);
472 if (!vectype)
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized: unsupported "
478 "data-type ");
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480 scalar_type);
481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
483 return false;
486 if (!bool_result)
487 STMT_VINFO_VECTYPE (stmt_info) = vectype;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493 dump_printf (MSG_NOTE, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 vf_vectype = vectype;
501 else
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 if (!bool_result)
507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 &dummy);
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 dump_printf (MSG_NOTE, "\n");
516 vf_vectype = get_vectype_for_scalar_type (scalar_type);
518 if (!vf_vectype)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525 scalar_type);
526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 return false;
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vf_vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553 dump_printf (MSG_NOTE, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 dump_printf (MSG_NOTE, "\n");
563 vect_update_max_nunits (&vectorization_factor, vf_vectype);
565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 pattern_def_seq = NULL;
568 gsi_next (&si);
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577 dump_dec (MSG_NOTE, vectorization_factor);
578 dump_printf (MSG_NOTE, "\n");
581 if (known_le (vectorization_factor, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585 "not vectorized: unsupported data-type\n");
586 return false;
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
590 for (i = 0; i < mask_producers.length (); i++)
592 tree mask_type = NULL;
594 stmt = STMT_VINFO_STMT (mask_producers[i]);
596 if (is_gimple_assign (stmt)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 mask_type = get_mask_type_for_scalar_type (scalar_type);
604 if (!mask_type)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "not vectorized: unsupported mask\n");
609 return false;
612 else
614 tree rhs;
615 ssa_op_iter iter;
616 gimple *def_stmt;
617 enum vect_def_type dt;
619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 &def_stmt, &dt, &vectype))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "not vectorized: can't compute mask type "
628 "for statement, ");
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 return false;
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
638 if (!vectype)
639 continue;
641 if (!mask_type)
642 mask_type = vectype;
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 TYPE_VECTOR_SUBPARTS (vectype)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 mask_type);
653 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 vectype);
656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
658 return false;
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 != VECTOR_BOOLEAN_TYPE_P (vectype))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 mask_type);
670 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 vectype);
673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
675 return false;
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
681 if (mask_type
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 && gimple_code (stmt) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 mask_type = build_same_sized_truth_vector_type (mask_type);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
690 if-conversion. */
691 if (!mask_type)
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "not vectorized: can't compute mask type "
697 "for statement, ");
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 return false;
704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 return true;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718 tree * step)
720 tree init_expr;
721 tree step_expr;
722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723 basic_block bb;
725 /* When there is no evolution in this loop, the evolution function
726 is not "simple". */
727 if (evolution_part == NULL_TREE)
728 return false;
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part))
733 return false;
735 step_expr = evolution_part;
736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742 dump_printf (MSG_NOTE, ", init: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744 dump_printf (MSG_NOTE, "\n");
747 *init = init_expr;
748 *step = step_expr;
750 if (TREE_CODE (step_expr) != INTEGER_CST
751 && (TREE_CODE (step_expr) != SSA_NAME
752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 || !flag_associative_math)))
757 && (TREE_CODE (step_expr) != REAL_CST
758 || !flag_associative_math))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "step unknown.\n");
763 return false;
766 return true;
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
774 enclosing LOOP). */
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
779 basic_block bb = loop->header;
780 tree init, step;
781 auto_vec<gimple *, 64> worklist;
782 gphi_iterator gsi;
783 bool double_reduc;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
791 changed. */
792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
794 gphi *phi = gsi.phi ();
795 tree access_fn = NULL;
796 tree def = PHI_RESULT (phi);
797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def))
808 continue;
810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
812 /* Analyze the evolution function. */
813 access_fn = analyze_scalar_evolution (loop, def);
814 if (access_fn)
816 STRIP_NOPS (access_fn);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822 dump_printf (MSG_NOTE, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 = initial_condition_in_loop_num (access_fn, loop->num);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 = evolution_part_in_loop_num (access_fn, loop->num);
830 if (!access_fn
831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 && TREE_CODE (step) != INTEGER_CST))
835 worklist.safe_push (phi);
836 continue;
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 != NULL_TREE);
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist.length () > 0)
852 gimple *phi = worklist.pop ();
853 tree def = PHI_RESULT (phi);
854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855 gimple *reduc_stmt;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 gcc_assert (!virtual_operand_p (def)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
903 chain. */
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 tree *number_of_iterations, tree *number_of_iterationsm1)
1024 edge exit = single_exit (loop);
1025 struct tree_niter_desc niter_desc;
1026 tree niter_assumptions, niter, may_be_zero;
1027 gcond *cond = get_loop_exit_condition (loop);
1029 *assumptions = boolean_true_node;
1030 *number_of_iterationsm1 = chrec_dont_know;
1031 *number_of_iterations = chrec_dont_know;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== get_loop_niters ===\n");
1036 if (!exit)
1037 return cond;
1039 niter = chrec_dont_know;
1040 may_be_zero = NULL_TREE;
1041 niter_assumptions = boolean_true_node;
1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043 || chrec_contains_undetermined (niter_desc.niter))
1044 return cond;
1046 niter_assumptions = niter_desc.assumptions;
1047 may_be_zero = niter_desc.may_be_zero;
1048 niter = niter_desc.niter;
1050 if (may_be_zero && integer_zerop (may_be_zero))
1051 may_be_zero = NULL_TREE;
1053 if (may_be_zero)
1055 if (COMPARISON_CLASS_P (may_be_zero))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 niter_assumptions,
1062 fold_build1 (TRUTH_NOT_EXPR,
1063 boolean_type_node,
1064 may_be_zero));
1065 else
1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 build_int_cst (TREE_TYPE (niter), 0),
1068 rewrite_to_non_trapping_overflow (niter));
1070 may_be_zero = NULL_TREE;
1072 else if (integer_nonzerop (may_be_zero))
1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 return cond;
1078 else
1079 return cond;
1082 *assumptions = niter_assumptions;
1083 *number_of_iterationsm1 = niter;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter && !chrec_contains_undetermined (niter))
1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 build_int_cst (TREE_TYPE (niter), 1));
1092 *number_of_iterations = niter;
1094 return cond;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1104 const struct loop *const loop = (const struct loop *)data;
1105 if (flow_bb_inside_loop_p (loop, bb))
1106 return true;
1107 return false;
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115 : vec_info (vec_info::loop, init_cost (loop_in)),
1116 loop (loop_in),
1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118 num_itersm1 (NULL_TREE),
1119 num_iters (NULL_TREE),
1120 num_iters_unchanged (NULL_TREE),
1121 num_iters_assumptions (NULL_TREE),
1122 th (0),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE),
1127 mask_compare_type (NULL_TREE),
1128 unaligned_dr (NULL),
1129 peeling_for_alignment (0),
1130 ptr_mask (0),
1131 slp_unrolling_factor (1),
1132 single_scalar_iteration_cost (0),
1133 vectorizable (false),
1134 can_fully_mask_p (true),
1135 fully_masked_p (false),
1136 peeling_for_gaps (false),
1137 peeling_for_niter (false),
1138 operands_swapped (false),
1139 no_data_dependencies (false),
1140 has_mask_store (false),
1141 scalar_loop (NULL),
1142 orig_loop_info (NULL)
1144 /* Create/Update stmt_info for all stmts in the loop. */
1145 basic_block *body = get_loop_body (loop);
1146 for (unsigned int i = 0; i < loop->num_nodes; i++)
1148 basic_block bb = body[i];
1149 gimple_stmt_iterator si;
1151 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153 gimple *phi = gsi_stmt (si);
1154 gimple_set_uid (phi, 0);
1155 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1158 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160 gimple *stmt = gsi_stmt (si);
1161 gimple_set_uid (stmt, 0);
1162 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1165 free (body);
1167 /* CHECKME: We want to visit all BBs before their successors (except for
1168 latch blocks, for which this assertion wouldn't hold). In the simple
1169 case of the loop forms we allow, a dfs order of the BBs would the same
1170 as reversed postorder traversal, so we are safe. */
1172 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1173 bbs, loop->num_nodes, loop);
1174 gcc_assert (nbbs == loop->num_nodes);
1177 /* Free all levels of MASKS. */
1179 void
1180 release_vec_loop_masks (vec_loop_masks *masks)
1182 rgroup_masks *rgm;
1183 unsigned int i;
1184 FOR_EACH_VEC_ELT (*masks, i, rgm)
1185 rgm->masks.release ();
1186 masks->release ();
1189 /* Free all memory used by the _loop_vec_info, as well as all the
1190 stmt_vec_info structs of all the stmts in the loop. */
1192 _loop_vec_info::~_loop_vec_info ()
1194 int nbbs;
1195 gimple_stmt_iterator si;
1196 int j;
1198 nbbs = loop->num_nodes;
1199 for (j = 0; j < nbbs; j++)
1201 basic_block bb = bbs[j];
1202 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1203 free_stmt_vec_info (gsi_stmt (si));
1205 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207 gimple *stmt = gsi_stmt (si);
1209 /* We may have broken canonical form by moving a constant
1210 into RHS1 of a commutative op. Fix such occurrences. */
1211 if (operands_swapped && is_gimple_assign (stmt))
1213 enum tree_code code = gimple_assign_rhs_code (stmt);
1215 if ((code == PLUS_EXPR
1216 || code == POINTER_PLUS_EXPR
1217 || code == MULT_EXPR)
1218 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1219 swap_ssa_operands (stmt,
1220 gimple_assign_rhs1_ptr (stmt),
1221 gimple_assign_rhs2_ptr (stmt));
1222 else if (code == COND_EXPR
1223 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225 tree cond_expr = gimple_assign_rhs1 (stmt);
1226 enum tree_code cond_code = TREE_CODE (cond_expr);
1228 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1231 0));
1232 cond_code = invert_tree_comparison (cond_code,
1233 honor_nans);
1234 if (cond_code != ERROR_MARK)
1236 TREE_SET_CODE (cond_expr, cond_code);
1237 swap_ssa_operands (stmt,
1238 gimple_assign_rhs2_ptr (stmt),
1239 gimple_assign_rhs3_ptr (stmt));
1245 /* Free stmt_vec_info. */
1246 free_stmt_vec_info (stmt);
1247 gsi_next (&si);
1251 free (bbs);
1253 release_vec_loop_masks (&masks);
1255 loop->aux = NULL;
1258 /* Return true if we can use CMP_TYPE as the comparison type to produce
1259 all masks required to mask LOOP_VINFO. */
1261 static bool
1262 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1264 rgroup_masks *rgm;
1265 unsigned int i;
1266 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1267 if (rgm->mask_type != NULL_TREE
1268 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1269 cmp_type, rgm->mask_type,
1270 OPTIMIZE_FOR_SPEED))
1271 return false;
1272 return true;
1275 /* Calculate the maximum number of scalars per iteration for every
1276 rgroup in LOOP_VINFO. */
1278 static unsigned int
1279 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1281 unsigned int res = 1;
1282 unsigned int i;
1283 rgroup_masks *rgm;
1284 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1285 res = MAX (res, rgm->max_nscalars_per_iter);
1286 return res;
1289 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1290 whether we can actually generate the masks required. Return true if so,
1291 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1293 static bool
1294 vect_verify_full_masking (loop_vec_info loop_vinfo)
1296 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1297 unsigned int min_ni_width;
1299 /* Use a normal loop if there are no statements that need masking.
1300 This only happens in rare degenerate cases: it means that the loop
1301 has no loads, no stores, and no live-out values. */
1302 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1303 return false;
1305 /* Get the maximum number of iterations that is representable
1306 in the counter type. */
1307 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1308 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1310 /* Get a more refined estimate for the number of iterations. */
1311 widest_int max_back_edges;
1312 if (max_loop_iterations (loop, &max_back_edges))
1313 max_ni = wi::smin (max_ni, max_back_edges + 1);
1315 /* Account for rgroup masks, in which each bit is replicated N times. */
1316 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1318 /* Work out how many bits we need to represent the limit. */
1319 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1321 /* Find a scalar mode for which WHILE_ULT is supported. */
1322 opt_scalar_int_mode cmp_mode_iter;
1323 tree cmp_type = NULL_TREE;
1324 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1326 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1327 if (cmp_bits >= min_ni_width
1328 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1330 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1331 if (this_type
1332 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1334 /* Although we could stop as soon as we find a valid mode,
1335 it's often better to continue until we hit Pmode, since the
1336 operands to the WHILE are more likely to be reusable in
1337 address calculations. */
1338 cmp_type = this_type;
1339 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1340 break;
1345 if (!cmp_type)
1346 return false;
1348 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1349 return true;
1352 /* Calculate the cost of one scalar iteration of the loop. */
1353 static void
1354 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1356 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1357 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1358 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1359 int innerloop_iters, i;
1361 /* Count statements in scalar loop. Using this as scalar cost for a single
1362 iteration for now.
1364 TODO: Add outer loop support.
1366 TODO: Consider assigning different costs to different scalar
1367 statements. */
1369 /* FORNOW. */
1370 innerloop_iters = 1;
1371 if (loop->inner)
1372 innerloop_iters = 50; /* FIXME */
1374 for (i = 0; i < nbbs; i++)
1376 gimple_stmt_iterator si;
1377 basic_block bb = bbs[i];
1379 if (bb->loop_father == loop->inner)
1380 factor = innerloop_iters;
1381 else
1382 factor = 1;
1384 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1386 gimple *stmt = gsi_stmt (si);
1387 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1389 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1390 continue;
1392 /* Skip stmts that are not vectorized inside the loop. */
1393 if (stmt_info
1394 && !STMT_VINFO_RELEVANT_P (stmt_info)
1395 && (!STMT_VINFO_LIVE_P (stmt_info)
1396 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1397 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1398 continue;
1400 vect_cost_for_stmt kind;
1401 if (STMT_VINFO_DATA_REF (stmt_info))
1403 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1404 kind = scalar_load;
1405 else
1406 kind = scalar_store;
1408 else
1409 kind = scalar_stmt;
1411 scalar_single_iter_cost
1412 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1413 factor, kind, stmt_info, 0, vect_prologue);
1416 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1417 = scalar_single_iter_cost;
1421 /* Function vect_analyze_loop_form_1.
1423 Verify that certain CFG restrictions hold, including:
1424 - the loop has a pre-header
1425 - the loop has a single entry and exit
1426 - the loop exit condition is simple enough
1427 - the number of iterations can be analyzed, i.e, a countable loop. The
1428 niter could be analyzed under some assumptions. */
1430 bool
1431 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1432 tree *assumptions, tree *number_of_iterationsm1,
1433 tree *number_of_iterations, gcond **inner_loop_cond)
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_NOTE, vect_location,
1437 "=== vect_analyze_loop_form ===\n");
1439 /* Different restrictions apply when we are considering an inner-most loop,
1440 vs. an outer (nested) loop.
1441 (FORNOW. May want to relax some of these restrictions in the future). */
1443 if (!loop->inner)
1445 /* Inner-most loop. We currently require that the number of BBs is
1446 exactly 2 (the header and latch). Vectorizable inner-most loops
1447 look like this:
1449 (pre-header)
1451 header <--------+
1452 | | |
1453 | +--> latch --+
1455 (exit-bb) */
1457 if (loop->num_nodes != 2)
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 "not vectorized: control flow in loop.\n");
1462 return false;
1465 if (empty_block_p (loop->header))
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469 "not vectorized: empty loop.\n");
1470 return false;
1473 else
1475 struct loop *innerloop = loop->inner;
1476 edge entryedge;
1478 /* Nested loop. We currently require that the loop is doubly-nested,
1479 contains a single inner loop, and the number of BBs is exactly 5.
1480 Vectorizable outer-loops look like this:
1482 (pre-header)
1484 header <---+
1486 inner-loop |
1488 tail ------+
1490 (exit-bb)
1492 The inner-loop has the properties expected of inner-most loops
1493 as described above. */
1495 if ((loop->inner)->inner || (loop->inner)->next)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: multiple nested loops.\n");
1500 return false;
1503 if (loop->num_nodes != 5)
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: control flow in loop.\n");
1508 return false;
1511 entryedge = loop_preheader_edge (innerloop);
1512 if (entryedge->src != loop->header
1513 || !single_exit (innerloop)
1514 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518 "not vectorized: unsupported outerloop form.\n");
1519 return false;
1522 /* Analyze the inner-loop. */
1523 tree inner_niterm1, inner_niter, inner_assumptions;
1524 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1525 &inner_assumptions, &inner_niterm1,
1526 &inner_niter, NULL)
1527 /* Don't support analyzing niter under assumptions for inner
1528 loop. */
1529 || !integer_onep (inner_assumptions))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "not vectorized: Bad inner loop.\n");
1534 return false;
1537 if (!expr_invariant_in_loop_p (loop, inner_niter))
1539 if (dump_enabled_p ())
1540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541 "not vectorized: inner-loop count not"
1542 " invariant.\n");
1543 return false;
1546 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE, vect_location,
1548 "Considering outer-loop vectorization.\n");
1551 if (!single_exit (loop)
1552 || EDGE_COUNT (loop->header->preds) != 2)
1554 if (dump_enabled_p ())
1556 if (!single_exit (loop))
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558 "not vectorized: multiple exits.\n");
1559 else if (EDGE_COUNT (loop->header->preds) != 2)
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561 "not vectorized: too many incoming edges.\n");
1563 return false;
1566 /* We assume that the loop exit condition is at the end of the loop. i.e,
1567 that the loop is represented as a do-while (with a proper if-guard
1568 before the loop if needed), where the loop header contains all the
1569 executable statements, and the latch is empty. */
1570 if (!empty_block_p (loop->latch)
1571 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1573 if (dump_enabled_p ())
1574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575 "not vectorized: latch block not empty.\n");
1576 return false;
1579 /* Make sure the exit is not abnormal. */
1580 edge e = single_exit (loop);
1581 if (e->flags & EDGE_ABNORMAL)
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585 "not vectorized: abnormal loop exit edge.\n");
1586 return false;
1589 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1590 number_of_iterationsm1);
1591 if (!*loop_cond)
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595 "not vectorized: complicated exit condition.\n");
1596 return false;
1599 if (integer_zerop (*assumptions)
1600 || !*number_of_iterations
1601 || chrec_contains_undetermined (*number_of_iterations))
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "not vectorized: number of iterations cannot be "
1606 "computed.\n");
1607 return false;
1610 if (integer_zerop (*number_of_iterations))
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614 "not vectorized: number of iterations = 0.\n");
1615 return false;
1618 return true;
1621 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1623 loop_vec_info
1624 vect_analyze_loop_form (struct loop *loop)
1626 tree assumptions, number_of_iterations, number_of_iterationsm1;
1627 gcond *loop_cond, *inner_loop_cond = NULL;
1629 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1630 &assumptions, &number_of_iterationsm1,
1631 &number_of_iterations, &inner_loop_cond))
1632 return NULL;
1634 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1635 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1636 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1637 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1638 if (!integer_onep (assumptions))
1640 /* We consider to vectorize this loop by versioning it under
1641 some assumptions. In order to do this, we need to clear
1642 existing information computed by scev and niter analyzer. */
1643 scev_reset_htab ();
1644 free_numbers_of_iterations_estimates (loop);
1645 /* Also set flag for this loop so that following scev and niter
1646 analysis are done under the assumptions. */
1647 loop_constraint_set (loop, LOOP_C_FINITE);
1648 /* Also record the assumptions for versioning. */
1649 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1652 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1654 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_NOTE, vect_location,
1657 "Symbolic number of iterations is ");
1658 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1659 dump_printf (MSG_NOTE, "\n");
1663 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1664 if (inner_loop_cond)
1665 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1666 = loop_exit_ctrl_vec_info_type;
1668 gcc_assert (!loop->aux);
1669 loop->aux = loop_vinfo;
1670 return loop_vinfo;
1675 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1676 statements update the vectorization factor. */
1678 static void
1679 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1681 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1682 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1683 int nbbs = loop->num_nodes;
1684 poly_uint64 vectorization_factor;
1685 int i;
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "=== vect_update_vf_for_slp ===\n");
1691 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1692 gcc_assert (known_ne (vectorization_factor, 0U));
1694 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1695 vectorization factor of the loop is the unrolling factor required by
1696 the SLP instances. If that unrolling factor is 1, we say, that we
1697 perform pure SLP on loop - cross iteration parallelism is not
1698 exploited. */
1699 bool only_slp_in_loop = true;
1700 for (i = 0; i < nbbs; i++)
1702 basic_block bb = bbs[i];
1703 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1704 gsi_next (&si))
1706 gimple *stmt = gsi_stmt (si);
1707 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1708 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1709 && STMT_VINFO_RELATED_STMT (stmt_info))
1711 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1712 stmt_info = vinfo_for_stmt (stmt);
1714 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1715 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1716 && !PURE_SLP_STMT (stmt_info))
1717 /* STMT needs both SLP and loop-based vectorization. */
1718 only_slp_in_loop = false;
1722 if (only_slp_in_loop)
1724 dump_printf_loc (MSG_NOTE, vect_location,
1725 "Loop contains only SLP stmts\n");
1726 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1728 else
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "Loop contains SLP and non-SLP stmts\n");
1732 /* Both the vectorization factor and unroll factor have the form
1733 current_vector_size * X for some rational X, so they must have
1734 a common multiple. */
1735 vectorization_factor
1736 = force_common_multiple (vectorization_factor,
1737 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1740 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1741 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_NOTE, vect_location,
1744 "Updating vectorization factor to ");
1745 dump_dec (MSG_NOTE, vectorization_factor);
1746 dump_printf (MSG_NOTE, ".\n");
1750 /* Return true if STMT_INFO describes a double reduction phi and if
1751 the other phi in the reduction is also relevant for vectorization.
1752 This rejects cases such as:
1754 outer1:
1755 x_1 = PHI <x_3(outer2), ...>;
1758 inner:
1759 x_2 = ...;
1762 outer2:
1763 x_3 = PHI <x_2(inner)>;
1765 if nothing in x_2 or elsewhere makes x_1 relevant. */
1767 static bool
1768 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1770 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1771 return false;
1773 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1774 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1777 /* Function vect_analyze_loop_operations.
1779 Scan the loop stmts and make sure they are all vectorizable. */
1781 static bool
1782 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1784 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1785 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1786 int nbbs = loop->num_nodes;
1787 int i;
1788 stmt_vec_info stmt_info;
1789 bool need_to_vectorize = false;
1790 bool ok;
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_NOTE, vect_location,
1794 "=== vect_analyze_loop_operations ===\n");
1796 for (i = 0; i < nbbs; i++)
1798 basic_block bb = bbs[i];
1800 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1801 gsi_next (&si))
1803 gphi *phi = si.phi ();
1804 ok = true;
1806 stmt_info = vinfo_for_stmt (phi);
1807 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1810 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1812 if (virtual_operand_p (gimple_phi_result (phi)))
1813 continue;
1815 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1816 (i.e., a phi in the tail of the outer-loop). */
1817 if (! is_loop_header_bb_p (bb))
1819 /* FORNOW: we currently don't support the case that these phis
1820 are not used in the outerloop (unless it is double reduction,
1821 i.e., this phi is vect_reduction_def), cause this case
1822 requires to actually do something here. */
1823 if (STMT_VINFO_LIVE_P (stmt_info)
1824 && !vect_active_double_reduction_p (stmt_info))
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "Unsupported loop-closed phi in "
1829 "outer-loop.\n");
1830 return false;
1833 /* If PHI is used in the outer loop, we check that its operand
1834 is defined in the inner loop. */
1835 if (STMT_VINFO_RELEVANT_P (stmt_info))
1837 tree phi_op;
1838 gimple *op_def_stmt;
1840 if (gimple_phi_num_args (phi) != 1)
1841 return false;
1843 phi_op = PHI_ARG_DEF (phi, 0);
1844 if (TREE_CODE (phi_op) != SSA_NAME)
1845 return false;
1847 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1848 if (gimple_nop_p (op_def_stmt)
1849 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1850 || !vinfo_for_stmt (op_def_stmt))
1851 return false;
1853 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1854 != vect_used_in_outer
1855 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1856 != vect_used_in_outer_by_reduction)
1857 return false;
1860 continue;
1863 gcc_assert (stmt_info);
1865 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1866 || STMT_VINFO_LIVE_P (stmt_info))
1867 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1869 /* A scalar-dependence cycle that we don't support. */
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "not vectorized: scalar dependence cycle.\n");
1873 return false;
1876 if (STMT_VINFO_RELEVANT_P (stmt_info))
1878 need_to_vectorize = true;
1879 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1880 && ! PURE_SLP_STMT (stmt_info))
1881 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1882 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1883 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1884 && ! PURE_SLP_STMT (stmt_info))
1885 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1888 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1889 if (ok
1890 && STMT_VINFO_LIVE_P (stmt_info)
1891 && !PURE_SLP_STMT (stmt_info))
1892 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1894 if (!ok)
1896 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "not vectorized: relevant phi not "
1900 "supported: ");
1901 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1903 return false;
1907 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1908 gsi_next (&si))
1910 gimple *stmt = gsi_stmt (si);
1911 if (!gimple_clobber_p (stmt)
1912 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1913 return false;
1915 } /* bbs */
1917 /* All operations in the loop are either irrelevant (deal with loop
1918 control, or dead), or only used outside the loop and can be moved
1919 out of the loop (e.g. invariants, inductions). The loop can be
1920 optimized away by scalar optimizations. We're better off not
1921 touching this loop. */
1922 if (!need_to_vectorize)
1924 if (dump_enabled_p ())
1925 dump_printf_loc (MSG_NOTE, vect_location,
1926 "All the computation can be taken out of the loop.\n");
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "not vectorized: redundant loop. no profit to "
1930 "vectorize.\n");
1931 return false;
1934 return true;
1937 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1938 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1939 definitely no, or -1 if it's worth retrying. */
1941 static int
1942 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1944 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1945 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1947 /* Only fully-masked loops can have iteration counts less than the
1948 vectorization factor. */
1949 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1951 HOST_WIDE_INT max_niter;
1953 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1954 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1955 else
1956 max_niter = max_stmt_executions_int (loop);
1958 if (max_niter != -1
1959 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963 "not vectorized: iteration count smaller than "
1964 "vectorization factor.\n");
1965 return 0;
1969 int min_profitable_iters, min_profitable_estimate;
1970 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1971 &min_profitable_estimate);
1973 if (min_profitable_iters < 0)
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977 "not vectorized: vectorization not profitable.\n");
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980 "not vectorized: vector version will never be "
1981 "profitable.\n");
1982 return -1;
1985 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1986 * assumed_vf);
1988 /* Use the cost model only if it is more conservative than user specified
1989 threshold. */
1990 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1991 min_profitable_iters);
1993 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1995 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1996 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2000 "not vectorized: vectorization not profitable.\n");
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE, vect_location,
2003 "not vectorized: iteration count smaller than user "
2004 "specified loop bound parameter or minimum profitable "
2005 "iterations (whichever is more conservative).\n");
2006 return 0;
2009 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2010 if (estimated_niter == -1)
2011 estimated_niter = likely_max_stmt_executions_int (loop);
2012 if (estimated_niter != -1
2013 && ((unsigned HOST_WIDE_INT) estimated_niter
2014 < MAX (th, (unsigned) min_profitable_estimate)))
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: estimated iteration count too "
2019 "small.\n");
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "not vectorized: estimated iteration count smaller "
2023 "than specified loop bound parameter or minimum "
2024 "profitable iterations (whichever is more "
2025 "conservative).\n");
2026 return -1;
2029 return 1;
2033 /* Function vect_analyze_loop_2.
2035 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2036 for it. The different analyses will record information in the
2037 loop_vec_info struct. */
2038 static bool
2039 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2041 bool ok;
2042 int res;
2043 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2044 poly_uint64 min_vf = 2;
2045 unsigned int n_stmts = 0;
2047 /* The first group of checks is independent of the vector size. */
2048 fatal = true;
2050 /* Find all data references in the loop (which correspond to vdefs/vuses)
2051 and analyze their evolution in the loop. */
2053 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2055 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2056 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2060 "not vectorized: loop nest containing two "
2061 "or more consecutive inner loops cannot be "
2062 "vectorized\n");
2063 return false;
2066 for (unsigned i = 0; i < loop->num_nodes; i++)
2067 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2068 !gsi_end_p (gsi); gsi_next (&gsi))
2070 gimple *stmt = gsi_stmt (gsi);
2071 if (is_gimple_debug (stmt))
2072 continue;
2073 ++n_stmts;
2074 if (!find_data_references_in_stmt (loop, stmt,
2075 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2077 if (is_gimple_call (stmt) && loop->safelen)
2079 tree fndecl = gimple_call_fndecl (stmt), op;
2080 if (fndecl != NULL_TREE)
2082 cgraph_node *node = cgraph_node::get (fndecl);
2083 if (node != NULL && node->simd_clones != NULL)
2085 unsigned int j, n = gimple_call_num_args (stmt);
2086 for (j = 0; j < n; j++)
2088 op = gimple_call_arg (stmt, j);
2089 if (DECL_P (op)
2090 || (REFERENCE_CLASS_P (op)
2091 && get_base_address (op)))
2092 break;
2094 op = gimple_call_lhs (stmt);
2095 /* Ignore #pragma omp declare simd functions
2096 if they don't have data references in the
2097 call stmt itself. */
2098 if (j == n
2099 && !(op
2100 && (DECL_P (op)
2101 || (REFERENCE_CLASS_P (op)
2102 && get_base_address (op)))))
2103 continue;
2107 if (dump_enabled_p ())
2108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109 "not vectorized: loop contains function "
2110 "calls or data references that cannot "
2111 "be analyzed\n");
2112 return false;
2116 /* Analyze the data references and also adjust the minimal
2117 vectorization factor according to the loads and stores. */
2119 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2120 if (!ok)
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124 "bad data references.\n");
2125 return false;
2128 /* Classify all cross-iteration scalar data-flow cycles.
2129 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2130 vect_analyze_scalar_cycles (loop_vinfo);
2132 vect_pattern_recog (loop_vinfo);
2134 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2136 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2137 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2139 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2140 if (!ok)
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "bad data access.\n");
2145 return false;
2148 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2150 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2151 if (!ok)
2153 if (dump_enabled_p ())
2154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155 "unexpected pattern.\n");
2156 return false;
2159 /* While the rest of the analysis below depends on it in some way. */
2160 fatal = false;
2162 /* Analyze data dependences between the data-refs in the loop
2163 and adjust the maximum vectorization factor according to
2164 the dependences.
2165 FORNOW: fail at the first data dependence that we encounter. */
2167 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2168 if (!ok
2169 || (max_vf != MAX_VECTORIZATION_FACTOR
2170 && maybe_lt (max_vf, min_vf)))
2172 if (dump_enabled_p ())
2173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2174 "bad data dependence.\n");
2175 return false;
2177 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2179 ok = vect_determine_vectorization_factor (loop_vinfo);
2180 if (!ok)
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "can't determine vectorization factor.\n");
2185 return false;
2187 if (max_vf != MAX_VECTORIZATION_FACTOR
2188 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "bad data dependence.\n");
2193 return false;
2196 /* Compute the scalar iteration cost. */
2197 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2199 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2200 unsigned th;
2202 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2203 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2204 if (!ok)
2205 return false;
2207 /* If there are any SLP instances mark them as pure_slp. */
2208 bool slp = vect_make_slp_decision (loop_vinfo);
2209 if (slp)
2211 /* Find stmts that need to be both vectorized and SLPed. */
2212 vect_detect_hybrid_slp (loop_vinfo);
2214 /* Update the vectorization factor based on the SLP decision. */
2215 vect_update_vf_for_slp (loop_vinfo);
2218 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2220 /* We don't expect to have to roll back to anything other than an empty
2221 set of rgroups. */
2222 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2224 /* This is the point where we can re-start analysis with SLP forced off. */
2225 start_over:
2227 /* Now the vectorization factor is final. */
2228 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2229 gcc_assert (known_ne (vectorization_factor, 0U));
2231 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "vectorization_factor = ");
2235 dump_dec (MSG_NOTE, vectorization_factor);
2236 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2237 LOOP_VINFO_INT_NITERS (loop_vinfo));
2240 HOST_WIDE_INT max_niter
2241 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2243 /* Analyze the alignment of the data-refs in the loop.
2244 Fail if a data reference is found that cannot be vectorized. */
2246 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2247 if (!ok)
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251 "bad data alignment.\n");
2252 return false;
2255 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2256 It is important to call pruning after vect_analyze_data_ref_accesses,
2257 since we use grouping information gathered by interleaving analysis. */
2258 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2259 if (!ok)
2260 return false;
2262 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2263 vectorization. */
2264 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2266 /* This pass will decide on using loop versioning and/or loop peeling in
2267 order to enhance the alignment of data references in the loop. */
2268 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2269 if (!ok)
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 "bad data alignment.\n");
2274 return false;
2278 if (slp)
2280 /* Analyze operations in the SLP instances. Note this may
2281 remove unsupported SLP instances which makes the above
2282 SLP kind detection invalid. */
2283 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2284 vect_slp_analyze_operations (loop_vinfo);
2285 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2286 goto again;
2289 /* Scan all the remaining operations in the loop that are not subject
2290 to SLP and make sure they are vectorizable. */
2291 ok = vect_analyze_loop_operations (loop_vinfo);
2292 if (!ok)
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "bad operation or unsupported loop bound.\n");
2297 return false;
2300 /* Decide whether to use a fully-masked loop for this vectorization
2301 factor. */
2302 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2303 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2304 && vect_verify_full_masking (loop_vinfo));
2305 if (dump_enabled_p ())
2307 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2308 dump_printf_loc (MSG_NOTE, vect_location,
2309 "using a fully-masked loop.\n");
2310 else
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "not using a fully-masked loop.\n");
2315 /* If epilog loop is required because of data accesses with gaps,
2316 one additional iteration needs to be peeled. Check if there is
2317 enough iterations for vectorization. */
2318 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2319 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2320 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2322 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2323 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2325 if (known_lt (wi::to_widest (scalar_niters), vf))
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_NOTE, vect_location,
2329 "loop has no enough iterations to support"
2330 " peeling for gaps.\n");
2331 return false;
2335 /* Check the costings of the loop make vectorizing worthwhile. */
2336 res = vect_analyze_loop_costing (loop_vinfo);
2337 if (res < 0)
2338 goto again;
2339 if (!res)
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Loop costings not worthwhile.\n");
2344 return false;
2347 /* Decide whether we need to create an epilogue loop to handle
2348 remaining scalar iterations. */
2349 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2351 unsigned HOST_WIDE_INT const_vf;
2352 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2353 /* The main loop handles all iterations. */
2354 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2355 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2356 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2358 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2359 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2360 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2361 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2363 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2364 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2365 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2366 < (unsigned) exact_log2 (const_vf))
2367 /* In case of versioning, check if the maximum number of
2368 iterations is greater than th. If they are identical,
2369 the epilogue is unnecessary. */
2370 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2371 || ((unsigned HOST_WIDE_INT) max_niter
2372 > (th / const_vf) * const_vf))))
2373 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2375 /* If an epilogue loop is required make sure we can create one. */
2376 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2377 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2381 if (!vect_can_advance_ivs_p (loop_vinfo)
2382 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2383 single_exit (LOOP_VINFO_LOOP
2384 (loop_vinfo))))
2386 if (dump_enabled_p ())
2387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388 "not vectorized: can't create required "
2389 "epilog loop\n");
2390 goto again;
2394 /* During peeling, we need to check if number of loop iterations is
2395 enough for both peeled prolog loop and vector loop. This check
2396 can be merged along with threshold check of loop versioning, so
2397 increase threshold for this case if necessary. */
2398 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2400 poly_uint64 niters_th = 0;
2402 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2404 /* Niters for peeled prolog loop. */
2405 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2407 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2408 tree vectype
2409 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2410 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2412 else
2413 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2416 /* Niters for at least one iteration of vectorized loop. */
2417 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2418 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419 /* One additional iteration because of peeling for gap. */
2420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2421 niters_th += 1;
2422 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2425 gcc_assert (known_eq (vectorization_factor,
2426 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2428 /* Ok to vectorize! */
2429 return true;
2431 again:
2432 /* Try again with SLP forced off but if we didn't do any SLP there is
2433 no point in re-trying. */
2434 if (!slp)
2435 return false;
2437 /* If there are reduction chains re-trying will fail anyway. */
2438 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2439 return false;
2441 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2442 via interleaving or lane instructions. */
2443 slp_instance instance;
2444 slp_tree node;
2445 unsigned i, j;
2446 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2448 stmt_vec_info vinfo;
2449 vinfo = vinfo_for_stmt
2450 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2451 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2452 continue;
2453 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2454 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2455 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2456 if (! vect_store_lanes_supported (vectype, size, false)
2457 && ! vect_grouped_store_supported (vectype, size))
2458 return false;
2459 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2461 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2462 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2463 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2464 size = STMT_VINFO_GROUP_SIZE (vinfo);
2465 vectype = STMT_VINFO_VECTYPE (vinfo);
2466 if (! vect_load_lanes_supported (vectype, size, false)
2467 && ! vect_grouped_load_supported (vectype, single_element_p,
2468 size))
2469 return false;
2473 if (dump_enabled_p ())
2474 dump_printf_loc (MSG_NOTE, vect_location,
2475 "re-trying with SLP disabled\n");
2477 /* Roll back state appropriately. No SLP this time. */
2478 slp = false;
2479 /* Restore vectorization factor as it were without SLP. */
2480 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2481 /* Free the SLP instances. */
2482 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2483 vect_free_slp_instance (instance);
2484 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2485 /* Reset SLP type to loop_vect on all stmts. */
2486 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2488 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2489 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2490 !gsi_end_p (si); gsi_next (&si))
2492 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2493 STMT_SLP_TYPE (stmt_info) = loop_vect;
2495 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2496 !gsi_end_p (si); gsi_next (&si))
2498 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2499 STMT_SLP_TYPE (stmt_info) = loop_vect;
2500 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2502 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2503 STMT_SLP_TYPE (stmt_info) = loop_vect;
2504 for (gimple_stmt_iterator pi
2505 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2506 !gsi_end_p (pi); gsi_next (&pi))
2508 gimple *pstmt = gsi_stmt (pi);
2509 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2514 /* Free optimized alias test DDRS. */
2515 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2516 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2517 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2518 /* Reset target cost data. */
2519 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2520 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2521 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2522 /* Reset accumulated rgroup information. */
2523 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2524 /* Reset assorted flags. */
2525 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2526 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2527 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2528 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2529 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2531 goto start_over;
2534 /* Function vect_analyze_loop.
2536 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2537 for it. The different analyses will record information in the
2538 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2539 be vectorized. */
2540 loop_vec_info
2541 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2543 loop_vec_info loop_vinfo;
2544 auto_vector_sizes vector_sizes;
2546 /* Autodetect first vector size we try. */
2547 current_vector_size = 0;
2548 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2549 unsigned int next_size = 0;
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE, vect_location,
2553 "===== analyze_loop_nest =====\n");
2555 if (loop_outer (loop)
2556 && loop_vec_info_for_loop (loop_outer (loop))
2557 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE, vect_location,
2561 "outer-loop already vectorized.\n");
2562 return NULL;
2565 poly_uint64 autodetected_vector_size = 0;
2566 while (1)
2568 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2569 loop_vinfo = vect_analyze_loop_form (loop);
2570 if (!loop_vinfo)
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2574 "bad loop form.\n");
2575 return NULL;
2578 bool fatal = false;
2580 if (orig_loop_vinfo)
2581 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2583 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2585 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2587 return loop_vinfo;
2590 delete loop_vinfo;
2592 if (next_size == 0)
2593 autodetected_vector_size = current_vector_size;
2595 if (next_size < vector_sizes.length ()
2596 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2597 next_size += 1;
2599 if (fatal
2600 || next_size == vector_sizes.length ()
2601 || known_eq (current_vector_size, 0U))
2602 return NULL;
2604 /* Try the next biggest vector size. */
2605 current_vector_size = vector_sizes[next_size++];
2606 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_NOTE, vect_location,
2609 "***** Re-trying analysis with "
2610 "vector size ");
2611 dump_dec (MSG_NOTE, current_vector_size);
2612 dump_printf (MSG_NOTE, "\n");
2617 /* Return true if there is an in-order reduction function for CODE, storing
2618 it in *REDUC_FN if so. */
2620 static bool
2621 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2623 switch (code)
2625 case PLUS_EXPR:
2626 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2627 return true;
2629 default:
2630 return false;
2634 /* Function reduction_fn_for_scalar_code
2636 Input:
2637 CODE - tree_code of a reduction operations.
2639 Output:
2640 REDUC_FN - the corresponding internal function to be used to reduce the
2641 vector of partial results into a single scalar result, or IFN_LAST
2642 if the operation is a supported reduction operation, but does not have
2643 such an internal function.
2645 Return FALSE if CODE currently cannot be vectorized as reduction. */
2647 static bool
2648 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2650 switch (code)
2652 case MAX_EXPR:
2653 *reduc_fn = IFN_REDUC_MAX;
2654 return true;
2656 case MIN_EXPR:
2657 *reduc_fn = IFN_REDUC_MIN;
2658 return true;
2660 case PLUS_EXPR:
2661 *reduc_fn = IFN_REDUC_PLUS;
2662 return true;
2664 case BIT_AND_EXPR:
2665 *reduc_fn = IFN_REDUC_AND;
2666 return true;
2668 case BIT_IOR_EXPR:
2669 *reduc_fn = IFN_REDUC_IOR;
2670 return true;
2672 case BIT_XOR_EXPR:
2673 *reduc_fn = IFN_REDUC_XOR;
2674 return true;
2676 case MULT_EXPR:
2677 case MINUS_EXPR:
2678 *reduc_fn = IFN_LAST;
2679 return true;
2681 default:
2682 return false;
2686 /* If there is a neutral value X such that SLP reduction NODE would not
2687 be affected by the introduction of additional X elements, return that X,
2688 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2689 is true if the SLP statements perform a single reduction, false if each
2690 statement performs an independent reduction. */
2692 static tree
2693 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2694 bool reduc_chain)
2696 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2697 gimple *stmt = stmts[0];
2698 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2699 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2700 tree scalar_type = TREE_TYPE (vector_type);
2701 struct loop *loop = gimple_bb (stmt)->loop_father;
2702 gcc_assert (loop);
2704 switch (code)
2706 case WIDEN_SUM_EXPR:
2707 case DOT_PROD_EXPR:
2708 case SAD_EXPR:
2709 case PLUS_EXPR:
2710 case MINUS_EXPR:
2711 case BIT_IOR_EXPR:
2712 case BIT_XOR_EXPR:
2713 return build_zero_cst (scalar_type);
2715 case MULT_EXPR:
2716 return build_one_cst (scalar_type);
2718 case BIT_AND_EXPR:
2719 return build_all_ones_cst (scalar_type);
2721 case MAX_EXPR:
2722 case MIN_EXPR:
2723 /* For MIN/MAX the initial values are neutral. A reduction chain
2724 has only a single initial value, so that value is neutral for
2725 all statements. */
2726 if (reduc_chain)
2727 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2728 return NULL_TREE;
2730 default:
2731 return NULL_TREE;
2735 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2736 STMT is printed with a message MSG. */
2738 static void
2739 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2741 dump_printf_loc (msg_type, vect_location, "%s", msg);
2742 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2746 /* Detect SLP reduction of the form:
2748 #a1 = phi <a5, a0>
2749 a2 = operation (a1)
2750 a3 = operation (a2)
2751 a4 = operation (a3)
2752 a5 = operation (a4)
2754 #a = phi <a5>
2756 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2757 FIRST_STMT is the first reduction stmt in the chain
2758 (a2 = operation (a1)).
2760 Return TRUE if a reduction chain was detected. */
2762 static bool
2763 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2764 gimple *first_stmt)
2766 struct loop *loop = (gimple_bb (phi))->loop_father;
2767 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2768 enum tree_code code;
2769 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2770 stmt_vec_info use_stmt_info, current_stmt_info;
2771 tree lhs;
2772 imm_use_iterator imm_iter;
2773 use_operand_p use_p;
2774 int nloop_uses, size = 0, n_out_of_loop_uses;
2775 bool found = false;
2777 if (loop != vect_loop)
2778 return false;
2780 lhs = PHI_RESULT (phi);
2781 code = gimple_assign_rhs_code (first_stmt);
2782 while (1)
2784 nloop_uses = 0;
2785 n_out_of_loop_uses = 0;
2786 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2788 gimple *use_stmt = USE_STMT (use_p);
2789 if (is_gimple_debug (use_stmt))
2790 continue;
2792 /* Check if we got back to the reduction phi. */
2793 if (use_stmt == phi)
2795 loop_use_stmt = use_stmt;
2796 found = true;
2797 break;
2800 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2802 loop_use_stmt = use_stmt;
2803 nloop_uses++;
2805 else
2806 n_out_of_loop_uses++;
2808 /* There are can be either a single use in the loop or two uses in
2809 phi nodes. */
2810 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2811 return false;
2814 if (found)
2815 break;
2817 /* We reached a statement with no loop uses. */
2818 if (nloop_uses == 0)
2819 return false;
2821 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2822 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2823 return false;
2825 if (!is_gimple_assign (loop_use_stmt)
2826 || code != gimple_assign_rhs_code (loop_use_stmt)
2827 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2828 return false;
2830 /* Insert USE_STMT into reduction chain. */
2831 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2832 if (current_stmt)
2834 current_stmt_info = vinfo_for_stmt (current_stmt);
2835 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2836 GROUP_FIRST_ELEMENT (use_stmt_info)
2837 = GROUP_FIRST_ELEMENT (current_stmt_info);
2839 else
2840 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2842 lhs = gimple_assign_lhs (loop_use_stmt);
2843 current_stmt = loop_use_stmt;
2844 size++;
2847 if (!found || loop_use_stmt != phi || size < 2)
2848 return false;
2850 /* Swap the operands, if needed, to make the reduction operand be the second
2851 operand. */
2852 lhs = PHI_RESULT (phi);
2853 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2854 while (next_stmt)
2856 if (gimple_assign_rhs2 (next_stmt) == lhs)
2858 tree op = gimple_assign_rhs1 (next_stmt);
2859 gimple *def_stmt = NULL;
2861 if (TREE_CODE (op) == SSA_NAME)
2862 def_stmt = SSA_NAME_DEF_STMT (op);
2864 /* Check that the other def is either defined in the loop
2865 ("vect_internal_def"), or it's an induction (defined by a
2866 loop-header phi-node). */
2867 if (def_stmt
2868 && gimple_bb (def_stmt)
2869 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2870 && (is_gimple_assign (def_stmt)
2871 || is_gimple_call (def_stmt)
2872 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2873 == vect_induction_def
2874 || (gimple_code (def_stmt) == GIMPLE_PHI
2875 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2876 == vect_internal_def
2877 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2879 lhs = gimple_assign_lhs (next_stmt);
2880 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2881 continue;
2884 return false;
2886 else
2888 tree op = gimple_assign_rhs2 (next_stmt);
2889 gimple *def_stmt = NULL;
2891 if (TREE_CODE (op) == SSA_NAME)
2892 def_stmt = SSA_NAME_DEF_STMT (op);
2894 /* Check that the other def is either defined in the loop
2895 ("vect_internal_def"), or it's an induction (defined by a
2896 loop-header phi-node). */
2897 if (def_stmt
2898 && gimple_bb (def_stmt)
2899 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2900 && (is_gimple_assign (def_stmt)
2901 || is_gimple_call (def_stmt)
2902 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2903 == vect_induction_def
2904 || (gimple_code (def_stmt) == GIMPLE_PHI
2905 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2906 == vect_internal_def
2907 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2909 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2912 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2915 swap_ssa_operands (next_stmt,
2916 gimple_assign_rhs1_ptr (next_stmt),
2917 gimple_assign_rhs2_ptr (next_stmt));
2918 update_stmt (next_stmt);
2920 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2921 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2923 else
2924 return false;
2927 lhs = gimple_assign_lhs (next_stmt);
2928 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2931 /* Save the chain for further analysis in SLP detection. */
2932 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2933 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2934 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2936 return true;
2939 /* Return true if we need an in-order reduction for operation CODE
2940 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2941 overflow must wrap. */
2943 static bool
2944 needs_fold_left_reduction_p (tree type, tree_code code,
2945 bool need_wrapping_integral_overflow)
2947 /* CHECKME: check for !flag_finite_math_only too? */
2948 if (SCALAR_FLOAT_TYPE_P (type))
2949 switch (code)
2951 case MIN_EXPR:
2952 case MAX_EXPR:
2953 return false;
2955 default:
2956 return !flag_associative_math;
2959 if (INTEGRAL_TYPE_P (type))
2961 if (!operation_no_trapping_overflow (type, code))
2962 return true;
2963 if (need_wrapping_integral_overflow
2964 && !TYPE_OVERFLOW_WRAPS (type)
2965 && operation_can_overflow (code))
2966 return true;
2967 return false;
2970 if (SAT_FIXED_POINT_TYPE_P (type))
2971 return true;
2973 return false;
2976 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2977 reduction operation CODE has a handled computation expression. */
2979 bool
2980 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2981 enum tree_code code)
2983 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2984 auto_bitmap visited;
2985 tree lookfor = PHI_RESULT (phi);
2986 ssa_op_iter curri;
2987 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2988 while (USE_FROM_PTR (curr) != loop_arg)
2989 curr = op_iter_next_use (&curri);
2990 curri.i = curri.numops;
2993 path.safe_push (std::make_pair (curri, curr));
2994 tree use = USE_FROM_PTR (curr);
2995 if (use == lookfor)
2996 break;
2997 gimple *def = SSA_NAME_DEF_STMT (use);
2998 if (gimple_nop_p (def)
2999 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3001 pop:
3004 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3005 curri = x.first;
3006 curr = x.second;
3008 curr = op_iter_next_use (&curri);
3009 /* Skip already visited or non-SSA operands (from iterating
3010 over PHI args). */
3011 while (curr != NULL_USE_OPERAND_P
3012 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3013 || ! bitmap_set_bit (visited,
3014 SSA_NAME_VERSION
3015 (USE_FROM_PTR (curr)))));
3017 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3018 if (curr == NULL_USE_OPERAND_P)
3019 break;
3021 else
3023 if (gimple_code (def) == GIMPLE_PHI)
3024 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3025 else
3026 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3027 while (curr != NULL_USE_OPERAND_P
3028 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3029 || ! bitmap_set_bit (visited,
3030 SSA_NAME_VERSION
3031 (USE_FROM_PTR (curr)))))
3032 curr = op_iter_next_use (&curri);
3033 if (curr == NULL_USE_OPERAND_P)
3034 goto pop;
3037 while (1);
3038 if (dump_file && (dump_flags & TDF_DETAILS))
3040 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3041 unsigned i;
3042 std::pair<ssa_op_iter, use_operand_p> *x;
3043 FOR_EACH_VEC_ELT (path, i, x)
3045 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3046 dump_printf (MSG_NOTE, " ");
3048 dump_printf (MSG_NOTE, "\n");
3051 /* Check whether the reduction path detected is valid. */
3052 bool fail = path.length () == 0;
3053 bool neg = false;
3054 for (unsigned i = 1; i < path.length (); ++i)
3056 gimple *use_stmt = USE_STMT (path[i].second);
3057 tree op = USE_FROM_PTR (path[i].second);
3058 if (! has_single_use (op)
3059 || ! is_gimple_assign (use_stmt))
3061 fail = true;
3062 break;
3064 if (gimple_assign_rhs_code (use_stmt) != code)
3066 if (code == PLUS_EXPR
3067 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3069 /* Track whether we negate the reduction value each iteration. */
3070 if (gimple_assign_rhs2 (use_stmt) == op)
3071 neg = ! neg;
3073 else
3075 fail = true;
3076 break;
3080 return ! fail && ! neg;
3084 /* Function vect_is_simple_reduction
3086 (1) Detect a cross-iteration def-use cycle that represents a simple
3087 reduction computation. We look for the following pattern:
3089 loop_header:
3090 a1 = phi < a0, a2 >
3091 a3 = ...
3092 a2 = operation (a3, a1)
3096 a3 = ...
3097 loop_header:
3098 a1 = phi < a0, a2 >
3099 a2 = operation (a3, a1)
3101 such that:
3102 1. operation is commutative and associative and it is safe to
3103 change the order of the computation
3104 2. no uses for a2 in the loop (a2 is used out of the loop)
3105 3. no uses of a1 in the loop besides the reduction operation
3106 4. no uses of a1 outside the loop.
3108 Conditions 1,4 are tested here.
3109 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3111 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3112 nested cycles.
3114 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3115 reductions:
3117 a1 = phi < a0, a2 >
3118 inner loop (def of a3)
3119 a2 = phi < a3 >
3121 (4) Detect condition expressions, ie:
3122 for (int i = 0; i < N; i++)
3123 if (a[i] < val)
3124 ret_val = a[i];
3128 static gimple *
3129 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3130 bool *double_reduc,
3131 bool need_wrapping_integral_overflow,
3132 enum vect_reduction_type *v_reduc_type)
3134 struct loop *loop = (gimple_bb (phi))->loop_father;
3135 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3136 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3137 enum tree_code orig_code, code;
3138 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3139 tree type;
3140 int nloop_uses;
3141 tree name;
3142 imm_use_iterator imm_iter;
3143 use_operand_p use_p;
3144 bool phi_def;
3146 *double_reduc = false;
3147 *v_reduc_type = TREE_CODE_REDUCTION;
3149 tree phi_name = PHI_RESULT (phi);
3150 /* ??? If there are no uses of the PHI result the inner loop reduction
3151 won't be detected as possibly double-reduction by vectorizable_reduction
3152 because that tries to walk the PHI arg from the preheader edge which
3153 can be constant. See PR60382. */
3154 if (has_zero_uses (phi_name))
3155 return NULL;
3156 nloop_uses = 0;
3157 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3159 gimple *use_stmt = USE_STMT (use_p);
3160 if (is_gimple_debug (use_stmt))
3161 continue;
3163 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167 "intermediate value used outside loop.\n");
3169 return NULL;
3172 nloop_uses++;
3173 if (nloop_uses > 1)
3175 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3177 "reduction value used in loop.\n");
3178 return NULL;
3181 phi_use_stmt = use_stmt;
3184 edge latch_e = loop_latch_edge (loop);
3185 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3186 if (TREE_CODE (loop_arg) != SSA_NAME)
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191 "reduction: not ssa_name: ");
3192 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3193 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3195 return NULL;
3198 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3199 if (is_gimple_assign (def_stmt))
3201 name = gimple_assign_lhs (def_stmt);
3202 phi_def = false;
3204 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3206 name = PHI_RESULT (def_stmt);
3207 phi_def = true;
3209 else
3211 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214 "reduction: unhandled reduction operation: ");
3215 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3217 return NULL;
3220 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3221 return NULL;
3223 nloop_uses = 0;
3224 auto_vec<gphi *, 3> lcphis;
3225 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3227 gimple *use_stmt = USE_STMT (use_p);
3228 if (is_gimple_debug (use_stmt))
3229 continue;
3230 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3231 nloop_uses++;
3232 else
3233 /* We can have more than one loop-closed PHI. */
3234 lcphis.safe_push (as_a <gphi *> (use_stmt));
3235 if (nloop_uses > 1)
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239 "reduction used in loop.\n");
3240 return NULL;
3244 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3245 defined in the inner loop. */
3246 if (phi_def)
3248 op1 = PHI_ARG_DEF (def_stmt, 0);
3250 if (gimple_phi_num_args (def_stmt) != 1
3251 || TREE_CODE (op1) != SSA_NAME)
3253 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3255 "unsupported phi node definition.\n");
3257 return NULL;
3260 def1 = SSA_NAME_DEF_STMT (op1);
3261 if (gimple_bb (def1)
3262 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3263 && loop->inner
3264 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3265 && is_gimple_assign (def1)
3266 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3268 if (dump_enabled_p ())
3269 report_vect_op (MSG_NOTE, def_stmt,
3270 "detected double reduction: ");
3272 *double_reduc = true;
3273 return def_stmt;
3276 return NULL;
3279 /* If we are vectorizing an inner reduction we are executing that
3280 in the original order only in case we are not dealing with a
3281 double reduction. */
3282 bool check_reduction = true;
3283 if (flow_loop_nested_p (vect_loop, loop))
3285 gphi *lcphi;
3286 unsigned i;
3287 check_reduction = false;
3288 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3289 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3291 gimple *use_stmt = USE_STMT (use_p);
3292 if (is_gimple_debug (use_stmt))
3293 continue;
3294 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3295 check_reduction = true;
3299 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3300 code = orig_code = gimple_assign_rhs_code (def_stmt);
3302 /* We can handle "res -= x[i]", which is non-associative by
3303 simply rewriting this into "res += -x[i]". Avoid changing
3304 gimple instruction for the first simple tests and only do this
3305 if we're allowed to change code at all. */
3306 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3307 code = PLUS_EXPR;
3309 if (code == COND_EXPR)
3311 if (! nested_in_vect_loop)
3312 *v_reduc_type = COND_REDUCTION;
3314 op3 = gimple_assign_rhs1 (def_stmt);
3315 if (COMPARISON_CLASS_P (op3))
3317 op4 = TREE_OPERAND (op3, 1);
3318 op3 = TREE_OPERAND (op3, 0);
3320 if (op3 == phi_name || op4 == phi_name)
3322 if (dump_enabled_p ())
3323 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3324 "reduction: condition depends on previous"
3325 " iteration: ");
3326 return NULL;
3329 op1 = gimple_assign_rhs2 (def_stmt);
3330 op2 = gimple_assign_rhs3 (def_stmt);
3332 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3334 if (dump_enabled_p ())
3335 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3336 "reduction: not commutative/associative: ");
3337 return NULL;
3339 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3341 op1 = gimple_assign_rhs1 (def_stmt);
3342 op2 = gimple_assign_rhs2 (def_stmt);
3344 else
3346 if (dump_enabled_p ())
3347 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3348 "reduction: not handled operation: ");
3349 return NULL;
3352 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3354 if (dump_enabled_p ())
3355 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3356 "reduction: both uses not ssa_names: ");
3358 return NULL;
3361 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3362 if ((TREE_CODE (op1) == SSA_NAME
3363 && !types_compatible_p (type,TREE_TYPE (op1)))
3364 || (TREE_CODE (op2) == SSA_NAME
3365 && !types_compatible_p (type, TREE_TYPE (op2)))
3366 || (op3 && TREE_CODE (op3) == SSA_NAME
3367 && !types_compatible_p (type, TREE_TYPE (op3)))
3368 || (op4 && TREE_CODE (op4) == SSA_NAME
3369 && !types_compatible_p (type, TREE_TYPE (op4))))
3371 if (dump_enabled_p ())
3373 dump_printf_loc (MSG_NOTE, vect_location,
3374 "reduction: multiple types: operation type: ");
3375 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3376 dump_printf (MSG_NOTE, ", operands types: ");
3377 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3378 TREE_TYPE (op1));
3379 dump_printf (MSG_NOTE, ",");
3380 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3381 TREE_TYPE (op2));
3382 if (op3)
3384 dump_printf (MSG_NOTE, ",");
3385 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3386 TREE_TYPE (op3));
3389 if (op4)
3391 dump_printf (MSG_NOTE, ",");
3392 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3393 TREE_TYPE (op4));
3395 dump_printf (MSG_NOTE, "\n");
3398 return NULL;
3401 /* Check whether it's ok to change the order of the computation.
3402 Generally, when vectorizing a reduction we change the order of the
3403 computation. This may change the behavior of the program in some
3404 cases, so we need to check that this is ok. One exception is when
3405 vectorizing an outer-loop: the inner-loop is executed sequentially,
3406 and therefore vectorizing reductions in the inner-loop during
3407 outer-loop vectorization is safe. */
3408 if (check_reduction
3409 && *v_reduc_type == TREE_CODE_REDUCTION
3410 && needs_fold_left_reduction_p (type, code,
3411 need_wrapping_integral_overflow))
3412 *v_reduc_type = FOLD_LEFT_REDUCTION;
3414 /* Reduction is safe. We're dealing with one of the following:
3415 1) integer arithmetic and no trapv
3416 2) floating point arithmetic, and special flags permit this optimization
3417 3) nested cycle (i.e., outer loop vectorization). */
3418 if (TREE_CODE (op1) == SSA_NAME)
3419 def1 = SSA_NAME_DEF_STMT (op1);
3421 if (TREE_CODE (op2) == SSA_NAME)
3422 def2 = SSA_NAME_DEF_STMT (op2);
3424 if (code != COND_EXPR
3425 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3427 if (dump_enabled_p ())
3428 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3429 return NULL;
3432 /* Check that one def is the reduction def, defined by PHI,
3433 the other def is either defined in the loop ("vect_internal_def"),
3434 or it's an induction (defined by a loop-header phi-node). */
3436 if (def2 && def2 == phi
3437 && (code == COND_EXPR
3438 || !def1 || gimple_nop_p (def1)
3439 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3440 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3441 && (is_gimple_assign (def1)
3442 || is_gimple_call (def1)
3443 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3444 == vect_induction_def
3445 || (gimple_code (def1) == GIMPLE_PHI
3446 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3447 == vect_internal_def
3448 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3450 if (dump_enabled_p ())
3451 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3452 return def_stmt;
3455 if (def1 && def1 == phi
3456 && (code == COND_EXPR
3457 || !def2 || gimple_nop_p (def2)
3458 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3459 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3460 && (is_gimple_assign (def2)
3461 || is_gimple_call (def2)
3462 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3463 == vect_induction_def
3464 || (gimple_code (def2) == GIMPLE_PHI
3465 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3466 == vect_internal_def
3467 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3469 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3471 /* Check if we can swap operands (just for simplicity - so that
3472 the rest of the code can assume that the reduction variable
3473 is always the last (second) argument). */
3474 if (code == COND_EXPR)
3476 /* Swap cond_expr by inverting the condition. */
3477 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3478 enum tree_code invert_code = ERROR_MARK;
3479 enum tree_code cond_code = TREE_CODE (cond_expr);
3481 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3483 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3484 invert_code = invert_tree_comparison (cond_code, honor_nans);
3486 if (invert_code != ERROR_MARK)
3488 TREE_SET_CODE (cond_expr, invert_code);
3489 swap_ssa_operands (def_stmt,
3490 gimple_assign_rhs2_ptr (def_stmt),
3491 gimple_assign_rhs3_ptr (def_stmt));
3493 else
3495 if (dump_enabled_p ())
3496 report_vect_op (MSG_NOTE, def_stmt,
3497 "detected reduction: cannot swap operands "
3498 "for cond_expr");
3499 return NULL;
3502 else
3503 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3504 gimple_assign_rhs2_ptr (def_stmt));
3506 if (dump_enabled_p ())
3507 report_vect_op (MSG_NOTE, def_stmt,
3508 "detected reduction: need to swap operands: ");
3510 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3511 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3513 else
3515 if (dump_enabled_p ())
3516 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3519 return def_stmt;
3522 /* Try to find SLP reduction chain. */
3523 if (! nested_in_vect_loop
3524 && code != COND_EXPR
3525 && orig_code != MINUS_EXPR
3526 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3528 if (dump_enabled_p ())
3529 report_vect_op (MSG_NOTE, def_stmt,
3530 "reduction: detected reduction chain: ");
3532 return def_stmt;
3535 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3536 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3537 while (first)
3539 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3540 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3541 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3542 first = next;
3545 /* Look for the expression computing loop_arg from loop PHI result. */
3546 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3547 code))
3548 return def_stmt;
3550 if (dump_enabled_p ())
3552 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3553 "reduction: unknown pattern: ");
3556 return NULL;
3559 /* Wrapper around vect_is_simple_reduction, which will modify code
3560 in-place if it enables detection of more reductions. Arguments
3561 as there. */
3563 gimple *
3564 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3565 bool *double_reduc,
3566 bool need_wrapping_integral_overflow)
3568 enum vect_reduction_type v_reduc_type;
3569 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3570 need_wrapping_integral_overflow,
3571 &v_reduc_type);
3572 if (def)
3574 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3575 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3576 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3577 reduc_def_info = vinfo_for_stmt (def);
3578 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3579 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3581 return def;
3584 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3586 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3587 int *peel_iters_epilogue,
3588 stmt_vector_for_cost *scalar_cost_vec,
3589 stmt_vector_for_cost *prologue_cost_vec,
3590 stmt_vector_for_cost *epilogue_cost_vec)
3592 int retval = 0;
3593 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3595 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3597 *peel_iters_epilogue = assumed_vf / 2;
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_NOTE, vect_location,
3600 "cost model: epilogue peel iters set to vf/2 "
3601 "because loop iterations are unknown .\n");
3603 /* If peeled iterations are known but number of scalar loop
3604 iterations are unknown, count a taken branch per peeled loop. */
3605 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3606 NULL, 0, vect_prologue);
3607 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3608 NULL, 0, vect_epilogue);
3610 else
3612 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3613 peel_iters_prologue = niters < peel_iters_prologue ?
3614 niters : peel_iters_prologue;
3615 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3616 /* If we need to peel for gaps, but no peeling is required, we have to
3617 peel VF iterations. */
3618 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3619 *peel_iters_epilogue = assumed_vf;
3622 stmt_info_for_cost *si;
3623 int j;
3624 if (peel_iters_prologue)
3625 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3627 stmt_vec_info stmt_info
3628 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3629 retval += record_stmt_cost (prologue_cost_vec,
3630 si->count * peel_iters_prologue,
3631 si->kind, stmt_info, si->misalign,
3632 vect_prologue);
3634 if (*peel_iters_epilogue)
3635 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3637 stmt_vec_info stmt_info
3638 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3639 retval += record_stmt_cost (epilogue_cost_vec,
3640 si->count * *peel_iters_epilogue,
3641 si->kind, stmt_info, si->misalign,
3642 vect_epilogue);
3645 return retval;
3648 /* Function vect_estimate_min_profitable_iters
3650 Return the number of iterations required for the vector version of the
3651 loop to be profitable relative to the cost of the scalar version of the
3652 loop.
3654 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3655 of iterations for vectorization. -1 value means loop vectorization
3656 is not profitable. This returned value may be used for dynamic
3657 profitability check.
3659 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3660 for static check against estimated number of iterations. */
3662 static void
3663 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3664 int *ret_min_profitable_niters,
3665 int *ret_min_profitable_estimate)
3667 int min_profitable_iters;
3668 int min_profitable_estimate;
3669 int peel_iters_prologue;
3670 int peel_iters_epilogue;
3671 unsigned vec_inside_cost = 0;
3672 int vec_outside_cost = 0;
3673 unsigned vec_prologue_cost = 0;
3674 unsigned vec_epilogue_cost = 0;
3675 int scalar_single_iter_cost = 0;
3676 int scalar_outside_cost = 0;
3677 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3678 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3679 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3681 /* Cost model disabled. */
3682 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3684 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3685 *ret_min_profitable_niters = 0;
3686 *ret_min_profitable_estimate = 0;
3687 return;
3690 /* Requires loop versioning tests to handle misalignment. */
3691 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3693 /* FIXME: Make cost depend on complexity of individual check. */
3694 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3695 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3696 vect_prologue);
3697 dump_printf (MSG_NOTE,
3698 "cost model: Adding cost of checks for loop "
3699 "versioning to treat misalignment.\n");
3702 /* Requires loop versioning with alias checks. */
3703 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3705 /* FIXME: Make cost depend on complexity of individual check. */
3706 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3707 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3708 vect_prologue);
3709 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3710 if (len)
3711 /* Count LEN - 1 ANDs and LEN comparisons. */
3712 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3713 NULL, 0, vect_prologue);
3714 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3715 if (len)
3717 /* Count LEN - 1 ANDs and LEN comparisons. */
3718 unsigned int nstmts = len * 2 - 1;
3719 /* +1 for each bias that needs adding. */
3720 for (unsigned int i = 0; i < len; ++i)
3721 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3722 nstmts += 1;
3723 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3724 NULL, 0, vect_prologue);
3726 dump_printf (MSG_NOTE,
3727 "cost model: Adding cost of checks for loop "
3728 "versioning aliasing.\n");
3731 /* Requires loop versioning with niter checks. */
3732 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3734 /* FIXME: Make cost depend on complexity of individual check. */
3735 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3736 vect_prologue);
3737 dump_printf (MSG_NOTE,
3738 "cost model: Adding cost of checks for loop "
3739 "versioning niters.\n");
3742 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3743 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3744 vect_prologue);
3746 /* Count statements in scalar loop. Using this as scalar cost for a single
3747 iteration for now.
3749 TODO: Add outer loop support.
3751 TODO: Consider assigning different costs to different scalar
3752 statements. */
3754 scalar_single_iter_cost
3755 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3757 /* Add additional cost for the peeled instructions in prologue and epilogue
3758 loop. (For fully-masked loops there will be no peeling.)
3760 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3761 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3763 TODO: Build an expression that represents peel_iters for prologue and
3764 epilogue to be used in a run-time test. */
3766 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3768 peel_iters_prologue = 0;
3769 peel_iters_epilogue = 0;
3771 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3773 /* We need to peel exactly one iteration. */
3774 peel_iters_epilogue += 1;
3775 stmt_info_for_cost *si;
3776 int j;
3777 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3778 j, si)
3780 struct _stmt_vec_info *stmt_info
3781 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3782 (void) add_stmt_cost (target_cost_data, si->count,
3783 si->kind, stmt_info, si->misalign,
3784 vect_epilogue);
3788 else if (npeel < 0)
3790 peel_iters_prologue = assumed_vf / 2;
3791 dump_printf (MSG_NOTE, "cost model: "
3792 "prologue peel iters set to vf/2.\n");
3794 /* If peeling for alignment is unknown, loop bound of main loop becomes
3795 unknown. */
3796 peel_iters_epilogue = assumed_vf / 2;
3797 dump_printf (MSG_NOTE, "cost model: "
3798 "epilogue peel iters set to vf/2 because "
3799 "peeling for alignment is unknown.\n");
3801 /* If peeled iterations are unknown, count a taken branch and a not taken
3802 branch per peeled loop. Even if scalar loop iterations are known,
3803 vector iterations are not known since peeled prologue iterations are
3804 not known. Hence guards remain the same. */
3805 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3806 NULL, 0, vect_prologue);
3807 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3808 NULL, 0, vect_prologue);
3809 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3810 NULL, 0, vect_epilogue);
3811 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3812 NULL, 0, vect_epilogue);
3813 stmt_info_for_cost *si;
3814 int j;
3815 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3817 struct _stmt_vec_info *stmt_info
3818 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3819 (void) add_stmt_cost (target_cost_data,
3820 si->count * peel_iters_prologue,
3821 si->kind, stmt_info, si->misalign,
3822 vect_prologue);
3823 (void) add_stmt_cost (target_cost_data,
3824 si->count * peel_iters_epilogue,
3825 si->kind, stmt_info, si->misalign,
3826 vect_epilogue);
3829 else
3831 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3832 stmt_info_for_cost *si;
3833 int j;
3834 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3836 prologue_cost_vec.create (2);
3837 epilogue_cost_vec.create (2);
3838 peel_iters_prologue = npeel;
3840 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3841 &peel_iters_epilogue,
3842 &LOOP_VINFO_SCALAR_ITERATION_COST
3843 (loop_vinfo),
3844 &prologue_cost_vec,
3845 &epilogue_cost_vec);
3847 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3849 struct _stmt_vec_info *stmt_info
3850 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3851 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3852 si->misalign, vect_prologue);
3855 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3857 struct _stmt_vec_info *stmt_info
3858 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3859 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3860 si->misalign, vect_epilogue);
3863 prologue_cost_vec.release ();
3864 epilogue_cost_vec.release ();
3867 /* FORNOW: The scalar outside cost is incremented in one of the
3868 following ways:
3870 1. The vectorizer checks for alignment and aliasing and generates
3871 a condition that allows dynamic vectorization. A cost model
3872 check is ANDED with the versioning condition. Hence scalar code
3873 path now has the added cost of the versioning check.
3875 if (cost > th & versioning_check)
3876 jmp to vector code
3878 Hence run-time scalar is incremented by not-taken branch cost.
3880 2. The vectorizer then checks if a prologue is required. If the
3881 cost model check was not done before during versioning, it has to
3882 be done before the prologue check.
3884 if (cost <= th)
3885 prologue = scalar_iters
3886 if (prologue == 0)
3887 jmp to vector code
3888 else
3889 execute prologue
3890 if (prologue == num_iters)
3891 go to exit
3893 Hence the run-time scalar cost is incremented by a taken branch,
3894 plus a not-taken branch, plus a taken branch cost.
3896 3. The vectorizer then checks if an epilogue is required. If the
3897 cost model check was not done before during prologue check, it
3898 has to be done with the epilogue check.
3900 if (prologue == 0)
3901 jmp to vector code
3902 else
3903 execute prologue
3904 if (prologue == num_iters)
3905 go to exit
3906 vector code:
3907 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3908 jmp to epilogue
3910 Hence the run-time scalar cost should be incremented by 2 taken
3911 branches.
3913 TODO: The back end may reorder the BBS's differently and reverse
3914 conditions/branch directions. Change the estimates below to
3915 something more reasonable. */
3917 /* If the number of iterations is known and we do not do versioning, we can
3918 decide whether to vectorize at compile time. Hence the scalar version
3919 do not carry cost model guard costs. */
3920 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3921 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923 /* Cost model check occurs at versioning. */
3924 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3925 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3926 else
3928 /* Cost model check occurs at prologue generation. */
3929 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3930 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3931 + vect_get_stmt_cost (cond_branch_not_taken);
3932 /* Cost model check occurs at epilogue generation. */
3933 else
3934 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3938 /* Complete the target-specific cost calculations. */
3939 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3940 &vec_inside_cost, &vec_epilogue_cost);
3942 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3944 if (dump_enabled_p ())
3946 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3947 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3948 vec_inside_cost);
3949 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3950 vec_prologue_cost);
3951 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3952 vec_epilogue_cost);
3953 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3954 scalar_single_iter_cost);
3955 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3956 scalar_outside_cost);
3957 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3958 vec_outside_cost);
3959 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3960 peel_iters_prologue);
3961 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3962 peel_iters_epilogue);
3965 /* Calculate number of iterations required to make the vector version
3966 profitable, relative to the loop bodies only. The following condition
3967 must hold true:
3968 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3969 where
3970 SIC = scalar iteration cost, VIC = vector iteration cost,
3971 VOC = vector outside cost, VF = vectorization factor,
3972 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3973 SOC = scalar outside cost for run time cost model check. */
3975 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3977 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3978 * assumed_vf
3979 - vec_inside_cost * peel_iters_prologue
3980 - vec_inside_cost * peel_iters_epilogue);
3981 if (min_profitable_iters <= 0)
3982 min_profitable_iters = 0;
3983 else
3985 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3986 - vec_inside_cost);
3988 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3989 <= (((int) vec_inside_cost * min_profitable_iters)
3990 + (((int) vec_outside_cost - scalar_outside_cost)
3991 * assumed_vf)))
3992 min_profitable_iters++;
3995 /* vector version will never be profitable. */
3996 else
3998 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3999 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4000 "did not happen for a simd loop");
4002 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4004 "cost model: the vector iteration cost = %d "
4005 "divided by the scalar iteration cost = %d "
4006 "is greater or equal to the vectorization factor = %d"
4007 ".\n",
4008 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4009 *ret_min_profitable_niters = -1;
4010 *ret_min_profitable_estimate = -1;
4011 return;
4014 dump_printf (MSG_NOTE,
4015 " Calculated minimum iters for profitability: %d\n",
4016 min_profitable_iters);
4018 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4019 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4020 /* We want the vectorized loop to execute at least once. */
4021 min_profitable_iters = assumed_vf + peel_iters_prologue;
4023 if (dump_enabled_p ())
4024 dump_printf_loc (MSG_NOTE, vect_location,
4025 " Runtime profitability threshold = %d\n",
4026 min_profitable_iters);
4028 *ret_min_profitable_niters = min_profitable_iters;
4030 /* Calculate number of iterations required to make the vector version
4031 profitable, relative to the loop bodies only.
4033 Non-vectorized variant is SIC * niters and it must win over vector
4034 variant on the expected loop trip count. The following condition must hold true:
4035 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4037 if (vec_outside_cost <= 0)
4038 min_profitable_estimate = 0;
4039 else
4041 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4042 * assumed_vf
4043 - vec_inside_cost * peel_iters_prologue
4044 - vec_inside_cost * peel_iters_epilogue)
4045 / ((scalar_single_iter_cost * assumed_vf)
4046 - vec_inside_cost);
4048 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4049 if (dump_enabled_p ())
4050 dump_printf_loc (MSG_NOTE, vect_location,
4051 " Static estimate profitability threshold = %d\n",
4052 min_profitable_estimate);
4054 *ret_min_profitable_estimate = min_profitable_estimate;
4057 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4058 vector elements (not bits) for a vector with NELT elements. */
4059 static void
4060 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4061 vec_perm_builder *sel)
4063 /* The encoding is a single stepped pattern. Any wrap-around is handled
4064 by vec_perm_indices. */
4065 sel->new_vector (nelt, 1, 3);
4066 for (unsigned int i = 0; i < 3; i++)
4067 sel->quick_push (i + offset);
4070 /* Checks whether the target supports whole-vector shifts for vectors of mode
4071 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4072 it supports vec_perm_const with masks for all necessary shift amounts. */
4073 static bool
4074 have_whole_vector_shift (machine_mode mode)
4076 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4077 return true;
4079 /* Variable-length vectors should be handled via the optab. */
4080 unsigned int nelt;
4081 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4082 return false;
4084 vec_perm_builder sel;
4085 vec_perm_indices indices;
4086 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4088 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4089 indices.new_vector (sel, 2, nelt);
4090 if (!can_vec_perm_const_p (mode, indices, false))
4091 return false;
4093 return true;
4096 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4097 functions. Design better to avoid maintenance issues. */
4099 /* Function vect_model_reduction_cost.
4101 Models cost for a reduction operation, including the vector ops
4102 generated within the strip-mine loop, the initial definition before
4103 the loop, and the epilogue code that must be generated. */
4105 static void
4106 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4107 int ncopies)
4109 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4110 enum tree_code code;
4111 optab optab;
4112 tree vectype;
4113 gimple *orig_stmt;
4114 machine_mode mode;
4115 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4116 struct loop *loop = NULL;
4117 void *target_cost_data;
4119 if (loop_vinfo)
4121 loop = LOOP_VINFO_LOOP (loop_vinfo);
4122 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4124 else
4125 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4127 /* Condition reductions generate two reductions in the loop. */
4128 vect_reduction_type reduction_type
4129 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4130 if (reduction_type == COND_REDUCTION)
4131 ncopies *= 2;
4133 vectype = STMT_VINFO_VECTYPE (stmt_info);
4134 mode = TYPE_MODE (vectype);
4135 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4137 if (!orig_stmt)
4138 orig_stmt = STMT_VINFO_STMT (stmt_info);
4140 code = gimple_assign_rhs_code (orig_stmt);
4142 if (reduction_type == EXTRACT_LAST_REDUCTION
4143 || reduction_type == FOLD_LEFT_REDUCTION)
4145 /* No extra instructions needed in the prologue. */
4146 prologue_cost = 0;
4148 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4149 /* Count one reduction-like operation per vector. */
4150 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4151 stmt_info, 0, vect_body);
4152 else
4154 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4155 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4156 inside_cost = add_stmt_cost (target_cost_data, nelements,
4157 vec_to_scalar, stmt_info, 0,
4158 vect_body);
4159 inside_cost += add_stmt_cost (target_cost_data, nelements,
4160 scalar_stmt, stmt_info, 0,
4161 vect_body);
4164 else
4166 /* Add in cost for initial definition.
4167 For cond reduction we have four vectors: initial index, step,
4168 initial result of the data reduction, initial value of the index
4169 reduction. */
4170 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4171 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4172 scalar_to_vec, stmt_info, 0,
4173 vect_prologue);
4175 /* Cost of reduction op inside loop. */
4176 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4177 stmt_info, 0, vect_body);
4180 /* Determine cost of epilogue code.
4182 We have a reduction operator that will reduce the vector in one statement.
4183 Also requires scalar extract. */
4185 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4187 if (reduc_fn != IFN_LAST)
4189 if (reduction_type == COND_REDUCTION)
4191 /* An EQ stmt and an COND_EXPR stmt. */
4192 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4193 vector_stmt, stmt_info, 0,
4194 vect_epilogue);
4195 /* Reduction of the max index and a reduction of the found
4196 values. */
4197 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4198 vec_to_scalar, stmt_info, 0,
4199 vect_epilogue);
4200 /* A broadcast of the max value. */
4201 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4202 scalar_to_vec, stmt_info, 0,
4203 vect_epilogue);
4205 else
4207 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4208 stmt_info, 0, vect_epilogue);
4209 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4210 vec_to_scalar, stmt_info, 0,
4211 vect_epilogue);
4214 else if (reduction_type == COND_REDUCTION)
4216 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4217 /* Extraction of scalar elements. */
4218 epilogue_cost += add_stmt_cost (target_cost_data,
4219 2 * estimated_nunits,
4220 vec_to_scalar, stmt_info, 0,
4221 vect_epilogue);
4222 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4223 epilogue_cost += add_stmt_cost (target_cost_data,
4224 2 * estimated_nunits - 3,
4225 scalar_stmt, stmt_info, 0,
4226 vect_epilogue);
4228 else if (reduction_type == EXTRACT_LAST_REDUCTION
4229 || reduction_type == FOLD_LEFT_REDUCTION)
4230 /* No extra instructions need in the epilogue. */
4232 else
4234 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4235 tree bitsize =
4236 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4237 int element_bitsize = tree_to_uhwi (bitsize);
4238 int nelements = vec_size_in_bits / element_bitsize;
4240 if (code == COND_EXPR)
4241 code = MAX_EXPR;
4243 optab = optab_for_tree_code (code, vectype, optab_default);
4245 /* We have a whole vector shift available. */
4246 if (optab != unknown_optab
4247 && VECTOR_MODE_P (mode)
4248 && optab_handler (optab, mode) != CODE_FOR_nothing
4249 && have_whole_vector_shift (mode))
4251 /* Final reduction via vector shifts and the reduction operator.
4252 Also requires scalar extract. */
4253 epilogue_cost += add_stmt_cost (target_cost_data,
4254 exact_log2 (nelements) * 2,
4255 vector_stmt, stmt_info, 0,
4256 vect_epilogue);
4257 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4258 vec_to_scalar, stmt_info, 0,
4259 vect_epilogue);
4261 else
4262 /* Use extracts and reduction op for final reduction. For N
4263 elements, we have N extracts and N-1 reduction ops. */
4264 epilogue_cost += add_stmt_cost (target_cost_data,
4265 nelements + nelements - 1,
4266 vector_stmt, stmt_info, 0,
4267 vect_epilogue);
4271 if (dump_enabled_p ())
4272 dump_printf (MSG_NOTE,
4273 "vect_model_reduction_cost: inside_cost = %d, "
4274 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4275 prologue_cost, epilogue_cost);
4279 /* Function vect_model_induction_cost.
4281 Models cost for induction operations. */
4283 static void
4284 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4286 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4287 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4288 unsigned inside_cost, prologue_cost;
4290 if (PURE_SLP_STMT (stmt_info))
4291 return;
4293 /* loop cost for vec_loop. */
4294 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4295 stmt_info, 0, vect_body);
4297 /* prologue cost for vec_init and vec_step. */
4298 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4299 stmt_info, 0, vect_prologue);
4301 if (dump_enabled_p ())
4302 dump_printf_loc (MSG_NOTE, vect_location,
4303 "vect_model_induction_cost: inside_cost = %d, "
4304 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4309 /* Function get_initial_def_for_reduction
4311 Input:
4312 STMT - a stmt that performs a reduction operation in the loop.
4313 INIT_VAL - the initial value of the reduction variable
4315 Output:
4316 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4317 of the reduction (used for adjusting the epilog - see below).
4318 Return a vector variable, initialized according to the operation that STMT
4319 performs. This vector will be used as the initial value of the
4320 vector of partial results.
4322 Option1 (adjust in epilog): Initialize the vector as follows:
4323 add/bit or/xor: [0,0,...,0,0]
4324 mult/bit and: [1,1,...,1,1]
4325 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4326 and when necessary (e.g. add/mult case) let the caller know
4327 that it needs to adjust the result by init_val.
4329 Option2: Initialize the vector as follows:
4330 add/bit or/xor: [init_val,0,0,...,0]
4331 mult/bit and: [init_val,1,1,...,1]
4332 min/max/cond_expr: [init_val,init_val,...,init_val]
4333 and no adjustments are needed.
4335 For example, for the following code:
4337 s = init_val;
4338 for (i=0;i<n;i++)
4339 s = s + a[i];
4341 STMT is 's = s + a[i]', and the reduction variable is 's'.
4342 For a vector of 4 units, we want to return either [0,0,0,init_val],
4343 or [0,0,0,0] and let the caller know that it needs to adjust
4344 the result at the end by 'init_val'.
4346 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4347 initialization vector is simpler (same element in all entries), if
4348 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4350 A cost model should help decide between these two schemes. */
4352 tree
4353 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4354 tree *adjustment_def)
4356 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4357 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4358 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4359 tree scalar_type = TREE_TYPE (init_val);
4360 tree vectype = get_vectype_for_scalar_type (scalar_type);
4361 enum tree_code code = gimple_assign_rhs_code (stmt);
4362 tree def_for_init;
4363 tree init_def;
4364 bool nested_in_vect_loop = false;
4365 REAL_VALUE_TYPE real_init_val = dconst0;
4366 int int_init_val = 0;
4367 gimple *def_stmt = NULL;
4368 gimple_seq stmts = NULL;
4370 gcc_assert (vectype);
4372 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4373 || SCALAR_FLOAT_TYPE_P (scalar_type));
4375 if (nested_in_vect_loop_p (loop, stmt))
4376 nested_in_vect_loop = true;
4377 else
4378 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4380 /* In case of double reduction we only create a vector variable to be put
4381 in the reduction phi node. The actual statement creation is done in
4382 vect_create_epilog_for_reduction. */
4383 if (adjustment_def && nested_in_vect_loop
4384 && TREE_CODE (init_val) == SSA_NAME
4385 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4386 && gimple_code (def_stmt) == GIMPLE_PHI
4387 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4388 && vinfo_for_stmt (def_stmt)
4389 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4390 == vect_double_reduction_def)
4392 *adjustment_def = NULL;
4393 return vect_create_destination_var (init_val, vectype);
4396 vect_reduction_type reduction_type
4397 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4399 /* In case of a nested reduction do not use an adjustment def as
4400 that case is not supported by the epilogue generation correctly
4401 if ncopies is not one. */
4402 if (adjustment_def && nested_in_vect_loop)
4404 *adjustment_def = NULL;
4405 return vect_get_vec_def_for_operand (init_val, stmt);
4408 switch (code)
4410 case WIDEN_SUM_EXPR:
4411 case DOT_PROD_EXPR:
4412 case SAD_EXPR:
4413 case PLUS_EXPR:
4414 case MINUS_EXPR:
4415 case BIT_IOR_EXPR:
4416 case BIT_XOR_EXPR:
4417 case MULT_EXPR:
4418 case BIT_AND_EXPR:
4420 /* ADJUSTMENT_DEF is NULL when called from
4421 vect_create_epilog_for_reduction to vectorize double reduction. */
4422 if (adjustment_def)
4423 *adjustment_def = init_val;
4425 if (code == MULT_EXPR)
4427 real_init_val = dconst1;
4428 int_init_val = 1;
4431 if (code == BIT_AND_EXPR)
4432 int_init_val = -1;
4434 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4435 def_for_init = build_real (scalar_type, real_init_val);
4436 else
4437 def_for_init = build_int_cst (scalar_type, int_init_val);
4439 if (adjustment_def)
4440 /* Option1: the first element is '0' or '1' as well. */
4441 init_def = gimple_build_vector_from_val (&stmts, vectype,
4442 def_for_init);
4443 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4445 /* Option2 (variable length): the first element is INIT_VAL. */
4446 init_def = build_vector_from_val (vectype, def_for_init);
4447 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4448 2, init_def, init_val);
4449 init_def = make_ssa_name (vectype);
4450 gimple_call_set_lhs (call, init_def);
4451 gimple_seq_add_stmt (&stmts, call);
4453 else
4455 /* Option2: the first element is INIT_VAL. */
4456 tree_vector_builder elts (vectype, 1, 2);
4457 elts.quick_push (init_val);
4458 elts.quick_push (def_for_init);
4459 init_def = gimple_build_vector (&stmts, &elts);
4462 break;
4464 case MIN_EXPR:
4465 case MAX_EXPR:
4466 case COND_EXPR:
4468 if (adjustment_def)
4470 *adjustment_def = NULL_TREE;
4471 if (reduction_type != COND_REDUCTION
4472 && reduction_type != EXTRACT_LAST_REDUCTION)
4474 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4475 break;
4478 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4479 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4481 break;
4483 default:
4484 gcc_unreachable ();
4487 if (stmts)
4488 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4489 return init_def;
4492 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4493 NUMBER_OF_VECTORS is the number of vector defs to create.
4494 If NEUTRAL_OP is nonnull, introducing extra elements of that
4495 value will not change the result. */
4497 static void
4498 get_initial_defs_for_reduction (slp_tree slp_node,
4499 vec<tree> *vec_oprnds,
4500 unsigned int number_of_vectors,
4501 bool reduc_chain, tree neutral_op)
4503 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4504 gimple *stmt = stmts[0];
4505 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4506 unsigned HOST_WIDE_INT nunits;
4507 unsigned j, number_of_places_left_in_vector;
4508 tree vector_type;
4509 tree vop;
4510 int group_size = stmts.length ();
4511 unsigned int vec_num, i;
4512 unsigned number_of_copies = 1;
4513 vec<tree> voprnds;
4514 voprnds.create (number_of_vectors);
4515 struct loop *loop;
4516 auto_vec<tree, 16> permute_results;
4518 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4520 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4522 loop = (gimple_bb (stmt))->loop_father;
4523 gcc_assert (loop);
4524 edge pe = loop_preheader_edge (loop);
4526 gcc_assert (!reduc_chain || neutral_op);
4528 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4529 created vectors. It is greater than 1 if unrolling is performed.
4531 For example, we have two scalar operands, s1 and s2 (e.g., group of
4532 strided accesses of size two), while NUNITS is four (i.e., four scalars
4533 of this type can be packed in a vector). The output vector will contain
4534 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4535 will be 2).
4537 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4538 containing the operands.
4540 For example, NUNITS is four as before, and the group size is 8
4541 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4542 {s5, s6, s7, s8}. */
4544 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4545 nunits = group_size;
4547 number_of_copies = nunits * number_of_vectors / group_size;
4549 number_of_places_left_in_vector = nunits;
4550 bool constant_p = true;
4551 tree_vector_builder elts (vector_type, nunits, 1);
4552 elts.quick_grow (nunits);
4553 for (j = 0; j < number_of_copies; j++)
4555 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4557 tree op;
4558 /* Get the def before the loop. In reduction chain we have only
4559 one initial value. */
4560 if ((j != (number_of_copies - 1)
4561 || (reduc_chain && i != 0))
4562 && neutral_op)
4563 op = neutral_op;
4564 else
4565 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4567 /* Create 'vect_ = {op0,op1,...,opn}'. */
4568 number_of_places_left_in_vector--;
4569 elts[number_of_places_left_in_vector] = op;
4570 if (!CONSTANT_CLASS_P (op))
4571 constant_p = false;
4573 if (number_of_places_left_in_vector == 0)
4575 gimple_seq ctor_seq = NULL;
4576 tree init;
4577 if (constant_p && !neutral_op
4578 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4579 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4580 /* Build the vector directly from ELTS. */
4581 init = gimple_build_vector (&ctor_seq, &elts);
4582 else if (neutral_op)
4584 /* Build a vector of the neutral value and shift the
4585 other elements into place. */
4586 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4587 neutral_op);
4588 int k = nunits;
4589 while (k > 0 && elts[k - 1] == neutral_op)
4590 k -= 1;
4591 while (k > 0)
4593 k -= 1;
4594 gcall *call = gimple_build_call_internal
4595 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4596 init = make_ssa_name (vector_type);
4597 gimple_call_set_lhs (call, init);
4598 gimple_seq_add_stmt (&ctor_seq, call);
4601 else
4603 /* First time round, duplicate ELTS to fill the
4604 required number of vectors, then cherry pick the
4605 appropriate result for each iteration. */
4606 if (vec_oprnds->is_empty ())
4607 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4608 number_of_vectors,
4609 permute_results);
4610 init = permute_results[number_of_vectors - j - 1];
4612 if (ctor_seq != NULL)
4613 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4614 voprnds.quick_push (init);
4616 number_of_places_left_in_vector = nunits;
4617 elts.new_vector (vector_type, nunits, 1);
4618 elts.quick_grow (nunits);
4619 constant_p = true;
4624 /* Since the vectors are created in the reverse order, we should invert
4625 them. */
4626 vec_num = voprnds.length ();
4627 for (j = vec_num; j != 0; j--)
4629 vop = voprnds[j - 1];
4630 vec_oprnds->quick_push (vop);
4633 voprnds.release ();
4635 /* In case that VF is greater than the unrolling factor needed for the SLP
4636 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4637 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4638 to replicate the vectors. */
4639 tree neutral_vec = NULL;
4640 while (number_of_vectors > vec_oprnds->length ())
4642 if (neutral_op)
4644 if (!neutral_vec)
4646 gimple_seq ctor_seq = NULL;
4647 neutral_vec = gimple_build_vector_from_val
4648 (&ctor_seq, vector_type, neutral_op);
4649 if (ctor_seq != NULL)
4650 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4652 vec_oprnds->quick_push (neutral_vec);
4654 else
4656 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4657 vec_oprnds->quick_push (vop);
4663 /* Function vect_create_epilog_for_reduction
4665 Create code at the loop-epilog to finalize the result of a reduction
4666 computation.
4668 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4669 reduction statements.
4670 STMT is the scalar reduction stmt that is being vectorized.
4671 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4672 number of elements that we can fit in a vectype (nunits). In this case
4673 we have to generate more than one vector stmt - i.e - we need to "unroll"
4674 the vector stmt by a factor VF/nunits. For more details see documentation
4675 in vectorizable_operation.
4676 REDUC_FN is the internal function for the epilog reduction.
4677 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4678 computation.
4679 REDUC_INDEX is the index of the operand in the right hand side of the
4680 statement that is defined by REDUCTION_PHI.
4681 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4682 SLP_NODE is an SLP node containing a group of reduction statements. The
4683 first one in this group is STMT.
4684 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4685 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4686 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4687 any value of the IV in the loop.
4688 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4689 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4690 null if this is not an SLP reduction
4692 This function:
4693 1. Creates the reduction def-use cycles: sets the arguments for
4694 REDUCTION_PHIS:
4695 The loop-entry argument is the vectorized initial-value of the reduction.
4696 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4697 sums.
4698 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4699 by calling the function specified by REDUC_FN if available, or by
4700 other means (whole-vector shifts or a scalar loop).
4701 The function also creates a new phi node at the loop exit to preserve
4702 loop-closed form, as illustrated below.
4704 The flow at the entry to this function:
4706 loop:
4707 vec_def = phi <null, null> # REDUCTION_PHI
4708 VECT_DEF = vector_stmt # vectorized form of STMT
4709 s_loop = scalar_stmt # (scalar) STMT
4710 loop_exit:
4711 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4712 use <s_out0>
4713 use <s_out0>
4715 The above is transformed by this function into:
4717 loop:
4718 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4719 VECT_DEF = vector_stmt # vectorized form of STMT
4720 s_loop = scalar_stmt # (scalar) STMT
4721 loop_exit:
4722 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4723 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4724 v_out2 = reduce <v_out1>
4725 s_out3 = extract_field <v_out2, 0>
4726 s_out4 = adjust_result <s_out3>
4727 use <s_out4>
4728 use <s_out4>
4731 static void
4732 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4733 gimple *reduc_def_stmt,
4734 int ncopies, internal_fn reduc_fn,
4735 vec<gimple *> reduction_phis,
4736 bool double_reduc,
4737 slp_tree slp_node,
4738 slp_instance slp_node_instance,
4739 tree induc_val, enum tree_code induc_code,
4740 tree neutral_op)
4742 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4743 stmt_vec_info prev_phi_info;
4744 tree vectype;
4745 machine_mode mode;
4746 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4747 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4748 basic_block exit_bb;
4749 tree scalar_dest;
4750 tree scalar_type;
4751 gimple *new_phi = NULL, *phi;
4752 gimple_stmt_iterator exit_gsi;
4753 tree vec_dest;
4754 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4755 gimple *epilog_stmt = NULL;
4756 enum tree_code code = gimple_assign_rhs_code (stmt);
4757 gimple *exit_phi;
4758 tree bitsize;
4759 tree adjustment_def = NULL;
4760 tree vec_initial_def = NULL;
4761 tree expr, def, initial_def = NULL;
4762 tree orig_name, scalar_result;
4763 imm_use_iterator imm_iter, phi_imm_iter;
4764 use_operand_p use_p, phi_use_p;
4765 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4766 bool nested_in_vect_loop = false;
4767 auto_vec<gimple *> new_phis;
4768 auto_vec<gimple *> inner_phis;
4769 enum vect_def_type dt = vect_unknown_def_type;
4770 int j, i;
4771 auto_vec<tree> scalar_results;
4772 unsigned int group_size = 1, k, ratio;
4773 auto_vec<tree> vec_initial_defs;
4774 auto_vec<gimple *> phis;
4775 bool slp_reduc = false;
4776 bool direct_slp_reduc;
4777 tree new_phi_result;
4778 gimple *inner_phi = NULL;
4779 tree induction_index = NULL_TREE;
4781 if (slp_node)
4782 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4784 if (nested_in_vect_loop_p (loop, stmt))
4786 outer_loop = loop;
4787 loop = loop->inner;
4788 nested_in_vect_loop = true;
4789 gcc_assert (!slp_node);
4792 vectype = STMT_VINFO_VECTYPE (stmt_info);
4793 gcc_assert (vectype);
4794 mode = TYPE_MODE (vectype);
4796 /* 1. Create the reduction def-use cycle:
4797 Set the arguments of REDUCTION_PHIS, i.e., transform
4799 loop:
4800 vec_def = phi <null, null> # REDUCTION_PHI
4801 VECT_DEF = vector_stmt # vectorized form of STMT
4804 into:
4806 loop:
4807 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4808 VECT_DEF = vector_stmt # vectorized form of STMT
4811 (in case of SLP, do it for all the phis). */
4813 /* Get the loop-entry arguments. */
4814 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4815 if (slp_node)
4817 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4818 vec_initial_defs.reserve (vec_num);
4819 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4820 &vec_initial_defs, vec_num,
4821 GROUP_FIRST_ELEMENT (stmt_info),
4822 neutral_op);
4824 else
4826 /* Get at the scalar def before the loop, that defines the initial value
4827 of the reduction variable. */
4828 gimple *def_stmt;
4829 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4830 loop_preheader_edge (loop));
4831 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4832 and we can't use zero for induc_val, use initial_def. Similarly
4833 for REDUC_MIN and initial_def larger than the base. */
4834 if (TREE_CODE (initial_def) == INTEGER_CST
4835 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4836 == INTEGER_INDUC_COND_REDUCTION)
4837 && !integer_zerop (induc_val)
4838 && ((induc_code == MAX_EXPR
4839 && tree_int_cst_lt (initial_def, induc_val))
4840 || (induc_code == MIN_EXPR
4841 && tree_int_cst_lt (induc_val, initial_def))))
4842 induc_val = initial_def;
4843 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4844 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4845 &adjustment_def);
4846 vec_initial_defs.create (1);
4847 vec_initial_defs.quick_push (vec_initial_def);
4850 /* Set phi nodes arguments. */
4851 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4853 tree vec_init_def = vec_initial_defs[i];
4854 tree def = vect_defs[i];
4855 for (j = 0; j < ncopies; j++)
4857 if (j != 0)
4859 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4860 if (nested_in_vect_loop)
4861 vec_init_def
4862 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4863 vec_init_def);
4866 /* Set the loop-entry arg of the reduction-phi. */
4868 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4869 == INTEGER_INDUC_COND_REDUCTION)
4871 /* Initialise the reduction phi to zero. This prevents initial
4872 values of non-zero interferring with the reduction op. */
4873 gcc_assert (ncopies == 1);
4874 gcc_assert (i == 0);
4876 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4877 tree induc_val_vec
4878 = build_vector_from_val (vec_init_def_type, induc_val);
4880 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4881 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4883 else
4884 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4885 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4887 /* Set the loop-latch arg for the reduction-phi. */
4888 if (j > 0)
4889 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4891 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4892 UNKNOWN_LOCATION);
4894 if (dump_enabled_p ())
4896 dump_printf_loc (MSG_NOTE, vect_location,
4897 "transform reduction: created def-use cycle: ");
4898 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4899 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4904 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4905 which is updated with the current index of the loop for every match of
4906 the original loop's cond_expr (VEC_STMT). This results in a vector
4907 containing the last time the condition passed for that vector lane.
4908 The first match will be a 1 to allow 0 to be used for non-matching
4909 indexes. If there are no matches at all then the vector will be all
4910 zeroes. */
4911 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4913 tree indx_before_incr, indx_after_incr;
4914 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4916 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4917 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4919 int scalar_precision
4920 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4921 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4922 tree cr_index_vector_type = build_vector_type
4923 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4925 /* First we create a simple vector induction variable which starts
4926 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4927 vector size (STEP). */
4929 /* Create a {1,2,3,...} vector. */
4930 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4932 /* Create a vector of the step value. */
4933 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4934 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4936 /* Create an induction variable. */
4937 gimple_stmt_iterator incr_gsi;
4938 bool insert_after;
4939 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4940 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4941 insert_after, &indx_before_incr, &indx_after_incr);
4943 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4944 filled with zeros (VEC_ZERO). */
4946 /* Create a vector of 0s. */
4947 tree zero = build_zero_cst (cr_index_scalar_type);
4948 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4950 /* Create a vector phi node. */
4951 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4952 new_phi = create_phi_node (new_phi_tree, loop->header);
4953 set_vinfo_for_stmt (new_phi,
4954 new_stmt_vec_info (new_phi, loop_vinfo));
4955 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4956 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4958 /* Now take the condition from the loops original cond_expr
4959 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4960 every match uses values from the induction variable
4961 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4962 (NEW_PHI_TREE).
4963 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4964 the new cond_expr (INDEX_COND_EXPR). */
4966 /* Duplicate the condition from vec_stmt. */
4967 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4969 /* Create a conditional, where the condition is taken from vec_stmt
4970 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4971 else is the phi (NEW_PHI_TREE). */
4972 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4973 ccompare, indx_before_incr,
4974 new_phi_tree);
4975 induction_index = make_ssa_name (cr_index_vector_type);
4976 gimple *index_condition = gimple_build_assign (induction_index,
4977 index_cond_expr);
4978 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4979 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4980 loop_vinfo);
4981 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4982 set_vinfo_for_stmt (index_condition, index_vec_info);
4984 /* Update the phi with the vec cond. */
4985 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4986 loop_latch_edge (loop), UNKNOWN_LOCATION);
4989 /* 2. Create epilog code.
4990 The reduction epilog code operates across the elements of the vector
4991 of partial results computed by the vectorized loop.
4992 The reduction epilog code consists of:
4994 step 1: compute the scalar result in a vector (v_out2)
4995 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4996 step 3: adjust the scalar result (s_out3) if needed.
4998 Step 1 can be accomplished using one the following three schemes:
4999 (scheme 1) using reduc_fn, if available.
5000 (scheme 2) using whole-vector shifts, if available.
5001 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5002 combined.
5004 The overall epilog code looks like this:
5006 s_out0 = phi <s_loop> # original EXIT_PHI
5007 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5008 v_out2 = reduce <v_out1> # step 1
5009 s_out3 = extract_field <v_out2, 0> # step 2
5010 s_out4 = adjust_result <s_out3> # step 3
5012 (step 3 is optional, and steps 1 and 2 may be combined).
5013 Lastly, the uses of s_out0 are replaced by s_out4. */
5016 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5017 v_out1 = phi <VECT_DEF>
5018 Store them in NEW_PHIS. */
5020 exit_bb = single_exit (loop)->dest;
5021 prev_phi_info = NULL;
5022 new_phis.create (vect_defs.length ());
5023 FOR_EACH_VEC_ELT (vect_defs, i, def)
5025 for (j = 0; j < ncopies; j++)
5027 tree new_def = copy_ssa_name (def);
5028 phi = create_phi_node (new_def, exit_bb);
5029 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5030 if (j == 0)
5031 new_phis.quick_push (phi);
5032 else
5034 def = vect_get_vec_def_for_stmt_copy (dt, def);
5035 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5038 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5039 prev_phi_info = vinfo_for_stmt (phi);
5043 /* The epilogue is created for the outer-loop, i.e., for the loop being
5044 vectorized. Create exit phis for the outer loop. */
5045 if (double_reduc)
5047 loop = outer_loop;
5048 exit_bb = single_exit (loop)->dest;
5049 inner_phis.create (vect_defs.length ());
5050 FOR_EACH_VEC_ELT (new_phis, i, phi)
5052 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5053 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5054 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5055 PHI_RESULT (phi));
5056 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5057 loop_vinfo));
5058 inner_phis.quick_push (phi);
5059 new_phis[i] = outer_phi;
5060 prev_phi_info = vinfo_for_stmt (outer_phi);
5061 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5063 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5064 new_result = copy_ssa_name (PHI_RESULT (phi));
5065 outer_phi = create_phi_node (new_result, exit_bb);
5066 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5067 PHI_RESULT (phi));
5068 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5069 loop_vinfo));
5070 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5071 prev_phi_info = vinfo_for_stmt (outer_phi);
5076 exit_gsi = gsi_after_labels (exit_bb);
5078 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5079 (i.e. when reduc_fn is not available) and in the final adjustment
5080 code (if needed). Also get the original scalar reduction variable as
5081 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5082 represents a reduction pattern), the tree-code and scalar-def are
5083 taken from the original stmt that the pattern-stmt (STMT) replaces.
5084 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5085 are taken from STMT. */
5087 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5088 if (!orig_stmt)
5090 /* Regular reduction */
5091 orig_stmt = stmt;
5093 else
5095 /* Reduction pattern */
5096 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5097 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5098 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5101 code = gimple_assign_rhs_code (orig_stmt);
5102 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5103 partial results are added and not subtracted. */
5104 if (code == MINUS_EXPR)
5105 code = PLUS_EXPR;
5107 scalar_dest = gimple_assign_lhs (orig_stmt);
5108 scalar_type = TREE_TYPE (scalar_dest);
5109 scalar_results.create (group_size);
5110 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5111 bitsize = TYPE_SIZE (scalar_type);
5113 /* In case this is a reduction in an inner-loop while vectorizing an outer
5114 loop - we don't need to extract a single scalar result at the end of the
5115 inner-loop (unless it is double reduction, i.e., the use of reduction is
5116 outside the outer-loop). The final vector of partial results will be used
5117 in the vectorized outer-loop, or reduced to a scalar result at the end of
5118 the outer-loop. */
5119 if (nested_in_vect_loop && !double_reduc)
5120 goto vect_finalize_reduction;
5122 /* SLP reduction without reduction chain, e.g.,
5123 # a1 = phi <a2, a0>
5124 # b1 = phi <b2, b0>
5125 a2 = operation (a1)
5126 b2 = operation (b1) */
5127 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5129 /* True if we should implement SLP_REDUC using native reduction operations
5130 instead of scalar operations. */
5131 direct_slp_reduc = (reduc_fn != IFN_LAST
5132 && slp_reduc
5133 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5135 /* In case of reduction chain, e.g.,
5136 # a1 = phi <a3, a0>
5137 a2 = operation (a1)
5138 a3 = operation (a2),
5140 we may end up with more than one vector result. Here we reduce them to
5141 one vector. */
5142 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5144 tree first_vect = PHI_RESULT (new_phis[0]);
5145 gassign *new_vec_stmt = NULL;
5146 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5147 for (k = 1; k < new_phis.length (); k++)
5149 gimple *next_phi = new_phis[k];
5150 tree second_vect = PHI_RESULT (next_phi);
5151 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5152 new_vec_stmt = gimple_build_assign (tem, code,
5153 first_vect, second_vect);
5154 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5155 first_vect = tem;
5158 new_phi_result = first_vect;
5159 if (new_vec_stmt)
5161 new_phis.truncate (0);
5162 new_phis.safe_push (new_vec_stmt);
5165 /* Likewise if we couldn't use a single defuse cycle. */
5166 else if (ncopies > 1)
5168 gcc_assert (new_phis.length () == 1);
5169 tree first_vect = PHI_RESULT (new_phis[0]);
5170 gassign *new_vec_stmt = NULL;
5171 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5172 gimple *next_phi = new_phis[0];
5173 for (int k = 1; k < ncopies; ++k)
5175 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5176 tree second_vect = PHI_RESULT (next_phi);
5177 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5178 new_vec_stmt = gimple_build_assign (tem, code,
5179 first_vect, second_vect);
5180 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5181 first_vect = tem;
5183 new_phi_result = first_vect;
5184 new_phis.truncate (0);
5185 new_phis.safe_push (new_vec_stmt);
5187 else
5188 new_phi_result = PHI_RESULT (new_phis[0]);
5190 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5191 && reduc_fn != IFN_LAST)
5193 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5194 various data values where the condition matched and another vector
5195 (INDUCTION_INDEX) containing all the indexes of those matches. We
5196 need to extract the last matching index (which will be the index with
5197 highest value) and use this to index into the data vector.
5198 For the case where there were no matches, the data vector will contain
5199 all default values and the index vector will be all zeros. */
5201 /* Get various versions of the type of the vector of indexes. */
5202 tree index_vec_type = TREE_TYPE (induction_index);
5203 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5204 tree index_scalar_type = TREE_TYPE (index_vec_type);
5205 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5206 (index_vec_type);
5208 /* Get an unsigned integer version of the type of the data vector. */
5209 int scalar_precision
5210 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5211 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5212 tree vectype_unsigned = build_vector_type
5213 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5215 /* First we need to create a vector (ZERO_VEC) of zeros and another
5216 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5217 can create using a MAX reduction and then expanding.
5218 In the case where the loop never made any matches, the max index will
5219 be zero. */
5221 /* Vector of {0, 0, 0,...}. */
5222 tree zero_vec = make_ssa_name (vectype);
5223 tree zero_vec_rhs = build_zero_cst (vectype);
5224 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5225 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5227 /* Find maximum value from the vector of found indexes. */
5228 tree max_index = make_ssa_name (index_scalar_type);
5229 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5230 1, induction_index);
5231 gimple_call_set_lhs (max_index_stmt, max_index);
5232 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5234 /* Vector of {max_index, max_index, max_index,...}. */
5235 tree max_index_vec = make_ssa_name (index_vec_type);
5236 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5237 max_index);
5238 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5239 max_index_vec_rhs);
5240 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5242 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5243 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5244 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5245 otherwise. Only one value should match, resulting in a vector
5246 (VEC_COND) with one data value and the rest zeros.
5247 In the case where the loop never made any matches, every index will
5248 match, resulting in a vector with all data values (which will all be
5249 the default value). */
5251 /* Compare the max index vector to the vector of found indexes to find
5252 the position of the max value. */
5253 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5254 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5255 induction_index,
5256 max_index_vec);
5257 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5259 /* Use the compare to choose either values from the data vector or
5260 zero. */
5261 tree vec_cond = make_ssa_name (vectype);
5262 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5263 vec_compare, new_phi_result,
5264 zero_vec);
5265 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5267 /* Finally we need to extract the data value from the vector (VEC_COND)
5268 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5269 reduction, but because this doesn't exist, we can use a MAX reduction
5270 instead. The data value might be signed or a float so we need to cast
5271 it first.
5272 In the case where the loop never made any matches, the data values are
5273 all identical, and so will reduce down correctly. */
5275 /* Make the matched data values unsigned. */
5276 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5277 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5278 vec_cond);
5279 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5280 VIEW_CONVERT_EXPR,
5281 vec_cond_cast_rhs);
5282 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5284 /* Reduce down to a scalar value. */
5285 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5286 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5287 1, vec_cond_cast);
5288 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5289 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5291 /* Convert the reduced value back to the result type and set as the
5292 result. */
5293 gimple_seq stmts = NULL;
5294 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5295 data_reduc);
5296 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5297 scalar_results.safe_push (new_temp);
5299 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5300 && reduc_fn == IFN_LAST)
5302 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5303 idx = 0;
5304 idx_val = induction_index[0];
5305 val = data_reduc[0];
5306 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5307 if (induction_index[i] > idx_val)
5308 val = data_reduc[i], idx_val = induction_index[i];
5309 return val; */
5311 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5312 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5313 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5314 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5315 /* Enforced by vectorizable_reduction, which ensures we have target
5316 support before allowing a conditional reduction on variable-length
5317 vectors. */
5318 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5319 tree idx_val = NULL_TREE, val = NULL_TREE;
5320 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5322 tree old_idx_val = idx_val;
5323 tree old_val = val;
5324 idx_val = make_ssa_name (idx_eltype);
5325 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5326 build3 (BIT_FIELD_REF, idx_eltype,
5327 induction_index,
5328 bitsize_int (el_size),
5329 bitsize_int (off)));
5330 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331 val = make_ssa_name (data_eltype);
5332 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5333 build3 (BIT_FIELD_REF,
5334 data_eltype,
5335 new_phi_result,
5336 bitsize_int (el_size),
5337 bitsize_int (off)));
5338 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339 if (off != 0)
5341 tree new_idx_val = idx_val;
5342 tree new_val = val;
5343 if (off != v_size - el_size)
5345 new_idx_val = make_ssa_name (idx_eltype);
5346 epilog_stmt = gimple_build_assign (new_idx_val,
5347 MAX_EXPR, idx_val,
5348 old_idx_val);
5349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351 new_val = make_ssa_name (data_eltype);
5352 epilog_stmt = gimple_build_assign (new_val,
5353 COND_EXPR,
5354 build2 (GT_EXPR,
5355 boolean_type_node,
5356 idx_val,
5357 old_idx_val),
5358 val, old_val);
5359 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360 idx_val = new_idx_val;
5361 val = new_val;
5364 /* Convert the reduced value back to the result type and set as the
5365 result. */
5366 gimple_seq stmts = NULL;
5367 val = gimple_convert (&stmts, scalar_type, val);
5368 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5369 scalar_results.safe_push (val);
5372 /* 2.3 Create the reduction code, using one of the three schemes described
5373 above. In SLP we simply need to extract all the elements from the
5374 vector (without reducing them), so we use scalar shifts. */
5375 else if (reduc_fn != IFN_LAST && !slp_reduc)
5377 tree tmp;
5378 tree vec_elem_type;
5380 /* Case 1: Create:
5381 v_out2 = reduc_expr <v_out1> */
5383 if (dump_enabled_p ())
5384 dump_printf_loc (MSG_NOTE, vect_location,
5385 "Reduce using direct vector reduction.\n");
5387 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5388 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5390 tree tmp_dest
5391 = vect_create_destination_var (scalar_dest, vec_elem_type);
5392 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5393 new_phi_result);
5394 gimple_set_lhs (epilog_stmt, tmp_dest);
5395 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5396 gimple_set_lhs (epilog_stmt, new_temp);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5399 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5400 new_temp);
5402 else
5404 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5405 new_phi_result);
5406 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5409 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5410 gimple_set_lhs (epilog_stmt, new_temp);
5411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5414 == INTEGER_INDUC_COND_REDUCTION)
5415 && !operand_equal_p (initial_def, induc_val, 0))
5417 /* Earlier we set the initial value to be a vector if induc_val
5418 values. Check the result and if it is induc_val then replace
5419 with the original initial value, unless induc_val is
5420 the same as initial_def already. */
5421 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5422 induc_val);
5424 tmp = make_ssa_name (new_scalar_dest);
5425 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5426 initial_def, new_temp);
5427 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5428 new_temp = tmp;
5431 scalar_results.safe_push (new_temp);
5433 else if (direct_slp_reduc)
5435 /* Here we create one vector for each of the GROUP_SIZE results,
5436 with the elements for other SLP statements replaced with the
5437 neutral value. We can then do a normal reduction on each vector. */
5439 /* Enforced by vectorizable_reduction. */
5440 gcc_assert (new_phis.length () == 1);
5441 gcc_assert (pow2p_hwi (group_size));
5443 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5444 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5445 gimple_seq seq = NULL;
5447 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5448 and the same element size as VECTYPE. */
5449 tree index = build_index_vector (vectype, 0, 1);
5450 tree index_type = TREE_TYPE (index);
5451 tree index_elt_type = TREE_TYPE (index_type);
5452 tree mask_type = build_same_sized_truth_vector_type (index_type);
5454 /* Create a vector that, for each element, identifies which of
5455 the GROUP_SIZE results should use it. */
5456 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5457 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5458 build_vector_from_val (index_type, index_mask));
5460 /* Get a neutral vector value. This is simply a splat of the neutral
5461 scalar value if we have one, otherwise the initial scalar value
5462 is itself a neutral value. */
5463 tree vector_identity = NULL_TREE;
5464 if (neutral_op)
5465 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5466 neutral_op);
5467 for (unsigned int i = 0; i < group_size; ++i)
5469 /* If there's no univeral neutral value, we can use the
5470 initial scalar value from the original PHI. This is used
5471 for MIN and MAX reduction, for example. */
5472 if (!neutral_op)
5474 tree scalar_value
5475 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5476 loop_preheader_edge (loop));
5477 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5478 scalar_value);
5481 /* Calculate the equivalent of:
5483 sel[j] = (index[j] == i);
5485 which selects the elements of NEW_PHI_RESULT that should
5486 be included in the result. */
5487 tree compare_val = build_int_cst (index_elt_type, i);
5488 compare_val = build_vector_from_val (index_type, compare_val);
5489 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5490 index, compare_val);
5492 /* Calculate the equivalent of:
5494 vec = seq ? new_phi_result : vector_identity;
5496 VEC is now suitable for a full vector reduction. */
5497 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5498 sel, new_phi_result, vector_identity);
5500 /* Do the reduction and convert it to the appropriate type. */
5501 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5502 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5503 gimple_call_set_lhs (call, scalar);
5504 gimple_seq_add_stmt (&seq, call);
5505 scalar = gimple_convert (&seq, scalar_type, scalar);
5506 scalar_results.safe_push (scalar);
5508 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5510 else
5512 bool reduce_with_shift;
5513 tree vec_temp;
5515 /* COND reductions all do the final reduction with MAX_EXPR
5516 or MIN_EXPR. */
5517 if (code == COND_EXPR)
5519 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5520 == INTEGER_INDUC_COND_REDUCTION)
5521 code = induc_code;
5522 else
5523 code = MAX_EXPR;
5526 /* See if the target wants to do the final (shift) reduction
5527 in a vector mode of smaller size and first reduce upper/lower
5528 halves against each other. */
5529 enum machine_mode mode1 = mode;
5530 tree vectype1 = vectype;
5531 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5532 unsigned sz1 = sz;
5533 if (!slp_reduc
5534 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5535 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5537 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5538 reduce_with_shift = have_whole_vector_shift (mode1);
5539 if (!VECTOR_MODE_P (mode1))
5540 reduce_with_shift = false;
5541 else
5543 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5544 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5545 reduce_with_shift = false;
5548 /* First reduce the vector to the desired vector size we should
5549 do shift reduction on by combining upper and lower halves. */
5550 new_temp = new_phi_result;
5551 while (sz > sz1)
5553 gcc_assert (!slp_reduc);
5554 sz /= 2;
5555 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5557 /* The target has to make sure we support lowpart/highpart
5558 extraction, either via direct vector extract or through
5559 an integer mode punning. */
5560 tree dst1, dst2;
5561 if (convert_optab_handler (vec_extract_optab,
5562 TYPE_MODE (TREE_TYPE (new_temp)),
5563 TYPE_MODE (vectype1))
5564 != CODE_FOR_nothing)
5566 /* Extract sub-vectors directly once vec_extract becomes
5567 a conversion optab. */
5568 dst1 = make_ssa_name (vectype1);
5569 epilog_stmt
5570 = gimple_build_assign (dst1, BIT_FIELD_REF,
5571 build3 (BIT_FIELD_REF, vectype1,
5572 new_temp, TYPE_SIZE (vectype1),
5573 bitsize_int (0)));
5574 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5575 dst2 = make_ssa_name (vectype1);
5576 epilog_stmt
5577 = gimple_build_assign (dst2, BIT_FIELD_REF,
5578 build3 (BIT_FIELD_REF, vectype1,
5579 new_temp, TYPE_SIZE (vectype1),
5580 bitsize_int (sz * BITS_PER_UNIT)));
5581 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5583 else
5585 /* Extract via punning to appropriately sized integer mode
5586 vector. */
5587 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5589 tree etype = build_vector_type (eltype, 2);
5590 gcc_assert (convert_optab_handler (vec_extract_optab,
5591 TYPE_MODE (etype),
5592 TYPE_MODE (eltype))
5593 != CODE_FOR_nothing);
5594 tree tem = make_ssa_name (etype);
5595 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5596 build1 (VIEW_CONVERT_EXPR,
5597 etype, new_temp));
5598 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5599 new_temp = tem;
5600 tem = make_ssa_name (eltype);
5601 epilog_stmt
5602 = gimple_build_assign (tem, BIT_FIELD_REF,
5603 build3 (BIT_FIELD_REF, eltype,
5604 new_temp, TYPE_SIZE (eltype),
5605 bitsize_int (0)));
5606 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5607 dst1 = make_ssa_name (vectype1);
5608 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5609 build1 (VIEW_CONVERT_EXPR,
5610 vectype1, tem));
5611 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612 tem = make_ssa_name (eltype);
5613 epilog_stmt
5614 = gimple_build_assign (tem, BIT_FIELD_REF,
5615 build3 (BIT_FIELD_REF, eltype,
5616 new_temp, TYPE_SIZE (eltype),
5617 bitsize_int (sz * BITS_PER_UNIT)));
5618 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5619 dst2 = make_ssa_name (vectype1);
5620 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5621 build1 (VIEW_CONVERT_EXPR,
5622 vectype1, tem));
5623 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5626 new_temp = make_ssa_name (vectype1);
5627 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5628 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631 if (reduce_with_shift && !slp_reduc)
5633 int element_bitsize = tree_to_uhwi (bitsize);
5634 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5635 for variable-length vectors and also requires direct target support
5636 for loop reductions. */
5637 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5638 int nelements = vec_size_in_bits / element_bitsize;
5639 vec_perm_builder sel;
5640 vec_perm_indices indices;
5642 int elt_offset;
5644 tree zero_vec = build_zero_cst (vectype1);
5645 /* Case 2: Create:
5646 for (offset = nelements/2; offset >= 1; offset/=2)
5648 Create: va' = vec_shift <va, offset>
5649 Create: va = vop <va, va'>
5650 } */
5652 tree rhs;
5654 if (dump_enabled_p ())
5655 dump_printf_loc (MSG_NOTE, vect_location,
5656 "Reduce using vector shifts\n");
5658 mode1 = TYPE_MODE (vectype1);
5659 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5660 for (elt_offset = nelements / 2;
5661 elt_offset >= 1;
5662 elt_offset /= 2)
5664 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5665 indices.new_vector (sel, 2, nelements);
5666 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5667 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5668 new_temp, zero_vec, mask);
5669 new_name = make_ssa_name (vec_dest, epilog_stmt);
5670 gimple_assign_set_lhs (epilog_stmt, new_name);
5671 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5673 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5674 new_temp);
5675 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5676 gimple_assign_set_lhs (epilog_stmt, new_temp);
5677 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5680 /* 2.4 Extract the final scalar result. Create:
5681 s_out3 = extract_field <v_out2, bitpos> */
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_NOTE, vect_location,
5685 "extract scalar result\n");
5687 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5688 bitsize, bitsize_zero_node);
5689 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5690 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5691 gimple_assign_set_lhs (epilog_stmt, new_temp);
5692 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5693 scalar_results.safe_push (new_temp);
5695 else
5697 /* Case 3: Create:
5698 s = extract_field <v_out2, 0>
5699 for (offset = element_size;
5700 offset < vector_size;
5701 offset += element_size;)
5703 Create: s' = extract_field <v_out2, offset>
5704 Create: s = op <s, s'> // For non SLP cases
5705 } */
5707 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 "Reduce using scalar code.\n");
5711 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5712 int element_bitsize = tree_to_uhwi (bitsize);
5713 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5715 int bit_offset;
5716 if (gimple_code (new_phi) == GIMPLE_PHI)
5717 vec_temp = PHI_RESULT (new_phi);
5718 else
5719 vec_temp = gimple_assign_lhs (new_phi);
5720 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5721 bitsize_zero_node);
5722 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5723 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5724 gimple_assign_set_lhs (epilog_stmt, new_temp);
5725 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5727 /* In SLP we don't need to apply reduction operation, so we just
5728 collect s' values in SCALAR_RESULTS. */
5729 if (slp_reduc)
5730 scalar_results.safe_push (new_temp);
5732 for (bit_offset = element_bitsize;
5733 bit_offset < vec_size_in_bits;
5734 bit_offset += element_bitsize)
5736 tree bitpos = bitsize_int (bit_offset);
5737 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5738 bitsize, bitpos);
5740 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5741 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5742 gimple_assign_set_lhs (epilog_stmt, new_name);
5743 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5745 if (slp_reduc)
5747 /* In SLP we don't need to apply reduction operation, so
5748 we just collect s' values in SCALAR_RESULTS. */
5749 new_temp = new_name;
5750 scalar_results.safe_push (new_name);
5752 else
5754 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5755 new_name, new_temp);
5756 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5757 gimple_assign_set_lhs (epilog_stmt, new_temp);
5758 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5763 /* The only case where we need to reduce scalar results in SLP, is
5764 unrolling. If the size of SCALAR_RESULTS is greater than
5765 GROUP_SIZE, we reduce them combining elements modulo
5766 GROUP_SIZE. */
5767 if (slp_reduc)
5769 tree res, first_res, new_res;
5770 gimple *new_stmt;
5772 /* Reduce multiple scalar results in case of SLP unrolling. */
5773 for (j = group_size; scalar_results.iterate (j, &res);
5774 j++)
5776 first_res = scalar_results[j % group_size];
5777 new_stmt = gimple_build_assign (new_scalar_dest, code,
5778 first_res, res);
5779 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5780 gimple_assign_set_lhs (new_stmt, new_res);
5781 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5782 scalar_results[j % group_size] = new_res;
5785 else
5786 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5787 scalar_results.safe_push (new_temp);
5790 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5791 == INTEGER_INDUC_COND_REDUCTION)
5792 && !operand_equal_p (initial_def, induc_val, 0))
5794 /* Earlier we set the initial value to be a vector if induc_val
5795 values. Check the result and if it is induc_val then replace
5796 with the original initial value, unless induc_val is
5797 the same as initial_def already. */
5798 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5799 induc_val);
5801 tree tmp = make_ssa_name (new_scalar_dest);
5802 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5803 initial_def, new_temp);
5804 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5805 scalar_results[0] = tmp;
5809 vect_finalize_reduction:
5811 if (double_reduc)
5812 loop = loop->inner;
5814 /* 2.5 Adjust the final result by the initial value of the reduction
5815 variable. (When such adjustment is not needed, then
5816 'adjustment_def' is zero). For example, if code is PLUS we create:
5817 new_temp = loop_exit_def + adjustment_def */
5819 if (adjustment_def)
5821 gcc_assert (!slp_reduc);
5822 if (nested_in_vect_loop)
5824 new_phi = new_phis[0];
5825 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5826 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5827 new_dest = vect_create_destination_var (scalar_dest, vectype);
5829 else
5831 new_temp = scalar_results[0];
5832 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5833 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5834 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5837 epilog_stmt = gimple_build_assign (new_dest, expr);
5838 new_temp = make_ssa_name (new_dest, epilog_stmt);
5839 gimple_assign_set_lhs (epilog_stmt, new_temp);
5840 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5841 if (nested_in_vect_loop)
5843 set_vinfo_for_stmt (epilog_stmt,
5844 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5845 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5846 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5848 if (!double_reduc)
5849 scalar_results.quick_push (new_temp);
5850 else
5851 scalar_results[0] = new_temp;
5853 else
5854 scalar_results[0] = new_temp;
5856 new_phis[0] = epilog_stmt;
5859 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5860 phis with new adjusted scalar results, i.e., replace use <s_out0>
5861 with use <s_out4>.
5863 Transform:
5864 loop_exit:
5865 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5866 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5867 v_out2 = reduce <v_out1>
5868 s_out3 = extract_field <v_out2, 0>
5869 s_out4 = adjust_result <s_out3>
5870 use <s_out0>
5871 use <s_out0>
5873 into:
5875 loop_exit:
5876 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5877 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5878 v_out2 = reduce <v_out1>
5879 s_out3 = extract_field <v_out2, 0>
5880 s_out4 = adjust_result <s_out3>
5881 use <s_out4>
5882 use <s_out4> */
5885 /* In SLP reduction chain we reduce vector results into one vector if
5886 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5887 the last stmt in the reduction chain, since we are looking for the loop
5888 exit phi node. */
5889 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5891 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5892 /* Handle reduction patterns. */
5893 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5894 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5896 scalar_dest = gimple_assign_lhs (dest_stmt);
5897 group_size = 1;
5900 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5901 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5902 need to match SCALAR_RESULTS with corresponding statements. The first
5903 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5904 the first vector stmt, etc.
5905 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5906 if (group_size > new_phis.length ())
5908 ratio = group_size / new_phis.length ();
5909 gcc_assert (!(group_size % new_phis.length ()));
5911 else
5912 ratio = 1;
5914 for (k = 0; k < group_size; k++)
5916 if (k % ratio == 0)
5918 epilog_stmt = new_phis[k / ratio];
5919 reduction_phi = reduction_phis[k / ratio];
5920 if (double_reduc)
5921 inner_phi = inner_phis[k / ratio];
5924 if (slp_reduc)
5926 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5928 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5929 /* SLP statements can't participate in patterns. */
5930 gcc_assert (!orig_stmt);
5931 scalar_dest = gimple_assign_lhs (current_stmt);
5934 phis.create (3);
5935 /* Find the loop-closed-use at the loop exit of the original scalar
5936 result. (The reduction result is expected to have two immediate uses -
5937 one at the latch block, and one at the loop exit). */
5938 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5939 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5940 && !is_gimple_debug (USE_STMT (use_p)))
5941 phis.safe_push (USE_STMT (use_p));
5943 /* While we expect to have found an exit_phi because of loop-closed-ssa
5944 form we can end up without one if the scalar cycle is dead. */
5946 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5948 if (outer_loop)
5950 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5951 gphi *vect_phi;
5953 /* FORNOW. Currently not supporting the case that an inner-loop
5954 reduction is not used in the outer-loop (but only outside the
5955 outer-loop), unless it is double reduction. */
5956 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5957 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5958 || double_reduc);
5960 if (double_reduc)
5961 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5962 else
5963 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5964 if (!double_reduc
5965 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5966 != vect_double_reduction_def)
5967 continue;
5969 /* Handle double reduction:
5971 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5972 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5973 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5974 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5976 At that point the regular reduction (stmt2 and stmt3) is
5977 already vectorized, as well as the exit phi node, stmt4.
5978 Here we vectorize the phi node of double reduction, stmt1, and
5979 update all relevant statements. */
5981 /* Go through all the uses of s2 to find double reduction phi
5982 node, i.e., stmt1 above. */
5983 orig_name = PHI_RESULT (exit_phi);
5984 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5986 stmt_vec_info use_stmt_vinfo;
5987 stmt_vec_info new_phi_vinfo;
5988 tree vect_phi_init, preheader_arg, vect_phi_res;
5989 basic_block bb = gimple_bb (use_stmt);
5990 gimple *use;
5992 /* Check that USE_STMT is really double reduction phi
5993 node. */
5994 if (gimple_code (use_stmt) != GIMPLE_PHI
5995 || gimple_phi_num_args (use_stmt) != 2
5996 || bb->loop_father != outer_loop)
5997 continue;
5998 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5999 if (!use_stmt_vinfo
6000 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6001 != vect_double_reduction_def)
6002 continue;
6004 /* Create vector phi node for double reduction:
6005 vs1 = phi <vs0, vs2>
6006 vs1 was created previously in this function by a call to
6007 vect_get_vec_def_for_operand and is stored in
6008 vec_initial_def;
6009 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6010 vs0 is created here. */
6012 /* Create vector phi node. */
6013 vect_phi = create_phi_node (vec_initial_def, bb);
6014 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6015 loop_vec_info_for_loop (outer_loop));
6016 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6018 /* Create vs0 - initial def of the double reduction phi. */
6019 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6020 loop_preheader_edge (outer_loop));
6021 vect_phi_init = get_initial_def_for_reduction
6022 (stmt, preheader_arg, NULL);
6024 /* Update phi node arguments with vs0 and vs2. */
6025 add_phi_arg (vect_phi, vect_phi_init,
6026 loop_preheader_edge (outer_loop),
6027 UNKNOWN_LOCATION);
6028 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6029 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6030 if (dump_enabled_p ())
6032 dump_printf_loc (MSG_NOTE, vect_location,
6033 "created double reduction phi node: ");
6034 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6037 vect_phi_res = PHI_RESULT (vect_phi);
6039 /* Replace the use, i.e., set the correct vs1 in the regular
6040 reduction phi node. FORNOW, NCOPIES is always 1, so the
6041 loop is redundant. */
6042 use = reduction_phi;
6043 for (j = 0; j < ncopies; j++)
6045 edge pr_edge = loop_preheader_edge (loop);
6046 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6047 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6053 phis.release ();
6054 if (nested_in_vect_loop)
6056 if (double_reduc)
6057 loop = outer_loop;
6058 else
6059 continue;
6062 phis.create (3);
6063 /* Find the loop-closed-use at the loop exit of the original scalar
6064 result. (The reduction result is expected to have two immediate uses,
6065 one at the latch block, and one at the loop exit). For double
6066 reductions we are looking for exit phis of the outer loop. */
6067 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6069 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6071 if (!is_gimple_debug (USE_STMT (use_p)))
6072 phis.safe_push (USE_STMT (use_p));
6074 else
6076 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6078 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6080 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6082 if (!flow_bb_inside_loop_p (loop,
6083 gimple_bb (USE_STMT (phi_use_p)))
6084 && !is_gimple_debug (USE_STMT (phi_use_p)))
6085 phis.safe_push (USE_STMT (phi_use_p));
6091 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6093 /* Replace the uses: */
6094 orig_name = PHI_RESULT (exit_phi);
6095 scalar_result = scalar_results[k];
6096 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6097 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6098 SET_USE (use_p, scalar_result);
6101 phis.release ();
6105 /* Return a vector of type VECTYPE that is equal to the vector select
6106 operation "MASK ? VEC : IDENTITY". Insert the select statements
6107 before GSI. */
6109 static tree
6110 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6111 tree vec, tree identity)
6113 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6114 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6115 mask, vec, identity);
6116 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6117 return cond;
6120 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6121 order, starting with LHS. Insert the extraction statements before GSI and
6122 associate the new scalar SSA names with variable SCALAR_DEST.
6123 Return the SSA name for the result. */
6125 static tree
6126 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6127 tree_code code, tree lhs, tree vector_rhs)
6129 tree vectype = TREE_TYPE (vector_rhs);
6130 tree scalar_type = TREE_TYPE (vectype);
6131 tree bitsize = TYPE_SIZE (scalar_type);
6132 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6133 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6135 for (unsigned HOST_WIDE_INT bit_offset = 0;
6136 bit_offset < vec_size_in_bits;
6137 bit_offset += element_bitsize)
6139 tree bitpos = bitsize_int (bit_offset);
6140 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6141 bitsize, bitpos);
6143 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6144 rhs = make_ssa_name (scalar_dest, stmt);
6145 gimple_assign_set_lhs (stmt, rhs);
6146 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6148 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6149 tree new_name = make_ssa_name (scalar_dest, stmt);
6150 gimple_assign_set_lhs (stmt, new_name);
6151 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6152 lhs = new_name;
6154 return lhs;
6157 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6158 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6159 statement. CODE is the operation performed by STMT and OPS are
6160 its scalar operands. REDUC_INDEX is the index of the operand in
6161 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6162 implements in-order reduction, or IFN_LAST if we should open-code it.
6163 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6164 that should be used to control the operation in a fully-masked loop. */
6166 static bool
6167 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6168 gimple **vec_stmt, slp_tree slp_node,
6169 gimple *reduc_def_stmt,
6170 tree_code code, internal_fn reduc_fn,
6171 tree ops[3], tree vectype_in,
6172 int reduc_index, vec_loop_masks *masks)
6174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6175 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6176 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6177 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6178 gimple *new_stmt = NULL;
6180 int ncopies;
6181 if (slp_node)
6182 ncopies = 1;
6183 else
6184 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6186 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6187 gcc_assert (ncopies == 1);
6188 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6189 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6190 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6191 == FOLD_LEFT_REDUCTION);
6193 if (slp_node)
6194 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6195 TYPE_VECTOR_SUBPARTS (vectype_in)));
6197 tree op0 = ops[1 - reduc_index];
6199 int group_size = 1;
6200 gimple *scalar_dest_def;
6201 auto_vec<tree> vec_oprnds0;
6202 if (slp_node)
6204 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6205 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6206 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6208 else
6210 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6211 vec_oprnds0.create (1);
6212 vec_oprnds0.quick_push (loop_vec_def0);
6213 scalar_dest_def = stmt;
6216 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6217 tree scalar_type = TREE_TYPE (scalar_dest);
6218 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6220 int vec_num = vec_oprnds0.length ();
6221 gcc_assert (vec_num == 1 || slp_node);
6222 tree vec_elem_type = TREE_TYPE (vectype_out);
6223 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6225 tree vector_identity = NULL_TREE;
6226 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6227 vector_identity = build_zero_cst (vectype_out);
6229 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6230 int i;
6231 tree def0;
6232 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6234 tree mask = NULL_TREE;
6235 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6236 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6238 /* Handle MINUS by adding the negative. */
6239 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6241 tree negated = make_ssa_name (vectype_out);
6242 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6243 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6244 def0 = negated;
6247 if (mask)
6248 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6249 vector_identity);
6251 /* On the first iteration the input is simply the scalar phi
6252 result, and for subsequent iterations it is the output of
6253 the preceding operation. */
6254 if (reduc_fn != IFN_LAST)
6256 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6257 /* For chained SLP reductions the output of the previous reduction
6258 operation serves as the input of the next. For the final statement
6259 the output cannot be a temporary - we reuse the original
6260 scalar destination of the last statement. */
6261 if (i != vec_num - 1)
6263 gimple_set_lhs (new_stmt, scalar_dest_var);
6264 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6265 gimple_set_lhs (new_stmt, reduc_var);
6268 else
6270 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6271 reduc_var, def0);
6272 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6273 /* Remove the statement, so that we can use the same code paths
6274 as for statements that we've just created. */
6275 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6276 gsi_remove (&tmp_gsi, false);
6279 if (i == vec_num - 1)
6281 gimple_set_lhs (new_stmt, scalar_dest);
6282 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6284 else
6285 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6287 if (slp_node)
6288 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6291 if (!slp_node)
6292 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6294 return true;
6297 /* Function is_nonwrapping_integer_induction.
6299 Check if STMT (which is part of loop LOOP) both increments and
6300 does not cause overflow. */
6302 static bool
6303 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6305 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6306 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6307 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6308 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6309 widest_int ni, max_loop_value, lhs_max;
6310 bool overflow = false;
6312 /* Make sure the loop is integer based. */
6313 if (TREE_CODE (base) != INTEGER_CST
6314 || TREE_CODE (step) != INTEGER_CST)
6315 return false;
6317 /* Check that the max size of the loop will not wrap. */
6319 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6320 return true;
6322 if (! max_stmt_executions (loop, &ni))
6323 return false;
6325 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6326 &overflow);
6327 if (overflow)
6328 return false;
6330 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6331 TYPE_SIGN (lhs_type), &overflow);
6332 if (overflow)
6333 return false;
6335 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6336 <= TYPE_PRECISION (lhs_type));
6339 /* Function vectorizable_reduction.
6341 Check if STMT performs a reduction operation that can be vectorized.
6342 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6343 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6344 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6346 This function also handles reduction idioms (patterns) that have been
6347 recognized in advance during vect_pattern_recog. In this case, STMT may be
6348 of this form:
6349 X = pattern_expr (arg0, arg1, ..., X)
6350 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6351 sequence that had been detected and replaced by the pattern-stmt (STMT).
6353 This function also handles reduction of condition expressions, for example:
6354 for (int i = 0; i < N; i++)
6355 if (a[i] < value)
6356 last = a[i];
6357 This is handled by vectorising the loop and creating an additional vector
6358 containing the loop indexes for which "a[i] < value" was true. In the
6359 function epilogue this is reduced to a single max value and then used to
6360 index into the vector of results.
6362 In some cases of reduction patterns, the type of the reduction variable X is
6363 different than the type of the other arguments of STMT.
6364 In such cases, the vectype that is used when transforming STMT into a vector
6365 stmt is different than the vectype that is used to determine the
6366 vectorization factor, because it consists of a different number of elements
6367 than the actual number of elements that are being operated upon in parallel.
6369 For example, consider an accumulation of shorts into an int accumulator.
6370 On some targets it's possible to vectorize this pattern operating on 8
6371 shorts at a time (hence, the vectype for purposes of determining the
6372 vectorization factor should be V8HI); on the other hand, the vectype that
6373 is used to create the vector form is actually V4SI (the type of the result).
6375 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6376 indicates what is the actual level of parallelism (V8HI in the example), so
6377 that the right vectorization factor would be derived. This vectype
6378 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6379 be used to create the vectorized stmt. The right vectype for the vectorized
6380 stmt is obtained from the type of the result X:
6381 get_vectype_for_scalar_type (TREE_TYPE (X))
6383 This means that, contrary to "regular" reductions (or "regular" stmts in
6384 general), the following equation:
6385 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6386 does *NOT* necessarily hold for reduction patterns. */
6388 bool
6389 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6390 gimple **vec_stmt, slp_tree slp_node,
6391 slp_instance slp_node_instance)
6393 tree vec_dest;
6394 tree scalar_dest;
6395 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6396 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6397 tree vectype_in = NULL_TREE;
6398 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6399 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6400 enum tree_code code, orig_code;
6401 internal_fn reduc_fn;
6402 machine_mode vec_mode;
6403 int op_type;
6404 optab optab;
6405 tree new_temp = NULL_TREE;
6406 gimple *def_stmt;
6407 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6408 gimple *cond_reduc_def_stmt = NULL;
6409 enum tree_code cond_reduc_op_code = ERROR_MARK;
6410 tree scalar_type;
6411 bool is_simple_use;
6412 gimple *orig_stmt;
6413 stmt_vec_info orig_stmt_info = NULL;
6414 int i;
6415 int ncopies;
6416 int epilog_copies;
6417 stmt_vec_info prev_stmt_info, prev_phi_info;
6418 bool single_defuse_cycle = false;
6419 gimple *new_stmt = NULL;
6420 int j;
6421 tree ops[3];
6422 enum vect_def_type dts[3];
6423 bool nested_cycle = false, found_nested_cycle_def = false;
6424 bool double_reduc = false;
6425 basic_block def_bb;
6426 struct loop * def_stmt_loop, *outer_loop = NULL;
6427 tree def_arg;
6428 gimple *def_arg_stmt;
6429 auto_vec<tree> vec_oprnds0;
6430 auto_vec<tree> vec_oprnds1;
6431 auto_vec<tree> vec_oprnds2;
6432 auto_vec<tree> vect_defs;
6433 auto_vec<gimple *> phis;
6434 int vec_num;
6435 tree def0, tem;
6436 bool first_p = true;
6437 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6438 tree cond_reduc_val = NULL_TREE;
6440 /* Make sure it was already recognized as a reduction computation. */
6441 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6442 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6443 return false;
6445 if (nested_in_vect_loop_p (loop, stmt))
6447 outer_loop = loop;
6448 loop = loop->inner;
6449 nested_cycle = true;
6452 /* In case of reduction chain we switch to the first stmt in the chain, but
6453 we don't update STMT_INFO, since only the last stmt is marked as reduction
6454 and has reduction properties. */
6455 if (GROUP_FIRST_ELEMENT (stmt_info)
6456 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6458 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6459 first_p = false;
6462 if (gimple_code (stmt) == GIMPLE_PHI)
6464 /* Analysis is fully done on the reduction stmt invocation. */
6465 if (! vec_stmt)
6467 if (slp_node)
6468 slp_node_instance->reduc_phis = slp_node;
6470 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6471 return true;
6474 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6475 /* Leave the scalar phi in place. Note that checking
6476 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6477 for reductions involving a single statement. */
6478 return true;
6480 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6481 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6482 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6484 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6485 == EXTRACT_LAST_REDUCTION)
6486 /* Leave the scalar phi in place. */
6487 return true;
6489 gcc_assert (is_gimple_assign (reduc_stmt));
6490 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6492 tree op = gimple_op (reduc_stmt, k);
6493 if (op == gimple_phi_result (stmt))
6494 continue;
6495 if (k == 1
6496 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6497 continue;
6498 if (!vectype_in
6499 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6500 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6501 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6502 break;
6504 gcc_assert (vectype_in);
6506 if (slp_node)
6507 ncopies = 1;
6508 else
6509 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6511 use_operand_p use_p;
6512 gimple *use_stmt;
6513 if (ncopies > 1
6514 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6515 <= vect_used_only_live)
6516 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6517 && (use_stmt == reduc_stmt
6518 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6519 == reduc_stmt)))
6520 single_defuse_cycle = true;
6522 /* Create the destination vector */
6523 scalar_dest = gimple_assign_lhs (reduc_stmt);
6524 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6526 if (slp_node)
6527 /* The size vect_schedule_slp_instance computes is off for us. */
6528 vec_num = vect_get_num_vectors
6529 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6530 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6531 vectype_in);
6532 else
6533 vec_num = 1;
6535 /* Generate the reduction PHIs upfront. */
6536 prev_phi_info = NULL;
6537 for (j = 0; j < ncopies; j++)
6539 if (j == 0 || !single_defuse_cycle)
6541 for (i = 0; i < vec_num; i++)
6543 /* Create the reduction-phi that defines the reduction
6544 operand. */
6545 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6546 set_vinfo_for_stmt (new_phi,
6547 new_stmt_vec_info (new_phi, loop_vinfo));
6549 if (slp_node)
6550 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6551 else
6553 if (j == 0)
6554 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6555 else
6556 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6557 prev_phi_info = vinfo_for_stmt (new_phi);
6563 return true;
6566 /* 1. Is vectorizable reduction? */
6567 /* Not supportable if the reduction variable is used in the loop, unless
6568 it's a reduction chain. */
6569 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6570 && !GROUP_FIRST_ELEMENT (stmt_info))
6571 return false;
6573 /* Reductions that are not used even in an enclosing outer-loop,
6574 are expected to be "live" (used out of the loop). */
6575 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6576 && !STMT_VINFO_LIVE_P (stmt_info))
6577 return false;
6579 /* 2. Has this been recognized as a reduction pattern?
6581 Check if STMT represents a pattern that has been recognized
6582 in earlier analysis stages. For stmts that represent a pattern,
6583 the STMT_VINFO_RELATED_STMT field records the last stmt in
6584 the original sequence that constitutes the pattern. */
6586 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6587 if (orig_stmt)
6589 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6590 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6591 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6594 /* 3. Check the operands of the operation. The first operands are defined
6595 inside the loop body. The last operand is the reduction variable,
6596 which is defined by the loop-header-phi. */
6598 gcc_assert (is_gimple_assign (stmt));
6600 /* Flatten RHS. */
6601 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6603 case GIMPLE_BINARY_RHS:
6604 code = gimple_assign_rhs_code (stmt);
6605 op_type = TREE_CODE_LENGTH (code);
6606 gcc_assert (op_type == binary_op);
6607 ops[0] = gimple_assign_rhs1 (stmt);
6608 ops[1] = gimple_assign_rhs2 (stmt);
6609 break;
6611 case GIMPLE_TERNARY_RHS:
6612 code = gimple_assign_rhs_code (stmt);
6613 op_type = TREE_CODE_LENGTH (code);
6614 gcc_assert (op_type == ternary_op);
6615 ops[0] = gimple_assign_rhs1 (stmt);
6616 ops[1] = gimple_assign_rhs2 (stmt);
6617 ops[2] = gimple_assign_rhs3 (stmt);
6618 break;
6620 case GIMPLE_UNARY_RHS:
6621 return false;
6623 default:
6624 gcc_unreachable ();
6627 if (code == COND_EXPR && slp_node)
6628 return false;
6630 scalar_dest = gimple_assign_lhs (stmt);
6631 scalar_type = TREE_TYPE (scalar_dest);
6632 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6633 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6634 return false;
6636 /* Do not try to vectorize bit-precision reductions. */
6637 if (!type_has_mode_precision_p (scalar_type))
6638 return false;
6640 /* All uses but the last are expected to be defined in the loop.
6641 The last use is the reduction variable. In case of nested cycle this
6642 assumption is not true: we use reduc_index to record the index of the
6643 reduction variable. */
6644 gimple *reduc_def_stmt = NULL;
6645 int reduc_index = -1;
6646 for (i = 0; i < op_type; i++)
6648 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6649 if (i == 0 && code == COND_EXPR)
6650 continue;
6652 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6653 &def_stmt, &dts[i], &tem);
6654 dt = dts[i];
6655 gcc_assert (is_simple_use);
6656 if (dt == vect_reduction_def)
6658 reduc_def_stmt = def_stmt;
6659 reduc_index = i;
6660 continue;
6662 else if (tem)
6664 /* To properly compute ncopies we are interested in the widest
6665 input type in case we're looking at a widening accumulation. */
6666 if (!vectype_in
6667 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6668 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6669 vectype_in = tem;
6672 if (dt != vect_internal_def
6673 && dt != vect_external_def
6674 && dt != vect_constant_def
6675 && dt != vect_induction_def
6676 && !(dt == vect_nested_cycle && nested_cycle))
6677 return false;
6679 if (dt == vect_nested_cycle)
6681 found_nested_cycle_def = true;
6682 reduc_def_stmt = def_stmt;
6683 reduc_index = i;
6686 if (i == 1 && code == COND_EXPR)
6688 /* Record how value of COND_EXPR is defined. */
6689 if (dt == vect_constant_def)
6691 cond_reduc_dt = dt;
6692 cond_reduc_val = ops[i];
6694 if (dt == vect_induction_def
6695 && def_stmt != NULL
6696 && is_nonwrapping_integer_induction (def_stmt, loop))
6698 cond_reduc_dt = dt;
6699 cond_reduc_def_stmt = def_stmt;
6704 if (!vectype_in)
6705 vectype_in = vectype_out;
6707 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6708 directy used in stmt. */
6709 if (reduc_index == -1)
6711 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6713 if (dump_enabled_p ())
6714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6715 "in-order reduction chain without SLP.\n");
6716 return false;
6719 if (orig_stmt)
6720 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6721 else
6722 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6725 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6726 return false;
6728 if (!(reduc_index == -1
6729 || dts[reduc_index] == vect_reduction_def
6730 || dts[reduc_index] == vect_nested_cycle
6731 || ((dts[reduc_index] == vect_internal_def
6732 || dts[reduc_index] == vect_external_def
6733 || dts[reduc_index] == vect_constant_def
6734 || dts[reduc_index] == vect_induction_def)
6735 && nested_cycle && found_nested_cycle_def)))
6737 /* For pattern recognized stmts, orig_stmt might be a reduction,
6738 but some helper statements for the pattern might not, or
6739 might be COND_EXPRs with reduction uses in the condition. */
6740 gcc_assert (orig_stmt);
6741 return false;
6744 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6745 enum vect_reduction_type v_reduc_type
6746 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6747 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6749 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6750 /* If we have a condition reduction, see if we can simplify it further. */
6751 if (v_reduc_type == COND_REDUCTION)
6753 /* Loop peeling modifies initial value of reduction PHI, which
6754 makes the reduction stmt to be transformed different to the
6755 original stmt analyzed. We need to record reduction code for
6756 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6757 it can be used directly at transform stage. */
6758 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6759 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6761 /* Also set the reduction type to CONST_COND_REDUCTION. */
6762 gcc_assert (cond_reduc_dt == vect_constant_def);
6763 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6765 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6766 vectype_in, OPTIMIZE_FOR_SPEED))
6768 if (dump_enabled_p ())
6769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770 "optimizing condition reduction with"
6771 " FOLD_EXTRACT_LAST.\n");
6772 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6774 else if (cond_reduc_dt == vect_induction_def)
6776 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6777 tree base
6778 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6779 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6781 gcc_assert (TREE_CODE (base) == INTEGER_CST
6782 && TREE_CODE (step) == INTEGER_CST);
6783 cond_reduc_val = NULL_TREE;
6784 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6785 above base; punt if base is the minimum value of the type for
6786 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6787 if (tree_int_cst_sgn (step) == -1)
6789 cond_reduc_op_code = MIN_EXPR;
6790 if (tree_int_cst_sgn (base) == -1)
6791 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6792 else if (tree_int_cst_lt (base,
6793 TYPE_MAX_VALUE (TREE_TYPE (base))))
6794 cond_reduc_val
6795 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6797 else
6799 cond_reduc_op_code = MAX_EXPR;
6800 if (tree_int_cst_sgn (base) == 1)
6801 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6802 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6803 base))
6804 cond_reduc_val
6805 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6807 if (cond_reduc_val)
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_NOTE, vect_location,
6811 "condition expression based on "
6812 "integer induction.\n");
6813 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6814 = INTEGER_INDUC_COND_REDUCTION;
6817 else if (cond_reduc_dt == vect_constant_def)
6819 enum vect_def_type cond_initial_dt;
6820 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6821 tree cond_initial_val
6822 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6824 gcc_assert (cond_reduc_val != NULL_TREE);
6825 vect_is_simple_use (cond_initial_val, loop_vinfo,
6826 &def_stmt, &cond_initial_dt);
6827 if (cond_initial_dt == vect_constant_def
6828 && types_compatible_p (TREE_TYPE (cond_initial_val),
6829 TREE_TYPE (cond_reduc_val)))
6831 tree e = fold_binary (LE_EXPR, boolean_type_node,
6832 cond_initial_val, cond_reduc_val);
6833 if (e && (integer_onep (e) || integer_zerop (e)))
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_NOTE, vect_location,
6837 "condition expression based on "
6838 "compile time constant.\n");
6839 /* Record reduction code at analysis stage. */
6840 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6841 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6842 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6843 = CONST_COND_REDUCTION;
6849 if (orig_stmt)
6850 gcc_assert (tmp == orig_stmt
6851 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6852 else
6853 /* We changed STMT to be the first stmt in reduction chain, hence we
6854 check that in this case the first element in the chain is STMT. */
6855 gcc_assert (stmt == tmp
6856 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6858 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6859 return false;
6861 if (slp_node)
6862 ncopies = 1;
6863 else
6864 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6866 gcc_assert (ncopies >= 1);
6868 vec_mode = TYPE_MODE (vectype_in);
6869 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6871 if (code == COND_EXPR)
6873 /* Only call during the analysis stage, otherwise we'll lose
6874 STMT_VINFO_TYPE. */
6875 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6876 ops[reduc_index], 0, NULL))
6878 if (dump_enabled_p ())
6879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6880 "unsupported condition in reduction\n");
6881 return false;
6884 else
6886 /* 4. Supportable by target? */
6888 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6889 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6891 /* Shifts and rotates are only supported by vectorizable_shifts,
6892 not vectorizable_reduction. */
6893 if (dump_enabled_p ())
6894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895 "unsupported shift or rotation.\n");
6896 return false;
6899 /* 4.1. check support for the operation in the loop */
6900 optab = optab_for_tree_code (code, vectype_in, optab_default);
6901 if (!optab)
6903 if (dump_enabled_p ())
6904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6905 "no optab.\n");
6907 return false;
6910 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6912 if (dump_enabled_p ())
6913 dump_printf (MSG_NOTE, "op not supported by target.\n");
6915 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6916 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6917 return false;
6919 if (dump_enabled_p ())
6920 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6923 /* Worthwhile without SIMD support? */
6924 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6925 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6927 if (dump_enabled_p ())
6928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6929 "not worthwhile without SIMD support.\n");
6931 return false;
6935 /* 4.2. Check support for the epilog operation.
6937 If STMT represents a reduction pattern, then the type of the
6938 reduction variable may be different than the type of the rest
6939 of the arguments. For example, consider the case of accumulation
6940 of shorts into an int accumulator; The original code:
6941 S1: int_a = (int) short_a;
6942 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6944 was replaced with:
6945 STMT: int_acc = widen_sum <short_a, int_acc>
6947 This means that:
6948 1. The tree-code that is used to create the vector operation in the
6949 epilog code (that reduces the partial results) is not the
6950 tree-code of STMT, but is rather the tree-code of the original
6951 stmt from the pattern that STMT is replacing. I.e, in the example
6952 above we want to use 'widen_sum' in the loop, but 'plus' in the
6953 epilog.
6954 2. The type (mode) we use to check available target support
6955 for the vector operation to be created in the *epilog*, is
6956 determined by the type of the reduction variable (in the example
6957 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6958 However the type (mode) we use to check available target support
6959 for the vector operation to be created *inside the loop*, is
6960 determined by the type of the other arguments to STMT (in the
6961 example we'd check this: optab_handler (widen_sum_optab,
6962 vect_short_mode)).
6964 This is contrary to "regular" reductions, in which the types of all
6965 the arguments are the same as the type of the reduction variable.
6966 For "regular" reductions we can therefore use the same vector type
6967 (and also the same tree-code) when generating the epilog code and
6968 when generating the code inside the loop. */
6970 vect_reduction_type reduction_type
6971 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6972 if (orig_stmt
6973 && (reduction_type == TREE_CODE_REDUCTION
6974 || reduction_type == FOLD_LEFT_REDUCTION))
6976 /* This is a reduction pattern: get the vectype from the type of the
6977 reduction variable, and get the tree-code from orig_stmt. */
6978 orig_code = gimple_assign_rhs_code (orig_stmt);
6979 gcc_assert (vectype_out);
6980 vec_mode = TYPE_MODE (vectype_out);
6982 else
6984 /* Regular reduction: use the same vectype and tree-code as used for
6985 the vector code inside the loop can be used for the epilog code. */
6986 orig_code = code;
6988 if (code == MINUS_EXPR)
6989 orig_code = PLUS_EXPR;
6991 /* For simple condition reductions, replace with the actual expression
6992 we want to base our reduction around. */
6993 if (reduction_type == CONST_COND_REDUCTION)
6995 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6996 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6998 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6999 orig_code = cond_reduc_op_code;
7002 if (nested_cycle)
7004 def_bb = gimple_bb (reduc_def_stmt);
7005 def_stmt_loop = def_bb->loop_father;
7006 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7007 loop_preheader_edge (def_stmt_loop));
7008 if (TREE_CODE (def_arg) == SSA_NAME
7009 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7010 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7011 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7012 && vinfo_for_stmt (def_arg_stmt)
7013 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7014 == vect_double_reduction_def)
7015 double_reduc = true;
7018 reduc_fn = IFN_LAST;
7020 if (reduction_type == TREE_CODE_REDUCTION
7021 || reduction_type == FOLD_LEFT_REDUCTION
7022 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7023 || reduction_type == CONST_COND_REDUCTION)
7025 if (reduction_type == FOLD_LEFT_REDUCTION
7026 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7027 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7029 if (reduc_fn != IFN_LAST
7030 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7031 OPTIMIZE_FOR_SPEED))
7033 if (dump_enabled_p ())
7034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035 "reduc op not supported by target.\n");
7037 reduc_fn = IFN_LAST;
7040 else
7042 if (!nested_cycle || double_reduc)
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046 "no reduc code for scalar code.\n");
7048 return false;
7052 else if (reduction_type == COND_REDUCTION)
7054 int scalar_precision
7055 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7056 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7057 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7058 nunits_out);
7060 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7061 OPTIMIZE_FOR_SPEED))
7062 reduc_fn = IFN_REDUC_MAX;
7065 if (reduction_type != EXTRACT_LAST_REDUCTION
7066 && reduc_fn == IFN_LAST
7067 && !nunits_out.is_constant ())
7069 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7071 "missing target support for reduction on"
7072 " variable-length vectors.\n");
7073 return false;
7076 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7077 && ncopies > 1)
7079 if (dump_enabled_p ())
7080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081 "multiple types in double reduction or condition "
7082 "reduction.\n");
7083 return false;
7086 /* For SLP reductions, see if there is a neutral value we can use. */
7087 tree neutral_op = NULL_TREE;
7088 if (slp_node)
7089 neutral_op
7090 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7091 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7093 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7095 /* We can't support in-order reductions of code such as this:
7097 for (int i = 0; i < n1; ++i)
7098 for (int j = 0; j < n2; ++j)
7099 l += a[j];
7101 since GCC effectively transforms the loop when vectorizing:
7103 for (int i = 0; i < n1 / VF; ++i)
7104 for (int j = 0; j < n2; ++j)
7105 for (int k = 0; k < VF; ++k)
7106 l += a[j];
7108 which is a reassociation of the original operation. */
7109 if (dump_enabled_p ())
7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111 "in-order double reduction not supported.\n");
7113 return false;
7116 if (reduction_type == FOLD_LEFT_REDUCTION
7117 && slp_node
7118 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7120 /* We cannot use in-order reductions in this case because there is
7121 an implicit reassociation of the operations involved. */
7122 if (dump_enabled_p ())
7123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7124 "in-order unchained SLP reductions not supported.\n");
7125 return false;
7128 /* For double reductions, and for SLP reductions with a neutral value,
7129 we construct a variable-length initial vector by loading a vector
7130 full of the neutral value and then shift-and-inserting the start
7131 values into the low-numbered elements. */
7132 if ((double_reduc || neutral_op)
7133 && !nunits_out.is_constant ()
7134 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7135 vectype_out, OPTIMIZE_FOR_SPEED))
7137 if (dump_enabled_p ())
7138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139 "reduction on variable-length vectors requires"
7140 " target support for a vector-shift-and-insert"
7141 " operation.\n");
7142 return false;
7145 /* Check extra constraints for variable-length unchained SLP reductions. */
7146 if (STMT_SLP_TYPE (stmt_info)
7147 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7148 && !nunits_out.is_constant ())
7150 /* We checked above that we could build the initial vector when
7151 there's a neutral element value. Check here for the case in
7152 which each SLP statement has its own initial value and in which
7153 that value needs to be repeated for every instance of the
7154 statement within the initial vector. */
7155 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7156 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7157 if (!neutral_op
7158 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7162 "unsupported form of SLP reduction for"
7163 " variable-length vectors: cannot build"
7164 " initial vector.\n");
7165 return false;
7167 /* The epilogue code relies on the number of elements being a multiple
7168 of the group size. The duplicate-and-interleave approach to setting
7169 up the the initial vector does too. */
7170 if (!multiple_p (nunits_out, group_size))
7172 if (dump_enabled_p ())
7173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7174 "unsupported form of SLP reduction for"
7175 " variable-length vectors: the vector size"
7176 " is not a multiple of the number of results.\n");
7177 return false;
7181 /* In case of widenning multiplication by a constant, we update the type
7182 of the constant to be the type of the other operand. We check that the
7183 constant fits the type in the pattern recognition pass. */
7184 if (code == DOT_PROD_EXPR
7185 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7187 if (TREE_CODE (ops[0]) == INTEGER_CST)
7188 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7189 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7190 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7191 else
7193 if (dump_enabled_p ())
7194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7195 "invalid types in dot-prod\n");
7197 return false;
7201 if (reduction_type == COND_REDUCTION)
7203 widest_int ni;
7205 if (! max_loop_iterations (loop, &ni))
7207 if (dump_enabled_p ())
7208 dump_printf_loc (MSG_NOTE, vect_location,
7209 "loop count not known, cannot create cond "
7210 "reduction.\n");
7211 return false;
7213 /* Convert backedges to iterations. */
7214 ni += 1;
7216 /* The additional index will be the same type as the condition. Check
7217 that the loop can fit into this less one (because we'll use up the
7218 zero slot for when there are no matches). */
7219 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7220 if (wi::geu_p (ni, wi::to_widest (max_index)))
7222 if (dump_enabled_p ())
7223 dump_printf_loc (MSG_NOTE, vect_location,
7224 "loop size is greater than data size.\n");
7225 return false;
7229 /* In case the vectorization factor (VF) is bigger than the number
7230 of elements that we can fit in a vectype (nunits), we have to generate
7231 more than one vector stmt - i.e - we need to "unroll" the
7232 vector stmt by a factor VF/nunits. For more details see documentation
7233 in vectorizable_operation. */
7235 /* If the reduction is used in an outer loop we need to generate
7236 VF intermediate results, like so (e.g. for ncopies=2):
7237 r0 = phi (init, r0)
7238 r1 = phi (init, r1)
7239 r0 = x0 + r0;
7240 r1 = x1 + r1;
7241 (i.e. we generate VF results in 2 registers).
7242 In this case we have a separate def-use cycle for each copy, and therefore
7243 for each copy we get the vector def for the reduction variable from the
7244 respective phi node created for this copy.
7246 Otherwise (the reduction is unused in the loop nest), we can combine
7247 together intermediate results, like so (e.g. for ncopies=2):
7248 r = phi (init, r)
7249 r = x0 + r;
7250 r = x1 + r;
7251 (i.e. we generate VF/2 results in a single register).
7252 In this case for each copy we get the vector def for the reduction variable
7253 from the vectorized reduction operation generated in the previous iteration.
7255 This only works when we see both the reduction PHI and its only consumer
7256 in vectorizable_reduction and there are no intermediate stmts
7257 participating. */
7258 use_operand_p use_p;
7259 gimple *use_stmt;
7260 if (ncopies > 1
7261 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7262 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7263 && (use_stmt == stmt
7264 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7266 single_defuse_cycle = true;
7267 epilog_copies = 1;
7269 else
7270 epilog_copies = ncopies;
7272 /* If the reduction stmt is one of the patterns that have lane
7273 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7274 if ((ncopies > 1
7275 && ! single_defuse_cycle)
7276 && (code == DOT_PROD_EXPR
7277 || code == WIDEN_SUM_EXPR
7278 || code == SAD_EXPR))
7280 if (dump_enabled_p ())
7281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7282 "multi def-use cycle not possible for lane-reducing "
7283 "reduction operation\n");
7284 return false;
7287 if (slp_node)
7288 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7289 else
7290 vec_num = 1;
7292 internal_fn cond_fn = get_conditional_internal_fn (code);
7293 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7295 if (!vec_stmt) /* transformation not required. */
7297 if (first_p)
7298 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7299 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7301 if (reduction_type != FOLD_LEFT_REDUCTION
7302 && (cond_fn == IFN_LAST
7303 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7304 OPTIMIZE_FOR_SPEED)))
7306 if (dump_enabled_p ())
7307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7308 "can't use a fully-masked loop because no"
7309 " conditional operation is available.\n");
7310 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7312 else if (reduc_index == -1)
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 "can't use a fully-masked loop for chained"
7317 " reductions.\n");
7318 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7320 else
7321 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7322 vectype_in);
7324 if (dump_enabled_p ()
7325 && reduction_type == FOLD_LEFT_REDUCTION)
7326 dump_printf_loc (MSG_NOTE, vect_location,
7327 "using an in-order (fold-left) reduction.\n");
7328 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7329 return true;
7332 /* Transform. */
7334 if (dump_enabled_p ())
7335 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7337 /* FORNOW: Multiple types are not supported for condition. */
7338 if (code == COND_EXPR)
7339 gcc_assert (ncopies == 1);
7341 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7343 if (reduction_type == FOLD_LEFT_REDUCTION)
7344 return vectorize_fold_left_reduction
7345 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7346 reduc_fn, ops, vectype_in, reduc_index, masks);
7348 if (reduction_type == EXTRACT_LAST_REDUCTION)
7350 gcc_assert (!slp_node);
7351 return vectorizable_condition (stmt, gsi, vec_stmt,
7352 NULL, reduc_index, NULL);
7355 /* Create the destination vector */
7356 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7358 prev_stmt_info = NULL;
7359 prev_phi_info = NULL;
7360 if (!slp_node)
7362 vec_oprnds0.create (1);
7363 vec_oprnds1.create (1);
7364 if (op_type == ternary_op)
7365 vec_oprnds2.create (1);
7368 phis.create (vec_num);
7369 vect_defs.create (vec_num);
7370 if (!slp_node)
7371 vect_defs.quick_push (NULL_TREE);
7373 if (slp_node)
7374 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7375 else
7376 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7378 for (j = 0; j < ncopies; j++)
7380 if (code == COND_EXPR)
7382 gcc_assert (!slp_node);
7383 vectorizable_condition (stmt, gsi, vec_stmt,
7384 PHI_RESULT (phis[0]),
7385 reduc_index, NULL);
7386 /* Multiple types are not supported for condition. */
7387 break;
7390 /* Handle uses. */
7391 if (j == 0)
7393 if (slp_node)
7395 /* Get vec defs for all the operands except the reduction index,
7396 ensuring the ordering of the ops in the vector is kept. */
7397 auto_vec<tree, 3> slp_ops;
7398 auto_vec<vec<tree>, 3> vec_defs;
7400 slp_ops.quick_push (ops[0]);
7401 slp_ops.quick_push (ops[1]);
7402 if (op_type == ternary_op)
7403 slp_ops.quick_push (ops[2]);
7405 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7407 vec_oprnds0.safe_splice (vec_defs[0]);
7408 vec_defs[0].release ();
7409 vec_oprnds1.safe_splice (vec_defs[1]);
7410 vec_defs[1].release ();
7411 if (op_type == ternary_op)
7413 vec_oprnds2.safe_splice (vec_defs[2]);
7414 vec_defs[2].release ();
7417 else
7419 vec_oprnds0.quick_push
7420 (vect_get_vec_def_for_operand (ops[0], stmt));
7421 vec_oprnds1.quick_push
7422 (vect_get_vec_def_for_operand (ops[1], stmt));
7423 if (op_type == ternary_op)
7424 vec_oprnds2.quick_push
7425 (vect_get_vec_def_for_operand (ops[2], stmt));
7428 else
7430 if (!slp_node)
7432 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7434 if (single_defuse_cycle && reduc_index == 0)
7435 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7436 else
7437 vec_oprnds0[0]
7438 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7439 if (single_defuse_cycle && reduc_index == 1)
7440 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7441 else
7442 vec_oprnds1[0]
7443 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7444 if (op_type == ternary_op)
7446 if (single_defuse_cycle && reduc_index == 2)
7447 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7448 else
7449 vec_oprnds2[0]
7450 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7455 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7457 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7458 if (masked_loop_p)
7460 /* Make sure that the reduction accumulator is vop[0]. */
7461 if (reduc_index == 1)
7463 gcc_assert (commutative_tree_code (code));
7464 std::swap (vop[0], vop[1]);
7466 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7467 vectype_in, i * ncopies + j);
7468 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7469 vop[0], vop[1]);
7470 new_temp = make_ssa_name (vec_dest, call);
7471 gimple_call_set_lhs (call, new_temp);
7472 gimple_call_set_nothrow (call, true);
7473 new_stmt = call;
7475 else
7477 if (op_type == ternary_op)
7478 vop[2] = vec_oprnds2[i];
7480 new_temp = make_ssa_name (vec_dest, new_stmt);
7481 new_stmt = gimple_build_assign (new_temp, code,
7482 vop[0], vop[1], vop[2]);
7484 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7486 if (slp_node)
7488 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7489 vect_defs.quick_push (new_temp);
7491 else
7492 vect_defs[0] = new_temp;
7495 if (slp_node)
7496 continue;
7498 if (j == 0)
7499 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7500 else
7501 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7503 prev_stmt_info = vinfo_for_stmt (new_stmt);
7506 /* Finalize the reduction-phi (set its arguments) and create the
7507 epilog reduction code. */
7508 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7509 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7511 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7512 epilog_copies, reduc_fn, phis,
7513 double_reduc, slp_node, slp_node_instance,
7514 cond_reduc_val, cond_reduc_op_code,
7515 neutral_op);
7517 return true;
7520 /* Function vect_min_worthwhile_factor.
7522 For a loop where we could vectorize the operation indicated by CODE,
7523 return the minimum vectorization factor that makes it worthwhile
7524 to use generic vectors. */
7525 static unsigned int
7526 vect_min_worthwhile_factor (enum tree_code code)
7528 switch (code)
7530 case PLUS_EXPR:
7531 case MINUS_EXPR:
7532 case NEGATE_EXPR:
7533 return 4;
7535 case BIT_AND_EXPR:
7536 case BIT_IOR_EXPR:
7537 case BIT_XOR_EXPR:
7538 case BIT_NOT_EXPR:
7539 return 2;
7541 default:
7542 return INT_MAX;
7546 /* Return true if VINFO indicates we are doing loop vectorization and if
7547 it is worth decomposing CODE operations into scalar operations for
7548 that loop's vectorization factor. */
7550 bool
7551 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7553 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7554 unsigned HOST_WIDE_INT value;
7555 return (loop_vinfo
7556 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7557 && value >= vect_min_worthwhile_factor (code));
7560 /* Function vectorizable_induction
7562 Check if PHI performs an induction computation that can be vectorized.
7563 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7564 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7565 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7567 bool
7568 vectorizable_induction (gimple *phi,
7569 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7570 gimple **vec_stmt, slp_tree slp_node)
7572 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7573 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7574 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7575 unsigned ncopies;
7576 bool nested_in_vect_loop = false;
7577 struct loop *iv_loop;
7578 tree vec_def;
7579 edge pe = loop_preheader_edge (loop);
7580 basic_block new_bb;
7581 tree new_vec, vec_init, vec_step, t;
7582 tree new_name;
7583 gimple *new_stmt;
7584 gphi *induction_phi;
7585 tree induc_def, vec_dest;
7586 tree init_expr, step_expr;
7587 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7588 unsigned i;
7589 tree expr;
7590 gimple_seq stmts;
7591 imm_use_iterator imm_iter;
7592 use_operand_p use_p;
7593 gimple *exit_phi;
7594 edge latch_e;
7595 tree loop_arg;
7596 gimple_stmt_iterator si;
7597 basic_block bb = gimple_bb (phi);
7599 if (gimple_code (phi) != GIMPLE_PHI)
7600 return false;
7602 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7603 return false;
7605 /* Make sure it was recognized as induction computation. */
7606 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7607 return false;
7609 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7610 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7612 if (slp_node)
7613 ncopies = 1;
7614 else
7615 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7616 gcc_assert (ncopies >= 1);
7618 /* FORNOW. These restrictions should be relaxed. */
7619 if (nested_in_vect_loop_p (loop, phi))
7621 imm_use_iterator imm_iter;
7622 use_operand_p use_p;
7623 gimple *exit_phi;
7624 edge latch_e;
7625 tree loop_arg;
7627 if (ncopies > 1)
7629 if (dump_enabled_p ())
7630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7631 "multiple types in nested loop.\n");
7632 return false;
7635 /* FORNOW: outer loop induction with SLP not supported. */
7636 if (STMT_SLP_TYPE (stmt_info))
7637 return false;
7639 exit_phi = NULL;
7640 latch_e = loop_latch_edge (loop->inner);
7641 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7642 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7644 gimple *use_stmt = USE_STMT (use_p);
7645 if (is_gimple_debug (use_stmt))
7646 continue;
7648 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7650 exit_phi = use_stmt;
7651 break;
7654 if (exit_phi)
7656 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7657 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7658 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7660 if (dump_enabled_p ())
7661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7662 "inner-loop induction only used outside "
7663 "of the outer vectorized loop.\n");
7664 return false;
7668 nested_in_vect_loop = true;
7669 iv_loop = loop->inner;
7671 else
7672 iv_loop = loop;
7673 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7675 if (slp_node && !nunits.is_constant ())
7677 /* The current SLP code creates the initial value element-by-element. */
7678 if (dump_enabled_p ())
7679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7680 "SLP induction not supported for variable-length"
7681 " vectors.\n");
7682 return false;
7685 if (!vec_stmt) /* transformation not required. */
7687 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7688 if (dump_enabled_p ())
7689 dump_printf_loc (MSG_NOTE, vect_location,
7690 "=== vectorizable_induction ===\n");
7691 vect_model_induction_cost (stmt_info, ncopies);
7692 return true;
7695 /* Transform. */
7697 /* Compute a vector variable, initialized with the first VF values of
7698 the induction variable. E.g., for an iv with IV_PHI='X' and
7699 evolution S, for a vector of 4 units, we want to compute:
7700 [X, X + S, X + 2*S, X + 3*S]. */
7702 if (dump_enabled_p ())
7703 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7705 latch_e = loop_latch_edge (iv_loop);
7706 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7708 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7709 gcc_assert (step_expr != NULL_TREE);
7711 pe = loop_preheader_edge (iv_loop);
7712 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7713 loop_preheader_edge (iv_loop));
7715 stmts = NULL;
7716 if (!nested_in_vect_loop)
7718 /* Convert the initial value to the desired type. */
7719 tree new_type = TREE_TYPE (vectype);
7720 init_expr = gimple_convert (&stmts, new_type, init_expr);
7722 /* If we are using the loop mask to "peel" for alignment then we need
7723 to adjust the start value here. */
7724 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7725 if (skip_niters != NULL_TREE)
7727 if (FLOAT_TYPE_P (vectype))
7728 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7729 skip_niters);
7730 else
7731 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7732 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7733 skip_niters, step_expr);
7734 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7735 init_expr, skip_step);
7739 /* Convert the step to the desired type. */
7740 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7742 if (stmts)
7744 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7745 gcc_assert (!new_bb);
7748 /* Find the first insertion point in the BB. */
7749 si = gsi_after_labels (bb);
7751 /* For SLP induction we have to generate several IVs as for example
7752 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7753 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7754 [VF*S, VF*S, VF*S, VF*S] for all. */
7755 if (slp_node)
7757 /* Enforced above. */
7758 unsigned int const_nunits = nunits.to_constant ();
7760 /* Generate [VF*S, VF*S, ... ]. */
7761 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7763 expr = build_int_cst (integer_type_node, vf);
7764 expr = fold_convert (TREE_TYPE (step_expr), expr);
7766 else
7767 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7768 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7769 expr, step_expr);
7770 if (! CONSTANT_CLASS_P (new_name))
7771 new_name = vect_init_vector (phi, new_name,
7772 TREE_TYPE (step_expr), NULL);
7773 new_vec = build_vector_from_val (vectype, new_name);
7774 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7776 /* Now generate the IVs. */
7777 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7778 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7779 unsigned elts = const_nunits * nvects;
7780 unsigned nivs = least_common_multiple (group_size,
7781 const_nunits) / const_nunits;
7782 gcc_assert (elts % group_size == 0);
7783 tree elt = init_expr;
7784 unsigned ivn;
7785 for (ivn = 0; ivn < nivs; ++ivn)
7787 tree_vector_builder elts (vectype, const_nunits, 1);
7788 stmts = NULL;
7789 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7791 if (ivn*const_nunits + eltn >= group_size
7792 && (ivn * const_nunits + eltn) % group_size == 0)
7793 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7794 elt, step_expr);
7795 elts.quick_push (elt);
7797 vec_init = gimple_build_vector (&stmts, &elts);
7798 if (stmts)
7800 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7801 gcc_assert (!new_bb);
7804 /* Create the induction-phi that defines the induction-operand. */
7805 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7806 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7807 set_vinfo_for_stmt (induction_phi,
7808 new_stmt_vec_info (induction_phi, loop_vinfo));
7809 induc_def = PHI_RESULT (induction_phi);
7811 /* Create the iv update inside the loop */
7812 vec_def = make_ssa_name (vec_dest);
7813 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7814 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7815 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7817 /* Set the arguments of the phi node: */
7818 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7819 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7820 UNKNOWN_LOCATION);
7822 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7825 /* Re-use IVs when we can. */
7826 if (ivn < nvects)
7828 unsigned vfp
7829 = least_common_multiple (group_size, const_nunits) / group_size;
7830 /* Generate [VF'*S, VF'*S, ... ]. */
7831 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7833 expr = build_int_cst (integer_type_node, vfp);
7834 expr = fold_convert (TREE_TYPE (step_expr), expr);
7836 else
7837 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7838 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7839 expr, step_expr);
7840 if (! CONSTANT_CLASS_P (new_name))
7841 new_name = vect_init_vector (phi, new_name,
7842 TREE_TYPE (step_expr), NULL);
7843 new_vec = build_vector_from_val (vectype, new_name);
7844 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7845 for (; ivn < nvects; ++ivn)
7847 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7848 tree def;
7849 if (gimple_code (iv) == GIMPLE_PHI)
7850 def = gimple_phi_result (iv);
7851 else
7852 def = gimple_assign_lhs (iv);
7853 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7854 PLUS_EXPR,
7855 def, vec_step);
7856 if (gimple_code (iv) == GIMPLE_PHI)
7857 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7858 else
7860 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7861 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7863 set_vinfo_for_stmt (new_stmt,
7864 new_stmt_vec_info (new_stmt, loop_vinfo));
7865 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7869 return true;
7872 /* Create the vector that holds the initial_value of the induction. */
7873 if (nested_in_vect_loop)
7875 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7876 been created during vectorization of previous stmts. We obtain it
7877 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7878 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7879 /* If the initial value is not of proper type, convert it. */
7880 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7882 new_stmt
7883 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7884 vect_simple_var,
7885 "vec_iv_"),
7886 VIEW_CONVERT_EXPR,
7887 build1 (VIEW_CONVERT_EXPR, vectype,
7888 vec_init));
7889 vec_init = gimple_assign_lhs (new_stmt);
7890 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7891 new_stmt);
7892 gcc_assert (!new_bb);
7893 set_vinfo_for_stmt (new_stmt,
7894 new_stmt_vec_info (new_stmt, loop_vinfo));
7897 else
7899 /* iv_loop is the loop to be vectorized. Create:
7900 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7901 stmts = NULL;
7902 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7904 unsigned HOST_WIDE_INT const_nunits;
7905 if (nunits.is_constant (&const_nunits))
7907 tree_vector_builder elts (vectype, const_nunits, 1);
7908 elts.quick_push (new_name);
7909 for (i = 1; i < const_nunits; i++)
7911 /* Create: new_name_i = new_name + step_expr */
7912 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7913 new_name, step_expr);
7914 elts.quick_push (new_name);
7916 /* Create a vector from [new_name_0, new_name_1, ...,
7917 new_name_nunits-1] */
7918 vec_init = gimple_build_vector (&stmts, &elts);
7920 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7921 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7922 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7923 new_name, step_expr);
7924 else
7926 /* Build:
7927 [base, base, base, ...]
7928 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7929 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7930 gcc_assert (flag_associative_math);
7931 tree index = build_index_vector (vectype, 0, 1);
7932 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7933 new_name);
7934 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7935 step_expr);
7936 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7937 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7938 vec_init, step_vec);
7939 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7940 vec_init, base_vec);
7943 if (stmts)
7945 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7946 gcc_assert (!new_bb);
7951 /* Create the vector that holds the step of the induction. */
7952 if (nested_in_vect_loop)
7953 /* iv_loop is nested in the loop to be vectorized. Generate:
7954 vec_step = [S, S, S, S] */
7955 new_name = step_expr;
7956 else
7958 /* iv_loop is the loop to be vectorized. Generate:
7959 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7960 gimple_seq seq = NULL;
7961 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7963 expr = build_int_cst (integer_type_node, vf);
7964 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7966 else
7967 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7968 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7969 expr, step_expr);
7970 if (seq)
7972 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7973 gcc_assert (!new_bb);
7977 t = unshare_expr (new_name);
7978 gcc_assert (CONSTANT_CLASS_P (new_name)
7979 || TREE_CODE (new_name) == SSA_NAME);
7980 new_vec = build_vector_from_val (vectype, t);
7981 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7984 /* Create the following def-use cycle:
7985 loop prolog:
7986 vec_init = ...
7987 vec_step = ...
7988 loop:
7989 vec_iv = PHI <vec_init, vec_loop>
7991 STMT
7993 vec_loop = vec_iv + vec_step; */
7995 /* Create the induction-phi that defines the induction-operand. */
7996 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7997 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7998 set_vinfo_for_stmt (induction_phi,
7999 new_stmt_vec_info (induction_phi, loop_vinfo));
8000 induc_def = PHI_RESULT (induction_phi);
8002 /* Create the iv update inside the loop */
8003 vec_def = make_ssa_name (vec_dest);
8004 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8005 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8006 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8008 /* Set the arguments of the phi node: */
8009 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8010 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8011 UNKNOWN_LOCATION);
8013 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8015 /* In case that vectorization factor (VF) is bigger than the number
8016 of elements that we can fit in a vectype (nunits), we have to generate
8017 more than one vector stmt - i.e - we need to "unroll" the
8018 vector stmt by a factor VF/nunits. For more details see documentation
8019 in vectorizable_operation. */
8021 if (ncopies > 1)
8023 gimple_seq seq = NULL;
8024 stmt_vec_info prev_stmt_vinfo;
8025 /* FORNOW. This restriction should be relaxed. */
8026 gcc_assert (!nested_in_vect_loop);
8028 /* Create the vector that holds the step of the induction. */
8029 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8031 expr = build_int_cst (integer_type_node, nunits);
8032 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8034 else
8035 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8036 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8037 expr, step_expr);
8038 if (seq)
8040 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8041 gcc_assert (!new_bb);
8044 t = unshare_expr (new_name);
8045 gcc_assert (CONSTANT_CLASS_P (new_name)
8046 || TREE_CODE (new_name) == SSA_NAME);
8047 new_vec = build_vector_from_val (vectype, t);
8048 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8050 vec_def = induc_def;
8051 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8052 for (i = 1; i < ncopies; i++)
8054 /* vec_i = vec_prev + vec_step */
8055 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8056 vec_def, vec_step);
8057 vec_def = make_ssa_name (vec_dest, new_stmt);
8058 gimple_assign_set_lhs (new_stmt, vec_def);
8060 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8061 set_vinfo_for_stmt (new_stmt,
8062 new_stmt_vec_info (new_stmt, loop_vinfo));
8063 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8064 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8068 if (nested_in_vect_loop)
8070 /* Find the loop-closed exit-phi of the induction, and record
8071 the final vector of induction results: */
8072 exit_phi = NULL;
8073 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8075 gimple *use_stmt = USE_STMT (use_p);
8076 if (is_gimple_debug (use_stmt))
8077 continue;
8079 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8081 exit_phi = use_stmt;
8082 break;
8085 if (exit_phi)
8087 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8088 /* FORNOW. Currently not supporting the case that an inner-loop induction
8089 is not used in the outer-loop (i.e. only outside the outer-loop). */
8090 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8091 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8093 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8094 if (dump_enabled_p ())
8096 dump_printf_loc (MSG_NOTE, vect_location,
8097 "vector of inductions after inner-loop:");
8098 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8104 if (dump_enabled_p ())
8106 dump_printf_loc (MSG_NOTE, vect_location,
8107 "transform induction: created def-use cycle: ");
8108 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8109 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8110 SSA_NAME_DEF_STMT (vec_def), 0);
8113 return true;
8116 /* Function vectorizable_live_operation.
8118 STMT computes a value that is used outside the loop. Check if
8119 it can be supported. */
8121 bool
8122 vectorizable_live_operation (gimple *stmt,
8123 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8124 slp_tree slp_node, int slp_index,
8125 gimple **vec_stmt)
8127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8129 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8130 imm_use_iterator imm_iter;
8131 tree lhs, lhs_type, bitsize, vec_bitsize;
8132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8133 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8134 int ncopies;
8135 gimple *use_stmt;
8136 auto_vec<tree> vec_oprnds;
8137 int vec_entry = 0;
8138 poly_uint64 vec_index = 0;
8140 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8142 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8143 return false;
8145 /* FORNOW. CHECKME. */
8146 if (nested_in_vect_loop_p (loop, stmt))
8147 return false;
8149 /* If STMT is not relevant and it is a simple assignment and its inputs are
8150 invariant then it can remain in place, unvectorized. The original last
8151 scalar value that it computes will be used. */
8152 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8154 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8155 if (dump_enabled_p ())
8156 dump_printf_loc (MSG_NOTE, vect_location,
8157 "statement is simple and uses invariant. Leaving in "
8158 "place.\n");
8159 return true;
8162 if (slp_node)
8163 ncopies = 1;
8164 else
8165 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8167 if (slp_node)
8169 gcc_assert (slp_index >= 0);
8171 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8172 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8174 /* Get the last occurrence of the scalar index from the concatenation of
8175 all the slp vectors. Calculate which slp vector it is and the index
8176 within. */
8177 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8179 /* Calculate which vector contains the result, and which lane of
8180 that vector we need. */
8181 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8183 if (dump_enabled_p ())
8184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8185 "Cannot determine which vector holds the"
8186 " final result.\n");
8187 return false;
8191 if (!vec_stmt)
8193 /* No transformation required. */
8194 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8196 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8197 OPTIMIZE_FOR_SPEED))
8199 if (dump_enabled_p ())
8200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8201 "can't use a fully-masked loop because "
8202 "the target doesn't support extract last "
8203 "reduction.\n");
8204 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8206 else if (slp_node)
8208 if (dump_enabled_p ())
8209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8210 "can't use a fully-masked loop because an "
8211 "SLP statement is live after the loop.\n");
8212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8214 else if (ncopies > 1)
8216 if (dump_enabled_p ())
8217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8218 "can't use a fully-masked loop because"
8219 " ncopies is greater than 1.\n");
8220 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8222 else
8224 gcc_assert (ncopies == 1 && !slp_node);
8225 vect_record_loop_mask (loop_vinfo,
8226 &LOOP_VINFO_MASKS (loop_vinfo),
8227 1, vectype);
8230 return true;
8233 /* If stmt has a related stmt, then use that for getting the lhs. */
8234 if (is_pattern_stmt_p (stmt_info))
8235 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8237 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8238 : gimple_get_lhs (stmt);
8239 lhs_type = TREE_TYPE (lhs);
8241 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8242 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8243 : TYPE_SIZE (TREE_TYPE (vectype)));
8244 vec_bitsize = TYPE_SIZE (vectype);
8246 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8247 tree vec_lhs, bitstart;
8248 if (slp_node)
8250 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8252 /* Get the correct slp vectorized stmt. */
8253 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8254 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8255 vec_lhs = gimple_phi_result (phi);
8256 else
8257 vec_lhs = gimple_get_lhs (vec_stmt);
8259 /* Get entry to use. */
8260 bitstart = bitsize_int (vec_index);
8261 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8263 else
8265 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8266 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8267 gcc_checking_assert (ncopies == 1
8268 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8270 /* For multiple copies, get the last copy. */
8271 for (int i = 1; i < ncopies; ++i)
8272 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8273 vec_lhs);
8275 /* Get the last lane in the vector. */
8276 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8279 gimple_seq stmts = NULL;
8280 tree new_tree;
8281 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8283 /* Emit:
8285 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8287 where VEC_LHS is the vectorized live-out result and MASK is
8288 the loop mask for the final iteration. */
8289 gcc_assert (ncopies == 1 && !slp_node);
8290 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8291 tree scalar_res = make_ssa_name (scalar_type);
8292 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8293 1, vectype, 0);
8294 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8295 2, mask, vec_lhs);
8296 gimple_call_set_lhs (new_stmt, scalar_res);
8297 gimple_seq_add_stmt (&stmts, new_stmt);
8299 /* Convert the extracted vector element to the required scalar type. */
8300 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8302 else
8304 tree bftype = TREE_TYPE (vectype);
8305 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8306 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8307 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8308 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8309 &stmts, true, NULL_TREE);
8312 if (stmts)
8313 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8315 /* Replace use of lhs with newly computed result. If the use stmt is a
8316 single arg PHI, just replace all uses of PHI result. It's necessary
8317 because lcssa PHI defining lhs may be before newly inserted stmt. */
8318 use_operand_p use_p;
8319 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8320 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8321 && !is_gimple_debug (use_stmt))
8323 if (gimple_code (use_stmt) == GIMPLE_PHI
8324 && gimple_phi_num_args (use_stmt) == 1)
8326 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8328 else
8330 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8331 SET_USE (use_p, new_tree);
8333 update_stmt (use_stmt);
8336 return true;
8339 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8341 static void
8342 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8344 ssa_op_iter op_iter;
8345 imm_use_iterator imm_iter;
8346 def_operand_p def_p;
8347 gimple *ustmt;
8349 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8351 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8353 basic_block bb;
8355 if (!is_gimple_debug (ustmt))
8356 continue;
8358 bb = gimple_bb (ustmt);
8360 if (!flow_bb_inside_loop_p (loop, bb))
8362 if (gimple_debug_bind_p (ustmt))
8364 if (dump_enabled_p ())
8365 dump_printf_loc (MSG_NOTE, vect_location,
8366 "killing debug use\n");
8368 gimple_debug_bind_reset_value (ustmt);
8369 update_stmt (ustmt);
8371 else
8372 gcc_unreachable ();
8378 /* Given loop represented by LOOP_VINFO, return true if computation of
8379 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8380 otherwise. */
8382 static bool
8383 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8385 /* Constant case. */
8386 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8388 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8389 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8391 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8392 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8393 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8394 return true;
8397 widest_int max;
8398 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8399 /* Check the upper bound of loop niters. */
8400 if (get_max_loop_iterations (loop, &max))
8402 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8403 signop sgn = TYPE_SIGN (type);
8404 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8405 if (max < type_max)
8406 return true;
8408 return false;
8411 /* Return a mask type with half the number of elements as TYPE. */
8413 tree
8414 vect_halve_mask_nunits (tree type)
8416 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8417 return build_truth_vector_type (nunits, current_vector_size);
8420 /* Return a mask type with twice as many elements as TYPE. */
8422 tree
8423 vect_double_mask_nunits (tree type)
8425 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8426 return build_truth_vector_type (nunits, current_vector_size);
8429 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8430 contain a sequence of NVECTORS masks that each control a vector of type
8431 VECTYPE. */
8433 void
8434 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8435 unsigned int nvectors, tree vectype)
8437 gcc_assert (nvectors != 0);
8438 if (masks->length () < nvectors)
8439 masks->safe_grow_cleared (nvectors);
8440 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8441 /* The number of scalars per iteration and the number of vectors are
8442 both compile-time constants. */
8443 unsigned int nscalars_per_iter
8444 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8445 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8446 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8448 rgm->max_nscalars_per_iter = nscalars_per_iter;
8449 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8453 /* Given a complete set of masks MASKS, extract mask number INDEX
8454 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8455 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8457 See the comment above vec_loop_masks for more details about the mask
8458 arrangement. */
8460 tree
8461 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8462 unsigned int nvectors, tree vectype, unsigned int index)
8464 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8465 tree mask_type = rgm->mask_type;
8467 /* Populate the rgroup's mask array, if this is the first time we've
8468 used it. */
8469 if (rgm->masks.is_empty ())
8471 rgm->masks.safe_grow_cleared (nvectors);
8472 for (unsigned int i = 0; i < nvectors; ++i)
8474 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8475 /* Provide a dummy definition until the real one is available. */
8476 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8477 rgm->masks[i] = mask;
8481 tree mask = rgm->masks[index];
8482 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8483 TYPE_VECTOR_SUBPARTS (vectype)))
8485 /* A loop mask for data type X can be reused for data type Y
8486 if X has N times more elements than Y and if Y's elements
8487 are N times bigger than X's. In this case each sequence
8488 of N elements in the loop mask will be all-zero or all-one.
8489 We can then view-convert the mask so that each sequence of
8490 N elements is replaced by a single element. */
8491 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8492 TYPE_VECTOR_SUBPARTS (vectype)));
8493 gimple_seq seq = NULL;
8494 mask_type = build_same_sized_truth_vector_type (vectype);
8495 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8496 if (seq)
8497 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8499 return mask;
8502 /* Scale profiling counters by estimation for LOOP which is vectorized
8503 by factor VF. */
8505 static void
8506 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8508 edge preheader = loop_preheader_edge (loop);
8509 /* Reduce loop iterations by the vectorization factor. */
8510 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8511 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8513 if (freq_h.nonzero_p ())
8515 profile_probability p;
8517 /* Avoid dropping loop body profile counter to 0 because of zero count
8518 in loop's preheader. */
8519 if (!(freq_e == profile_count::zero ()))
8520 freq_e = freq_e.force_nonzero ();
8521 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8522 scale_loop_frequencies (loop, p);
8525 edge exit_e = single_exit (loop);
8526 exit_e->probability = profile_probability::always ()
8527 .apply_scale (1, new_est_niter + 1);
8529 edge exit_l = single_pred_edge (loop->latch);
8530 profile_probability prob = exit_l->probability;
8531 exit_l->probability = exit_e->probability.invert ();
8532 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8533 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8536 /* Function vect_transform_loop.
8538 The analysis phase has determined that the loop is vectorizable.
8539 Vectorize the loop - created vectorized stmts to replace the scalar
8540 stmts in the loop, and update the loop exit condition.
8541 Returns scalar epilogue loop if any. */
8543 struct loop *
8544 vect_transform_loop (loop_vec_info loop_vinfo)
8546 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8547 struct loop *epilogue = NULL;
8548 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8549 int nbbs = loop->num_nodes;
8550 int i;
8551 tree niters_vector = NULL_TREE;
8552 tree step_vector = NULL_TREE;
8553 tree niters_vector_mult_vf = NULL_TREE;
8554 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8555 unsigned int lowest_vf = constant_lower_bound (vf);
8556 bool grouped_store;
8557 bool slp_scheduled = false;
8558 gimple *stmt, *pattern_stmt;
8559 gimple_seq pattern_def_seq = NULL;
8560 gimple_stmt_iterator pattern_def_si = gsi_none ();
8561 bool transform_pattern_stmt = false;
8562 bool check_profitability = false;
8563 unsigned int th;
8565 if (dump_enabled_p ())
8566 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8568 /* Use the more conservative vectorization threshold. If the number
8569 of iterations is constant assume the cost check has been performed
8570 by our caller. If the threshold makes all loops profitable that
8571 run at least the (estimated) vectorization factor number of times
8572 checking is pointless, too. */
8573 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8574 if (th >= vect_vf_for_cost (loop_vinfo)
8575 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8577 if (dump_enabled_p ())
8578 dump_printf_loc (MSG_NOTE, vect_location,
8579 "Profitability threshold is %d loop iterations.\n",
8580 th);
8581 check_profitability = true;
8584 /* Make sure there exists a single-predecessor exit bb. Do this before
8585 versioning. */
8586 edge e = single_exit (loop);
8587 if (! single_pred_p (e->dest))
8589 split_loop_exit_edge (e);
8590 if (dump_enabled_p ())
8591 dump_printf (MSG_NOTE, "split exit edge\n");
8594 /* Version the loop first, if required, so the profitability check
8595 comes first. */
8597 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8599 poly_uint64 versioning_threshold
8600 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8601 if (check_profitability
8602 && ordered_p (poly_uint64 (th), versioning_threshold))
8604 versioning_threshold = ordered_max (poly_uint64 (th),
8605 versioning_threshold);
8606 check_profitability = false;
8608 vect_loop_versioning (loop_vinfo, th, check_profitability,
8609 versioning_threshold);
8610 check_profitability = false;
8613 /* Make sure there exists a single-predecessor exit bb also on the
8614 scalar loop copy. Do this after versioning but before peeling
8615 so CFG structure is fine for both scalar and if-converted loop
8616 to make slpeel_duplicate_current_defs_from_edges face matched
8617 loop closed PHI nodes on the exit. */
8618 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8620 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8621 if (! single_pred_p (e->dest))
8623 split_loop_exit_edge (e);
8624 if (dump_enabled_p ())
8625 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8629 tree niters = vect_build_loop_niters (loop_vinfo);
8630 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8631 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8632 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8633 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8634 &step_vector, &niters_vector_mult_vf, th,
8635 check_profitability, niters_no_overflow);
8637 if (niters_vector == NULL_TREE)
8639 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8640 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8641 && known_eq (lowest_vf, vf))
8643 niters_vector
8644 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8645 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8646 step_vector = build_one_cst (TREE_TYPE (niters));
8648 else
8649 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8650 &step_vector, niters_no_overflow);
8653 /* 1) Make sure the loop header has exactly two entries
8654 2) Make sure we have a preheader basic block. */
8656 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8658 split_edge (loop_preheader_edge (loop));
8660 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8661 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8662 /* This will deal with any possible peeling. */
8663 vect_prepare_for_masked_peels (loop_vinfo);
8665 /* FORNOW: the vectorizer supports only loops which body consist
8666 of one basic block (header + empty latch). When the vectorizer will
8667 support more involved loop forms, the order by which the BBs are
8668 traversed need to be reconsidered. */
8670 for (i = 0; i < nbbs; i++)
8672 basic_block bb = bbs[i];
8673 stmt_vec_info stmt_info;
8675 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8676 gsi_next (&si))
8678 gphi *phi = si.phi ();
8679 if (dump_enabled_p ())
8681 dump_printf_loc (MSG_NOTE, vect_location,
8682 "------>vectorizing phi: ");
8683 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8685 stmt_info = vinfo_for_stmt (phi);
8686 if (!stmt_info)
8687 continue;
8689 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8690 vect_loop_kill_debug_uses (loop, phi);
8692 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8693 && !STMT_VINFO_LIVE_P (stmt_info))
8694 continue;
8696 if (STMT_VINFO_VECTYPE (stmt_info)
8697 && (maybe_ne
8698 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8699 && dump_enabled_p ())
8700 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8702 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8703 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8704 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8705 && ! PURE_SLP_STMT (stmt_info))
8707 if (dump_enabled_p ())
8708 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8709 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8713 pattern_stmt = NULL;
8714 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8715 !gsi_end_p (si) || transform_pattern_stmt;)
8717 bool is_store;
8719 if (transform_pattern_stmt)
8720 stmt = pattern_stmt;
8721 else
8723 stmt = gsi_stmt (si);
8724 /* During vectorization remove existing clobber stmts. */
8725 if (gimple_clobber_p (stmt))
8727 unlink_stmt_vdef (stmt);
8728 gsi_remove (&si, true);
8729 release_defs (stmt);
8730 continue;
8734 if (dump_enabled_p ())
8736 dump_printf_loc (MSG_NOTE, vect_location,
8737 "------>vectorizing statement: ");
8738 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8741 stmt_info = vinfo_for_stmt (stmt);
8743 /* vector stmts created in the outer-loop during vectorization of
8744 stmts in an inner-loop may not have a stmt_info, and do not
8745 need to be vectorized. */
8746 if (!stmt_info)
8748 gsi_next (&si);
8749 continue;
8752 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8753 vect_loop_kill_debug_uses (loop, stmt);
8755 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8756 && !STMT_VINFO_LIVE_P (stmt_info))
8758 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8759 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8760 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8761 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8763 stmt = pattern_stmt;
8764 stmt_info = vinfo_for_stmt (stmt);
8766 else
8768 gsi_next (&si);
8769 continue;
8772 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8773 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8774 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8775 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8776 transform_pattern_stmt = true;
8778 /* If pattern statement has def stmts, vectorize them too. */
8779 if (is_pattern_stmt_p (stmt_info))
8781 if (pattern_def_seq == NULL)
8783 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8784 pattern_def_si = gsi_start (pattern_def_seq);
8786 else if (!gsi_end_p (pattern_def_si))
8787 gsi_next (&pattern_def_si);
8788 if (pattern_def_seq != NULL)
8790 gimple *pattern_def_stmt = NULL;
8791 stmt_vec_info pattern_def_stmt_info = NULL;
8793 while (!gsi_end_p (pattern_def_si))
8795 pattern_def_stmt = gsi_stmt (pattern_def_si);
8796 pattern_def_stmt_info
8797 = vinfo_for_stmt (pattern_def_stmt);
8798 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8799 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8800 break;
8801 gsi_next (&pattern_def_si);
8804 if (!gsi_end_p (pattern_def_si))
8806 if (dump_enabled_p ())
8808 dump_printf_loc (MSG_NOTE, vect_location,
8809 "==> vectorizing pattern def "
8810 "stmt: ");
8811 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8812 pattern_def_stmt, 0);
8815 stmt = pattern_def_stmt;
8816 stmt_info = pattern_def_stmt_info;
8818 else
8820 pattern_def_si = gsi_none ();
8821 transform_pattern_stmt = false;
8824 else
8825 transform_pattern_stmt = false;
8828 if (STMT_VINFO_VECTYPE (stmt_info))
8830 poly_uint64 nunits
8831 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8832 if (!STMT_SLP_TYPE (stmt_info)
8833 && maybe_ne (nunits, vf)
8834 && dump_enabled_p ())
8835 /* For SLP VF is set according to unrolling factor, and not
8836 to vector size, hence for SLP this print is not valid. */
8837 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8840 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8841 reached. */
8842 if (STMT_SLP_TYPE (stmt_info))
8844 if (!slp_scheduled)
8846 slp_scheduled = true;
8848 if (dump_enabled_p ())
8849 dump_printf_loc (MSG_NOTE, vect_location,
8850 "=== scheduling SLP instances ===\n");
8852 vect_schedule_slp (loop_vinfo);
8855 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8856 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8858 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8860 pattern_def_seq = NULL;
8861 gsi_next (&si);
8863 continue;
8867 /* -------- vectorize statement ------------ */
8868 if (dump_enabled_p ())
8869 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8871 grouped_store = false;
8872 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8873 if (is_store)
8875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8877 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8878 interleaving chain was completed - free all the stores in
8879 the chain. */
8880 gsi_next (&si);
8881 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8883 else
8885 /* Free the attached stmt_vec_info and remove the stmt. */
8886 gimple *store = gsi_stmt (si);
8887 free_stmt_vec_info (store);
8888 unlink_stmt_vdef (store);
8889 gsi_remove (&si, true);
8890 release_defs (store);
8893 /* Stores can only appear at the end of pattern statements. */
8894 gcc_assert (!transform_pattern_stmt);
8895 pattern_def_seq = NULL;
8897 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8899 pattern_def_seq = NULL;
8900 gsi_next (&si);
8902 } /* stmts in BB */
8904 /* Stub out scalar statements that must not survive vectorization.
8905 Doing this here helps with grouped statements, or statements that
8906 are involved in patterns. */
8907 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8908 !gsi_end_p (gsi); gsi_next (&gsi))
8910 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8911 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8913 tree lhs = gimple_get_lhs (call);
8914 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8916 tree zero = build_zero_cst (TREE_TYPE (lhs));
8917 gimple *new_stmt = gimple_build_assign (lhs, zero);
8918 gsi_replace (&gsi, new_stmt, true);
8922 } /* BBs in loop */
8924 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8925 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8926 if (integer_onep (step_vector))
8927 niters_no_overflow = true;
8928 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8929 niters_vector_mult_vf, !niters_no_overflow);
8931 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8932 scale_profile_for_vect_loop (loop, assumed_vf);
8934 /* True if the final iteration might not handle a full vector's
8935 worth of scalar iterations. */
8936 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8937 /* The minimum number of iterations performed by the epilogue. This
8938 is 1 when peeling for gaps because we always need a final scalar
8939 iteration. */
8940 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8941 /* +1 to convert latch counts to loop iteration counts,
8942 -min_epilogue_iters to remove iterations that cannot be performed
8943 by the vector code. */
8944 int bias_for_lowest = 1 - min_epilogue_iters;
8945 int bias_for_assumed = bias_for_lowest;
8946 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8947 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8949 /* When the amount of peeling is known at compile time, the first
8950 iteration will have exactly alignment_npeels active elements.
8951 In the worst case it will have at least one. */
8952 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8953 bias_for_lowest += lowest_vf - min_first_active;
8954 bias_for_assumed += assumed_vf - min_first_active;
8956 /* In these calculations the "- 1" converts loop iteration counts
8957 back to latch counts. */
8958 if (loop->any_upper_bound)
8959 loop->nb_iterations_upper_bound
8960 = (final_iter_may_be_partial
8961 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8962 lowest_vf) - 1
8963 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8964 lowest_vf) - 1);
8965 if (loop->any_likely_upper_bound)
8966 loop->nb_iterations_likely_upper_bound
8967 = (final_iter_may_be_partial
8968 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8969 + bias_for_lowest, lowest_vf) - 1
8970 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8971 + bias_for_lowest, lowest_vf) - 1);
8972 if (loop->any_estimate)
8973 loop->nb_iterations_estimate
8974 = (final_iter_may_be_partial
8975 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8976 assumed_vf) - 1
8977 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8978 assumed_vf) - 1);
8980 if (dump_enabled_p ())
8982 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8984 dump_printf_loc (MSG_NOTE, vect_location,
8985 "LOOP VECTORIZED\n");
8986 if (loop->inner)
8987 dump_printf_loc (MSG_NOTE, vect_location,
8988 "OUTER LOOP VECTORIZED\n");
8989 dump_printf (MSG_NOTE, "\n");
8991 else
8993 dump_printf_loc (MSG_NOTE, vect_location,
8994 "LOOP EPILOGUE VECTORIZED (VS=");
8995 dump_dec (MSG_NOTE, current_vector_size);
8996 dump_printf (MSG_NOTE, ")\n");
9000 /* Free SLP instances here because otherwise stmt reference counting
9001 won't work. */
9002 slp_instance instance;
9003 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9004 vect_free_slp_instance (instance);
9005 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9006 /* Clear-up safelen field since its value is invalid after vectorization
9007 since vectorized loop can have loop-carried dependencies. */
9008 loop->safelen = 0;
9010 /* Don't vectorize epilogue for epilogue. */
9011 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9012 epilogue = NULL;
9014 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9015 epilogue = NULL;
9017 if (epilogue)
9019 auto_vector_sizes vector_sizes;
9020 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9021 unsigned int next_size = 0;
9023 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9024 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9025 && known_eq (vf, lowest_vf))
9027 unsigned int eiters
9028 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9029 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9030 eiters = eiters % lowest_vf;
9031 epilogue->nb_iterations_upper_bound = eiters - 1;
9033 unsigned int ratio;
9034 while (next_size < vector_sizes.length ()
9035 && !(constant_multiple_p (current_vector_size,
9036 vector_sizes[next_size], &ratio)
9037 && eiters >= lowest_vf / ratio))
9038 next_size += 1;
9040 else
9041 while (next_size < vector_sizes.length ()
9042 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9043 next_size += 1;
9045 if (next_size == vector_sizes.length ())
9046 epilogue = NULL;
9049 if (epilogue)
9051 epilogue->force_vectorize = loop->force_vectorize;
9052 epilogue->safelen = loop->safelen;
9053 epilogue->dont_vectorize = false;
9055 /* We may need to if-convert epilogue to vectorize it. */
9056 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9057 tree_if_conversion (epilogue);
9060 return epilogue;
9063 /* The code below is trying to perform simple optimization - revert
9064 if-conversion for masked stores, i.e. if the mask of a store is zero
9065 do not perform it and all stored value producers also if possible.
9066 For example,
9067 for (i=0; i<n; i++)
9068 if (c[i])
9070 p1[i] += 1;
9071 p2[i] = p3[i] +2;
9073 this transformation will produce the following semi-hammock:
9075 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9077 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9078 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9079 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9080 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9081 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9082 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9086 void
9087 optimize_mask_stores (struct loop *loop)
9089 basic_block *bbs = get_loop_body (loop);
9090 unsigned nbbs = loop->num_nodes;
9091 unsigned i;
9092 basic_block bb;
9093 struct loop *bb_loop;
9094 gimple_stmt_iterator gsi;
9095 gimple *stmt;
9096 auto_vec<gimple *> worklist;
9098 vect_location = find_loop_location (loop);
9099 /* Pick up all masked stores in loop if any. */
9100 for (i = 0; i < nbbs; i++)
9102 bb = bbs[i];
9103 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9104 gsi_next (&gsi))
9106 stmt = gsi_stmt (gsi);
9107 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9108 worklist.safe_push (stmt);
9112 free (bbs);
9113 if (worklist.is_empty ())
9114 return;
9116 /* Loop has masked stores. */
9117 while (!worklist.is_empty ())
9119 gimple *last, *last_store;
9120 edge e, efalse;
9121 tree mask;
9122 basic_block store_bb, join_bb;
9123 gimple_stmt_iterator gsi_to;
9124 tree vdef, new_vdef;
9125 gphi *phi;
9126 tree vectype;
9127 tree zero;
9129 last = worklist.pop ();
9130 mask = gimple_call_arg (last, 2);
9131 bb = gimple_bb (last);
9132 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9133 the same loop as if_bb. It could be different to LOOP when two
9134 level loop-nest is vectorized and mask_store belongs to the inner
9135 one. */
9136 e = split_block (bb, last);
9137 bb_loop = bb->loop_father;
9138 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9139 join_bb = e->dest;
9140 store_bb = create_empty_bb (bb);
9141 add_bb_to_loop (store_bb, bb_loop);
9142 e->flags = EDGE_TRUE_VALUE;
9143 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9144 /* Put STORE_BB to likely part. */
9145 efalse->probability = profile_probability::unlikely ();
9146 store_bb->count = efalse->count ();
9147 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9148 if (dom_info_available_p (CDI_DOMINATORS))
9149 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9150 if (dump_enabled_p ())
9151 dump_printf_loc (MSG_NOTE, vect_location,
9152 "Create new block %d to sink mask stores.",
9153 store_bb->index);
9154 /* Create vector comparison with boolean result. */
9155 vectype = TREE_TYPE (mask);
9156 zero = build_zero_cst (vectype);
9157 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9158 gsi = gsi_last_bb (bb);
9159 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9160 /* Create new PHI node for vdef of the last masked store:
9161 .MEM_2 = VDEF <.MEM_1>
9162 will be converted to
9163 .MEM.3 = VDEF <.MEM_1>
9164 and new PHI node will be created in join bb
9165 .MEM_2 = PHI <.MEM_1, .MEM_3>
9167 vdef = gimple_vdef (last);
9168 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9169 gimple_set_vdef (last, new_vdef);
9170 phi = create_phi_node (vdef, join_bb);
9171 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9173 /* Put all masked stores with the same mask to STORE_BB if possible. */
9174 while (true)
9176 gimple_stmt_iterator gsi_from;
9177 gimple *stmt1 = NULL;
9179 /* Move masked store to STORE_BB. */
9180 last_store = last;
9181 gsi = gsi_for_stmt (last);
9182 gsi_from = gsi;
9183 /* Shift GSI to the previous stmt for further traversal. */
9184 gsi_prev (&gsi);
9185 gsi_to = gsi_start_bb (store_bb);
9186 gsi_move_before (&gsi_from, &gsi_to);
9187 /* Setup GSI_TO to the non-empty block start. */
9188 gsi_to = gsi_start_bb (store_bb);
9189 if (dump_enabled_p ())
9191 dump_printf_loc (MSG_NOTE, vect_location,
9192 "Move stmt to created bb\n");
9193 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9195 /* Move all stored value producers if possible. */
9196 while (!gsi_end_p (gsi))
9198 tree lhs;
9199 imm_use_iterator imm_iter;
9200 use_operand_p use_p;
9201 bool res;
9203 /* Skip debug statements. */
9204 if (is_gimple_debug (gsi_stmt (gsi)))
9206 gsi_prev (&gsi);
9207 continue;
9209 stmt1 = gsi_stmt (gsi);
9210 /* Do not consider statements writing to memory or having
9211 volatile operand. */
9212 if (gimple_vdef (stmt1)
9213 || gimple_has_volatile_ops (stmt1))
9214 break;
9215 gsi_from = gsi;
9216 gsi_prev (&gsi);
9217 lhs = gimple_get_lhs (stmt1);
9218 if (!lhs)
9219 break;
9221 /* LHS of vectorized stmt must be SSA_NAME. */
9222 if (TREE_CODE (lhs) != SSA_NAME)
9223 break;
9225 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9227 /* Remove dead scalar statement. */
9228 if (has_zero_uses (lhs))
9230 gsi_remove (&gsi_from, true);
9231 continue;
9235 /* Check that LHS does not have uses outside of STORE_BB. */
9236 res = true;
9237 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9239 gimple *use_stmt;
9240 use_stmt = USE_STMT (use_p);
9241 if (is_gimple_debug (use_stmt))
9242 continue;
9243 if (gimple_bb (use_stmt) != store_bb)
9245 res = false;
9246 break;
9249 if (!res)
9250 break;
9252 if (gimple_vuse (stmt1)
9253 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9254 break;
9256 /* Can move STMT1 to STORE_BB. */
9257 if (dump_enabled_p ())
9259 dump_printf_loc (MSG_NOTE, vect_location,
9260 "Move stmt to created bb\n");
9261 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9263 gsi_move_before (&gsi_from, &gsi_to);
9264 /* Shift GSI_TO for further insertion. */
9265 gsi_prev (&gsi_to);
9267 /* Put other masked stores with the same mask to STORE_BB. */
9268 if (worklist.is_empty ()
9269 || gimple_call_arg (worklist.last (), 2) != mask
9270 || worklist.last () != stmt1)
9271 break;
9272 last = worklist.pop ();
9274 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);