usr.sbin/makefs/hammer2: Sync with recent userspace ondisk.c changes
[dragonfly.git] / contrib / gcc-8.0 / gcc / tree-vect-loop.c
blobc74a485cc2f638c26498ac6edb57faa221f5a46d
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
169 in the loop.
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
172 original loop:
173 for (i=0; i<N; i++){
174 a[i] = b[i] + c[i];
177 vectorized loop:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188 unsigned nbbs = loop->num_nodes;
189 poly_uint64 vectorization_factor = 1;
190 tree scalar_type = NULL_TREE;
191 gphi *phi;
192 tree vectype;
193 stmt_vec_info stmt_info;
194 unsigned i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
200 bool bool_result;
201 auto_vec<stmt_vec_info> mask_producers;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i = 0; i < nbbs; i++)
209 basic_block bb = bbs[i];
211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 gsi_next (&si))
214 phi = si.phi ();
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 gcc_assert (stmt_info);
224 if (STMT_VINFO_RELEVANT_P (stmt_info)
225 || STMT_VINFO_LIVE_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 dump_printf (MSG_NOTE, "\n");
268 vect_update_max_nunits (&vectorization_factor, vectype);
272 for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 !gsi_end_p (si) || analyze_pattern_stmt;)
275 tree vf_vectype;
277 if (analyze_pattern_stmt)
278 stmt = pattern_stmt;
279 else
280 stmt = gsi_stmt (si);
282 stmt_info = vinfo_for_stmt (stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 return false;
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 else
434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
438 compute a factor. */
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 && is_gimple_assign (stmt)
441 && gimple_assign_rhs_code (stmt) != COND_EXPR)
443 if (STMT_VINFO_RELEVANT_P (stmt_info)
444 || STMT_VINFO_LIVE_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 == tcc_comparison
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 else
455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
457 pattern_def_seq = NULL;
458 gsi_next (&si);
460 continue;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469 dump_printf (MSG_NOTE, "\n");
471 vectype = get_vectype_for_scalar_type (scalar_type);
472 if (!vectype)
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized: unsupported "
478 "data-type ");
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480 scalar_type);
481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
483 return false;
486 if (!bool_result)
487 STMT_VINFO_VECTYPE (stmt_info) = vectype;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493 dump_printf (MSG_NOTE, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 vf_vectype = vectype;
501 else
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 if (!bool_result)
507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 &dummy);
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 dump_printf (MSG_NOTE, "\n");
516 vf_vectype = get_vectype_for_scalar_type (scalar_type);
518 if (!vf_vectype)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525 scalar_type);
526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 return false;
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vf_vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553 dump_printf (MSG_NOTE, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 dump_printf (MSG_NOTE, "\n");
563 vect_update_max_nunits (&vectorization_factor, vf_vectype);
565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 pattern_def_seq = NULL;
568 gsi_next (&si);
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577 dump_dec (MSG_NOTE, vectorization_factor);
578 dump_printf (MSG_NOTE, "\n");
581 if (known_le (vectorization_factor, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585 "not vectorized: unsupported data-type\n");
586 return false;
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
590 for (i = 0; i < mask_producers.length (); i++)
592 tree mask_type = NULL;
594 stmt = STMT_VINFO_STMT (mask_producers[i]);
596 if (is_gimple_assign (stmt)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 mask_type = get_mask_type_for_scalar_type (scalar_type);
604 if (!mask_type)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "not vectorized: unsupported mask\n");
609 return false;
612 else
614 tree rhs;
615 ssa_op_iter iter;
616 gimple *def_stmt;
617 enum vect_def_type dt;
619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 &def_stmt, &dt, &vectype))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "not vectorized: can't compute mask type "
628 "for statement, ");
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 return false;
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
638 if (!vectype)
639 continue;
641 if (!mask_type)
642 mask_type = vectype;
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 TYPE_VECTOR_SUBPARTS (vectype)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 mask_type);
653 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 vectype);
656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
658 return false;
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 != VECTOR_BOOLEAN_TYPE_P (vectype))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 mask_type);
670 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 vectype);
673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
675 return false;
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
681 if (mask_type
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 && gimple_code (stmt) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 mask_type = build_same_sized_truth_vector_type (mask_type);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
690 if-conversion. */
691 if (!mask_type)
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "not vectorized: can't compute mask type "
697 "for statement, ");
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 return false;
704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 return true;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718 tree * step)
720 tree init_expr;
721 tree step_expr;
722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723 basic_block bb;
725 /* When there is no evolution in this loop, the evolution function
726 is not "simple". */
727 if (evolution_part == NULL_TREE)
728 return false;
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part))
733 return false;
735 step_expr = evolution_part;
736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742 dump_printf (MSG_NOTE, ", init: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744 dump_printf (MSG_NOTE, "\n");
747 *init = init_expr;
748 *step = step_expr;
750 if (TREE_CODE (step_expr) != INTEGER_CST
751 && (TREE_CODE (step_expr) != SSA_NAME
752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 || !flag_associative_math)))
757 && (TREE_CODE (step_expr) != REAL_CST
758 || !flag_associative_math))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "step unknown.\n");
763 return false;
766 return true;
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
774 enclosing LOOP). */
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
779 basic_block bb = loop->header;
780 tree init, step;
781 auto_vec<gimple *, 64> worklist;
782 gphi_iterator gsi;
783 bool double_reduc;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
791 changed. */
792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
794 gphi *phi = gsi.phi ();
795 tree access_fn = NULL;
796 tree def = PHI_RESULT (phi);
797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def))
808 continue;
810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
812 /* Analyze the evolution function. */
813 access_fn = analyze_scalar_evolution (loop, def);
814 if (access_fn)
816 STRIP_NOPS (access_fn);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822 dump_printf (MSG_NOTE, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 = initial_condition_in_loop_num (access_fn, loop->num);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 = evolution_part_in_loop_num (access_fn, loop->num);
830 if (!access_fn
831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 && TREE_CODE (step) != INTEGER_CST))
835 worklist.safe_push (phi);
836 continue;
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 != NULL_TREE);
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist.length () > 0)
852 gimple *phi = worklist.pop ();
853 tree def = PHI_RESULT (phi);
854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855 gimple *reduc_stmt;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 gcc_assert (!virtual_operand_p (def)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
903 chain. */
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 tree *number_of_iterations, tree *number_of_iterationsm1)
1024 edge exit = single_exit (loop);
1025 struct tree_niter_desc niter_desc;
1026 tree niter_assumptions, niter, may_be_zero;
1027 gcond *cond = get_loop_exit_condition (loop);
1029 *assumptions = boolean_true_node;
1030 *number_of_iterationsm1 = chrec_dont_know;
1031 *number_of_iterations = chrec_dont_know;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== get_loop_niters ===\n");
1036 if (!exit)
1037 return cond;
1039 niter = chrec_dont_know;
1040 may_be_zero = NULL_TREE;
1041 niter_assumptions = boolean_true_node;
1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043 || chrec_contains_undetermined (niter_desc.niter))
1044 return cond;
1046 niter_assumptions = niter_desc.assumptions;
1047 may_be_zero = niter_desc.may_be_zero;
1048 niter = niter_desc.niter;
1050 if (may_be_zero && integer_zerop (may_be_zero))
1051 may_be_zero = NULL_TREE;
1053 if (may_be_zero)
1055 if (COMPARISON_CLASS_P (may_be_zero))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 niter_assumptions,
1062 fold_build1 (TRUTH_NOT_EXPR,
1063 boolean_type_node,
1064 may_be_zero));
1065 else
1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 build_int_cst (TREE_TYPE (niter), 0),
1068 rewrite_to_non_trapping_overflow (niter));
1070 may_be_zero = NULL_TREE;
1072 else if (integer_nonzerop (may_be_zero))
1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 return cond;
1078 else
1079 return cond;
1082 *assumptions = niter_assumptions;
1083 *number_of_iterationsm1 = niter;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter && !chrec_contains_undetermined (niter))
1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 build_int_cst (TREE_TYPE (niter), 1));
1092 *number_of_iterations = niter;
1094 return cond;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1104 const struct loop *const loop = (const struct loop *)data;
1105 if (flow_bb_inside_loop_p (loop, bb))
1106 return true;
1107 return false;
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115 : vec_info (vec_info::loop, init_cost (loop_in)),
1116 loop (loop_in),
1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118 num_itersm1 (NULL_TREE),
1119 num_iters (NULL_TREE),
1120 num_iters_unchanged (NULL_TREE),
1121 num_iters_assumptions (NULL_TREE),
1122 th (0),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE),
1127 mask_compare_type (NULL_TREE),
1128 unaligned_dr (NULL),
1129 peeling_for_alignment (0),
1130 ptr_mask (0),
1131 ivexpr_map (NULL),
1132 slp_unrolling_factor (1),
1133 single_scalar_iteration_cost (0),
1134 vectorizable (false),
1135 can_fully_mask_p (true),
1136 fully_masked_p (false),
1137 peeling_for_gaps (false),
1138 peeling_for_niter (false),
1139 operands_swapped (false),
1140 no_data_dependencies (false),
1141 has_mask_store (false),
1142 scalar_loop (NULL),
1143 orig_loop_info (NULL)
1145 /* Create/Update stmt_info for all stmts in the loop. */
1146 basic_block *body = get_loop_body (loop);
1147 for (unsigned int i = 0; i < loop->num_nodes; i++)
1149 basic_block bb = body[i];
1150 gimple_stmt_iterator si;
1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1154 gimple *phi = gsi_stmt (si);
1155 gimple_set_uid (phi, 0);
1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1161 gimple *stmt = gsi_stmt (si);
1162 gimple_set_uid (stmt, 0);
1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 free (body);
1168 /* CHECKME: We want to visit all BBs before their successors (except for
1169 latch blocks, for which this assertion wouldn't hold). In the simple
1170 case of the loop forms we allow, a dfs order of the BBs would the same
1171 as reversed postorder traversal, so we are safe. */
1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 bbs, loop->num_nodes, loop);
1175 gcc_assert (nbbs == loop->num_nodes);
1178 /* Free all levels of MASKS. */
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1183 rgroup_masks *rgm;
1184 unsigned int i;
1185 FOR_EACH_VEC_ELT (*masks, i, rgm)
1186 rgm->masks.release ();
1187 masks->release ();
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191 stmt_vec_info structs of all the stmts in the loop. */
1193 _loop_vec_info::~_loop_vec_info ()
1195 int nbbs;
1196 gimple_stmt_iterator si;
1197 int j;
1199 nbbs = loop->num_nodes;
1200 for (j = 0; j < nbbs; j++)
1202 basic_block bb = bbs[j];
1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204 free_stmt_vec_info (gsi_stmt (si));
1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1208 gimple *stmt = gsi_stmt (si);
1210 /* We may have broken canonical form by moving a constant
1211 into RHS1 of a commutative op. Fix such occurrences. */
1212 if (operands_swapped && is_gimple_assign (stmt))
1214 enum tree_code code = gimple_assign_rhs_code (stmt);
1216 if ((code == PLUS_EXPR
1217 || code == POINTER_PLUS_EXPR
1218 || code == MULT_EXPR)
1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 swap_ssa_operands (stmt,
1221 gimple_assign_rhs1_ptr (stmt),
1222 gimple_assign_rhs2_ptr (stmt));
1223 else if (code == COND_EXPR
1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1226 tree cond_expr = gimple_assign_rhs1 (stmt);
1227 enum tree_code cond_code = TREE_CODE (cond_expr);
1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 0));
1233 cond_code = invert_tree_comparison (cond_code,
1234 honor_nans);
1235 if (cond_code != ERROR_MARK)
1237 TREE_SET_CODE (cond_expr, cond_code);
1238 swap_ssa_operands (stmt,
1239 gimple_assign_rhs2_ptr (stmt),
1240 gimple_assign_rhs3_ptr (stmt));
1246 /* Free stmt_vec_info. */
1247 free_stmt_vec_info (stmt);
1248 gsi_next (&si);
1252 free (bbs);
1254 release_vec_loop_masks (&masks);
1255 delete ivexpr_map;
1257 loop->aux = NULL;
1260 /* Return an invariant or register for EXPR and emit necessary
1261 computations in the LOOP_VINFO loop preheader. */
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1266 if (is_gimple_reg (expr)
1267 || is_gimple_min_invariant (expr))
1268 return expr;
1270 if (! loop_vinfo->ivexpr_map)
1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273 if (! cached)
1275 gimple_seq stmts = NULL;
1276 cached = force_gimple_operand (unshare_expr (expr),
1277 &stmts, true, NULL_TREE);
1278 if (stmts)
1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 gsi_insert_seq_on_edge_immediate (e, stmts);
1284 return cached;
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288 all masks required to mask LOOP_VINFO. */
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1293 rgroup_masks *rgm;
1294 unsigned int i;
1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296 if (rgm->mask_type != NULL_TREE
1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 cmp_type, rgm->mask_type,
1299 OPTIMIZE_FOR_SPEED))
1300 return false;
1301 return true;
1304 /* Calculate the maximum number of scalars per iteration for every
1305 rgroup in LOOP_VINFO. */
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1310 unsigned int res = 1;
1311 unsigned int i;
1312 rgroup_masks *rgm;
1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314 res = MAX (res, rgm->max_nscalars_per_iter);
1315 return res;
1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1319 whether we can actually generate the masks required. Return true if so,
1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326 unsigned int min_ni_width;
1328 /* Use a normal loop if there are no statements that need masking.
1329 This only happens in rare degenerate cases: it means that the loop
1330 has no loads, no stores, and no live-out values. */
1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332 return false;
1334 /* Get the maximum number of iterations that is representable
1335 in the counter type. */
1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1339 /* Get a more refined estimate for the number of iterations. */
1340 widest_int max_back_edges;
1341 if (max_loop_iterations (loop, &max_back_edges))
1342 max_ni = wi::smin (max_ni, max_back_edges + 1);
1344 /* Account for rgroup masks, in which each bit is replicated N times. */
1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1347 /* Work out how many bits we need to represent the limit. */
1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1350 /* Find a scalar mode for which WHILE_ULT is supported. */
1351 opt_scalar_int_mode cmp_mode_iter;
1352 tree cmp_type = NULL_TREE;
1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356 if (cmp_bits >= min_ni_width
1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 if (this_type
1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1363 /* Although we could stop as soon as we find a valid mode,
1364 it's often better to continue until we hit Pmode, since the
1365 operands to the WHILE are more likely to be reusable in
1366 address calculations. */
1367 cmp_type = this_type;
1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 break;
1374 if (!cmp_type)
1375 return false;
1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378 return true;
1381 /* Calculate the cost of one scalar iteration of the loop. */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387 int nbbs = loop->num_nodes, factor;
1388 int innerloop_iters, i;
1390 /* Gather costs for statements in the scalar loop. */
1392 /* FORNOW. */
1393 innerloop_iters = 1;
1394 if (loop->inner)
1395 innerloop_iters = 50; /* FIXME */
1397 for (i = 0; i < nbbs; i++)
1399 gimple_stmt_iterator si;
1400 basic_block bb = bbs[i];
1402 if (bb->loop_father == loop->inner)
1403 factor = innerloop_iters;
1404 else
1405 factor = 1;
1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1409 gimple *stmt = gsi_stmt (si);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413 continue;
1415 /* Skip stmts that are not vectorized inside the loop. */
1416 if (stmt_info
1417 && !STMT_VINFO_RELEVANT_P (stmt_info)
1418 && (!STMT_VINFO_LIVE_P (stmt_info)
1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421 continue;
1423 vect_cost_for_stmt kind;
1424 if (STMT_VINFO_DATA_REF (stmt_info))
1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427 kind = scalar_load;
1428 else
1429 kind = scalar_store;
1431 else
1432 kind = scalar_stmt;
1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 factor, kind, stmt_info, 0, vect_prologue);
1439 /* Now accumulate cost. */
1440 void *target_cost_data = init_cost (loop);
1441 stmt_info_for_cost *si;
1442 int j;
1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 j, si)
1446 struct _stmt_vec_info *stmt_info
1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448 (void) add_stmt_cost (target_cost_data, si->count,
1449 si->kind, stmt_info, si->misalign,
1450 vect_body);
1452 unsigned dummy, body_cost = 0;
1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454 destroy_cost_data (target_cost_data);
1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1459 /* Function vect_analyze_loop_form_1.
1461 Verify that certain CFG restrictions hold, including:
1462 - the loop has a pre-header
1463 - the loop has a single entry and exit
1464 - the loop exit condition is simple enough
1465 - the number of iterations can be analyzed, i.e, a countable loop. The
1466 niter could be analyzed under some assumptions. */
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 tree *assumptions, tree *number_of_iterationsm1,
1471 tree *number_of_iterations, gcond **inner_loop_cond)
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_NOTE, vect_location,
1475 "=== vect_analyze_loop_form ===\n");
1477 /* Different restrictions apply when we are considering an inner-most loop,
1478 vs. an outer (nested) loop.
1479 (FORNOW. May want to relax some of these restrictions in the future). */
1481 if (!loop->inner)
1483 /* Inner-most loop. We currently require that the number of BBs is
1484 exactly 2 (the header and latch). Vectorizable inner-most loops
1485 look like this:
1487 (pre-header)
1489 header <--------+
1490 | | |
1491 | +--> latch --+
1493 (exit-bb) */
1495 if (loop->num_nodes != 2)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: control flow in loop.\n");
1500 return false;
1503 if (empty_block_p (loop->header))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: empty loop.\n");
1508 return false;
1511 else
1513 struct loop *innerloop = loop->inner;
1514 edge entryedge;
1516 /* Nested loop. We currently require that the loop is doubly-nested,
1517 contains a single inner loop, and the number of BBs is exactly 5.
1518 Vectorizable outer-loops look like this:
1520 (pre-header)
1522 header <---+
1524 inner-loop |
1526 tail ------+
1528 (exit-bb)
1530 The inner-loop has the properties expected of inner-most loops
1531 as described above. */
1533 if ((loop->inner)->inner || (loop->inner)->next)
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "not vectorized: multiple nested loops.\n");
1538 return false;
1541 if (loop->num_nodes != 5)
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: control flow in loop.\n");
1546 return false;
1549 entryedge = loop_preheader_edge (innerloop);
1550 if (entryedge->src != loop->header
1551 || !single_exit (innerloop)
1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: unsupported outerloop form.\n");
1557 return false;
1560 /* Analyze the inner-loop. */
1561 tree inner_niterm1, inner_niter, inner_assumptions;
1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 &inner_assumptions, &inner_niterm1,
1564 &inner_niter, NULL)
1565 /* Don't support analyzing niter under assumptions for inner
1566 loop. */
1567 || !integer_onep (inner_assumptions))
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 "not vectorized: Bad inner loop.\n");
1572 return false;
1575 if (!expr_invariant_in_loop_p (loop, inner_niter))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "not vectorized: inner-loop count not"
1580 " invariant.\n");
1581 return false;
1584 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "Considering outer-loop vectorization.\n");
1589 if (!single_exit (loop)
1590 || EDGE_COUNT (loop->header->preds) != 2)
1592 if (dump_enabled_p ())
1594 if (!single_exit (loop))
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: multiple exits.\n");
1597 else if (EDGE_COUNT (loop->header->preds) != 2)
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: too many incoming edges.\n");
1601 return false;
1604 /* We assume that the loop exit condition is at the end of the loop. i.e,
1605 that the loop is represented as a do-while (with a proper if-guard
1606 before the loop if needed), where the loop header contains all the
1607 executable statements, and the latch is empty. */
1608 if (!empty_block_p (loop->latch)
1609 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1611 if (dump_enabled_p ())
1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 "not vectorized: latch block not empty.\n");
1614 return false;
1617 /* Make sure the exit is not abnormal. */
1618 edge e = single_exit (loop);
1619 if (e->flags & EDGE_ABNORMAL)
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "not vectorized: abnormal loop exit edge.\n");
1624 return false;
1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 number_of_iterationsm1);
1629 if (!*loop_cond)
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "not vectorized: complicated exit condition.\n");
1634 return false;
1637 if (integer_zerop (*assumptions)
1638 || !*number_of_iterations
1639 || chrec_contains_undetermined (*number_of_iterations))
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "not vectorized: number of iterations cannot be "
1644 "computed.\n");
1645 return false;
1648 if (integer_zerop (*number_of_iterations))
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: number of iterations = 0.\n");
1653 return false;
1656 return true;
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1664 tree assumptions, number_of_iterations, number_of_iterationsm1;
1665 gcond *loop_cond, *inner_loop_cond = NULL;
1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 &assumptions, &number_of_iterationsm1,
1669 &number_of_iterations, &inner_loop_cond))
1670 return NULL;
1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676 if (!integer_onep (assumptions))
1678 /* We consider to vectorize this loop by versioning it under
1679 some assumptions. In order to do this, we need to clear
1680 existing information computed by scev and niter analyzer. */
1681 scev_reset_htab ();
1682 free_numbers_of_iterations_estimates (loop);
1683 /* Also set flag for this loop so that following scev and niter
1684 analysis are done under the assumptions. */
1685 loop_constraint_set (loop, LOOP_C_FINITE);
1686 /* Also record the assumptions for versioning. */
1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1692 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_NOTE, vect_location,
1695 "Symbolic number of iterations is ");
1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697 dump_printf (MSG_NOTE, "\n");
1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702 if (inner_loop_cond)
1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704 = loop_exit_ctrl_vec_info_type;
1706 gcc_assert (!loop->aux);
1707 loop->aux = loop_vinfo;
1708 return loop_vinfo;
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714 statements update the vectorization factor. */
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721 int nbbs = loop->num_nodes;
1722 poly_uint64 vectorization_factor;
1723 int i;
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "=== vect_update_vf_for_slp ===\n");
1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730 gcc_assert (known_ne (vectorization_factor, 0U));
1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733 vectorization factor of the loop is the unrolling factor required by
1734 the SLP instances. If that unrolling factor is 1, we say, that we
1735 perform pure SLP on loop - cross iteration parallelism is not
1736 exploited. */
1737 bool only_slp_in_loop = true;
1738 for (i = 0; i < nbbs; i++)
1740 basic_block bb = bbs[i];
1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 gsi_next (&si))
1744 gimple *stmt = gsi_stmt (si);
1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 && STMT_VINFO_RELATED_STMT (stmt_info))
1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 stmt_info = vinfo_for_stmt (stmt);
1752 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 && !PURE_SLP_STMT (stmt_info))
1755 /* STMT needs both SLP and loop-based vectorization. */
1756 only_slp_in_loop = false;
1760 if (only_slp_in_loop)
1762 dump_printf_loc (MSG_NOTE, vect_location,
1763 "Loop contains only SLP stmts\n");
1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1766 else
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "Loop contains SLP and non-SLP stmts\n");
1770 /* Both the vectorization factor and unroll factor have the form
1771 current_vector_size * X for some rational X, so they must have
1772 a common multiple. */
1773 vectorization_factor
1774 = force_common_multiple (vectorization_factor,
1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Updating vectorization factor to ");
1783 dump_dec (MSG_NOTE, vectorization_factor);
1784 dump_printf (MSG_NOTE, ".\n");
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789 the other phi in the reduction is also relevant for vectorization.
1790 This rejects cases such as:
1792 outer1:
1793 x_1 = PHI <x_3(outer2), ...>;
1796 inner:
1797 x_2 = ...;
1800 outer2:
1801 x_3 = PHI <x_2(inner)>;
1803 if nothing in x_2 or elsewhere makes x_1 relevant. */
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809 return false;
1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 /* Function vect_analyze_loop_operations.
1817 Scan the loop stmts and make sure they are all vectorizable. */
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824 int nbbs = loop->num_nodes;
1825 int i;
1826 stmt_vec_info stmt_info;
1827 bool need_to_vectorize = false;
1828 bool ok;
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_NOTE, vect_location,
1832 "=== vect_analyze_loop_operations ===\n");
1834 for (i = 0; i < nbbs; i++)
1836 basic_block bb = bbs[i];
1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 gsi_next (&si))
1841 gphi *phi = si.phi ();
1842 ok = true;
1844 stmt_info = vinfo_for_stmt (phi);
1845 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1850 if (virtual_operand_p (gimple_phi_result (phi)))
1851 continue;
1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854 (i.e., a phi in the tail of the outer-loop). */
1855 if (! is_loop_header_bb_p (bb))
1857 /* FORNOW: we currently don't support the case that these phis
1858 are not used in the outerloop (unless it is double reduction,
1859 i.e., this phi is vect_reduction_def), cause this case
1860 requires to actually do something here. */
1861 if (STMT_VINFO_LIVE_P (stmt_info)
1862 && !vect_active_double_reduction_p (stmt_info))
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "Unsupported loop-closed phi in "
1867 "outer-loop.\n");
1868 return false;
1871 /* If PHI is used in the outer loop, we check that its operand
1872 is defined in the inner loop. */
1873 if (STMT_VINFO_RELEVANT_P (stmt_info))
1875 tree phi_op;
1876 gimple *op_def_stmt;
1878 if (gimple_phi_num_args (phi) != 1)
1879 return false;
1881 phi_op = PHI_ARG_DEF (phi, 0);
1882 if (TREE_CODE (phi_op) != SSA_NAME)
1883 return false;
1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 if (gimple_nop_p (op_def_stmt)
1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 || !vinfo_for_stmt (op_def_stmt))
1889 return false;
1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892 != vect_used_in_outer
1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894 != vect_used_in_outer_by_reduction)
1895 return false;
1898 continue;
1901 gcc_assert (stmt_info);
1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904 || STMT_VINFO_LIVE_P (stmt_info))
1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1907 /* A scalar-dependence cycle that we don't support. */
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "not vectorized: scalar dependence cycle.\n");
1911 return false;
1914 if (STMT_VINFO_RELEVANT_P (stmt_info))
1916 need_to_vectorize = true;
1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 && ! PURE_SLP_STMT (stmt_info))
1919 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 && ! PURE_SLP_STMT (stmt_info))
1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1927 if (ok
1928 && STMT_VINFO_LIVE_P (stmt_info)
1929 && !PURE_SLP_STMT (stmt_info))
1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1932 if (!ok)
1934 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 "not vectorized: relevant phi not "
1938 "supported: ");
1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1941 return false;
1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 gsi_next (&si))
1948 gimple *stmt = gsi_stmt (si);
1949 if (!gimple_clobber_p (stmt)
1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 return false;
1953 } /* bbs */
1955 /* All operations in the loop are either irrelevant (deal with loop
1956 control, or dead), or only used outside the loop and can be moved
1957 out of the loop (e.g. invariants, inductions). The loop can be
1958 optimized away by scalar optimizations. We're better off not
1959 touching this loop. */
1960 if (!need_to_vectorize)
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "All the computation can be taken out of the loop.\n");
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 "not vectorized: redundant loop. no profit to "
1968 "vectorize.\n");
1969 return false;
1972 return true;
1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1977 definitely no, or -1 if it's worth retrying. */
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1985 /* Only fully-masked loops can have iteration counts less than the
1986 vectorization factor. */
1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1989 HOST_WIDE_INT max_niter;
1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993 else
1994 max_niter = max_stmt_executions_int (loop);
1996 if (max_niter != -1
1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "not vectorized: iteration count smaller than "
2002 "vectorization factor.\n");
2003 return 0;
2007 int min_profitable_iters, min_profitable_estimate;
2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 &min_profitable_estimate);
2011 if (min_profitable_iters < 0)
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "not vectorized: vectorization not profitable.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: vector version will never be "
2019 "profitable.\n");
2020 return -1;
2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 * assumed_vf);
2026 /* Use the cost model only if it is more conservative than user specified
2027 threshold. */
2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 min_profitable_iters);
2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 "not vectorized: vectorization not profitable.\n");
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "not vectorized: iteration count smaller than user "
2042 "specified loop bound parameter or minimum profitable "
2043 "iterations (whichever is more conservative).\n");
2044 return 0;
2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048 if (estimated_niter == -1)
2049 estimated_niter = likely_max_stmt_executions_int (loop);
2050 if (estimated_niter != -1
2051 && ((unsigned HOST_WIDE_INT) estimated_niter
2052 < MAX (th, (unsigned) min_profitable_estimate)))
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 "not vectorized: estimated iteration count too "
2057 "small.\n");
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not vectorized: estimated iteration count smaller "
2061 "than specified loop bound parameter or minimum "
2062 "profitable iterations (whichever is more "
2063 "conservative).\n");
2064 return -1;
2067 return 1;
2071 /* Function vect_analyze_loop_2.
2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074 for it. The different analyses will record information in the
2075 loop_vec_info struct. */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2079 bool ok;
2080 int res;
2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082 poly_uint64 min_vf = 2;
2083 unsigned int n_stmts = 0;
2085 /* The first group of checks is independent of the vector size. */
2086 fatal = true;
2088 /* Find all data references in the loop (which correspond to vdefs/vuses)
2089 and analyze their evolution in the loop. */
2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "not vectorized: loop nest containing two "
2099 "or more consecutive inner loops cannot be "
2100 "vectorized\n");
2101 return false;
2104 for (unsigned i = 0; i < loop->num_nodes; i++)
2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 !gsi_end_p (gsi); gsi_next (&gsi))
2108 gimple *stmt = gsi_stmt (gsi);
2109 if (is_gimple_debug (stmt))
2110 continue;
2111 ++n_stmts;
2112 if (!find_data_references_in_stmt (loop, stmt,
2113 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2115 if (is_gimple_call (stmt) && loop->safelen)
2117 tree fndecl = gimple_call_fndecl (stmt), op;
2118 if (fndecl != NULL_TREE)
2120 cgraph_node *node = cgraph_node::get (fndecl);
2121 if (node != NULL && node->simd_clones != NULL)
2123 unsigned int j, n = gimple_call_num_args (stmt);
2124 for (j = 0; j < n; j++)
2126 op = gimple_call_arg (stmt, j);
2127 if (DECL_P (op)
2128 || (REFERENCE_CLASS_P (op)
2129 && get_base_address (op)))
2130 break;
2132 op = gimple_call_lhs (stmt);
2133 /* Ignore #pragma omp declare simd functions
2134 if they don't have data references in the
2135 call stmt itself. */
2136 if (j == n
2137 && !(op
2138 && (DECL_P (op)
2139 || (REFERENCE_CLASS_P (op)
2140 && get_base_address (op)))))
2141 continue;
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: loop contains function "
2148 "calls or data references that cannot "
2149 "be analyzed\n");
2150 return false;
2154 /* Analyze the data references and also adjust the minimal
2155 vectorization factor according to the loads and stores. */
2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158 if (!ok)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "bad data references.\n");
2163 return false;
2166 /* Classify all cross-iteration scalar data-flow cycles.
2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2168 vect_analyze_scalar_cycles (loop_vinfo);
2170 vect_pattern_recog (loop_vinfo);
2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2174 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2177 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178 if (!ok)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "bad data access.\n");
2183 return false;
2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189 if (!ok)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "unexpected pattern.\n");
2194 return false;
2197 /* While the rest of the analysis below depends on it in some way. */
2198 fatal = false;
2200 /* Analyze data dependences between the data-refs in the loop
2201 and adjust the maximum vectorization factor according to
2202 the dependences.
2203 FORNOW: fail at the first data dependence that we encounter. */
2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206 if (!ok
2207 || (max_vf != MAX_VECTORIZATION_FACTOR
2208 && maybe_lt (max_vf, min_vf)))
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "bad data dependence.\n");
2213 return false;
2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2217 ok = vect_determine_vectorization_factor (loop_vinfo);
2218 if (!ok)
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 "can't determine vectorization factor.\n");
2223 return false;
2225 if (max_vf != MAX_VECTORIZATION_FACTOR
2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "bad data dependence.\n");
2231 return false;
2234 /* Compute the scalar iteration cost. */
2235 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238 unsigned th;
2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2241 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242 if (!ok)
2243 return false;
2245 /* If there are any SLP instances mark them as pure_slp. */
2246 bool slp = vect_make_slp_decision (loop_vinfo);
2247 if (slp)
2249 /* Find stmts that need to be both vectorized and SLPed. */
2250 vect_detect_hybrid_slp (loop_vinfo);
2252 /* Update the vectorization factor based on the SLP decision. */
2253 vect_update_vf_for_slp (loop_vinfo);
2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2258 /* We don't expect to have to roll back to anything other than an empty
2259 set of rgroups. */
2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2262 /* This is the point where we can re-start analysis with SLP forced off. */
2263 start_over:
2265 /* Now the vectorization factor is final. */
2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267 gcc_assert (known_ne (vectorization_factor, 0U));
2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "vectorization_factor = ");
2273 dump_dec (MSG_NOTE, vectorization_factor);
2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 LOOP_VINFO_INT_NITERS (loop_vinfo));
2278 HOST_WIDE_INT max_niter
2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2281 /* Analyze the alignment of the data-refs in the loop.
2282 Fail if a data reference is found that cannot be vectorized. */
2284 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285 if (!ok)
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data alignment.\n");
2290 return false;
2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294 It is important to call pruning after vect_analyze_data_ref_accesses,
2295 since we use grouping information gathered by interleaving analysis. */
2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297 if (!ok)
2298 return false;
2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301 vectorization. */
2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2304 /* This pass will decide on using loop versioning and/or loop peeling in
2305 order to enhance the alignment of data references in the loop. */
2306 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return false;
2316 if (slp)
2318 /* Analyze operations in the SLP instances. Note this may
2319 remove unsupported SLP instances which makes the above
2320 SLP kind detection invalid. */
2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322 vect_slp_analyze_operations (loop_vinfo);
2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 goto again;
2327 /* Scan all the remaining operations in the loop that are not subject
2328 to SLP and make sure they are vectorizable. */
2329 ok = vect_analyze_loop_operations (loop_vinfo);
2330 if (!ok)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad operation or unsupported loop bound.\n");
2335 return false;
2338 /* Decide whether to use a fully-masked loop for this vectorization
2339 factor. */
2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342 && vect_verify_full_masking (loop_vinfo));
2343 if (dump_enabled_p ())
2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 dump_printf_loc (MSG_NOTE, vect_location,
2347 "using a fully-masked loop.\n");
2348 else
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "not using a fully-masked loop.\n");
2353 /* If epilog loop is required because of data accesses with gaps,
2354 one additional iteration needs to be peeled. Check if there is
2355 enough iterations for vectorization. */
2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2363 if (known_lt (wi::to_widest (scalar_niters), vf))
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "loop has no enough iterations to support"
2368 " peeling for gaps.\n");
2369 return false;
2373 /* Check the costings of the loop make vectorizing worthwhile. */
2374 res = vect_analyze_loop_costing (loop_vinfo);
2375 if (res < 0)
2376 goto again;
2377 if (!res)
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "Loop costings not worthwhile.\n");
2382 return false;
2385 /* Decide whether we need to create an epilogue loop to handle
2386 remaining scalar iterations. */
2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2389 unsigned HOST_WIDE_INT const_vf;
2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391 /* The main loop handles all iterations. */
2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2396 /* Work out the (constant) number of iterations that need to be
2397 peeled for reasons other than niters. */
2398 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2399 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2400 peel_niter += 1;
2401 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2402 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2403 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2405 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2406 /* ??? When peeling for gaps but not alignment, we could
2407 try to check whether the (variable) niters is known to be
2408 VF * N + 1. That's something of a niche case though. */
2409 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2410 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2411 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2412 < (unsigned) exact_log2 (const_vf))
2413 /* In case of versioning, check if the maximum number of
2414 iterations is greater than th. If they are identical,
2415 the epilogue is unnecessary. */
2416 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2417 || ((unsigned HOST_WIDE_INT) max_niter
2418 > (th / const_vf) * const_vf))))
2419 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2421 /* If an epilogue loop is required make sure we can create one. */
2422 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2423 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2427 if (!vect_can_advance_ivs_p (loop_vinfo)
2428 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2429 single_exit (LOOP_VINFO_LOOP
2430 (loop_vinfo))))
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434 "not vectorized: can't create required "
2435 "epilog loop\n");
2436 goto again;
2440 /* During peeling, we need to check if number of loop iterations is
2441 enough for both peeled prolog loop and vector loop. This check
2442 can be merged along with threshold check of loop versioning, so
2443 increase threshold for this case if necessary. */
2444 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2446 poly_uint64 niters_th = 0;
2448 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2450 /* Niters for peeled prolog loop. */
2451 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2453 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2454 tree vectype
2455 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2456 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2458 else
2459 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2462 /* Niters for at least one iteration of vectorized loop. */
2463 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2464 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2465 /* One additional iteration because of peeling for gap. */
2466 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2467 niters_th += 1;
2468 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2471 gcc_assert (known_eq (vectorization_factor,
2472 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2474 /* Ok to vectorize! */
2475 return true;
2477 again:
2478 /* Try again with SLP forced off but if we didn't do any SLP there is
2479 no point in re-trying. */
2480 if (!slp)
2481 return false;
2483 /* If there are reduction chains re-trying will fail anyway. */
2484 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2485 return false;
2487 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2488 via interleaving or lane instructions. */
2489 slp_instance instance;
2490 slp_tree node;
2491 unsigned i, j;
2492 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2494 stmt_vec_info vinfo;
2495 vinfo = vinfo_for_stmt
2496 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2497 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2498 continue;
2499 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2500 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2501 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2502 if (! vect_store_lanes_supported (vectype, size, false)
2503 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2504 && ! vect_grouped_store_supported (vectype, size))
2505 return false;
2506 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2508 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2509 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2510 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2511 size = STMT_VINFO_GROUP_SIZE (vinfo);
2512 vectype = STMT_VINFO_VECTYPE (vinfo);
2513 if (! vect_load_lanes_supported (vectype, size, false)
2514 && ! vect_grouped_load_supported (vectype, single_element_p,
2515 size))
2516 return false;
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "re-trying with SLP disabled\n");
2524 /* Roll back state appropriately. No SLP this time. */
2525 slp = false;
2526 /* Restore vectorization factor as it were without SLP. */
2527 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2528 /* Free the SLP instances. */
2529 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2530 vect_free_slp_instance (instance);
2531 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2532 /* Reset SLP type to loop_vect on all stmts. */
2533 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2535 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2536 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2537 !gsi_end_p (si); gsi_next (&si))
2539 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2540 STMT_SLP_TYPE (stmt_info) = loop_vect;
2542 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2543 !gsi_end_p (si); gsi_next (&si))
2545 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2546 STMT_SLP_TYPE (stmt_info) = loop_vect;
2547 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2549 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2550 STMT_SLP_TYPE (stmt_info) = loop_vect;
2551 for (gimple_stmt_iterator pi
2552 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2553 !gsi_end_p (pi); gsi_next (&pi))
2555 gimple *pstmt = gsi_stmt (pi);
2556 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2561 /* Free optimized alias test DDRS. */
2562 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2563 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2564 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2565 /* Reset target cost data. */
2566 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2567 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2568 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2569 /* Reset accumulated rgroup information. */
2570 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2571 /* Reset assorted flags. */
2572 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2573 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2574 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2575 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2576 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2578 goto start_over;
2581 /* Function vect_analyze_loop.
2583 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2584 for it. The different analyses will record information in the
2585 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2586 be vectorized. */
2587 loop_vec_info
2588 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2590 loop_vec_info loop_vinfo;
2591 auto_vector_sizes vector_sizes;
2593 /* Autodetect first vector size we try. */
2594 current_vector_size = 0;
2595 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2596 unsigned int next_size = 0;
2598 if (dump_enabled_p ())
2599 dump_printf_loc (MSG_NOTE, vect_location,
2600 "===== analyze_loop_nest =====\n");
2602 if (loop_outer (loop)
2603 && loop_vec_info_for_loop (loop_outer (loop))
2604 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2606 if (dump_enabled_p ())
2607 dump_printf_loc (MSG_NOTE, vect_location,
2608 "outer-loop already vectorized.\n");
2609 return NULL;
2612 poly_uint64 autodetected_vector_size = 0;
2613 while (1)
2615 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2616 loop_vinfo = vect_analyze_loop_form (loop);
2617 if (!loop_vinfo)
2619 if (dump_enabled_p ())
2620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621 "bad loop form.\n");
2622 return NULL;
2625 bool fatal = false;
2627 if (orig_loop_vinfo)
2628 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2630 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2632 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2634 return loop_vinfo;
2637 delete loop_vinfo;
2639 if (next_size == 0)
2640 autodetected_vector_size = current_vector_size;
2642 if (next_size < vector_sizes.length ()
2643 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2644 next_size += 1;
2646 if (fatal
2647 || next_size == vector_sizes.length ()
2648 || known_eq (current_vector_size, 0U))
2649 return NULL;
2651 /* Try the next biggest vector size. */
2652 current_vector_size = vector_sizes[next_size++];
2653 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_NOTE, vect_location,
2656 "***** Re-trying analysis with "
2657 "vector size ");
2658 dump_dec (MSG_NOTE, current_vector_size);
2659 dump_printf (MSG_NOTE, "\n");
2664 /* Return true if there is an in-order reduction function for CODE, storing
2665 it in *REDUC_FN if so. */
2667 static bool
2668 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2670 switch (code)
2672 case PLUS_EXPR:
2673 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2674 return true;
2676 default:
2677 return false;
2681 /* Function reduction_fn_for_scalar_code
2683 Input:
2684 CODE - tree_code of a reduction operations.
2686 Output:
2687 REDUC_FN - the corresponding internal function to be used to reduce the
2688 vector of partial results into a single scalar result, or IFN_LAST
2689 if the operation is a supported reduction operation, but does not have
2690 such an internal function.
2692 Return FALSE if CODE currently cannot be vectorized as reduction. */
2694 static bool
2695 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2697 switch (code)
2699 case MAX_EXPR:
2700 *reduc_fn = IFN_REDUC_MAX;
2701 return true;
2703 case MIN_EXPR:
2704 *reduc_fn = IFN_REDUC_MIN;
2705 return true;
2707 case PLUS_EXPR:
2708 *reduc_fn = IFN_REDUC_PLUS;
2709 return true;
2711 case BIT_AND_EXPR:
2712 *reduc_fn = IFN_REDUC_AND;
2713 return true;
2715 case BIT_IOR_EXPR:
2716 *reduc_fn = IFN_REDUC_IOR;
2717 return true;
2719 case BIT_XOR_EXPR:
2720 *reduc_fn = IFN_REDUC_XOR;
2721 return true;
2723 case MULT_EXPR:
2724 case MINUS_EXPR:
2725 *reduc_fn = IFN_LAST;
2726 return true;
2728 default:
2729 return false;
2733 /* If there is a neutral value X such that SLP reduction NODE would not
2734 be affected by the introduction of additional X elements, return that X,
2735 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2736 is true if the SLP statements perform a single reduction, false if each
2737 statement performs an independent reduction. */
2739 static tree
2740 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2741 bool reduc_chain)
2743 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2744 gimple *stmt = stmts[0];
2745 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2746 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2747 tree scalar_type = TREE_TYPE (vector_type);
2748 struct loop *loop = gimple_bb (stmt)->loop_father;
2749 gcc_assert (loop);
2751 switch (code)
2753 case WIDEN_SUM_EXPR:
2754 case DOT_PROD_EXPR:
2755 case SAD_EXPR:
2756 case PLUS_EXPR:
2757 case MINUS_EXPR:
2758 case BIT_IOR_EXPR:
2759 case BIT_XOR_EXPR:
2760 return build_zero_cst (scalar_type);
2762 case MULT_EXPR:
2763 return build_one_cst (scalar_type);
2765 case BIT_AND_EXPR:
2766 return build_all_ones_cst (scalar_type);
2768 case MAX_EXPR:
2769 case MIN_EXPR:
2770 /* For MIN/MAX the initial values are neutral. A reduction chain
2771 has only a single initial value, so that value is neutral for
2772 all statements. */
2773 if (reduc_chain)
2774 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2775 return NULL_TREE;
2777 default:
2778 return NULL_TREE;
2782 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2783 STMT is printed with a message MSG. */
2785 static void
2786 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2788 dump_printf_loc (msg_type, vect_location, "%s", msg);
2789 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2793 /* Detect SLP reduction of the form:
2795 #a1 = phi <a5, a0>
2796 a2 = operation (a1)
2797 a3 = operation (a2)
2798 a4 = operation (a3)
2799 a5 = operation (a4)
2801 #a = phi <a5>
2803 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2804 FIRST_STMT is the first reduction stmt in the chain
2805 (a2 = operation (a1)).
2807 Return TRUE if a reduction chain was detected. */
2809 static bool
2810 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2811 gimple *first_stmt)
2813 struct loop *loop = (gimple_bb (phi))->loop_father;
2814 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2815 enum tree_code code;
2816 gimple *loop_use_stmt = NULL;
2817 stmt_vec_info use_stmt_info;
2818 tree lhs;
2819 imm_use_iterator imm_iter;
2820 use_operand_p use_p;
2821 int nloop_uses, size = 0, n_out_of_loop_uses;
2822 bool found = false;
2824 if (loop != vect_loop)
2825 return false;
2827 auto_vec<stmt_vec_info, 8> reduc_chain;
2828 lhs = PHI_RESULT (phi);
2829 code = gimple_assign_rhs_code (first_stmt);
2830 while (1)
2832 nloop_uses = 0;
2833 n_out_of_loop_uses = 0;
2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2836 gimple *use_stmt = USE_STMT (use_p);
2837 if (is_gimple_debug (use_stmt))
2838 continue;
2840 /* Check if we got back to the reduction phi. */
2841 if (use_stmt == phi)
2843 loop_use_stmt = use_stmt;
2844 found = true;
2845 break;
2848 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2850 loop_use_stmt = use_stmt;
2851 nloop_uses++;
2853 else
2854 n_out_of_loop_uses++;
2856 /* There are can be either a single use in the loop or two uses in
2857 phi nodes. */
2858 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2859 return false;
2862 if (found)
2863 break;
2865 /* We reached a statement with no loop uses. */
2866 if (nloop_uses == 0)
2867 return false;
2869 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2870 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2871 return false;
2873 if (!is_gimple_assign (loop_use_stmt)
2874 || code != gimple_assign_rhs_code (loop_use_stmt)
2875 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2876 return false;
2878 /* Insert USE_STMT into reduction chain. */
2879 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2880 reduc_chain.safe_push (use_stmt_info);
2882 lhs = gimple_assign_lhs (loop_use_stmt);
2883 size++;
2886 if (!found || loop_use_stmt != phi || size < 2)
2887 return false;
2889 /* Swap the operands, if needed, to make the reduction operand be the second
2890 operand. */
2891 lhs = PHI_RESULT (phi);
2892 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2894 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2895 if (gimple_assign_rhs2 (next_stmt) == lhs)
2897 tree op = gimple_assign_rhs1 (next_stmt);
2898 gimple *def_stmt = NULL;
2900 if (TREE_CODE (op) == SSA_NAME)
2901 def_stmt = SSA_NAME_DEF_STMT (op);
2903 /* Check that the other def is either defined in the loop
2904 ("vect_internal_def"), or it's an induction (defined by a
2905 loop-header phi-node). */
2906 if (def_stmt
2907 && gimple_bb (def_stmt)
2908 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909 && (is_gimple_assign (def_stmt)
2910 || is_gimple_call (def_stmt)
2911 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912 == vect_induction_def
2913 || (gimple_code (def_stmt) == GIMPLE_PHI
2914 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915 == vect_internal_def
2916 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2918 lhs = gimple_assign_lhs (next_stmt);
2919 continue;
2922 return false;
2924 else
2926 tree op = gimple_assign_rhs2 (next_stmt);
2927 gimple *def_stmt = NULL;
2929 if (TREE_CODE (op) == SSA_NAME)
2930 def_stmt = SSA_NAME_DEF_STMT (op);
2932 /* Check that the other def is either defined in the loop
2933 ("vect_internal_def"), or it's an induction (defined by a
2934 loop-header phi-node). */
2935 if (def_stmt
2936 && gimple_bb (def_stmt)
2937 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938 && (is_gimple_assign (def_stmt)
2939 || is_gimple_call (def_stmt)
2940 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941 == vect_induction_def
2942 || (gimple_code (def_stmt) == GIMPLE_PHI
2943 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944 == vect_internal_def
2945 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2947 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2953 swap_ssa_operands (next_stmt,
2954 gimple_assign_rhs1_ptr (next_stmt),
2955 gimple_assign_rhs2_ptr (next_stmt));
2956 update_stmt (next_stmt);
2958 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2961 else
2962 return false;
2965 lhs = gimple_assign_lhs (next_stmt);
2968 /* Build up the actual chain. */
2969 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2971 GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2972 GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2974 GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2975 GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2977 /* Save the chain for further analysis in SLP detection. */
2978 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2979 GROUP_SIZE (reduc_chain[0]) = size;
2981 return true;
2984 /* Return true if we need an in-order reduction for operation CODE
2985 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2986 overflow must wrap. */
2988 static bool
2989 needs_fold_left_reduction_p (tree type, tree_code code,
2990 bool need_wrapping_integral_overflow)
2992 /* CHECKME: check for !flag_finite_math_only too? */
2993 if (SCALAR_FLOAT_TYPE_P (type))
2994 switch (code)
2996 case MIN_EXPR:
2997 case MAX_EXPR:
2998 return false;
3000 default:
3001 return !flag_associative_math;
3004 if (INTEGRAL_TYPE_P (type))
3006 if (!operation_no_trapping_overflow (type, code))
3007 return true;
3008 if (need_wrapping_integral_overflow
3009 && !TYPE_OVERFLOW_WRAPS (type)
3010 && operation_can_overflow (code))
3011 return true;
3012 return false;
3015 if (SAT_FIXED_POINT_TYPE_P (type))
3016 return true;
3018 return false;
3021 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3022 reduction operation CODE has a handled computation expression. */
3024 bool
3025 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3026 enum tree_code code)
3028 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3029 auto_bitmap visited;
3030 tree lookfor = PHI_RESULT (phi);
3031 ssa_op_iter curri;
3032 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3033 while (USE_FROM_PTR (curr) != loop_arg)
3034 curr = op_iter_next_use (&curri);
3035 curri.i = curri.numops;
3038 path.safe_push (std::make_pair (curri, curr));
3039 tree use = USE_FROM_PTR (curr);
3040 if (use == lookfor)
3041 break;
3042 gimple *def = SSA_NAME_DEF_STMT (use);
3043 if (gimple_nop_p (def)
3044 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3046 pop:
3049 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3050 curri = x.first;
3051 curr = x.second;
3053 curr = op_iter_next_use (&curri);
3054 /* Skip already visited or non-SSA operands (from iterating
3055 over PHI args). */
3056 while (curr != NULL_USE_OPERAND_P
3057 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3058 || ! bitmap_set_bit (visited,
3059 SSA_NAME_VERSION
3060 (USE_FROM_PTR (curr)))));
3062 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3063 if (curr == NULL_USE_OPERAND_P)
3064 break;
3066 else
3068 if (gimple_code (def) == GIMPLE_PHI)
3069 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3070 else
3071 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3072 while (curr != NULL_USE_OPERAND_P
3073 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3074 || ! bitmap_set_bit (visited,
3075 SSA_NAME_VERSION
3076 (USE_FROM_PTR (curr)))))
3077 curr = op_iter_next_use (&curri);
3078 if (curr == NULL_USE_OPERAND_P)
3079 goto pop;
3082 while (1);
3083 if (dump_file && (dump_flags & TDF_DETAILS))
3085 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3086 unsigned i;
3087 std::pair<ssa_op_iter, use_operand_p> *x;
3088 FOR_EACH_VEC_ELT (path, i, x)
3090 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3091 dump_printf (MSG_NOTE, " ");
3093 dump_printf (MSG_NOTE, "\n");
3096 /* Check whether the reduction path detected is valid. */
3097 bool fail = path.length () == 0;
3098 bool neg = false;
3099 for (unsigned i = 1; i < path.length (); ++i)
3101 gimple *use_stmt = USE_STMT (path[i].second);
3102 tree op = USE_FROM_PTR (path[i].second);
3103 if (! has_single_use (op)
3104 || ! is_gimple_assign (use_stmt))
3106 fail = true;
3107 break;
3109 if (gimple_assign_rhs_code (use_stmt) != code)
3111 if (code == PLUS_EXPR
3112 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3114 /* Track whether we negate the reduction value each iteration. */
3115 if (gimple_assign_rhs2 (use_stmt) == op)
3116 neg = ! neg;
3118 else
3120 fail = true;
3121 break;
3125 return ! fail && ! neg;
3129 /* Function vect_is_simple_reduction
3131 (1) Detect a cross-iteration def-use cycle that represents a simple
3132 reduction computation. We look for the following pattern:
3134 loop_header:
3135 a1 = phi < a0, a2 >
3136 a3 = ...
3137 a2 = operation (a3, a1)
3141 a3 = ...
3142 loop_header:
3143 a1 = phi < a0, a2 >
3144 a2 = operation (a3, a1)
3146 such that:
3147 1. operation is commutative and associative and it is safe to
3148 change the order of the computation
3149 2. no uses for a2 in the loop (a2 is used out of the loop)
3150 3. no uses of a1 in the loop besides the reduction operation
3151 4. no uses of a1 outside the loop.
3153 Conditions 1,4 are tested here.
3154 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3156 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3157 nested cycles.
3159 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3160 reductions:
3162 a1 = phi < a0, a2 >
3163 inner loop (def of a3)
3164 a2 = phi < a3 >
3166 (4) Detect condition expressions, ie:
3167 for (int i = 0; i < N; i++)
3168 if (a[i] < val)
3169 ret_val = a[i];
3173 static gimple *
3174 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3175 bool *double_reduc,
3176 bool need_wrapping_integral_overflow,
3177 enum vect_reduction_type *v_reduc_type)
3179 struct loop *loop = (gimple_bb (phi))->loop_father;
3180 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3181 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3182 enum tree_code orig_code, code;
3183 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3184 tree type;
3185 int nloop_uses;
3186 tree name;
3187 imm_use_iterator imm_iter;
3188 use_operand_p use_p;
3189 bool phi_def;
3191 *double_reduc = false;
3192 *v_reduc_type = TREE_CODE_REDUCTION;
3194 tree phi_name = PHI_RESULT (phi);
3195 /* ??? If there are no uses of the PHI result the inner loop reduction
3196 won't be detected as possibly double-reduction by vectorizable_reduction
3197 because that tries to walk the PHI arg from the preheader edge which
3198 can be constant. See PR60382. */
3199 if (has_zero_uses (phi_name))
3200 return NULL;
3201 nloop_uses = 0;
3202 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3204 gimple *use_stmt = USE_STMT (use_p);
3205 if (is_gimple_debug (use_stmt))
3206 continue;
3208 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3210 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212 "intermediate value used outside loop.\n");
3214 return NULL;
3217 nloop_uses++;
3218 if (nloop_uses > 1)
3220 if (dump_enabled_p ())
3221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3222 "reduction value used in loop.\n");
3223 return NULL;
3226 phi_use_stmt = use_stmt;
3229 edge latch_e = loop_latch_edge (loop);
3230 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3231 if (TREE_CODE (loop_arg) != SSA_NAME)
3233 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236 "reduction: not ssa_name: ");
3237 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3238 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3240 return NULL;
3243 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3244 if (is_gimple_assign (def_stmt))
3246 name = gimple_assign_lhs (def_stmt);
3247 phi_def = false;
3249 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3251 name = PHI_RESULT (def_stmt);
3252 phi_def = true;
3254 else
3256 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259 "reduction: unhandled reduction operation: ");
3260 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3262 return NULL;
3265 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3266 return NULL;
3268 nloop_uses = 0;
3269 auto_vec<gphi *, 3> lcphis;
3270 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3272 gimple *use_stmt = USE_STMT (use_p);
3273 if (is_gimple_debug (use_stmt))
3274 continue;
3275 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3276 nloop_uses++;
3277 else
3278 /* We can have more than one loop-closed PHI. */
3279 lcphis.safe_push (as_a <gphi *> (use_stmt));
3280 if (nloop_uses > 1)
3282 if (dump_enabled_p ())
3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3284 "reduction used in loop.\n");
3285 return NULL;
3289 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3290 defined in the inner loop. */
3291 if (phi_def)
3293 op1 = PHI_ARG_DEF (def_stmt, 0);
3295 if (gimple_phi_num_args (def_stmt) != 1
3296 || TREE_CODE (op1) != SSA_NAME)
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3300 "unsupported phi node definition.\n");
3302 return NULL;
3305 def1 = SSA_NAME_DEF_STMT (op1);
3306 if (gimple_bb (def1)
3307 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3308 && loop->inner
3309 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3310 && is_gimple_assign (def1)
3311 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3313 if (dump_enabled_p ())
3314 report_vect_op (MSG_NOTE, def_stmt,
3315 "detected double reduction: ");
3317 *double_reduc = true;
3318 return def_stmt;
3321 return NULL;
3324 /* If we are vectorizing an inner reduction we are executing that
3325 in the original order only in case we are not dealing with a
3326 double reduction. */
3327 bool check_reduction = true;
3328 if (flow_loop_nested_p (vect_loop, loop))
3330 gphi *lcphi;
3331 unsigned i;
3332 check_reduction = false;
3333 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3336 gimple *use_stmt = USE_STMT (use_p);
3337 if (is_gimple_debug (use_stmt))
3338 continue;
3339 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3340 check_reduction = true;
3344 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3345 code = orig_code = gimple_assign_rhs_code (def_stmt);
3347 /* We can handle "res -= x[i]", which is non-associative by
3348 simply rewriting this into "res += -x[i]". Avoid changing
3349 gimple instruction for the first simple tests and only do this
3350 if we're allowed to change code at all. */
3351 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3352 code = PLUS_EXPR;
3354 if (code == COND_EXPR)
3356 if (! nested_in_vect_loop)
3357 *v_reduc_type = COND_REDUCTION;
3359 op3 = gimple_assign_rhs1 (def_stmt);
3360 if (COMPARISON_CLASS_P (op3))
3362 op4 = TREE_OPERAND (op3, 1);
3363 op3 = TREE_OPERAND (op3, 0);
3365 if (op3 == phi_name || op4 == phi_name)
3367 if (dump_enabled_p ())
3368 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3369 "reduction: condition depends on previous"
3370 " iteration: ");
3371 return NULL;
3374 op1 = gimple_assign_rhs2 (def_stmt);
3375 op2 = gimple_assign_rhs3 (def_stmt);
3377 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3379 if (dump_enabled_p ())
3380 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3381 "reduction: not commutative/associative: ");
3382 return NULL;
3384 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3386 op1 = gimple_assign_rhs1 (def_stmt);
3387 op2 = gimple_assign_rhs2 (def_stmt);
3389 else
3391 if (dump_enabled_p ())
3392 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3393 "reduction: not handled operation: ");
3394 return NULL;
3397 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3399 if (dump_enabled_p ())
3400 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3401 "reduction: both uses not ssa_names: ");
3403 return NULL;
3406 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3407 if ((TREE_CODE (op1) == SSA_NAME
3408 && !types_compatible_p (type,TREE_TYPE (op1)))
3409 || (TREE_CODE (op2) == SSA_NAME
3410 && !types_compatible_p (type, TREE_TYPE (op2)))
3411 || (op3 && TREE_CODE (op3) == SSA_NAME
3412 && !types_compatible_p (type, TREE_TYPE (op3)))
3413 || (op4 && TREE_CODE (op4) == SSA_NAME
3414 && !types_compatible_p (type, TREE_TYPE (op4))))
3416 if (dump_enabled_p ())
3418 dump_printf_loc (MSG_NOTE, vect_location,
3419 "reduction: multiple types: operation type: ");
3420 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3421 dump_printf (MSG_NOTE, ", operands types: ");
3422 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3423 TREE_TYPE (op1));
3424 dump_printf (MSG_NOTE, ",");
3425 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3426 TREE_TYPE (op2));
3427 if (op3)
3429 dump_printf (MSG_NOTE, ",");
3430 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431 TREE_TYPE (op3));
3434 if (op4)
3436 dump_printf (MSG_NOTE, ",");
3437 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3438 TREE_TYPE (op4));
3440 dump_printf (MSG_NOTE, "\n");
3443 return NULL;
3446 /* Check whether it's ok to change the order of the computation.
3447 Generally, when vectorizing a reduction we change the order of the
3448 computation. This may change the behavior of the program in some
3449 cases, so we need to check that this is ok. One exception is when
3450 vectorizing an outer-loop: the inner-loop is executed sequentially,
3451 and therefore vectorizing reductions in the inner-loop during
3452 outer-loop vectorization is safe. */
3453 if (check_reduction
3454 && *v_reduc_type == TREE_CODE_REDUCTION
3455 && needs_fold_left_reduction_p (type, code,
3456 need_wrapping_integral_overflow))
3457 *v_reduc_type = FOLD_LEFT_REDUCTION;
3459 /* Reduction is safe. We're dealing with one of the following:
3460 1) integer arithmetic and no trapv
3461 2) floating point arithmetic, and special flags permit this optimization
3462 3) nested cycle (i.e., outer loop vectorization). */
3463 if (TREE_CODE (op1) == SSA_NAME)
3464 def1 = SSA_NAME_DEF_STMT (op1);
3466 if (TREE_CODE (op2) == SSA_NAME)
3467 def2 = SSA_NAME_DEF_STMT (op2);
3469 if (code != COND_EXPR
3470 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3472 if (dump_enabled_p ())
3473 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3474 return NULL;
3477 /* Check that one def is the reduction def, defined by PHI,
3478 the other def is either defined in the loop ("vect_internal_def"),
3479 or it's an induction (defined by a loop-header phi-node). */
3481 if (def2 && def2 == phi
3482 && (code == COND_EXPR
3483 || !def1 || gimple_nop_p (def1)
3484 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3485 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3486 && (is_gimple_assign (def1)
3487 || is_gimple_call (def1)
3488 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3489 == vect_induction_def
3490 || (gimple_code (def1) == GIMPLE_PHI
3491 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3492 == vect_internal_def
3493 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3495 if (dump_enabled_p ())
3496 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3497 return def_stmt;
3500 if (def1 && def1 == phi
3501 && (code == COND_EXPR
3502 || !def2 || gimple_nop_p (def2)
3503 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3504 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3505 && (is_gimple_assign (def2)
3506 || is_gimple_call (def2)
3507 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3508 == vect_induction_def
3509 || (gimple_code (def2) == GIMPLE_PHI
3510 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3511 == vect_internal_def
3512 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3514 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3516 /* Check if we can swap operands (just for simplicity - so that
3517 the rest of the code can assume that the reduction variable
3518 is always the last (second) argument). */
3519 if (code == COND_EXPR)
3521 /* Swap cond_expr by inverting the condition. */
3522 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3523 enum tree_code invert_code = ERROR_MARK;
3524 enum tree_code cond_code = TREE_CODE (cond_expr);
3526 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3528 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3529 invert_code = invert_tree_comparison (cond_code, honor_nans);
3531 if (invert_code != ERROR_MARK)
3533 TREE_SET_CODE (cond_expr, invert_code);
3534 swap_ssa_operands (def_stmt,
3535 gimple_assign_rhs2_ptr (def_stmt),
3536 gimple_assign_rhs3_ptr (def_stmt));
3538 else
3540 if (dump_enabled_p ())
3541 report_vect_op (MSG_NOTE, def_stmt,
3542 "detected reduction: cannot swap operands "
3543 "for cond_expr");
3544 return NULL;
3547 else
3548 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3549 gimple_assign_rhs2_ptr (def_stmt));
3551 if (dump_enabled_p ())
3552 report_vect_op (MSG_NOTE, def_stmt,
3553 "detected reduction: need to swap operands: ");
3555 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3556 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3558 else
3560 if (dump_enabled_p ())
3561 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3564 return def_stmt;
3567 /* Try to find SLP reduction chain. */
3568 if (! nested_in_vect_loop
3569 && code != COND_EXPR
3570 && orig_code != MINUS_EXPR
3571 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3573 if (dump_enabled_p ())
3574 report_vect_op (MSG_NOTE, def_stmt,
3575 "reduction: detected reduction chain: ");
3577 return def_stmt;
3580 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3581 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3582 while (first)
3584 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3585 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3586 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3587 first = next;
3590 /* Look for the expression computing loop_arg from loop PHI result. */
3591 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3592 code))
3593 return def_stmt;
3595 if (dump_enabled_p ())
3597 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3598 "reduction: unknown pattern: ");
3601 return NULL;
3604 /* Wrapper around vect_is_simple_reduction, which will modify code
3605 in-place if it enables detection of more reductions. Arguments
3606 as there. */
3608 gimple *
3609 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3610 bool *double_reduc,
3611 bool need_wrapping_integral_overflow)
3613 enum vect_reduction_type v_reduc_type;
3614 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3615 need_wrapping_integral_overflow,
3616 &v_reduc_type);
3617 if (def)
3619 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3620 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3621 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3622 reduc_def_info = vinfo_for_stmt (def);
3623 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3624 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3626 return def;
3629 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3631 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3632 int *peel_iters_epilogue,
3633 stmt_vector_for_cost *scalar_cost_vec,
3634 stmt_vector_for_cost *prologue_cost_vec,
3635 stmt_vector_for_cost *epilogue_cost_vec)
3637 int retval = 0;
3638 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3640 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3642 *peel_iters_epilogue = assumed_vf / 2;
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_NOTE, vect_location,
3645 "cost model: epilogue peel iters set to vf/2 "
3646 "because loop iterations are unknown .\n");
3648 /* If peeled iterations are known but number of scalar loop
3649 iterations are unknown, count a taken branch per peeled loop. */
3650 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3651 NULL, 0, vect_prologue);
3652 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3653 NULL, 0, vect_epilogue);
3655 else
3657 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3658 peel_iters_prologue = niters < peel_iters_prologue ?
3659 niters : peel_iters_prologue;
3660 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3661 /* If we need to peel for gaps, but no peeling is required, we have to
3662 peel VF iterations. */
3663 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3664 *peel_iters_epilogue = assumed_vf;
3667 stmt_info_for_cost *si;
3668 int j;
3669 if (peel_iters_prologue)
3670 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3672 stmt_vec_info stmt_info
3673 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3674 retval += record_stmt_cost (prologue_cost_vec,
3675 si->count * peel_iters_prologue,
3676 si->kind, stmt_info, si->misalign,
3677 vect_prologue);
3679 if (*peel_iters_epilogue)
3680 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3682 stmt_vec_info stmt_info
3683 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3684 retval += record_stmt_cost (epilogue_cost_vec,
3685 si->count * *peel_iters_epilogue,
3686 si->kind, stmt_info, si->misalign,
3687 vect_epilogue);
3690 return retval;
3693 /* Function vect_estimate_min_profitable_iters
3695 Return the number of iterations required for the vector version of the
3696 loop to be profitable relative to the cost of the scalar version of the
3697 loop.
3699 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3700 of iterations for vectorization. -1 value means loop vectorization
3701 is not profitable. This returned value may be used for dynamic
3702 profitability check.
3704 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3705 for static check against estimated number of iterations. */
3707 static void
3708 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3709 int *ret_min_profitable_niters,
3710 int *ret_min_profitable_estimate)
3712 int min_profitable_iters;
3713 int min_profitable_estimate;
3714 int peel_iters_prologue;
3715 int peel_iters_epilogue;
3716 unsigned vec_inside_cost = 0;
3717 int vec_outside_cost = 0;
3718 unsigned vec_prologue_cost = 0;
3719 unsigned vec_epilogue_cost = 0;
3720 int scalar_single_iter_cost = 0;
3721 int scalar_outside_cost = 0;
3722 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3723 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3724 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3726 /* Cost model disabled. */
3727 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3729 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3730 *ret_min_profitable_niters = 0;
3731 *ret_min_profitable_estimate = 0;
3732 return;
3735 /* Requires loop versioning tests to handle misalignment. */
3736 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3738 /* FIXME: Make cost depend on complexity of individual check. */
3739 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3740 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3741 vect_prologue);
3742 dump_printf (MSG_NOTE,
3743 "cost model: Adding cost of checks for loop "
3744 "versioning to treat misalignment.\n");
3747 /* Requires loop versioning with alias checks. */
3748 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3750 /* FIXME: Make cost depend on complexity of individual check. */
3751 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3752 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3753 vect_prologue);
3754 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3755 if (len)
3756 /* Count LEN - 1 ANDs and LEN comparisons. */
3757 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3758 NULL, 0, vect_prologue);
3759 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3760 if (len)
3762 /* Count LEN - 1 ANDs and LEN comparisons. */
3763 unsigned int nstmts = len * 2 - 1;
3764 /* +1 for each bias that needs adding. */
3765 for (unsigned int i = 0; i < len; ++i)
3766 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3767 nstmts += 1;
3768 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3769 NULL, 0, vect_prologue);
3771 dump_printf (MSG_NOTE,
3772 "cost model: Adding cost of checks for loop "
3773 "versioning aliasing.\n");
3776 /* Requires loop versioning with niter checks. */
3777 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3779 /* FIXME: Make cost depend on complexity of individual check. */
3780 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3781 vect_prologue);
3782 dump_printf (MSG_NOTE,
3783 "cost model: Adding cost of checks for loop "
3784 "versioning niters.\n");
3787 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3788 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3789 vect_prologue);
3791 /* Count statements in scalar loop. Using this as scalar cost for a single
3792 iteration for now.
3794 TODO: Add outer loop support.
3796 TODO: Consider assigning different costs to different scalar
3797 statements. */
3799 scalar_single_iter_cost
3800 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3802 /* Add additional cost for the peeled instructions in prologue and epilogue
3803 loop. (For fully-masked loops there will be no peeling.)
3805 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3806 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3808 TODO: Build an expression that represents peel_iters for prologue and
3809 epilogue to be used in a run-time test. */
3811 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3813 peel_iters_prologue = 0;
3814 peel_iters_epilogue = 0;
3816 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3818 /* We need to peel exactly one iteration. */
3819 peel_iters_epilogue += 1;
3820 stmt_info_for_cost *si;
3821 int j;
3822 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3823 j, si)
3825 struct _stmt_vec_info *stmt_info
3826 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3827 (void) add_stmt_cost (target_cost_data, si->count,
3828 si->kind, stmt_info, si->misalign,
3829 vect_epilogue);
3833 else if (npeel < 0)
3835 peel_iters_prologue = assumed_vf / 2;
3836 dump_printf (MSG_NOTE, "cost model: "
3837 "prologue peel iters set to vf/2.\n");
3839 /* If peeling for alignment is unknown, loop bound of main loop becomes
3840 unknown. */
3841 peel_iters_epilogue = assumed_vf / 2;
3842 dump_printf (MSG_NOTE, "cost model: "
3843 "epilogue peel iters set to vf/2 because "
3844 "peeling for alignment is unknown.\n");
3846 /* If peeled iterations are unknown, count a taken branch and a not taken
3847 branch per peeled loop. Even if scalar loop iterations are known,
3848 vector iterations are not known since peeled prologue iterations are
3849 not known. Hence guards remain the same. */
3850 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3851 NULL, 0, vect_prologue);
3852 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3853 NULL, 0, vect_prologue);
3854 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3855 NULL, 0, vect_epilogue);
3856 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3857 NULL, 0, vect_epilogue);
3858 stmt_info_for_cost *si;
3859 int j;
3860 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3862 struct _stmt_vec_info *stmt_info
3863 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3864 (void) add_stmt_cost (target_cost_data,
3865 si->count * peel_iters_prologue,
3866 si->kind, stmt_info, si->misalign,
3867 vect_prologue);
3868 (void) add_stmt_cost (target_cost_data,
3869 si->count * peel_iters_epilogue,
3870 si->kind, stmt_info, si->misalign,
3871 vect_epilogue);
3874 else
3876 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3877 stmt_info_for_cost *si;
3878 int j;
3879 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3881 prologue_cost_vec.create (2);
3882 epilogue_cost_vec.create (2);
3883 peel_iters_prologue = npeel;
3885 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3886 &peel_iters_epilogue,
3887 &LOOP_VINFO_SCALAR_ITERATION_COST
3888 (loop_vinfo),
3889 &prologue_cost_vec,
3890 &epilogue_cost_vec);
3892 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3894 struct _stmt_vec_info *stmt_info
3895 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3896 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3897 si->misalign, vect_prologue);
3900 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3902 struct _stmt_vec_info *stmt_info
3903 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3904 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3905 si->misalign, vect_epilogue);
3908 prologue_cost_vec.release ();
3909 epilogue_cost_vec.release ();
3912 /* FORNOW: The scalar outside cost is incremented in one of the
3913 following ways:
3915 1. The vectorizer checks for alignment and aliasing and generates
3916 a condition that allows dynamic vectorization. A cost model
3917 check is ANDED with the versioning condition. Hence scalar code
3918 path now has the added cost of the versioning check.
3920 if (cost > th & versioning_check)
3921 jmp to vector code
3923 Hence run-time scalar is incremented by not-taken branch cost.
3925 2. The vectorizer then checks if a prologue is required. If the
3926 cost model check was not done before during versioning, it has to
3927 be done before the prologue check.
3929 if (cost <= th)
3930 prologue = scalar_iters
3931 if (prologue == 0)
3932 jmp to vector code
3933 else
3934 execute prologue
3935 if (prologue == num_iters)
3936 go to exit
3938 Hence the run-time scalar cost is incremented by a taken branch,
3939 plus a not-taken branch, plus a taken branch cost.
3941 3. The vectorizer then checks if an epilogue is required. If the
3942 cost model check was not done before during prologue check, it
3943 has to be done with the epilogue check.
3945 if (prologue == 0)
3946 jmp to vector code
3947 else
3948 execute prologue
3949 if (prologue == num_iters)
3950 go to exit
3951 vector code:
3952 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3953 jmp to epilogue
3955 Hence the run-time scalar cost should be incremented by 2 taken
3956 branches.
3958 TODO: The back end may reorder the BBS's differently and reverse
3959 conditions/branch directions. Change the estimates below to
3960 something more reasonable. */
3962 /* If the number of iterations is known and we do not do versioning, we can
3963 decide whether to vectorize at compile time. Hence the scalar version
3964 do not carry cost model guard costs. */
3965 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3966 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3968 /* Cost model check occurs at versioning. */
3969 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3970 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3971 else
3973 /* Cost model check occurs at prologue generation. */
3974 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3975 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3976 + vect_get_stmt_cost (cond_branch_not_taken);
3977 /* Cost model check occurs at epilogue generation. */
3978 else
3979 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3983 /* Complete the target-specific cost calculations. */
3984 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3985 &vec_inside_cost, &vec_epilogue_cost);
3987 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3989 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3992 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3993 vec_inside_cost);
3994 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3995 vec_prologue_cost);
3996 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3997 vec_epilogue_cost);
3998 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3999 scalar_single_iter_cost);
4000 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4001 scalar_outside_cost);
4002 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4003 vec_outside_cost);
4004 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4005 peel_iters_prologue);
4006 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4007 peel_iters_epilogue);
4010 /* Calculate number of iterations required to make the vector version
4011 profitable, relative to the loop bodies only. The following condition
4012 must hold true:
4013 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4014 where
4015 SIC = scalar iteration cost, VIC = vector iteration cost,
4016 VOC = vector outside cost, VF = vectorization factor,
4017 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4018 SOC = scalar outside cost for run time cost model check. */
4020 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4022 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4023 * assumed_vf
4024 - vec_inside_cost * peel_iters_prologue
4025 - vec_inside_cost * peel_iters_epilogue);
4026 if (min_profitable_iters <= 0)
4027 min_profitable_iters = 0;
4028 else
4030 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4031 - vec_inside_cost);
4033 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4034 <= (((int) vec_inside_cost * min_profitable_iters)
4035 + (((int) vec_outside_cost - scalar_outside_cost)
4036 * assumed_vf)))
4037 min_profitable_iters++;
4040 /* vector version will never be profitable. */
4041 else
4043 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4044 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4045 "did not happen for a simd loop");
4047 if (dump_enabled_p ())
4048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049 "cost model: the vector iteration cost = %d "
4050 "divided by the scalar iteration cost = %d "
4051 "is greater or equal to the vectorization factor = %d"
4052 ".\n",
4053 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4054 *ret_min_profitable_niters = -1;
4055 *ret_min_profitable_estimate = -1;
4056 return;
4059 dump_printf (MSG_NOTE,
4060 " Calculated minimum iters for profitability: %d\n",
4061 min_profitable_iters);
4063 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4064 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4065 /* We want the vectorized loop to execute at least once. */
4066 min_profitable_iters = assumed_vf + peel_iters_prologue;
4068 if (dump_enabled_p ())
4069 dump_printf_loc (MSG_NOTE, vect_location,
4070 " Runtime profitability threshold = %d\n",
4071 min_profitable_iters);
4073 *ret_min_profitable_niters = min_profitable_iters;
4075 /* Calculate number of iterations required to make the vector version
4076 profitable, relative to the loop bodies only.
4078 Non-vectorized variant is SIC * niters and it must win over vector
4079 variant on the expected loop trip count. The following condition must hold true:
4080 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4082 if (vec_outside_cost <= 0)
4083 min_profitable_estimate = 0;
4084 else
4086 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4087 * assumed_vf
4088 - vec_inside_cost * peel_iters_prologue
4089 - vec_inside_cost * peel_iters_epilogue)
4090 / ((scalar_single_iter_cost * assumed_vf)
4091 - vec_inside_cost);
4093 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 " Static estimate profitability threshold = %d\n",
4097 min_profitable_estimate);
4099 *ret_min_profitable_estimate = min_profitable_estimate;
4102 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4103 vector elements (not bits) for a vector with NELT elements. */
4104 static void
4105 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4106 vec_perm_builder *sel)
4108 /* The encoding is a single stepped pattern. Any wrap-around is handled
4109 by vec_perm_indices. */
4110 sel->new_vector (nelt, 1, 3);
4111 for (unsigned int i = 0; i < 3; i++)
4112 sel->quick_push (i + offset);
4115 /* Checks whether the target supports whole-vector shifts for vectors of mode
4116 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4117 it supports vec_perm_const with masks for all necessary shift amounts. */
4118 static bool
4119 have_whole_vector_shift (machine_mode mode)
4121 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4122 return true;
4124 /* Variable-length vectors should be handled via the optab. */
4125 unsigned int nelt;
4126 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4127 return false;
4129 vec_perm_builder sel;
4130 vec_perm_indices indices;
4131 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4133 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4134 indices.new_vector (sel, 2, nelt);
4135 if (!can_vec_perm_const_p (mode, indices, false))
4136 return false;
4138 return true;
4141 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4142 functions. Design better to avoid maintenance issues. */
4144 /* Function vect_model_reduction_cost.
4146 Models cost for a reduction operation, including the vector ops
4147 generated within the strip-mine loop, the initial definition before
4148 the loop, and the epilogue code that must be generated. */
4150 static void
4151 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4152 int ncopies)
4154 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4155 enum tree_code code;
4156 optab optab;
4157 tree vectype;
4158 gimple *orig_stmt;
4159 machine_mode mode;
4160 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4161 struct loop *loop = NULL;
4162 void *target_cost_data;
4164 if (loop_vinfo)
4166 loop = LOOP_VINFO_LOOP (loop_vinfo);
4167 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4169 else
4170 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4172 /* Condition reductions generate two reductions in the loop. */
4173 vect_reduction_type reduction_type
4174 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4175 if (reduction_type == COND_REDUCTION)
4176 ncopies *= 2;
4178 vectype = STMT_VINFO_VECTYPE (stmt_info);
4179 mode = TYPE_MODE (vectype);
4180 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4182 if (!orig_stmt)
4183 orig_stmt = STMT_VINFO_STMT (stmt_info);
4185 code = gimple_assign_rhs_code (orig_stmt);
4187 if (reduction_type == EXTRACT_LAST_REDUCTION
4188 || reduction_type == FOLD_LEFT_REDUCTION)
4190 /* No extra instructions needed in the prologue. */
4191 prologue_cost = 0;
4193 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4194 /* Count one reduction-like operation per vector. */
4195 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4196 stmt_info, 0, vect_body);
4197 else
4199 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4200 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4201 inside_cost = add_stmt_cost (target_cost_data, nelements,
4202 vec_to_scalar, stmt_info, 0,
4203 vect_body);
4204 inside_cost += add_stmt_cost (target_cost_data, nelements,
4205 scalar_stmt, stmt_info, 0,
4206 vect_body);
4209 else
4211 /* Add in cost for initial definition.
4212 For cond reduction we have four vectors: initial index, step,
4213 initial result of the data reduction, initial value of the index
4214 reduction. */
4215 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4216 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4217 scalar_to_vec, stmt_info, 0,
4218 vect_prologue);
4220 /* Cost of reduction op inside loop. */
4221 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4222 stmt_info, 0, vect_body);
4225 /* Determine cost of epilogue code.
4227 We have a reduction operator that will reduce the vector in one statement.
4228 Also requires scalar extract. */
4230 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4232 if (reduc_fn != IFN_LAST)
4234 if (reduction_type == COND_REDUCTION)
4236 /* An EQ stmt and an COND_EXPR stmt. */
4237 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4238 vector_stmt, stmt_info, 0,
4239 vect_epilogue);
4240 /* Reduction of the max index and a reduction of the found
4241 values. */
4242 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4243 vec_to_scalar, stmt_info, 0,
4244 vect_epilogue);
4245 /* A broadcast of the max value. */
4246 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4247 scalar_to_vec, stmt_info, 0,
4248 vect_epilogue);
4250 else
4252 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4253 stmt_info, 0, vect_epilogue);
4254 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4255 vec_to_scalar, stmt_info, 0,
4256 vect_epilogue);
4259 else if (reduction_type == COND_REDUCTION)
4261 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4262 /* Extraction of scalar elements. */
4263 epilogue_cost += add_stmt_cost (target_cost_data,
4264 2 * estimated_nunits,
4265 vec_to_scalar, stmt_info, 0,
4266 vect_epilogue);
4267 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4268 epilogue_cost += add_stmt_cost (target_cost_data,
4269 2 * estimated_nunits - 3,
4270 scalar_stmt, stmt_info, 0,
4271 vect_epilogue);
4273 else if (reduction_type == EXTRACT_LAST_REDUCTION
4274 || reduction_type == FOLD_LEFT_REDUCTION)
4275 /* No extra instructions need in the epilogue. */
4277 else
4279 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4280 tree bitsize =
4281 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4282 int element_bitsize = tree_to_uhwi (bitsize);
4283 int nelements = vec_size_in_bits / element_bitsize;
4285 if (code == COND_EXPR)
4286 code = MAX_EXPR;
4288 optab = optab_for_tree_code (code, vectype, optab_default);
4290 /* We have a whole vector shift available. */
4291 if (optab != unknown_optab
4292 && VECTOR_MODE_P (mode)
4293 && optab_handler (optab, mode) != CODE_FOR_nothing
4294 && have_whole_vector_shift (mode))
4296 /* Final reduction via vector shifts and the reduction operator.
4297 Also requires scalar extract. */
4298 epilogue_cost += add_stmt_cost (target_cost_data,
4299 exact_log2 (nelements) * 2,
4300 vector_stmt, stmt_info, 0,
4301 vect_epilogue);
4302 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4303 vec_to_scalar, stmt_info, 0,
4304 vect_epilogue);
4306 else
4307 /* Use extracts and reduction op for final reduction. For N
4308 elements, we have N extracts and N-1 reduction ops. */
4309 epilogue_cost += add_stmt_cost (target_cost_data,
4310 nelements + nelements - 1,
4311 vector_stmt, stmt_info, 0,
4312 vect_epilogue);
4316 if (dump_enabled_p ())
4317 dump_printf (MSG_NOTE,
4318 "vect_model_reduction_cost: inside_cost = %d, "
4319 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4320 prologue_cost, epilogue_cost);
4324 /* Function vect_model_induction_cost.
4326 Models cost for induction operations. */
4328 static void
4329 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4331 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4332 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4333 unsigned inside_cost, prologue_cost;
4335 if (PURE_SLP_STMT (stmt_info))
4336 return;
4338 /* loop cost for vec_loop. */
4339 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4340 stmt_info, 0, vect_body);
4342 /* prologue cost for vec_init and vec_step. */
4343 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4344 stmt_info, 0, vect_prologue);
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_NOTE, vect_location,
4348 "vect_model_induction_cost: inside_cost = %d, "
4349 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4354 /* Function get_initial_def_for_reduction
4356 Input:
4357 STMT - a stmt that performs a reduction operation in the loop.
4358 INIT_VAL - the initial value of the reduction variable
4360 Output:
4361 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4362 of the reduction (used for adjusting the epilog - see below).
4363 Return a vector variable, initialized according to the operation that STMT
4364 performs. This vector will be used as the initial value of the
4365 vector of partial results.
4367 Option1 (adjust in epilog): Initialize the vector as follows:
4368 add/bit or/xor: [0,0,...,0,0]
4369 mult/bit and: [1,1,...,1,1]
4370 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4371 and when necessary (e.g. add/mult case) let the caller know
4372 that it needs to adjust the result by init_val.
4374 Option2: Initialize the vector as follows:
4375 add/bit or/xor: [init_val,0,0,...,0]
4376 mult/bit and: [init_val,1,1,...,1]
4377 min/max/cond_expr: [init_val,init_val,...,init_val]
4378 and no adjustments are needed.
4380 For example, for the following code:
4382 s = init_val;
4383 for (i=0;i<n;i++)
4384 s = s + a[i];
4386 STMT is 's = s + a[i]', and the reduction variable is 's'.
4387 For a vector of 4 units, we want to return either [0,0,0,init_val],
4388 or [0,0,0,0] and let the caller know that it needs to adjust
4389 the result at the end by 'init_val'.
4391 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4392 initialization vector is simpler (same element in all entries), if
4393 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4395 A cost model should help decide between these two schemes. */
4397 tree
4398 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4399 tree *adjustment_def)
4401 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4402 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4404 tree scalar_type = TREE_TYPE (init_val);
4405 tree vectype = get_vectype_for_scalar_type (scalar_type);
4406 enum tree_code code = gimple_assign_rhs_code (stmt);
4407 tree def_for_init;
4408 tree init_def;
4409 bool nested_in_vect_loop = false;
4410 REAL_VALUE_TYPE real_init_val = dconst0;
4411 int int_init_val = 0;
4412 gimple *def_stmt = NULL;
4413 gimple_seq stmts = NULL;
4415 gcc_assert (vectype);
4417 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4418 || SCALAR_FLOAT_TYPE_P (scalar_type));
4420 if (nested_in_vect_loop_p (loop, stmt))
4421 nested_in_vect_loop = true;
4422 else
4423 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4425 /* In case of double reduction we only create a vector variable to be put
4426 in the reduction phi node. The actual statement creation is done in
4427 vect_create_epilog_for_reduction. */
4428 if (adjustment_def && nested_in_vect_loop
4429 && TREE_CODE (init_val) == SSA_NAME
4430 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4431 && gimple_code (def_stmt) == GIMPLE_PHI
4432 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4433 && vinfo_for_stmt (def_stmt)
4434 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4435 == vect_double_reduction_def)
4437 *adjustment_def = NULL;
4438 return vect_create_destination_var (init_val, vectype);
4441 vect_reduction_type reduction_type
4442 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4444 /* In case of a nested reduction do not use an adjustment def as
4445 that case is not supported by the epilogue generation correctly
4446 if ncopies is not one. */
4447 if (adjustment_def && nested_in_vect_loop)
4449 *adjustment_def = NULL;
4450 return vect_get_vec_def_for_operand (init_val, stmt);
4453 switch (code)
4455 case WIDEN_SUM_EXPR:
4456 case DOT_PROD_EXPR:
4457 case SAD_EXPR:
4458 case PLUS_EXPR:
4459 case MINUS_EXPR:
4460 case BIT_IOR_EXPR:
4461 case BIT_XOR_EXPR:
4462 case MULT_EXPR:
4463 case BIT_AND_EXPR:
4465 /* ADJUSTMENT_DEF is NULL when called from
4466 vect_create_epilog_for_reduction to vectorize double reduction. */
4467 if (adjustment_def)
4468 *adjustment_def = init_val;
4470 if (code == MULT_EXPR)
4472 real_init_val = dconst1;
4473 int_init_val = 1;
4476 if (code == BIT_AND_EXPR)
4477 int_init_val = -1;
4479 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4480 def_for_init = build_real (scalar_type, real_init_val);
4481 else
4482 def_for_init = build_int_cst (scalar_type, int_init_val);
4484 if (adjustment_def)
4485 /* Option1: the first element is '0' or '1' as well. */
4486 init_def = gimple_build_vector_from_val (&stmts, vectype,
4487 def_for_init);
4488 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4490 /* Option2 (variable length): the first element is INIT_VAL. */
4491 init_def = build_vector_from_val (vectype, def_for_init);
4492 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4493 2, init_def, init_val);
4494 init_def = make_ssa_name (vectype);
4495 gimple_call_set_lhs (call, init_def);
4496 gimple_seq_add_stmt (&stmts, call);
4498 else
4500 /* Option2: the first element is INIT_VAL. */
4501 tree_vector_builder elts (vectype, 1, 2);
4502 elts.quick_push (init_val);
4503 elts.quick_push (def_for_init);
4504 init_def = gimple_build_vector (&stmts, &elts);
4507 break;
4509 case MIN_EXPR:
4510 case MAX_EXPR:
4511 case COND_EXPR:
4513 if (adjustment_def)
4515 *adjustment_def = NULL_TREE;
4516 if (reduction_type != COND_REDUCTION
4517 && reduction_type != EXTRACT_LAST_REDUCTION)
4519 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4520 break;
4523 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4524 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4526 break;
4528 default:
4529 gcc_unreachable ();
4532 if (stmts)
4533 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4534 return init_def;
4537 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4538 NUMBER_OF_VECTORS is the number of vector defs to create.
4539 If NEUTRAL_OP is nonnull, introducing extra elements of that
4540 value will not change the result. */
4542 static void
4543 get_initial_defs_for_reduction (slp_tree slp_node,
4544 vec<tree> *vec_oprnds,
4545 unsigned int number_of_vectors,
4546 bool reduc_chain, tree neutral_op)
4548 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4549 gimple *stmt = stmts[0];
4550 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4551 unsigned HOST_WIDE_INT nunits;
4552 unsigned j, number_of_places_left_in_vector;
4553 tree vector_type;
4554 tree vop;
4555 int group_size = stmts.length ();
4556 unsigned int vec_num, i;
4557 unsigned number_of_copies = 1;
4558 vec<tree> voprnds;
4559 voprnds.create (number_of_vectors);
4560 struct loop *loop;
4561 auto_vec<tree, 16> permute_results;
4563 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4565 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4567 loop = (gimple_bb (stmt))->loop_father;
4568 gcc_assert (loop);
4569 edge pe = loop_preheader_edge (loop);
4571 gcc_assert (!reduc_chain || neutral_op);
4573 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4574 created vectors. It is greater than 1 if unrolling is performed.
4576 For example, we have two scalar operands, s1 and s2 (e.g., group of
4577 strided accesses of size two), while NUNITS is four (i.e., four scalars
4578 of this type can be packed in a vector). The output vector will contain
4579 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4580 will be 2).
4582 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4583 containing the operands.
4585 For example, NUNITS is four as before, and the group size is 8
4586 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4587 {s5, s6, s7, s8}. */
4589 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4590 nunits = group_size;
4592 number_of_copies = nunits * number_of_vectors / group_size;
4594 number_of_places_left_in_vector = nunits;
4595 bool constant_p = true;
4596 tree_vector_builder elts (vector_type, nunits, 1);
4597 elts.quick_grow (nunits);
4598 for (j = 0; j < number_of_copies; j++)
4600 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4602 tree op;
4603 /* Get the def before the loop. In reduction chain we have only
4604 one initial value. */
4605 if ((j != (number_of_copies - 1)
4606 || (reduc_chain && i != 0))
4607 && neutral_op)
4608 op = neutral_op;
4609 else
4610 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4612 /* Create 'vect_ = {op0,op1,...,opn}'. */
4613 number_of_places_left_in_vector--;
4614 elts[number_of_places_left_in_vector] = op;
4615 if (!CONSTANT_CLASS_P (op))
4616 constant_p = false;
4618 if (number_of_places_left_in_vector == 0)
4620 gimple_seq ctor_seq = NULL;
4621 tree init;
4622 if (constant_p && !neutral_op
4623 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4624 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4625 /* Build the vector directly from ELTS. */
4626 init = gimple_build_vector (&ctor_seq, &elts);
4627 else if (neutral_op)
4629 /* Build a vector of the neutral value and shift the
4630 other elements into place. */
4631 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4632 neutral_op);
4633 int k = nunits;
4634 while (k > 0 && elts[k - 1] == neutral_op)
4635 k -= 1;
4636 while (k > 0)
4638 k -= 1;
4639 gcall *call = gimple_build_call_internal
4640 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4641 init = make_ssa_name (vector_type);
4642 gimple_call_set_lhs (call, init);
4643 gimple_seq_add_stmt (&ctor_seq, call);
4646 else
4648 /* First time round, duplicate ELTS to fill the
4649 required number of vectors, then cherry pick the
4650 appropriate result for each iteration. */
4651 if (vec_oprnds->is_empty ())
4652 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4653 number_of_vectors,
4654 permute_results);
4655 init = permute_results[number_of_vectors - j - 1];
4657 if (ctor_seq != NULL)
4658 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4659 voprnds.quick_push (init);
4661 number_of_places_left_in_vector = nunits;
4662 elts.new_vector (vector_type, nunits, 1);
4663 elts.quick_grow (nunits);
4664 constant_p = true;
4669 /* Since the vectors are created in the reverse order, we should invert
4670 them. */
4671 vec_num = voprnds.length ();
4672 for (j = vec_num; j != 0; j--)
4674 vop = voprnds[j - 1];
4675 vec_oprnds->quick_push (vop);
4678 voprnds.release ();
4680 /* In case that VF is greater than the unrolling factor needed for the SLP
4681 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4682 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4683 to replicate the vectors. */
4684 tree neutral_vec = NULL;
4685 while (number_of_vectors > vec_oprnds->length ())
4687 if (neutral_op)
4689 if (!neutral_vec)
4691 gimple_seq ctor_seq = NULL;
4692 neutral_vec = gimple_build_vector_from_val
4693 (&ctor_seq, vector_type, neutral_op);
4694 if (ctor_seq != NULL)
4695 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4697 vec_oprnds->quick_push (neutral_vec);
4699 else
4701 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4702 vec_oprnds->quick_push (vop);
4708 /* Function vect_create_epilog_for_reduction
4710 Create code at the loop-epilog to finalize the result of a reduction
4711 computation.
4713 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4714 reduction statements.
4715 STMT is the scalar reduction stmt that is being vectorized.
4716 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4717 number of elements that we can fit in a vectype (nunits). In this case
4718 we have to generate more than one vector stmt - i.e - we need to "unroll"
4719 the vector stmt by a factor VF/nunits. For more details see documentation
4720 in vectorizable_operation.
4721 REDUC_FN is the internal function for the epilog reduction.
4722 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4723 computation.
4724 REDUC_INDEX is the index of the operand in the right hand side of the
4725 statement that is defined by REDUCTION_PHI.
4726 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4727 SLP_NODE is an SLP node containing a group of reduction statements. The
4728 first one in this group is STMT.
4729 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4730 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4731 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4732 any value of the IV in the loop.
4733 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4734 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4735 null if this is not an SLP reduction
4737 This function:
4738 1. Creates the reduction def-use cycles: sets the arguments for
4739 REDUCTION_PHIS:
4740 The loop-entry argument is the vectorized initial-value of the reduction.
4741 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4742 sums.
4743 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4744 by calling the function specified by REDUC_FN if available, or by
4745 other means (whole-vector shifts or a scalar loop).
4746 The function also creates a new phi node at the loop exit to preserve
4747 loop-closed form, as illustrated below.
4749 The flow at the entry to this function:
4751 loop:
4752 vec_def = phi <null, null> # REDUCTION_PHI
4753 VECT_DEF = vector_stmt # vectorized form of STMT
4754 s_loop = scalar_stmt # (scalar) STMT
4755 loop_exit:
4756 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4757 use <s_out0>
4758 use <s_out0>
4760 The above is transformed by this function into:
4762 loop:
4763 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4764 VECT_DEF = vector_stmt # vectorized form of STMT
4765 s_loop = scalar_stmt # (scalar) STMT
4766 loop_exit:
4767 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4768 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4769 v_out2 = reduce <v_out1>
4770 s_out3 = extract_field <v_out2, 0>
4771 s_out4 = adjust_result <s_out3>
4772 use <s_out4>
4773 use <s_out4>
4776 static void
4777 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4778 gimple *reduc_def_stmt,
4779 int ncopies, internal_fn reduc_fn,
4780 vec<gimple *> reduction_phis,
4781 bool double_reduc,
4782 slp_tree slp_node,
4783 slp_instance slp_node_instance,
4784 tree induc_val, enum tree_code induc_code,
4785 tree neutral_op)
4787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4788 stmt_vec_info prev_phi_info;
4789 tree vectype;
4790 machine_mode mode;
4791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4792 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4793 basic_block exit_bb;
4794 tree scalar_dest;
4795 tree scalar_type;
4796 gimple *new_phi = NULL, *phi;
4797 gimple_stmt_iterator exit_gsi;
4798 tree vec_dest;
4799 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4800 gimple *epilog_stmt = NULL;
4801 enum tree_code code = gimple_assign_rhs_code (stmt);
4802 gimple *exit_phi;
4803 tree bitsize;
4804 tree adjustment_def = NULL;
4805 tree vec_initial_def = NULL;
4806 tree expr, def, initial_def = NULL;
4807 tree orig_name, scalar_result;
4808 imm_use_iterator imm_iter, phi_imm_iter;
4809 use_operand_p use_p, phi_use_p;
4810 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4811 bool nested_in_vect_loop = false;
4812 auto_vec<gimple *> new_phis;
4813 auto_vec<gimple *> inner_phis;
4814 enum vect_def_type dt = vect_unknown_def_type;
4815 int j, i;
4816 auto_vec<tree> scalar_results;
4817 unsigned int group_size = 1, k, ratio;
4818 auto_vec<tree> vec_initial_defs;
4819 auto_vec<gimple *> phis;
4820 bool slp_reduc = false;
4821 bool direct_slp_reduc;
4822 tree new_phi_result;
4823 gimple *inner_phi = NULL;
4824 tree induction_index = NULL_TREE;
4826 if (slp_node)
4827 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4829 if (nested_in_vect_loop_p (loop, stmt))
4831 outer_loop = loop;
4832 loop = loop->inner;
4833 nested_in_vect_loop = true;
4834 gcc_assert (!slp_node);
4837 vectype = STMT_VINFO_VECTYPE (stmt_info);
4838 gcc_assert (vectype);
4839 mode = TYPE_MODE (vectype);
4841 /* 1. Create the reduction def-use cycle:
4842 Set the arguments of REDUCTION_PHIS, i.e., transform
4844 loop:
4845 vec_def = phi <null, null> # REDUCTION_PHI
4846 VECT_DEF = vector_stmt # vectorized form of STMT
4849 into:
4851 loop:
4852 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4853 VECT_DEF = vector_stmt # vectorized form of STMT
4856 (in case of SLP, do it for all the phis). */
4858 /* Get the loop-entry arguments. */
4859 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4860 if (slp_node)
4862 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4863 vec_initial_defs.reserve (vec_num);
4864 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4865 &vec_initial_defs, vec_num,
4866 GROUP_FIRST_ELEMENT (stmt_info),
4867 neutral_op);
4869 else
4871 /* Get at the scalar def before the loop, that defines the initial value
4872 of the reduction variable. */
4873 gimple *def_stmt;
4874 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4875 loop_preheader_edge (loop));
4876 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4877 and we can't use zero for induc_val, use initial_def. Similarly
4878 for REDUC_MIN and initial_def larger than the base. */
4879 if (TREE_CODE (initial_def) == INTEGER_CST
4880 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4881 == INTEGER_INDUC_COND_REDUCTION)
4882 && !integer_zerop (induc_val)
4883 && ((induc_code == MAX_EXPR
4884 && tree_int_cst_lt (initial_def, induc_val))
4885 || (induc_code == MIN_EXPR
4886 && tree_int_cst_lt (induc_val, initial_def))))
4887 induc_val = initial_def;
4888 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4889 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4890 &adjustment_def);
4891 vec_initial_defs.create (1);
4892 vec_initial_defs.quick_push (vec_initial_def);
4895 /* Set phi nodes arguments. */
4896 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4898 tree vec_init_def = vec_initial_defs[i];
4899 tree def = vect_defs[i];
4900 for (j = 0; j < ncopies; j++)
4902 if (j != 0)
4904 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4905 if (nested_in_vect_loop)
4906 vec_init_def
4907 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4908 vec_init_def);
4911 /* Set the loop-entry arg of the reduction-phi. */
4913 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4914 == INTEGER_INDUC_COND_REDUCTION)
4916 /* Initialise the reduction phi to zero. This prevents initial
4917 values of non-zero interferring with the reduction op. */
4918 gcc_assert (ncopies == 1);
4919 gcc_assert (i == 0);
4921 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4922 tree induc_val_vec
4923 = build_vector_from_val (vec_init_def_type, induc_val);
4925 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4926 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4928 else
4929 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4930 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4932 /* Set the loop-latch arg for the reduction-phi. */
4933 if (j > 0)
4934 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4936 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4937 UNKNOWN_LOCATION);
4939 if (dump_enabled_p ())
4941 dump_printf_loc (MSG_NOTE, vect_location,
4942 "transform reduction: created def-use cycle: ");
4943 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4944 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4949 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4950 which is updated with the current index of the loop for every match of
4951 the original loop's cond_expr (VEC_STMT). This results in a vector
4952 containing the last time the condition passed for that vector lane.
4953 The first match will be a 1 to allow 0 to be used for non-matching
4954 indexes. If there are no matches at all then the vector will be all
4955 zeroes. */
4956 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4958 tree indx_before_incr, indx_after_incr;
4959 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4961 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4962 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4964 int scalar_precision
4965 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4966 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4967 tree cr_index_vector_type = build_vector_type
4968 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4970 /* First we create a simple vector induction variable which starts
4971 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4972 vector size (STEP). */
4974 /* Create a {1,2,3,...} vector. */
4975 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4977 /* Create a vector of the step value. */
4978 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4979 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4981 /* Create an induction variable. */
4982 gimple_stmt_iterator incr_gsi;
4983 bool insert_after;
4984 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4985 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4986 insert_after, &indx_before_incr, &indx_after_incr);
4988 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4989 filled with zeros (VEC_ZERO). */
4991 /* Create a vector of 0s. */
4992 tree zero = build_zero_cst (cr_index_scalar_type);
4993 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4995 /* Create a vector phi node. */
4996 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4997 new_phi = create_phi_node (new_phi_tree, loop->header);
4998 set_vinfo_for_stmt (new_phi,
4999 new_stmt_vec_info (new_phi, loop_vinfo));
5000 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5001 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5003 /* Now take the condition from the loops original cond_expr
5004 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
5005 every match uses values from the induction variable
5006 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5007 (NEW_PHI_TREE).
5008 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5009 the new cond_expr (INDEX_COND_EXPR). */
5011 /* Duplicate the condition from vec_stmt. */
5012 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5014 /* Create a conditional, where the condition is taken from vec_stmt
5015 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5016 else is the phi (NEW_PHI_TREE). */
5017 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5018 ccompare, indx_before_incr,
5019 new_phi_tree);
5020 induction_index = make_ssa_name (cr_index_vector_type);
5021 gimple *index_condition = gimple_build_assign (induction_index,
5022 index_cond_expr);
5023 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5024 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5025 loop_vinfo);
5026 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5027 set_vinfo_for_stmt (index_condition, index_vec_info);
5029 /* Update the phi with the vec cond. */
5030 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5031 loop_latch_edge (loop), UNKNOWN_LOCATION);
5034 /* 2. Create epilog code.
5035 The reduction epilog code operates across the elements of the vector
5036 of partial results computed by the vectorized loop.
5037 The reduction epilog code consists of:
5039 step 1: compute the scalar result in a vector (v_out2)
5040 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5041 step 3: adjust the scalar result (s_out3) if needed.
5043 Step 1 can be accomplished using one the following three schemes:
5044 (scheme 1) using reduc_fn, if available.
5045 (scheme 2) using whole-vector shifts, if available.
5046 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5047 combined.
5049 The overall epilog code looks like this:
5051 s_out0 = phi <s_loop> # original EXIT_PHI
5052 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5053 v_out2 = reduce <v_out1> # step 1
5054 s_out3 = extract_field <v_out2, 0> # step 2
5055 s_out4 = adjust_result <s_out3> # step 3
5057 (step 3 is optional, and steps 1 and 2 may be combined).
5058 Lastly, the uses of s_out0 are replaced by s_out4. */
5061 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5062 v_out1 = phi <VECT_DEF>
5063 Store them in NEW_PHIS. */
5065 exit_bb = single_exit (loop)->dest;
5066 prev_phi_info = NULL;
5067 new_phis.create (vect_defs.length ());
5068 FOR_EACH_VEC_ELT (vect_defs, i, def)
5070 for (j = 0; j < ncopies; j++)
5072 tree new_def = copy_ssa_name (def);
5073 phi = create_phi_node (new_def, exit_bb);
5074 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5075 if (j == 0)
5076 new_phis.quick_push (phi);
5077 else
5079 def = vect_get_vec_def_for_stmt_copy (dt, def);
5080 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5083 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5084 prev_phi_info = vinfo_for_stmt (phi);
5088 /* The epilogue is created for the outer-loop, i.e., for the loop being
5089 vectorized. Create exit phis for the outer loop. */
5090 if (double_reduc)
5092 loop = outer_loop;
5093 exit_bb = single_exit (loop)->dest;
5094 inner_phis.create (vect_defs.length ());
5095 FOR_EACH_VEC_ELT (new_phis, i, phi)
5097 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5098 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5099 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5100 PHI_RESULT (phi));
5101 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5102 loop_vinfo));
5103 inner_phis.quick_push (phi);
5104 new_phis[i] = outer_phi;
5105 prev_phi_info = vinfo_for_stmt (outer_phi);
5106 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5108 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5109 new_result = copy_ssa_name (PHI_RESULT (phi));
5110 outer_phi = create_phi_node (new_result, exit_bb);
5111 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5112 PHI_RESULT (phi));
5113 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5114 loop_vinfo));
5115 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5116 prev_phi_info = vinfo_for_stmt (outer_phi);
5121 exit_gsi = gsi_after_labels (exit_bb);
5123 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5124 (i.e. when reduc_fn is not available) and in the final adjustment
5125 code (if needed). Also get the original scalar reduction variable as
5126 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5127 represents a reduction pattern), the tree-code and scalar-def are
5128 taken from the original stmt that the pattern-stmt (STMT) replaces.
5129 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5130 are taken from STMT. */
5132 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5133 if (!orig_stmt)
5135 /* Regular reduction */
5136 orig_stmt = stmt;
5138 else
5140 /* Reduction pattern */
5141 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5142 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5143 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5146 code = gimple_assign_rhs_code (orig_stmt);
5147 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5148 partial results are added and not subtracted. */
5149 if (code == MINUS_EXPR)
5150 code = PLUS_EXPR;
5152 scalar_dest = gimple_assign_lhs (orig_stmt);
5153 scalar_type = TREE_TYPE (scalar_dest);
5154 scalar_results.create (group_size);
5155 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5156 bitsize = TYPE_SIZE (scalar_type);
5158 /* In case this is a reduction in an inner-loop while vectorizing an outer
5159 loop - we don't need to extract a single scalar result at the end of the
5160 inner-loop (unless it is double reduction, i.e., the use of reduction is
5161 outside the outer-loop). The final vector of partial results will be used
5162 in the vectorized outer-loop, or reduced to a scalar result at the end of
5163 the outer-loop. */
5164 if (nested_in_vect_loop && !double_reduc)
5165 goto vect_finalize_reduction;
5167 /* SLP reduction without reduction chain, e.g.,
5168 # a1 = phi <a2, a0>
5169 # b1 = phi <b2, b0>
5170 a2 = operation (a1)
5171 b2 = operation (b1) */
5172 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5174 /* True if we should implement SLP_REDUC using native reduction operations
5175 instead of scalar operations. */
5176 direct_slp_reduc = (reduc_fn != IFN_LAST
5177 && slp_reduc
5178 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5180 /* In case of reduction chain, e.g.,
5181 # a1 = phi <a3, a0>
5182 a2 = operation (a1)
5183 a3 = operation (a2),
5185 we may end up with more than one vector result. Here we reduce them to
5186 one vector. */
5187 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5189 tree first_vect = PHI_RESULT (new_phis[0]);
5190 gassign *new_vec_stmt = NULL;
5191 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5192 for (k = 1; k < new_phis.length (); k++)
5194 gimple *next_phi = new_phis[k];
5195 tree second_vect = PHI_RESULT (next_phi);
5196 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5197 new_vec_stmt = gimple_build_assign (tem, code,
5198 first_vect, second_vect);
5199 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5200 first_vect = tem;
5203 new_phi_result = first_vect;
5204 if (new_vec_stmt)
5206 new_phis.truncate (0);
5207 new_phis.safe_push (new_vec_stmt);
5210 /* Likewise if we couldn't use a single defuse cycle. */
5211 else if (ncopies > 1)
5213 gcc_assert (new_phis.length () == 1);
5214 tree first_vect = PHI_RESULT (new_phis[0]);
5215 gassign *new_vec_stmt = NULL;
5216 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5217 gimple *next_phi = new_phis[0];
5218 for (int k = 1; k < ncopies; ++k)
5220 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5221 tree second_vect = PHI_RESULT (next_phi);
5222 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5223 new_vec_stmt = gimple_build_assign (tem, code,
5224 first_vect, second_vect);
5225 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5226 first_vect = tem;
5228 new_phi_result = first_vect;
5229 new_phis.truncate (0);
5230 new_phis.safe_push (new_vec_stmt);
5232 else
5233 new_phi_result = PHI_RESULT (new_phis[0]);
5235 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5236 && reduc_fn != IFN_LAST)
5238 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5239 various data values where the condition matched and another vector
5240 (INDUCTION_INDEX) containing all the indexes of those matches. We
5241 need to extract the last matching index (which will be the index with
5242 highest value) and use this to index into the data vector.
5243 For the case where there were no matches, the data vector will contain
5244 all default values and the index vector will be all zeros. */
5246 /* Get various versions of the type of the vector of indexes. */
5247 tree index_vec_type = TREE_TYPE (induction_index);
5248 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5249 tree index_scalar_type = TREE_TYPE (index_vec_type);
5250 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5251 (index_vec_type);
5253 /* Get an unsigned integer version of the type of the data vector. */
5254 int scalar_precision
5255 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5256 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5257 tree vectype_unsigned = build_vector_type
5258 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5260 /* First we need to create a vector (ZERO_VEC) of zeros and another
5261 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5262 can create using a MAX reduction and then expanding.
5263 In the case where the loop never made any matches, the max index will
5264 be zero. */
5266 /* Vector of {0, 0, 0,...}. */
5267 tree zero_vec = make_ssa_name (vectype);
5268 tree zero_vec_rhs = build_zero_cst (vectype);
5269 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5270 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5272 /* Find maximum value from the vector of found indexes. */
5273 tree max_index = make_ssa_name (index_scalar_type);
5274 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5275 1, induction_index);
5276 gimple_call_set_lhs (max_index_stmt, max_index);
5277 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5279 /* Vector of {max_index, max_index, max_index,...}. */
5280 tree max_index_vec = make_ssa_name (index_vec_type);
5281 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5282 max_index);
5283 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5284 max_index_vec_rhs);
5285 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5287 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5288 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5289 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5290 otherwise. Only one value should match, resulting in a vector
5291 (VEC_COND) with one data value and the rest zeros.
5292 In the case where the loop never made any matches, every index will
5293 match, resulting in a vector with all data values (which will all be
5294 the default value). */
5296 /* Compare the max index vector to the vector of found indexes to find
5297 the position of the max value. */
5298 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5299 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5300 induction_index,
5301 max_index_vec);
5302 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5304 /* Use the compare to choose either values from the data vector or
5305 zero. */
5306 tree vec_cond = make_ssa_name (vectype);
5307 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5308 vec_compare, new_phi_result,
5309 zero_vec);
5310 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5312 /* Finally we need to extract the data value from the vector (VEC_COND)
5313 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5314 reduction, but because this doesn't exist, we can use a MAX reduction
5315 instead. The data value might be signed or a float so we need to cast
5316 it first.
5317 In the case where the loop never made any matches, the data values are
5318 all identical, and so will reduce down correctly. */
5320 /* Make the matched data values unsigned. */
5321 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5322 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5323 vec_cond);
5324 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5325 VIEW_CONVERT_EXPR,
5326 vec_cond_cast_rhs);
5327 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5329 /* Reduce down to a scalar value. */
5330 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5331 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5332 1, vec_cond_cast);
5333 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5334 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5336 /* Convert the reduced value back to the result type and set as the
5337 result. */
5338 gimple_seq stmts = NULL;
5339 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5340 data_reduc);
5341 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5342 scalar_results.safe_push (new_temp);
5344 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5345 && reduc_fn == IFN_LAST)
5347 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5348 idx = 0;
5349 idx_val = induction_index[0];
5350 val = data_reduc[0];
5351 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5352 if (induction_index[i] > idx_val)
5353 val = data_reduc[i], idx_val = induction_index[i];
5354 return val; */
5356 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5357 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5358 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5359 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5360 /* Enforced by vectorizable_reduction, which ensures we have target
5361 support before allowing a conditional reduction on variable-length
5362 vectors. */
5363 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5364 tree idx_val = NULL_TREE, val = NULL_TREE;
5365 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5367 tree old_idx_val = idx_val;
5368 tree old_val = val;
5369 idx_val = make_ssa_name (idx_eltype);
5370 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5371 build3 (BIT_FIELD_REF, idx_eltype,
5372 induction_index,
5373 bitsize_int (el_size),
5374 bitsize_int (off)));
5375 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5376 val = make_ssa_name (data_eltype);
5377 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5378 build3 (BIT_FIELD_REF,
5379 data_eltype,
5380 new_phi_result,
5381 bitsize_int (el_size),
5382 bitsize_int (off)));
5383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5384 if (off != 0)
5386 tree new_idx_val = idx_val;
5387 tree new_val = val;
5388 if (off != v_size - el_size)
5390 new_idx_val = make_ssa_name (idx_eltype);
5391 epilog_stmt = gimple_build_assign (new_idx_val,
5392 MAX_EXPR, idx_val,
5393 old_idx_val);
5394 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396 new_val = make_ssa_name (data_eltype);
5397 epilog_stmt = gimple_build_assign (new_val,
5398 COND_EXPR,
5399 build2 (GT_EXPR,
5400 boolean_type_node,
5401 idx_val,
5402 old_idx_val),
5403 val, old_val);
5404 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5405 idx_val = new_idx_val;
5406 val = new_val;
5409 /* Convert the reduced value back to the result type and set as the
5410 result. */
5411 gimple_seq stmts = NULL;
5412 val = gimple_convert (&stmts, scalar_type, val);
5413 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5414 scalar_results.safe_push (val);
5417 /* 2.3 Create the reduction code, using one of the three schemes described
5418 above. In SLP we simply need to extract all the elements from the
5419 vector (without reducing them), so we use scalar shifts. */
5420 else if (reduc_fn != IFN_LAST && !slp_reduc)
5422 tree tmp;
5423 tree vec_elem_type;
5425 /* Case 1: Create:
5426 v_out2 = reduc_expr <v_out1> */
5428 if (dump_enabled_p ())
5429 dump_printf_loc (MSG_NOTE, vect_location,
5430 "Reduce using direct vector reduction.\n");
5432 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5433 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5435 tree tmp_dest
5436 = vect_create_destination_var (scalar_dest, vec_elem_type);
5437 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5438 new_phi_result);
5439 gimple_set_lhs (epilog_stmt, tmp_dest);
5440 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5441 gimple_set_lhs (epilog_stmt, new_temp);
5442 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5444 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5445 new_temp);
5447 else
5449 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5450 new_phi_result);
5451 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5454 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5455 gimple_set_lhs (epilog_stmt, new_temp);
5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5458 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5459 == INTEGER_INDUC_COND_REDUCTION)
5460 && !operand_equal_p (initial_def, induc_val, 0))
5462 /* Earlier we set the initial value to be a vector if induc_val
5463 values. Check the result and if it is induc_val then replace
5464 with the original initial value, unless induc_val is
5465 the same as initial_def already. */
5466 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5467 induc_val);
5469 tmp = make_ssa_name (new_scalar_dest);
5470 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5471 initial_def, new_temp);
5472 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5473 new_temp = tmp;
5476 scalar_results.safe_push (new_temp);
5478 else if (direct_slp_reduc)
5480 /* Here we create one vector for each of the GROUP_SIZE results,
5481 with the elements for other SLP statements replaced with the
5482 neutral value. We can then do a normal reduction on each vector. */
5484 /* Enforced by vectorizable_reduction. */
5485 gcc_assert (new_phis.length () == 1);
5486 gcc_assert (pow2p_hwi (group_size));
5488 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5489 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5490 gimple_seq seq = NULL;
5492 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5493 and the same element size as VECTYPE. */
5494 tree index = build_index_vector (vectype, 0, 1);
5495 tree index_type = TREE_TYPE (index);
5496 tree index_elt_type = TREE_TYPE (index_type);
5497 tree mask_type = build_same_sized_truth_vector_type (index_type);
5499 /* Create a vector that, for each element, identifies which of
5500 the GROUP_SIZE results should use it. */
5501 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5502 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5503 build_vector_from_val (index_type, index_mask));
5505 /* Get a neutral vector value. This is simply a splat of the neutral
5506 scalar value if we have one, otherwise the initial scalar value
5507 is itself a neutral value. */
5508 tree vector_identity = NULL_TREE;
5509 if (neutral_op)
5510 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5511 neutral_op);
5512 for (unsigned int i = 0; i < group_size; ++i)
5514 /* If there's no univeral neutral value, we can use the
5515 initial scalar value from the original PHI. This is used
5516 for MIN and MAX reduction, for example. */
5517 if (!neutral_op)
5519 tree scalar_value
5520 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5521 loop_preheader_edge (loop));
5522 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5523 scalar_value);
5526 /* Calculate the equivalent of:
5528 sel[j] = (index[j] == i);
5530 which selects the elements of NEW_PHI_RESULT that should
5531 be included in the result. */
5532 tree compare_val = build_int_cst (index_elt_type, i);
5533 compare_val = build_vector_from_val (index_type, compare_val);
5534 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5535 index, compare_val);
5537 /* Calculate the equivalent of:
5539 vec = seq ? new_phi_result : vector_identity;
5541 VEC is now suitable for a full vector reduction. */
5542 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5543 sel, new_phi_result, vector_identity);
5545 /* Do the reduction and convert it to the appropriate type. */
5546 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5547 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5548 gimple_call_set_lhs (call, scalar);
5549 gimple_seq_add_stmt (&seq, call);
5550 scalar = gimple_convert (&seq, scalar_type, scalar);
5551 scalar_results.safe_push (scalar);
5553 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5555 else
5557 bool reduce_with_shift;
5558 tree vec_temp;
5560 /* COND reductions all do the final reduction with MAX_EXPR
5561 or MIN_EXPR. */
5562 if (code == COND_EXPR)
5564 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5565 == INTEGER_INDUC_COND_REDUCTION)
5566 code = induc_code;
5567 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5568 == CONST_COND_REDUCTION)
5569 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5570 else
5571 code = MAX_EXPR;
5574 /* See if the target wants to do the final (shift) reduction
5575 in a vector mode of smaller size and first reduce upper/lower
5576 halves against each other. */
5577 enum machine_mode mode1 = mode;
5578 tree vectype1 = vectype;
5579 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5580 unsigned sz1 = sz;
5581 if (!slp_reduc
5582 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5583 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5585 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5586 reduce_with_shift = have_whole_vector_shift (mode1);
5587 if (!VECTOR_MODE_P (mode1))
5588 reduce_with_shift = false;
5589 else
5591 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5592 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5593 reduce_with_shift = false;
5596 /* First reduce the vector to the desired vector size we should
5597 do shift reduction on by combining upper and lower halves. */
5598 new_temp = new_phi_result;
5599 while (sz > sz1)
5601 gcc_assert (!slp_reduc);
5602 sz /= 2;
5603 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5605 /* The target has to make sure we support lowpart/highpart
5606 extraction, either via direct vector extract or through
5607 an integer mode punning. */
5608 tree dst1, dst2;
5609 if (convert_optab_handler (vec_extract_optab,
5610 TYPE_MODE (TREE_TYPE (new_temp)),
5611 TYPE_MODE (vectype1))
5612 != CODE_FOR_nothing)
5614 /* Extract sub-vectors directly once vec_extract becomes
5615 a conversion optab. */
5616 dst1 = make_ssa_name (vectype1);
5617 epilog_stmt
5618 = gimple_build_assign (dst1, BIT_FIELD_REF,
5619 build3 (BIT_FIELD_REF, vectype1,
5620 new_temp, TYPE_SIZE (vectype1),
5621 bitsize_int (0)));
5622 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5623 dst2 = make_ssa_name (vectype1);
5624 epilog_stmt
5625 = gimple_build_assign (dst2, BIT_FIELD_REF,
5626 build3 (BIT_FIELD_REF, vectype1,
5627 new_temp, TYPE_SIZE (vectype1),
5628 bitsize_int (sz * BITS_PER_UNIT)));
5629 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631 else
5633 /* Extract via punning to appropriately sized integer mode
5634 vector. */
5635 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5637 tree etype = build_vector_type (eltype, 2);
5638 gcc_assert (convert_optab_handler (vec_extract_optab,
5639 TYPE_MODE (etype),
5640 TYPE_MODE (eltype))
5641 != CODE_FOR_nothing);
5642 tree tem = make_ssa_name (etype);
5643 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5644 build1 (VIEW_CONVERT_EXPR,
5645 etype, new_temp));
5646 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5647 new_temp = tem;
5648 tem = make_ssa_name (eltype);
5649 epilog_stmt
5650 = gimple_build_assign (tem, BIT_FIELD_REF,
5651 build3 (BIT_FIELD_REF, eltype,
5652 new_temp, TYPE_SIZE (eltype),
5653 bitsize_int (0)));
5654 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5655 dst1 = make_ssa_name (vectype1);
5656 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5657 build1 (VIEW_CONVERT_EXPR,
5658 vectype1, tem));
5659 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5660 tem = make_ssa_name (eltype);
5661 epilog_stmt
5662 = gimple_build_assign (tem, BIT_FIELD_REF,
5663 build3 (BIT_FIELD_REF, eltype,
5664 new_temp, TYPE_SIZE (eltype),
5665 bitsize_int (sz * BITS_PER_UNIT)));
5666 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5667 dst2 = make_ssa_name (vectype1);
5668 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5669 build1 (VIEW_CONVERT_EXPR,
5670 vectype1, tem));
5671 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5674 new_temp = make_ssa_name (vectype1);
5675 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5676 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5679 if (reduce_with_shift && !slp_reduc)
5681 int element_bitsize = tree_to_uhwi (bitsize);
5682 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5683 for variable-length vectors and also requires direct target support
5684 for loop reductions. */
5685 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5686 int nelements = vec_size_in_bits / element_bitsize;
5687 vec_perm_builder sel;
5688 vec_perm_indices indices;
5690 int elt_offset;
5692 tree zero_vec = build_zero_cst (vectype1);
5693 /* Case 2: Create:
5694 for (offset = nelements/2; offset >= 1; offset/=2)
5696 Create: va' = vec_shift <va, offset>
5697 Create: va = vop <va, va'>
5698 } */
5700 tree rhs;
5702 if (dump_enabled_p ())
5703 dump_printf_loc (MSG_NOTE, vect_location,
5704 "Reduce using vector shifts\n");
5706 mode1 = TYPE_MODE (vectype1);
5707 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5708 for (elt_offset = nelements / 2;
5709 elt_offset >= 1;
5710 elt_offset /= 2)
5712 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5713 indices.new_vector (sel, 2, nelements);
5714 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5715 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5716 new_temp, zero_vec, mask);
5717 new_name = make_ssa_name (vec_dest, epilog_stmt);
5718 gimple_assign_set_lhs (epilog_stmt, new_name);
5719 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5721 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5722 new_temp);
5723 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5724 gimple_assign_set_lhs (epilog_stmt, new_temp);
5725 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5728 /* 2.4 Extract the final scalar result. Create:
5729 s_out3 = extract_field <v_out2, bitpos> */
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 "extract scalar result\n");
5735 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5736 bitsize, bitsize_zero_node);
5737 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5738 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5739 gimple_assign_set_lhs (epilog_stmt, new_temp);
5740 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5741 scalar_results.safe_push (new_temp);
5743 else
5745 /* Case 3: Create:
5746 s = extract_field <v_out2, 0>
5747 for (offset = element_size;
5748 offset < vector_size;
5749 offset += element_size;)
5751 Create: s' = extract_field <v_out2, offset>
5752 Create: s = op <s, s'> // For non SLP cases
5753 } */
5755 if (dump_enabled_p ())
5756 dump_printf_loc (MSG_NOTE, vect_location,
5757 "Reduce using scalar code.\n");
5759 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760 int element_bitsize = tree_to_uhwi (bitsize);
5761 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5763 int bit_offset;
5764 if (gimple_code (new_phi) == GIMPLE_PHI)
5765 vec_temp = PHI_RESULT (new_phi);
5766 else
5767 vec_temp = gimple_assign_lhs (new_phi);
5768 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5769 bitsize_zero_node);
5770 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5771 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5772 gimple_assign_set_lhs (epilog_stmt, new_temp);
5773 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5775 /* In SLP we don't need to apply reduction operation, so we just
5776 collect s' values in SCALAR_RESULTS. */
5777 if (slp_reduc)
5778 scalar_results.safe_push (new_temp);
5780 for (bit_offset = element_bitsize;
5781 bit_offset < vec_size_in_bits;
5782 bit_offset += element_bitsize)
5784 tree bitpos = bitsize_int (bit_offset);
5785 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5786 bitsize, bitpos);
5788 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5789 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5790 gimple_assign_set_lhs (epilog_stmt, new_name);
5791 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5793 if (slp_reduc)
5795 /* In SLP we don't need to apply reduction operation, so
5796 we just collect s' values in SCALAR_RESULTS. */
5797 new_temp = new_name;
5798 scalar_results.safe_push (new_name);
5800 else
5802 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5803 new_name, new_temp);
5804 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5805 gimple_assign_set_lhs (epilog_stmt, new_temp);
5806 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5811 /* The only case where we need to reduce scalar results in SLP, is
5812 unrolling. If the size of SCALAR_RESULTS is greater than
5813 GROUP_SIZE, we reduce them combining elements modulo
5814 GROUP_SIZE. */
5815 if (slp_reduc)
5817 tree res, first_res, new_res;
5818 gimple *new_stmt;
5820 /* Reduce multiple scalar results in case of SLP unrolling. */
5821 for (j = group_size; scalar_results.iterate (j, &res);
5822 j++)
5824 first_res = scalar_results[j % group_size];
5825 new_stmt = gimple_build_assign (new_scalar_dest, code,
5826 first_res, res);
5827 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5828 gimple_assign_set_lhs (new_stmt, new_res);
5829 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5830 scalar_results[j % group_size] = new_res;
5833 else
5834 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5835 scalar_results.safe_push (new_temp);
5838 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5839 == INTEGER_INDUC_COND_REDUCTION)
5840 && !operand_equal_p (initial_def, induc_val, 0))
5842 /* Earlier we set the initial value to be a vector if induc_val
5843 values. Check the result and if it is induc_val then replace
5844 with the original initial value, unless induc_val is
5845 the same as initial_def already. */
5846 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5847 induc_val);
5849 tree tmp = make_ssa_name (new_scalar_dest);
5850 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5851 initial_def, new_temp);
5852 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5853 scalar_results[0] = tmp;
5857 vect_finalize_reduction:
5859 if (double_reduc)
5860 loop = loop->inner;
5862 /* 2.5 Adjust the final result by the initial value of the reduction
5863 variable. (When such adjustment is not needed, then
5864 'adjustment_def' is zero). For example, if code is PLUS we create:
5865 new_temp = loop_exit_def + adjustment_def */
5867 if (adjustment_def)
5869 gcc_assert (!slp_reduc);
5870 if (nested_in_vect_loop)
5872 new_phi = new_phis[0];
5873 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5874 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5875 new_dest = vect_create_destination_var (scalar_dest, vectype);
5877 else
5879 new_temp = scalar_results[0];
5880 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5881 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5882 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5885 epilog_stmt = gimple_build_assign (new_dest, expr);
5886 new_temp = make_ssa_name (new_dest, epilog_stmt);
5887 gimple_assign_set_lhs (epilog_stmt, new_temp);
5888 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5889 if (nested_in_vect_loop)
5891 set_vinfo_for_stmt (epilog_stmt,
5892 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5893 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5894 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5896 if (!double_reduc)
5897 scalar_results.quick_push (new_temp);
5898 else
5899 scalar_results[0] = new_temp;
5901 else
5902 scalar_results[0] = new_temp;
5904 new_phis[0] = epilog_stmt;
5907 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5908 phis with new adjusted scalar results, i.e., replace use <s_out0>
5909 with use <s_out4>.
5911 Transform:
5912 loop_exit:
5913 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5914 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5915 v_out2 = reduce <v_out1>
5916 s_out3 = extract_field <v_out2, 0>
5917 s_out4 = adjust_result <s_out3>
5918 use <s_out0>
5919 use <s_out0>
5921 into:
5923 loop_exit:
5924 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5925 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5926 v_out2 = reduce <v_out1>
5927 s_out3 = extract_field <v_out2, 0>
5928 s_out4 = adjust_result <s_out3>
5929 use <s_out4>
5930 use <s_out4> */
5933 /* In SLP reduction chain we reduce vector results into one vector if
5934 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5935 the last stmt in the reduction chain, since we are looking for the loop
5936 exit phi node. */
5937 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5939 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5940 /* Handle reduction patterns. */
5941 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5942 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5944 scalar_dest = gimple_assign_lhs (dest_stmt);
5945 group_size = 1;
5948 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5949 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5950 need to match SCALAR_RESULTS with corresponding statements. The first
5951 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5952 the first vector stmt, etc.
5953 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5954 if (group_size > new_phis.length ())
5956 ratio = group_size / new_phis.length ();
5957 gcc_assert (!(group_size % new_phis.length ()));
5959 else
5960 ratio = 1;
5962 for (k = 0; k < group_size; k++)
5964 if (k % ratio == 0)
5966 epilog_stmt = new_phis[k / ratio];
5967 reduction_phi = reduction_phis[k / ratio];
5968 if (double_reduc)
5969 inner_phi = inner_phis[k / ratio];
5972 if (slp_reduc)
5974 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5976 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5977 /* SLP statements can't participate in patterns. */
5978 gcc_assert (!orig_stmt);
5979 scalar_dest = gimple_assign_lhs (current_stmt);
5982 phis.create (3);
5983 /* Find the loop-closed-use at the loop exit of the original scalar
5984 result. (The reduction result is expected to have two immediate uses -
5985 one at the latch block, and one at the loop exit). */
5986 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5987 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5988 && !is_gimple_debug (USE_STMT (use_p)))
5989 phis.safe_push (USE_STMT (use_p));
5991 /* While we expect to have found an exit_phi because of loop-closed-ssa
5992 form we can end up without one if the scalar cycle is dead. */
5994 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5996 if (outer_loop)
5998 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5999 gphi *vect_phi;
6001 /* FORNOW. Currently not supporting the case that an inner-loop
6002 reduction is not used in the outer-loop (but only outside the
6003 outer-loop), unless it is double reduction. */
6004 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6005 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
6006 || double_reduc);
6008 if (double_reduc)
6009 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6010 else
6011 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6012 if (!double_reduc
6013 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6014 != vect_double_reduction_def)
6015 continue;
6017 /* Handle double reduction:
6019 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
6020 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6021 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
6022 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
6024 At that point the regular reduction (stmt2 and stmt3) is
6025 already vectorized, as well as the exit phi node, stmt4.
6026 Here we vectorize the phi node of double reduction, stmt1, and
6027 update all relevant statements. */
6029 /* Go through all the uses of s2 to find double reduction phi
6030 node, i.e., stmt1 above. */
6031 orig_name = PHI_RESULT (exit_phi);
6032 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6034 stmt_vec_info use_stmt_vinfo;
6035 stmt_vec_info new_phi_vinfo;
6036 tree vect_phi_init, preheader_arg, vect_phi_res;
6037 basic_block bb = gimple_bb (use_stmt);
6038 gimple *use;
6040 /* Check that USE_STMT is really double reduction phi
6041 node. */
6042 if (gimple_code (use_stmt) != GIMPLE_PHI
6043 || gimple_phi_num_args (use_stmt) != 2
6044 || bb->loop_father != outer_loop)
6045 continue;
6046 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6047 if (!use_stmt_vinfo
6048 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6049 != vect_double_reduction_def)
6050 continue;
6052 /* Create vector phi node for double reduction:
6053 vs1 = phi <vs0, vs2>
6054 vs1 was created previously in this function by a call to
6055 vect_get_vec_def_for_operand and is stored in
6056 vec_initial_def;
6057 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6058 vs0 is created here. */
6060 /* Create vector phi node. */
6061 vect_phi = create_phi_node (vec_initial_def, bb);
6062 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6063 loop_vec_info_for_loop (outer_loop));
6064 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6066 /* Create vs0 - initial def of the double reduction phi. */
6067 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6068 loop_preheader_edge (outer_loop));
6069 vect_phi_init = get_initial_def_for_reduction
6070 (stmt, preheader_arg, NULL);
6072 /* Update phi node arguments with vs0 and vs2. */
6073 add_phi_arg (vect_phi, vect_phi_init,
6074 loop_preheader_edge (outer_loop),
6075 UNKNOWN_LOCATION);
6076 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6077 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6078 if (dump_enabled_p ())
6080 dump_printf_loc (MSG_NOTE, vect_location,
6081 "created double reduction phi node: ");
6082 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6085 vect_phi_res = PHI_RESULT (vect_phi);
6087 /* Replace the use, i.e., set the correct vs1 in the regular
6088 reduction phi node. FORNOW, NCOPIES is always 1, so the
6089 loop is redundant. */
6090 use = reduction_phi;
6091 for (j = 0; j < ncopies; j++)
6093 edge pr_edge = loop_preheader_edge (loop);
6094 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6095 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6101 phis.release ();
6102 if (nested_in_vect_loop)
6104 if (double_reduc)
6105 loop = outer_loop;
6106 else
6107 continue;
6110 phis.create (3);
6111 /* Find the loop-closed-use at the loop exit of the original scalar
6112 result. (The reduction result is expected to have two immediate uses,
6113 one at the latch block, and one at the loop exit). For double
6114 reductions we are looking for exit phis of the outer loop. */
6115 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6117 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6119 if (!is_gimple_debug (USE_STMT (use_p)))
6120 phis.safe_push (USE_STMT (use_p));
6122 else
6124 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6126 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6128 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6130 if (!flow_bb_inside_loop_p (loop,
6131 gimple_bb (USE_STMT (phi_use_p)))
6132 && !is_gimple_debug (USE_STMT (phi_use_p)))
6133 phis.safe_push (USE_STMT (phi_use_p));
6139 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6141 /* Replace the uses: */
6142 orig_name = PHI_RESULT (exit_phi);
6143 scalar_result = scalar_results[k];
6144 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6145 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6146 SET_USE (use_p, scalar_result);
6149 phis.release ();
6153 /* Return a vector of type VECTYPE that is equal to the vector select
6154 operation "MASK ? VEC : IDENTITY". Insert the select statements
6155 before GSI. */
6157 static tree
6158 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6159 tree vec, tree identity)
6161 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6162 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6163 mask, vec, identity);
6164 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6165 return cond;
6168 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6169 order, starting with LHS. Insert the extraction statements before GSI and
6170 associate the new scalar SSA names with variable SCALAR_DEST.
6171 Return the SSA name for the result. */
6173 static tree
6174 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6175 tree_code code, tree lhs, tree vector_rhs)
6177 tree vectype = TREE_TYPE (vector_rhs);
6178 tree scalar_type = TREE_TYPE (vectype);
6179 tree bitsize = TYPE_SIZE (scalar_type);
6180 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6181 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6183 for (unsigned HOST_WIDE_INT bit_offset = 0;
6184 bit_offset < vec_size_in_bits;
6185 bit_offset += element_bitsize)
6187 tree bitpos = bitsize_int (bit_offset);
6188 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6189 bitsize, bitpos);
6191 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6192 rhs = make_ssa_name (scalar_dest, stmt);
6193 gimple_assign_set_lhs (stmt, rhs);
6194 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6196 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6197 tree new_name = make_ssa_name (scalar_dest, stmt);
6198 gimple_assign_set_lhs (stmt, new_name);
6199 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6200 lhs = new_name;
6202 return lhs;
6205 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6206 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6207 statement. CODE is the operation performed by STMT and OPS are
6208 its scalar operands. REDUC_INDEX is the index of the operand in
6209 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6210 implements in-order reduction, or IFN_LAST if we should open-code it.
6211 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6212 that should be used to control the operation in a fully-masked loop. */
6214 static bool
6215 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6216 gimple **vec_stmt, slp_tree slp_node,
6217 gimple *reduc_def_stmt,
6218 tree_code code, internal_fn reduc_fn,
6219 tree ops[3], tree vectype_in,
6220 int reduc_index, vec_loop_masks *masks)
6222 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6223 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6225 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6226 gimple *new_stmt = NULL;
6228 int ncopies;
6229 if (slp_node)
6230 ncopies = 1;
6231 else
6232 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6234 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6235 gcc_assert (ncopies == 1);
6236 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6237 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6238 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6239 == FOLD_LEFT_REDUCTION);
6241 if (slp_node)
6242 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6243 TYPE_VECTOR_SUBPARTS (vectype_in)));
6245 tree op0 = ops[1 - reduc_index];
6247 int group_size = 1;
6248 gimple *scalar_dest_def;
6249 auto_vec<tree> vec_oprnds0;
6250 if (slp_node)
6252 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6253 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6254 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6256 else
6258 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6259 vec_oprnds0.create (1);
6260 vec_oprnds0.quick_push (loop_vec_def0);
6261 scalar_dest_def = stmt;
6264 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6265 tree scalar_type = TREE_TYPE (scalar_dest);
6266 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6268 int vec_num = vec_oprnds0.length ();
6269 gcc_assert (vec_num == 1 || slp_node);
6270 tree vec_elem_type = TREE_TYPE (vectype_out);
6271 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6273 tree vector_identity = NULL_TREE;
6274 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6275 vector_identity = build_zero_cst (vectype_out);
6277 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6278 int i;
6279 tree def0;
6280 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6282 tree mask = NULL_TREE;
6283 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6284 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6286 /* Handle MINUS by adding the negative. */
6287 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6289 tree negated = make_ssa_name (vectype_out);
6290 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6291 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6292 def0 = negated;
6295 if (mask)
6296 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6297 vector_identity);
6299 /* On the first iteration the input is simply the scalar phi
6300 result, and for subsequent iterations it is the output of
6301 the preceding operation. */
6302 if (reduc_fn != IFN_LAST)
6304 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6305 /* For chained SLP reductions the output of the previous reduction
6306 operation serves as the input of the next. For the final statement
6307 the output cannot be a temporary - we reuse the original
6308 scalar destination of the last statement. */
6309 if (i != vec_num - 1)
6311 gimple_set_lhs (new_stmt, scalar_dest_var);
6312 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6313 gimple_set_lhs (new_stmt, reduc_var);
6316 else
6318 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6319 reduc_var, def0);
6320 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6321 /* Remove the statement, so that we can use the same code paths
6322 as for statements that we've just created. */
6323 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6324 gsi_remove (&tmp_gsi, true);
6327 if (i == vec_num - 1)
6329 gimple_set_lhs (new_stmt, scalar_dest);
6330 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6332 else
6333 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6335 if (slp_node)
6336 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6339 if (!slp_node)
6340 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6342 return true;
6345 /* Function is_nonwrapping_integer_induction.
6347 Check if STMT (which is part of loop LOOP) both increments and
6348 does not cause overflow. */
6350 static bool
6351 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6353 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6354 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6355 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6356 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6357 widest_int ni, max_loop_value, lhs_max;
6358 bool overflow = false;
6360 /* Make sure the loop is integer based. */
6361 if (TREE_CODE (base) != INTEGER_CST
6362 || TREE_CODE (step) != INTEGER_CST)
6363 return false;
6365 /* Check that the max size of the loop will not wrap. */
6367 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6368 return true;
6370 if (! max_stmt_executions (loop, &ni))
6371 return false;
6373 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6374 &overflow);
6375 if (overflow)
6376 return false;
6378 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6379 TYPE_SIGN (lhs_type), &overflow);
6380 if (overflow)
6381 return false;
6383 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6384 <= TYPE_PRECISION (lhs_type));
6387 /* Function vectorizable_reduction.
6389 Check if STMT performs a reduction operation that can be vectorized.
6390 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6391 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6392 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6394 This function also handles reduction idioms (patterns) that have been
6395 recognized in advance during vect_pattern_recog. In this case, STMT may be
6396 of this form:
6397 X = pattern_expr (arg0, arg1, ..., X)
6398 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6399 sequence that had been detected and replaced by the pattern-stmt (STMT).
6401 This function also handles reduction of condition expressions, for example:
6402 for (int i = 0; i < N; i++)
6403 if (a[i] < value)
6404 last = a[i];
6405 This is handled by vectorising the loop and creating an additional vector
6406 containing the loop indexes for which "a[i] < value" was true. In the
6407 function epilogue this is reduced to a single max value and then used to
6408 index into the vector of results.
6410 In some cases of reduction patterns, the type of the reduction variable X is
6411 different than the type of the other arguments of STMT.
6412 In such cases, the vectype that is used when transforming STMT into a vector
6413 stmt is different than the vectype that is used to determine the
6414 vectorization factor, because it consists of a different number of elements
6415 than the actual number of elements that are being operated upon in parallel.
6417 For example, consider an accumulation of shorts into an int accumulator.
6418 On some targets it's possible to vectorize this pattern operating on 8
6419 shorts at a time (hence, the vectype for purposes of determining the
6420 vectorization factor should be V8HI); on the other hand, the vectype that
6421 is used to create the vector form is actually V4SI (the type of the result).
6423 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6424 indicates what is the actual level of parallelism (V8HI in the example), so
6425 that the right vectorization factor would be derived. This vectype
6426 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6427 be used to create the vectorized stmt. The right vectype for the vectorized
6428 stmt is obtained from the type of the result X:
6429 get_vectype_for_scalar_type (TREE_TYPE (X))
6431 This means that, contrary to "regular" reductions (or "regular" stmts in
6432 general), the following equation:
6433 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6434 does *NOT* necessarily hold for reduction patterns. */
6436 bool
6437 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6438 gimple **vec_stmt, slp_tree slp_node,
6439 slp_instance slp_node_instance)
6441 tree vec_dest;
6442 tree scalar_dest;
6443 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6444 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6445 tree vectype_in = NULL_TREE;
6446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6447 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6448 enum tree_code code, orig_code;
6449 internal_fn reduc_fn;
6450 machine_mode vec_mode;
6451 int op_type;
6452 optab optab;
6453 tree new_temp = NULL_TREE;
6454 gimple *def_stmt;
6455 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6456 gimple *cond_reduc_def_stmt = NULL;
6457 enum tree_code cond_reduc_op_code = ERROR_MARK;
6458 tree scalar_type;
6459 bool is_simple_use;
6460 gimple *orig_stmt;
6461 stmt_vec_info orig_stmt_info = NULL;
6462 int i;
6463 int ncopies;
6464 int epilog_copies;
6465 stmt_vec_info prev_stmt_info, prev_phi_info;
6466 bool single_defuse_cycle = false;
6467 gimple *new_stmt = NULL;
6468 int j;
6469 tree ops[3];
6470 enum vect_def_type dts[3];
6471 bool nested_cycle = false, found_nested_cycle_def = false;
6472 bool double_reduc = false;
6473 basic_block def_bb;
6474 struct loop * def_stmt_loop, *outer_loop = NULL;
6475 tree def_arg;
6476 gimple *def_arg_stmt;
6477 auto_vec<tree> vec_oprnds0;
6478 auto_vec<tree> vec_oprnds1;
6479 auto_vec<tree> vec_oprnds2;
6480 auto_vec<tree> vect_defs;
6481 auto_vec<gimple *> phis;
6482 int vec_num;
6483 tree def0, tem;
6484 bool first_p = true;
6485 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6486 tree cond_reduc_val = NULL_TREE;
6488 /* Make sure it was already recognized as a reduction computation. */
6489 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6490 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6491 return false;
6493 if (nested_in_vect_loop_p (loop, stmt))
6495 outer_loop = loop;
6496 loop = loop->inner;
6497 nested_cycle = true;
6500 /* In case of reduction chain we switch to the first stmt in the chain, but
6501 we don't update STMT_INFO, since only the last stmt is marked as reduction
6502 and has reduction properties. */
6503 if (GROUP_FIRST_ELEMENT (stmt_info)
6504 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6506 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6507 first_p = false;
6510 if (gimple_code (stmt) == GIMPLE_PHI)
6512 /* Analysis is fully done on the reduction stmt invocation. */
6513 if (! vec_stmt)
6515 if (slp_node)
6516 slp_node_instance->reduc_phis = slp_node;
6518 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6519 return true;
6522 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6523 /* Leave the scalar phi in place. Note that checking
6524 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6525 for reductions involving a single statement. */
6526 return true;
6528 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6529 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6530 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6532 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6533 == EXTRACT_LAST_REDUCTION)
6534 /* Leave the scalar phi in place. */
6535 return true;
6537 gcc_assert (is_gimple_assign (reduc_stmt));
6538 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6540 tree op = gimple_op (reduc_stmt, k);
6541 if (op == gimple_phi_result (stmt))
6542 continue;
6543 if (k == 1
6544 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6545 continue;
6546 if (!vectype_in
6547 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6548 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6549 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6550 break;
6552 gcc_assert (vectype_in);
6554 if (slp_node)
6555 ncopies = 1;
6556 else
6557 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6559 use_operand_p use_p;
6560 gimple *use_stmt;
6561 if (ncopies > 1
6562 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6563 <= vect_used_only_live)
6564 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6565 && (use_stmt == reduc_stmt
6566 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6567 == reduc_stmt)))
6568 single_defuse_cycle = true;
6570 /* Create the destination vector */
6571 scalar_dest = gimple_assign_lhs (reduc_stmt);
6572 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6574 if (slp_node)
6575 /* The size vect_schedule_slp_instance computes is off for us. */
6576 vec_num = vect_get_num_vectors
6577 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6578 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6579 vectype_in);
6580 else
6581 vec_num = 1;
6583 /* Generate the reduction PHIs upfront. */
6584 prev_phi_info = NULL;
6585 for (j = 0; j < ncopies; j++)
6587 if (j == 0 || !single_defuse_cycle)
6589 for (i = 0; i < vec_num; i++)
6591 /* Create the reduction-phi that defines the reduction
6592 operand. */
6593 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6594 set_vinfo_for_stmt (new_phi,
6595 new_stmt_vec_info (new_phi, loop_vinfo));
6597 if (slp_node)
6598 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6599 else
6601 if (j == 0)
6602 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6603 else
6604 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6605 prev_phi_info = vinfo_for_stmt (new_phi);
6611 return true;
6614 /* 1. Is vectorizable reduction? */
6615 /* Not supportable if the reduction variable is used in the loop, unless
6616 it's a reduction chain. */
6617 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6618 && !GROUP_FIRST_ELEMENT (stmt_info))
6619 return false;
6621 /* Reductions that are not used even in an enclosing outer-loop,
6622 are expected to be "live" (used out of the loop). */
6623 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6624 && !STMT_VINFO_LIVE_P (stmt_info))
6625 return false;
6627 /* 2. Has this been recognized as a reduction pattern?
6629 Check if STMT represents a pattern that has been recognized
6630 in earlier analysis stages. For stmts that represent a pattern,
6631 the STMT_VINFO_RELATED_STMT field records the last stmt in
6632 the original sequence that constitutes the pattern. */
6634 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6635 if (orig_stmt)
6637 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6638 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6639 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6642 /* 3. Check the operands of the operation. The first operands are defined
6643 inside the loop body. The last operand is the reduction variable,
6644 which is defined by the loop-header-phi. */
6646 gcc_assert (is_gimple_assign (stmt));
6648 /* Flatten RHS. */
6649 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6651 case GIMPLE_BINARY_RHS:
6652 code = gimple_assign_rhs_code (stmt);
6653 op_type = TREE_CODE_LENGTH (code);
6654 gcc_assert (op_type == binary_op);
6655 ops[0] = gimple_assign_rhs1 (stmt);
6656 ops[1] = gimple_assign_rhs2 (stmt);
6657 break;
6659 case GIMPLE_TERNARY_RHS:
6660 code = gimple_assign_rhs_code (stmt);
6661 op_type = TREE_CODE_LENGTH (code);
6662 gcc_assert (op_type == ternary_op);
6663 ops[0] = gimple_assign_rhs1 (stmt);
6664 ops[1] = gimple_assign_rhs2 (stmt);
6665 ops[2] = gimple_assign_rhs3 (stmt);
6666 break;
6668 case GIMPLE_UNARY_RHS:
6669 return false;
6671 default:
6672 gcc_unreachable ();
6675 if (code == COND_EXPR && slp_node)
6676 return false;
6678 scalar_dest = gimple_assign_lhs (stmt);
6679 scalar_type = TREE_TYPE (scalar_dest);
6680 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6681 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6682 return false;
6684 /* Do not try to vectorize bit-precision reductions. */
6685 if (!type_has_mode_precision_p (scalar_type))
6686 return false;
6688 /* All uses but the last are expected to be defined in the loop.
6689 The last use is the reduction variable. In case of nested cycle this
6690 assumption is not true: we use reduc_index to record the index of the
6691 reduction variable. */
6692 gimple *reduc_def_stmt = NULL;
6693 int reduc_index = -1;
6694 for (i = 0; i < op_type; i++)
6696 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6697 if (i == 0 && code == COND_EXPR)
6698 continue;
6700 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6701 &def_stmt, &dts[i], &tem);
6702 dt = dts[i];
6703 gcc_assert (is_simple_use);
6704 if (dt == vect_reduction_def)
6706 reduc_def_stmt = def_stmt;
6707 reduc_index = i;
6708 continue;
6710 else if (tem)
6712 /* To properly compute ncopies we are interested in the widest
6713 input type in case we're looking at a widening accumulation. */
6714 if (!vectype_in
6715 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6716 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6717 vectype_in = tem;
6720 if (dt != vect_internal_def
6721 && dt != vect_external_def
6722 && dt != vect_constant_def
6723 && dt != vect_induction_def
6724 && !(dt == vect_nested_cycle && nested_cycle))
6725 return false;
6727 if (dt == vect_nested_cycle)
6729 found_nested_cycle_def = true;
6730 reduc_def_stmt = def_stmt;
6731 reduc_index = i;
6734 if (i == 1 && code == COND_EXPR)
6736 /* Record how value of COND_EXPR is defined. */
6737 if (dt == vect_constant_def)
6739 cond_reduc_dt = dt;
6740 cond_reduc_val = ops[i];
6742 if (dt == vect_induction_def
6743 && def_stmt != NULL
6744 && is_nonwrapping_integer_induction (def_stmt, loop))
6746 cond_reduc_dt = dt;
6747 cond_reduc_def_stmt = def_stmt;
6752 if (!vectype_in)
6753 vectype_in = vectype_out;
6755 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6756 directy used in stmt. */
6757 if (reduc_index == -1)
6759 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6761 if (dump_enabled_p ())
6762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6763 "in-order reduction chain without SLP.\n");
6764 return false;
6767 if (orig_stmt)
6768 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6769 else
6770 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6773 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6774 return false;
6776 if (!(reduc_index == -1
6777 || dts[reduc_index] == vect_reduction_def
6778 || dts[reduc_index] == vect_nested_cycle
6779 || ((dts[reduc_index] == vect_internal_def
6780 || dts[reduc_index] == vect_external_def
6781 || dts[reduc_index] == vect_constant_def
6782 || dts[reduc_index] == vect_induction_def)
6783 && nested_cycle && found_nested_cycle_def)))
6785 /* For pattern recognized stmts, orig_stmt might be a reduction,
6786 but some helper statements for the pattern might not, or
6787 might be COND_EXPRs with reduction uses in the condition. */
6788 gcc_assert (orig_stmt);
6789 return false;
6792 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6793 enum vect_reduction_type v_reduc_type
6794 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6795 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6797 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6798 /* If we have a condition reduction, see if we can simplify it further. */
6799 if (v_reduc_type == COND_REDUCTION)
6801 /* TODO: We can't yet handle reduction chains, since we need to treat
6802 each COND_EXPR in the chain specially, not just the last one.
6803 E.g. for:
6805 x_1 = PHI <x_3, ...>
6806 x_2 = a_2 ? ... : x_1;
6807 x_3 = a_3 ? ... : x_2;
6809 we're interested in the last element in x_3 for which a_2 || a_3
6810 is true, whereas the current reduction chain handling would
6811 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6812 as a reduction operation. */
6813 if (reduc_index == -1)
6815 if (dump_enabled_p ())
6816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6817 "conditional reduction chains not supported\n");
6818 return false;
6821 /* vect_is_simple_reduction ensured that operand 2 is the
6822 loop-carried operand. */
6823 gcc_assert (reduc_index == 2);
6825 /* Loop peeling modifies initial value of reduction PHI, which
6826 makes the reduction stmt to be transformed different to the
6827 original stmt analyzed. We need to record reduction code for
6828 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6829 it can be used directly at transform stage. */
6830 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6831 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6833 /* Also set the reduction type to CONST_COND_REDUCTION. */
6834 gcc_assert (cond_reduc_dt == vect_constant_def);
6835 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6837 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6838 vectype_in, OPTIMIZE_FOR_SPEED))
6840 if (dump_enabled_p ())
6841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842 "optimizing condition reduction with"
6843 " FOLD_EXTRACT_LAST.\n");
6844 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6846 else if (cond_reduc_dt == vect_induction_def)
6848 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6849 tree base
6850 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6851 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6853 gcc_assert (TREE_CODE (base) == INTEGER_CST
6854 && TREE_CODE (step) == INTEGER_CST);
6855 cond_reduc_val = NULL_TREE;
6856 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6857 above base; punt if base is the minimum value of the type for
6858 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6859 if (tree_int_cst_sgn (step) == -1)
6861 cond_reduc_op_code = MIN_EXPR;
6862 if (tree_int_cst_sgn (base) == -1)
6863 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6864 else if (tree_int_cst_lt (base,
6865 TYPE_MAX_VALUE (TREE_TYPE (base))))
6866 cond_reduc_val
6867 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6869 else
6871 cond_reduc_op_code = MAX_EXPR;
6872 if (tree_int_cst_sgn (base) == 1)
6873 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6874 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6875 base))
6876 cond_reduc_val
6877 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6879 if (cond_reduc_val)
6881 if (dump_enabled_p ())
6882 dump_printf_loc (MSG_NOTE, vect_location,
6883 "condition expression based on "
6884 "integer induction.\n");
6885 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6886 = INTEGER_INDUC_COND_REDUCTION;
6889 else if (cond_reduc_dt == vect_constant_def)
6891 enum vect_def_type cond_initial_dt;
6892 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6893 tree cond_initial_val
6894 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6896 gcc_assert (cond_reduc_val != NULL_TREE);
6897 vect_is_simple_use (cond_initial_val, loop_vinfo,
6898 &def_stmt, &cond_initial_dt);
6899 if (cond_initial_dt == vect_constant_def
6900 && types_compatible_p (TREE_TYPE (cond_initial_val),
6901 TREE_TYPE (cond_reduc_val)))
6903 tree e = fold_binary (LE_EXPR, boolean_type_node,
6904 cond_initial_val, cond_reduc_val);
6905 if (e && (integer_onep (e) || integer_zerop (e)))
6907 if (dump_enabled_p ())
6908 dump_printf_loc (MSG_NOTE, vect_location,
6909 "condition expression based on "
6910 "compile time constant.\n");
6911 /* Record reduction code at analysis stage. */
6912 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6913 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6914 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6915 = CONST_COND_REDUCTION;
6921 if (orig_stmt)
6922 gcc_assert (tmp == orig_stmt
6923 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6924 else
6925 /* We changed STMT to be the first stmt in reduction chain, hence we
6926 check that in this case the first element in the chain is STMT. */
6927 gcc_assert (stmt == tmp
6928 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6930 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6931 return false;
6933 if (slp_node)
6934 ncopies = 1;
6935 else
6936 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6938 gcc_assert (ncopies >= 1);
6940 vec_mode = TYPE_MODE (vectype_in);
6941 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6943 if (code == COND_EXPR)
6945 /* Only call during the analysis stage, otherwise we'll lose
6946 STMT_VINFO_TYPE. */
6947 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6948 ops[reduc_index], 0, NULL))
6950 if (dump_enabled_p ())
6951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6952 "unsupported condition in reduction\n");
6953 return false;
6956 else
6958 /* 4. Supportable by target? */
6960 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6961 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6963 /* Shifts and rotates are only supported by vectorizable_shifts,
6964 not vectorizable_reduction. */
6965 if (dump_enabled_p ())
6966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967 "unsupported shift or rotation.\n");
6968 return false;
6971 /* 4.1. check support for the operation in the loop */
6972 optab = optab_for_tree_code (code, vectype_in, optab_default);
6973 if (!optab)
6975 if (dump_enabled_p ())
6976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6977 "no optab.\n");
6979 return false;
6982 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6984 if (dump_enabled_p ())
6985 dump_printf (MSG_NOTE, "op not supported by target.\n");
6987 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6988 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6989 return false;
6991 if (dump_enabled_p ())
6992 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6995 /* Worthwhile without SIMD support? */
6996 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6997 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6999 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 "not worthwhile without SIMD support.\n");
7003 return false;
7007 /* 4.2. Check support for the epilog operation.
7009 If STMT represents a reduction pattern, then the type of the
7010 reduction variable may be different than the type of the rest
7011 of the arguments. For example, consider the case of accumulation
7012 of shorts into an int accumulator; The original code:
7013 S1: int_a = (int) short_a;
7014 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7016 was replaced with:
7017 STMT: int_acc = widen_sum <short_a, int_acc>
7019 This means that:
7020 1. The tree-code that is used to create the vector operation in the
7021 epilog code (that reduces the partial results) is not the
7022 tree-code of STMT, but is rather the tree-code of the original
7023 stmt from the pattern that STMT is replacing. I.e, in the example
7024 above we want to use 'widen_sum' in the loop, but 'plus' in the
7025 epilog.
7026 2. The type (mode) we use to check available target support
7027 for the vector operation to be created in the *epilog*, is
7028 determined by the type of the reduction variable (in the example
7029 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7030 However the type (mode) we use to check available target support
7031 for the vector operation to be created *inside the loop*, is
7032 determined by the type of the other arguments to STMT (in the
7033 example we'd check this: optab_handler (widen_sum_optab,
7034 vect_short_mode)).
7036 This is contrary to "regular" reductions, in which the types of all
7037 the arguments are the same as the type of the reduction variable.
7038 For "regular" reductions we can therefore use the same vector type
7039 (and also the same tree-code) when generating the epilog code and
7040 when generating the code inside the loop. */
7042 vect_reduction_type reduction_type
7043 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7044 if (orig_stmt
7045 && (reduction_type == TREE_CODE_REDUCTION
7046 || reduction_type == FOLD_LEFT_REDUCTION))
7048 /* This is a reduction pattern: get the vectype from the type of the
7049 reduction variable, and get the tree-code from orig_stmt. */
7050 orig_code = gimple_assign_rhs_code (orig_stmt);
7051 gcc_assert (vectype_out);
7052 vec_mode = TYPE_MODE (vectype_out);
7054 else
7056 /* Regular reduction: use the same vectype and tree-code as used for
7057 the vector code inside the loop can be used for the epilog code. */
7058 orig_code = code;
7060 if (code == MINUS_EXPR)
7061 orig_code = PLUS_EXPR;
7063 /* For simple condition reductions, replace with the actual expression
7064 we want to base our reduction around. */
7065 if (reduction_type == CONST_COND_REDUCTION)
7067 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7068 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7070 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7071 orig_code = cond_reduc_op_code;
7074 if (nested_cycle)
7076 def_bb = gimple_bb (reduc_def_stmt);
7077 def_stmt_loop = def_bb->loop_father;
7078 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7079 loop_preheader_edge (def_stmt_loop));
7080 if (TREE_CODE (def_arg) == SSA_NAME
7081 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7082 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7083 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7084 && vinfo_for_stmt (def_arg_stmt)
7085 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7086 == vect_double_reduction_def)
7087 double_reduc = true;
7090 reduc_fn = IFN_LAST;
7092 if (reduction_type == TREE_CODE_REDUCTION
7093 || reduction_type == FOLD_LEFT_REDUCTION
7094 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7095 || reduction_type == CONST_COND_REDUCTION)
7097 if (reduction_type == FOLD_LEFT_REDUCTION
7098 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7099 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7101 if (reduc_fn != IFN_LAST
7102 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7103 OPTIMIZE_FOR_SPEED))
7105 if (dump_enabled_p ())
7106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7107 "reduc op not supported by target.\n");
7109 reduc_fn = IFN_LAST;
7112 else
7114 if (!nested_cycle || double_reduc)
7116 if (dump_enabled_p ())
7117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118 "no reduc code for scalar code.\n");
7120 return false;
7124 else if (reduction_type == COND_REDUCTION)
7126 int scalar_precision
7127 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7128 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7129 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7130 nunits_out);
7132 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7133 OPTIMIZE_FOR_SPEED))
7134 reduc_fn = IFN_REDUC_MAX;
7137 if (reduction_type != EXTRACT_LAST_REDUCTION
7138 && reduc_fn == IFN_LAST
7139 && !nunits_out.is_constant ())
7141 if (dump_enabled_p ())
7142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 "missing target support for reduction on"
7144 " variable-length vectors.\n");
7145 return false;
7148 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7149 && ncopies > 1)
7151 if (dump_enabled_p ())
7152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7153 "multiple types in double reduction or condition "
7154 "reduction.\n");
7155 return false;
7158 /* For SLP reductions, see if there is a neutral value we can use. */
7159 tree neutral_op = NULL_TREE;
7160 if (slp_node)
7161 neutral_op
7162 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7163 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7165 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7167 /* We can't support in-order reductions of code such as this:
7169 for (int i = 0; i < n1; ++i)
7170 for (int j = 0; j < n2; ++j)
7171 l += a[j];
7173 since GCC effectively transforms the loop when vectorizing:
7175 for (int i = 0; i < n1 / VF; ++i)
7176 for (int j = 0; j < n2; ++j)
7177 for (int k = 0; k < VF; ++k)
7178 l += a[j];
7180 which is a reassociation of the original operation. */
7181 if (dump_enabled_p ())
7182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7183 "in-order double reduction not supported.\n");
7185 return false;
7188 if (reduction_type == FOLD_LEFT_REDUCTION
7189 && slp_node
7190 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7192 /* We cannot use in-order reductions in this case because there is
7193 an implicit reassociation of the operations involved. */
7194 if (dump_enabled_p ())
7195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7196 "in-order unchained SLP reductions not supported.\n");
7197 return false;
7200 /* For double reductions, and for SLP reductions with a neutral value,
7201 we construct a variable-length initial vector by loading a vector
7202 full of the neutral value and then shift-and-inserting the start
7203 values into the low-numbered elements. */
7204 if ((double_reduc || neutral_op)
7205 && !nunits_out.is_constant ()
7206 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7207 vectype_out, OPTIMIZE_FOR_SPEED))
7209 if (dump_enabled_p ())
7210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211 "reduction on variable-length vectors requires"
7212 " target support for a vector-shift-and-insert"
7213 " operation.\n");
7214 return false;
7217 /* Check extra constraints for variable-length unchained SLP reductions. */
7218 if (STMT_SLP_TYPE (stmt_info)
7219 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7220 && !nunits_out.is_constant ())
7222 /* We checked above that we could build the initial vector when
7223 there's a neutral element value. Check here for the case in
7224 which each SLP statement has its own initial value and in which
7225 that value needs to be repeated for every instance of the
7226 statement within the initial vector. */
7227 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7228 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7229 if (!neutral_op
7230 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7232 if (dump_enabled_p ())
7233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7234 "unsupported form of SLP reduction for"
7235 " variable-length vectors: cannot build"
7236 " initial vector.\n");
7237 return false;
7239 /* The epilogue code relies on the number of elements being a multiple
7240 of the group size. The duplicate-and-interleave approach to setting
7241 up the the initial vector does too. */
7242 if (!multiple_p (nunits_out, group_size))
7244 if (dump_enabled_p ())
7245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7246 "unsupported form of SLP reduction for"
7247 " variable-length vectors: the vector size"
7248 " is not a multiple of the number of results.\n");
7249 return false;
7253 /* In case of widenning multiplication by a constant, we update the type
7254 of the constant to be the type of the other operand. We check that the
7255 constant fits the type in the pattern recognition pass. */
7256 if (code == DOT_PROD_EXPR
7257 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7259 if (TREE_CODE (ops[0]) == INTEGER_CST)
7260 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7261 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7262 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7263 else
7265 if (dump_enabled_p ())
7266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7267 "invalid types in dot-prod\n");
7269 return false;
7273 if (reduction_type == COND_REDUCTION)
7275 widest_int ni;
7277 if (! max_loop_iterations (loop, &ni))
7279 if (dump_enabled_p ())
7280 dump_printf_loc (MSG_NOTE, vect_location,
7281 "loop count not known, cannot create cond "
7282 "reduction.\n");
7283 return false;
7285 /* Convert backedges to iterations. */
7286 ni += 1;
7288 /* The additional index will be the same type as the condition. Check
7289 that the loop can fit into this less one (because we'll use up the
7290 zero slot for when there are no matches). */
7291 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7292 if (wi::geu_p (ni, wi::to_widest (max_index)))
7294 if (dump_enabled_p ())
7295 dump_printf_loc (MSG_NOTE, vect_location,
7296 "loop size is greater than data size.\n");
7297 return false;
7301 /* In case the vectorization factor (VF) is bigger than the number
7302 of elements that we can fit in a vectype (nunits), we have to generate
7303 more than one vector stmt - i.e - we need to "unroll" the
7304 vector stmt by a factor VF/nunits. For more details see documentation
7305 in vectorizable_operation. */
7307 /* If the reduction is used in an outer loop we need to generate
7308 VF intermediate results, like so (e.g. for ncopies=2):
7309 r0 = phi (init, r0)
7310 r1 = phi (init, r1)
7311 r0 = x0 + r0;
7312 r1 = x1 + r1;
7313 (i.e. we generate VF results in 2 registers).
7314 In this case we have a separate def-use cycle for each copy, and therefore
7315 for each copy we get the vector def for the reduction variable from the
7316 respective phi node created for this copy.
7318 Otherwise (the reduction is unused in the loop nest), we can combine
7319 together intermediate results, like so (e.g. for ncopies=2):
7320 r = phi (init, r)
7321 r = x0 + r;
7322 r = x1 + r;
7323 (i.e. we generate VF/2 results in a single register).
7324 In this case for each copy we get the vector def for the reduction variable
7325 from the vectorized reduction operation generated in the previous iteration.
7327 This only works when we see both the reduction PHI and its only consumer
7328 in vectorizable_reduction and there are no intermediate stmts
7329 participating. */
7330 use_operand_p use_p;
7331 gimple *use_stmt;
7332 if (ncopies > 1
7333 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7334 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7335 && (use_stmt == stmt
7336 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7338 single_defuse_cycle = true;
7339 epilog_copies = 1;
7341 else
7342 epilog_copies = ncopies;
7344 /* If the reduction stmt is one of the patterns that have lane
7345 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7346 if ((ncopies > 1
7347 && ! single_defuse_cycle)
7348 && (code == DOT_PROD_EXPR
7349 || code == WIDEN_SUM_EXPR
7350 || code == SAD_EXPR))
7352 if (dump_enabled_p ())
7353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7354 "multi def-use cycle not possible for lane-reducing "
7355 "reduction operation\n");
7356 return false;
7359 if (slp_node)
7360 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7361 else
7362 vec_num = 1;
7364 internal_fn cond_fn = get_conditional_internal_fn (code);
7365 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7367 if (!vec_stmt) /* transformation not required. */
7369 if (first_p)
7370 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7371 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7373 if (reduction_type != FOLD_LEFT_REDUCTION
7374 && (cond_fn == IFN_LAST
7375 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7376 OPTIMIZE_FOR_SPEED)))
7378 if (dump_enabled_p ())
7379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7380 "can't use a fully-masked loop because no"
7381 " conditional operation is available.\n");
7382 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7384 else if (reduc_index == -1)
7386 if (dump_enabled_p ())
7387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7388 "can't use a fully-masked loop for chained"
7389 " reductions.\n");
7390 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7392 else
7393 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7394 vectype_in);
7396 if (dump_enabled_p ()
7397 && reduction_type == FOLD_LEFT_REDUCTION)
7398 dump_printf_loc (MSG_NOTE, vect_location,
7399 "using an in-order (fold-left) reduction.\n");
7400 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7401 return true;
7404 /* Transform. */
7406 if (dump_enabled_p ())
7407 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7409 /* FORNOW: Multiple types are not supported for condition. */
7410 if (code == COND_EXPR)
7411 gcc_assert (ncopies == 1);
7413 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7415 if (reduction_type == FOLD_LEFT_REDUCTION)
7416 return vectorize_fold_left_reduction
7417 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7418 reduc_fn, ops, vectype_in, reduc_index, masks);
7420 if (reduction_type == EXTRACT_LAST_REDUCTION)
7422 gcc_assert (!slp_node);
7423 return vectorizable_condition (stmt, gsi, vec_stmt,
7424 NULL, reduc_index, NULL);
7427 /* Create the destination vector */
7428 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7430 prev_stmt_info = NULL;
7431 prev_phi_info = NULL;
7432 if (!slp_node)
7434 vec_oprnds0.create (1);
7435 vec_oprnds1.create (1);
7436 if (op_type == ternary_op)
7437 vec_oprnds2.create (1);
7440 phis.create (vec_num);
7441 vect_defs.create (vec_num);
7442 if (!slp_node)
7443 vect_defs.quick_push (NULL_TREE);
7445 if (slp_node)
7446 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7447 else
7448 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7450 for (j = 0; j < ncopies; j++)
7452 if (code == COND_EXPR)
7454 gcc_assert (!slp_node);
7455 vectorizable_condition (stmt, gsi, vec_stmt,
7456 PHI_RESULT (phis[0]),
7457 reduc_index, NULL);
7458 /* Multiple types are not supported for condition. */
7459 break;
7462 /* Handle uses. */
7463 if (j == 0)
7465 if (slp_node)
7467 /* Get vec defs for all the operands except the reduction index,
7468 ensuring the ordering of the ops in the vector is kept. */
7469 auto_vec<tree, 3> slp_ops;
7470 auto_vec<vec<tree>, 3> vec_defs;
7472 slp_ops.quick_push (ops[0]);
7473 slp_ops.quick_push (ops[1]);
7474 if (op_type == ternary_op)
7475 slp_ops.quick_push (ops[2]);
7477 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7479 vec_oprnds0.safe_splice (vec_defs[0]);
7480 vec_defs[0].release ();
7481 vec_oprnds1.safe_splice (vec_defs[1]);
7482 vec_defs[1].release ();
7483 if (op_type == ternary_op)
7485 vec_oprnds2.safe_splice (vec_defs[2]);
7486 vec_defs[2].release ();
7489 else
7491 vec_oprnds0.quick_push
7492 (vect_get_vec_def_for_operand (ops[0], stmt));
7493 vec_oprnds1.quick_push
7494 (vect_get_vec_def_for_operand (ops[1], stmt));
7495 if (op_type == ternary_op)
7496 vec_oprnds2.quick_push
7497 (vect_get_vec_def_for_operand (ops[2], stmt));
7500 else
7502 if (!slp_node)
7504 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7506 if (single_defuse_cycle && reduc_index == 0)
7507 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7508 else
7509 vec_oprnds0[0]
7510 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7511 if (single_defuse_cycle && reduc_index == 1)
7512 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7513 else
7514 vec_oprnds1[0]
7515 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7516 if (op_type == ternary_op)
7518 if (single_defuse_cycle && reduc_index == 2)
7519 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7520 else
7521 vec_oprnds2[0]
7522 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7527 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7529 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7530 if (masked_loop_p)
7532 /* Make sure that the reduction accumulator is vop[0]. */
7533 if (reduc_index == 1)
7535 gcc_assert (commutative_tree_code (code));
7536 std::swap (vop[0], vop[1]);
7538 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7539 vectype_in, i * ncopies + j);
7540 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7541 vop[0], vop[1]);
7542 new_temp = make_ssa_name (vec_dest, call);
7543 gimple_call_set_lhs (call, new_temp);
7544 gimple_call_set_nothrow (call, true);
7545 new_stmt = call;
7547 else
7549 if (op_type == ternary_op)
7550 vop[2] = vec_oprnds2[i];
7552 new_temp = make_ssa_name (vec_dest, new_stmt);
7553 new_stmt = gimple_build_assign (new_temp, code,
7554 vop[0], vop[1], vop[2]);
7556 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7558 if (slp_node)
7560 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7561 vect_defs.quick_push (new_temp);
7563 else
7564 vect_defs[0] = new_temp;
7567 if (slp_node)
7568 continue;
7570 if (j == 0)
7571 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7572 else
7573 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7575 prev_stmt_info = vinfo_for_stmt (new_stmt);
7578 /* Finalize the reduction-phi (set its arguments) and create the
7579 epilog reduction code. */
7580 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7581 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7583 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7584 epilog_copies, reduc_fn, phis,
7585 double_reduc, slp_node, slp_node_instance,
7586 cond_reduc_val, cond_reduc_op_code,
7587 neutral_op);
7589 return true;
7592 /* Function vect_min_worthwhile_factor.
7594 For a loop where we could vectorize the operation indicated by CODE,
7595 return the minimum vectorization factor that makes it worthwhile
7596 to use generic vectors. */
7597 static unsigned int
7598 vect_min_worthwhile_factor (enum tree_code code)
7600 switch (code)
7602 case PLUS_EXPR:
7603 case MINUS_EXPR:
7604 case NEGATE_EXPR:
7605 return 4;
7607 case BIT_AND_EXPR:
7608 case BIT_IOR_EXPR:
7609 case BIT_XOR_EXPR:
7610 case BIT_NOT_EXPR:
7611 return 2;
7613 default:
7614 return INT_MAX;
7618 /* Return true if VINFO indicates we are doing loop vectorization and if
7619 it is worth decomposing CODE operations into scalar operations for
7620 that loop's vectorization factor. */
7622 bool
7623 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7625 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7626 unsigned HOST_WIDE_INT value;
7627 return (loop_vinfo
7628 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7629 && value >= vect_min_worthwhile_factor (code));
7632 /* Function vectorizable_induction
7634 Check if PHI performs an induction computation that can be vectorized.
7635 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7636 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7637 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7639 bool
7640 vectorizable_induction (gimple *phi,
7641 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7642 gimple **vec_stmt, slp_tree slp_node)
7644 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7645 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7646 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7647 unsigned ncopies;
7648 bool nested_in_vect_loop = false;
7649 struct loop *iv_loop;
7650 tree vec_def;
7651 edge pe = loop_preheader_edge (loop);
7652 basic_block new_bb;
7653 tree new_vec, vec_init, vec_step, t;
7654 tree new_name;
7655 gimple *new_stmt;
7656 gphi *induction_phi;
7657 tree induc_def, vec_dest;
7658 tree init_expr, step_expr;
7659 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7660 unsigned i;
7661 tree expr;
7662 gimple_seq stmts;
7663 imm_use_iterator imm_iter;
7664 use_operand_p use_p;
7665 gimple *exit_phi;
7666 edge latch_e;
7667 tree loop_arg;
7668 gimple_stmt_iterator si;
7669 basic_block bb = gimple_bb (phi);
7671 if (gimple_code (phi) != GIMPLE_PHI)
7672 return false;
7674 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7675 return false;
7677 /* Make sure it was recognized as induction computation. */
7678 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7679 return false;
7681 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7682 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7684 if (slp_node)
7685 ncopies = 1;
7686 else
7687 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7688 gcc_assert (ncopies >= 1);
7690 /* FORNOW. These restrictions should be relaxed. */
7691 if (nested_in_vect_loop_p (loop, phi))
7693 imm_use_iterator imm_iter;
7694 use_operand_p use_p;
7695 gimple *exit_phi;
7696 edge latch_e;
7697 tree loop_arg;
7699 if (ncopies > 1)
7701 if (dump_enabled_p ())
7702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7703 "multiple types in nested loop.\n");
7704 return false;
7707 /* FORNOW: outer loop induction with SLP not supported. */
7708 if (STMT_SLP_TYPE (stmt_info))
7709 return false;
7711 exit_phi = NULL;
7712 latch_e = loop_latch_edge (loop->inner);
7713 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7714 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7716 gimple *use_stmt = USE_STMT (use_p);
7717 if (is_gimple_debug (use_stmt))
7718 continue;
7720 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7722 exit_phi = use_stmt;
7723 break;
7726 if (exit_phi)
7728 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7729 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7730 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7732 if (dump_enabled_p ())
7733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7734 "inner-loop induction only used outside "
7735 "of the outer vectorized loop.\n");
7736 return false;
7740 nested_in_vect_loop = true;
7741 iv_loop = loop->inner;
7743 else
7744 iv_loop = loop;
7745 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7747 if (slp_node && !nunits.is_constant ())
7749 /* The current SLP code creates the initial value element-by-element. */
7750 if (dump_enabled_p ())
7751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7752 "SLP induction not supported for variable-length"
7753 " vectors.\n");
7754 return false;
7757 if (!vec_stmt) /* transformation not required. */
7759 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7760 if (dump_enabled_p ())
7761 dump_printf_loc (MSG_NOTE, vect_location,
7762 "=== vectorizable_induction ===\n");
7763 vect_model_induction_cost (stmt_info, ncopies);
7764 return true;
7767 /* Transform. */
7769 /* Compute a vector variable, initialized with the first VF values of
7770 the induction variable. E.g., for an iv with IV_PHI='X' and
7771 evolution S, for a vector of 4 units, we want to compute:
7772 [X, X + S, X + 2*S, X + 3*S]. */
7774 if (dump_enabled_p ())
7775 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7777 latch_e = loop_latch_edge (iv_loop);
7778 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7780 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7781 gcc_assert (step_expr != NULL_TREE);
7783 pe = loop_preheader_edge (iv_loop);
7784 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7785 loop_preheader_edge (iv_loop));
7787 stmts = NULL;
7788 if (!nested_in_vect_loop)
7790 /* Convert the initial value to the desired type. */
7791 tree new_type = TREE_TYPE (vectype);
7792 init_expr = gimple_convert (&stmts, new_type, init_expr);
7794 /* If we are using the loop mask to "peel" for alignment then we need
7795 to adjust the start value here. */
7796 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7797 if (skip_niters != NULL_TREE)
7799 if (FLOAT_TYPE_P (vectype))
7800 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7801 skip_niters);
7802 else
7803 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7804 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7805 skip_niters, step_expr);
7806 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7807 init_expr, skip_step);
7811 /* Convert the step to the desired type. */
7812 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7814 if (stmts)
7816 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7817 gcc_assert (!new_bb);
7820 /* Find the first insertion point in the BB. */
7821 si = gsi_after_labels (bb);
7823 /* For SLP induction we have to generate several IVs as for example
7824 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7825 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7826 [VF*S, VF*S, VF*S, VF*S] for all. */
7827 if (slp_node)
7829 /* Enforced above. */
7830 unsigned int const_nunits = nunits.to_constant ();
7832 /* Generate [VF*S, VF*S, ... ]. */
7833 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7835 expr = build_int_cst (integer_type_node, vf);
7836 expr = fold_convert (TREE_TYPE (step_expr), expr);
7838 else
7839 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7840 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7841 expr, step_expr);
7842 if (! CONSTANT_CLASS_P (new_name))
7843 new_name = vect_init_vector (phi, new_name,
7844 TREE_TYPE (step_expr), NULL);
7845 new_vec = build_vector_from_val (vectype, new_name);
7846 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7848 /* Now generate the IVs. */
7849 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7850 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7851 unsigned elts = const_nunits * nvects;
7852 unsigned nivs = least_common_multiple (group_size,
7853 const_nunits) / const_nunits;
7854 gcc_assert (elts % group_size == 0);
7855 tree elt = init_expr;
7856 unsigned ivn;
7857 for (ivn = 0; ivn < nivs; ++ivn)
7859 tree_vector_builder elts (vectype, const_nunits, 1);
7860 stmts = NULL;
7861 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7863 if (ivn*const_nunits + eltn >= group_size
7864 && (ivn * const_nunits + eltn) % group_size == 0)
7865 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7866 elt, step_expr);
7867 elts.quick_push (elt);
7869 vec_init = gimple_build_vector (&stmts, &elts);
7870 if (stmts)
7872 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7873 gcc_assert (!new_bb);
7876 /* Create the induction-phi that defines the induction-operand. */
7877 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7878 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7879 set_vinfo_for_stmt (induction_phi,
7880 new_stmt_vec_info (induction_phi, loop_vinfo));
7881 induc_def = PHI_RESULT (induction_phi);
7883 /* Create the iv update inside the loop */
7884 vec_def = make_ssa_name (vec_dest);
7885 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7886 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7887 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7889 /* Set the arguments of the phi node: */
7890 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7891 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7892 UNKNOWN_LOCATION);
7894 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7897 /* Re-use IVs when we can. */
7898 if (ivn < nvects)
7900 unsigned vfp
7901 = least_common_multiple (group_size, const_nunits) / group_size;
7902 /* Generate [VF'*S, VF'*S, ... ]. */
7903 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7905 expr = build_int_cst (integer_type_node, vfp);
7906 expr = fold_convert (TREE_TYPE (step_expr), expr);
7908 else
7909 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7910 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7911 expr, step_expr);
7912 if (! CONSTANT_CLASS_P (new_name))
7913 new_name = vect_init_vector (phi, new_name,
7914 TREE_TYPE (step_expr), NULL);
7915 new_vec = build_vector_from_val (vectype, new_name);
7916 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7917 for (; ivn < nvects; ++ivn)
7919 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7920 tree def;
7921 if (gimple_code (iv) == GIMPLE_PHI)
7922 def = gimple_phi_result (iv);
7923 else
7924 def = gimple_assign_lhs (iv);
7925 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7926 PLUS_EXPR,
7927 def, vec_step);
7928 if (gimple_code (iv) == GIMPLE_PHI)
7929 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7930 else
7932 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7933 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7935 set_vinfo_for_stmt (new_stmt,
7936 new_stmt_vec_info (new_stmt, loop_vinfo));
7937 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7941 return true;
7944 /* Create the vector that holds the initial_value of the induction. */
7945 if (nested_in_vect_loop)
7947 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7948 been created during vectorization of previous stmts. We obtain it
7949 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7950 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7951 /* If the initial value is not of proper type, convert it. */
7952 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7954 new_stmt
7955 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7956 vect_simple_var,
7957 "vec_iv_"),
7958 VIEW_CONVERT_EXPR,
7959 build1 (VIEW_CONVERT_EXPR, vectype,
7960 vec_init));
7961 vec_init = gimple_assign_lhs (new_stmt);
7962 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7963 new_stmt);
7964 gcc_assert (!new_bb);
7965 set_vinfo_for_stmt (new_stmt,
7966 new_stmt_vec_info (new_stmt, loop_vinfo));
7969 else
7971 /* iv_loop is the loop to be vectorized. Create:
7972 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7973 stmts = NULL;
7974 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7976 unsigned HOST_WIDE_INT const_nunits;
7977 if (nunits.is_constant (&const_nunits))
7979 tree_vector_builder elts (vectype, const_nunits, 1);
7980 elts.quick_push (new_name);
7981 for (i = 1; i < const_nunits; i++)
7983 /* Create: new_name_i = new_name + step_expr */
7984 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7985 new_name, step_expr);
7986 elts.quick_push (new_name);
7988 /* Create a vector from [new_name_0, new_name_1, ...,
7989 new_name_nunits-1] */
7990 vec_init = gimple_build_vector (&stmts, &elts);
7992 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7993 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7994 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7995 new_name, step_expr);
7996 else
7998 /* Build:
7999 [base, base, base, ...]
8000 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8001 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8002 gcc_assert (flag_associative_math);
8003 tree index = build_index_vector (vectype, 0, 1);
8004 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
8005 new_name);
8006 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
8007 step_expr);
8008 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
8009 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8010 vec_init, step_vec);
8011 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8012 vec_init, base_vec);
8015 if (stmts)
8017 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8018 gcc_assert (!new_bb);
8023 /* Create the vector that holds the step of the induction. */
8024 if (nested_in_vect_loop)
8025 /* iv_loop is nested in the loop to be vectorized. Generate:
8026 vec_step = [S, S, S, S] */
8027 new_name = step_expr;
8028 else
8030 /* iv_loop is the loop to be vectorized. Generate:
8031 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8032 gimple_seq seq = NULL;
8033 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8035 expr = build_int_cst (integer_type_node, vf);
8036 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8038 else
8039 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8040 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8041 expr, step_expr);
8042 if (seq)
8044 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8045 gcc_assert (!new_bb);
8049 t = unshare_expr (new_name);
8050 gcc_assert (CONSTANT_CLASS_P (new_name)
8051 || TREE_CODE (new_name) == SSA_NAME);
8052 new_vec = build_vector_from_val (vectype, t);
8053 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8056 /* Create the following def-use cycle:
8057 loop prolog:
8058 vec_init = ...
8059 vec_step = ...
8060 loop:
8061 vec_iv = PHI <vec_init, vec_loop>
8063 STMT
8065 vec_loop = vec_iv + vec_step; */
8067 /* Create the induction-phi that defines the induction-operand. */
8068 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8069 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8070 set_vinfo_for_stmt (induction_phi,
8071 new_stmt_vec_info (induction_phi, loop_vinfo));
8072 induc_def = PHI_RESULT (induction_phi);
8074 /* Create the iv update inside the loop */
8075 vec_def = make_ssa_name (vec_dest);
8076 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8077 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8078 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8080 /* Set the arguments of the phi node: */
8081 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8082 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8083 UNKNOWN_LOCATION);
8085 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8087 /* In case that vectorization factor (VF) is bigger than the number
8088 of elements that we can fit in a vectype (nunits), we have to generate
8089 more than one vector stmt - i.e - we need to "unroll" the
8090 vector stmt by a factor VF/nunits. For more details see documentation
8091 in vectorizable_operation. */
8093 if (ncopies > 1)
8095 gimple_seq seq = NULL;
8096 stmt_vec_info prev_stmt_vinfo;
8097 /* FORNOW. This restriction should be relaxed. */
8098 gcc_assert (!nested_in_vect_loop);
8100 /* Create the vector that holds the step of the induction. */
8101 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8103 expr = build_int_cst (integer_type_node, nunits);
8104 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8106 else
8107 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8108 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8109 expr, step_expr);
8110 if (seq)
8112 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8113 gcc_assert (!new_bb);
8116 t = unshare_expr (new_name);
8117 gcc_assert (CONSTANT_CLASS_P (new_name)
8118 || TREE_CODE (new_name) == SSA_NAME);
8119 new_vec = build_vector_from_val (vectype, t);
8120 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8122 vec_def = induc_def;
8123 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8124 for (i = 1; i < ncopies; i++)
8126 /* vec_i = vec_prev + vec_step */
8127 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8128 vec_def, vec_step);
8129 vec_def = make_ssa_name (vec_dest, new_stmt);
8130 gimple_assign_set_lhs (new_stmt, vec_def);
8132 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8133 set_vinfo_for_stmt (new_stmt,
8134 new_stmt_vec_info (new_stmt, loop_vinfo));
8135 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8136 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8140 if (nested_in_vect_loop)
8142 /* Find the loop-closed exit-phi of the induction, and record
8143 the final vector of induction results: */
8144 exit_phi = NULL;
8145 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8147 gimple *use_stmt = USE_STMT (use_p);
8148 if (is_gimple_debug (use_stmt))
8149 continue;
8151 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8153 exit_phi = use_stmt;
8154 break;
8157 if (exit_phi)
8159 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8160 /* FORNOW. Currently not supporting the case that an inner-loop induction
8161 is not used in the outer-loop (i.e. only outside the outer-loop). */
8162 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8163 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8165 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8166 if (dump_enabled_p ())
8168 dump_printf_loc (MSG_NOTE, vect_location,
8169 "vector of inductions after inner-loop:");
8170 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8176 if (dump_enabled_p ())
8178 dump_printf_loc (MSG_NOTE, vect_location,
8179 "transform induction: created def-use cycle: ");
8180 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8181 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8182 SSA_NAME_DEF_STMT (vec_def), 0);
8185 return true;
8188 /* Function vectorizable_live_operation.
8190 STMT computes a value that is used outside the loop. Check if
8191 it can be supported. */
8193 bool
8194 vectorizable_live_operation (gimple *stmt,
8195 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8196 slp_tree slp_node, int slp_index,
8197 gimple **vec_stmt)
8199 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8201 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8202 imm_use_iterator imm_iter;
8203 tree lhs, lhs_type, bitsize, vec_bitsize;
8204 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8205 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8206 int ncopies;
8207 gimple *use_stmt;
8208 auto_vec<tree> vec_oprnds;
8209 int vec_entry = 0;
8210 poly_uint64 vec_index = 0;
8212 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8214 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8215 return false;
8217 /* FORNOW. CHECKME. */
8218 if (nested_in_vect_loop_p (loop, stmt))
8219 return false;
8221 /* If STMT is not relevant and it is a simple assignment and its inputs are
8222 invariant then it can remain in place, unvectorized. The original last
8223 scalar value that it computes will be used. */
8224 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8226 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8227 if (dump_enabled_p ())
8228 dump_printf_loc (MSG_NOTE, vect_location,
8229 "statement is simple and uses invariant. Leaving in "
8230 "place.\n");
8231 return true;
8234 if (slp_node)
8235 ncopies = 1;
8236 else
8237 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8239 if (slp_node)
8241 gcc_assert (slp_index >= 0);
8243 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8244 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8246 /* Get the last occurrence of the scalar index from the concatenation of
8247 all the slp vectors. Calculate which slp vector it is and the index
8248 within. */
8249 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8251 /* Calculate which vector contains the result, and which lane of
8252 that vector we need. */
8253 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8255 if (dump_enabled_p ())
8256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8257 "Cannot determine which vector holds the"
8258 " final result.\n");
8259 return false;
8263 if (!vec_stmt)
8265 /* No transformation required. */
8266 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8268 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8269 OPTIMIZE_FOR_SPEED))
8271 if (dump_enabled_p ())
8272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273 "can't use a fully-masked loop because "
8274 "the target doesn't support extract last "
8275 "reduction.\n");
8276 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8278 else if (slp_node)
8280 if (dump_enabled_p ())
8281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282 "can't use a fully-masked loop because an "
8283 "SLP statement is live after the loop.\n");
8284 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8286 else if (ncopies > 1)
8288 if (dump_enabled_p ())
8289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8290 "can't use a fully-masked loop because"
8291 " ncopies is greater than 1.\n");
8292 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8294 else
8296 gcc_assert (ncopies == 1 && !slp_node);
8297 vect_record_loop_mask (loop_vinfo,
8298 &LOOP_VINFO_MASKS (loop_vinfo),
8299 1, vectype);
8302 return true;
8305 /* If stmt has a related stmt, then use that for getting the lhs. */
8306 if (is_pattern_stmt_p (stmt_info))
8307 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8309 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8310 : gimple_get_lhs (stmt);
8311 lhs_type = TREE_TYPE (lhs);
8313 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8314 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8315 : TYPE_SIZE (TREE_TYPE (vectype)));
8316 vec_bitsize = TYPE_SIZE (vectype);
8318 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8319 tree vec_lhs, bitstart;
8320 if (slp_node)
8322 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8324 /* Get the correct slp vectorized stmt. */
8325 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8326 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8327 vec_lhs = gimple_phi_result (phi);
8328 else
8329 vec_lhs = gimple_get_lhs (vec_stmt);
8331 /* Get entry to use. */
8332 bitstart = bitsize_int (vec_index);
8333 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8335 else
8337 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8338 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8339 gcc_checking_assert (ncopies == 1
8340 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8342 /* For multiple copies, get the last copy. */
8343 for (int i = 1; i < ncopies; ++i)
8344 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8345 vec_lhs);
8347 /* Get the last lane in the vector. */
8348 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8351 gimple_seq stmts = NULL;
8352 tree new_tree;
8353 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8355 /* Emit:
8357 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8359 where VEC_LHS is the vectorized live-out result and MASK is
8360 the loop mask for the final iteration. */
8361 gcc_assert (ncopies == 1 && !slp_node);
8362 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8363 tree scalar_res = make_ssa_name (scalar_type);
8364 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8365 1, vectype, 0);
8366 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8367 2, mask, vec_lhs);
8368 gimple_call_set_lhs (new_stmt, scalar_res);
8369 gimple_seq_add_stmt (&stmts, new_stmt);
8371 /* Convert the extracted vector element to the required scalar type. */
8372 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8374 else
8376 tree bftype = TREE_TYPE (vectype);
8377 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8378 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8379 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8380 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8381 &stmts, true, NULL_TREE);
8384 if (stmts)
8385 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8387 /* Replace use of lhs with newly computed result. If the use stmt is a
8388 single arg PHI, just replace all uses of PHI result. It's necessary
8389 because lcssa PHI defining lhs may be before newly inserted stmt. */
8390 use_operand_p use_p;
8391 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8392 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8393 && !is_gimple_debug (use_stmt))
8395 if (gimple_code (use_stmt) == GIMPLE_PHI
8396 && gimple_phi_num_args (use_stmt) == 1)
8398 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8400 else
8402 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8403 SET_USE (use_p, new_tree);
8405 update_stmt (use_stmt);
8408 return true;
8411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8413 static void
8414 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8416 ssa_op_iter op_iter;
8417 imm_use_iterator imm_iter;
8418 def_operand_p def_p;
8419 gimple *ustmt;
8421 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8423 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8425 basic_block bb;
8427 if (!is_gimple_debug (ustmt))
8428 continue;
8430 bb = gimple_bb (ustmt);
8432 if (!flow_bb_inside_loop_p (loop, bb))
8434 if (gimple_debug_bind_p (ustmt))
8436 if (dump_enabled_p ())
8437 dump_printf_loc (MSG_NOTE, vect_location,
8438 "killing debug use\n");
8440 gimple_debug_bind_reset_value (ustmt);
8441 update_stmt (ustmt);
8443 else
8444 gcc_unreachable ();
8450 /* Given loop represented by LOOP_VINFO, return true if computation of
8451 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8452 otherwise. */
8454 static bool
8455 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8457 /* Constant case. */
8458 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8460 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8461 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8463 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8464 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8465 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8466 return true;
8469 widest_int max;
8470 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8471 /* Check the upper bound of loop niters. */
8472 if (get_max_loop_iterations (loop, &max))
8474 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8475 signop sgn = TYPE_SIGN (type);
8476 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8477 if (max < type_max)
8478 return true;
8480 return false;
8483 /* Return a mask type with half the number of elements as TYPE. */
8485 tree
8486 vect_halve_mask_nunits (tree type)
8488 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8489 return build_truth_vector_type (nunits, current_vector_size);
8492 /* Return a mask type with twice as many elements as TYPE. */
8494 tree
8495 vect_double_mask_nunits (tree type)
8497 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8498 return build_truth_vector_type (nunits, current_vector_size);
8501 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8502 contain a sequence of NVECTORS masks that each control a vector of type
8503 VECTYPE. */
8505 void
8506 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8507 unsigned int nvectors, tree vectype)
8509 gcc_assert (nvectors != 0);
8510 if (masks->length () < nvectors)
8511 masks->safe_grow_cleared (nvectors);
8512 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8513 /* The number of scalars per iteration and the number of vectors are
8514 both compile-time constants. */
8515 unsigned int nscalars_per_iter
8516 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8517 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8518 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8520 rgm->max_nscalars_per_iter = nscalars_per_iter;
8521 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8525 /* Given a complete set of masks MASKS, extract mask number INDEX
8526 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8527 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8529 See the comment above vec_loop_masks for more details about the mask
8530 arrangement. */
8532 tree
8533 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8534 unsigned int nvectors, tree vectype, unsigned int index)
8536 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8537 tree mask_type = rgm->mask_type;
8539 /* Populate the rgroup's mask array, if this is the first time we've
8540 used it. */
8541 if (rgm->masks.is_empty ())
8543 rgm->masks.safe_grow_cleared (nvectors);
8544 for (unsigned int i = 0; i < nvectors; ++i)
8546 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8547 /* Provide a dummy definition until the real one is available. */
8548 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8549 rgm->masks[i] = mask;
8553 tree mask = rgm->masks[index];
8554 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8555 TYPE_VECTOR_SUBPARTS (vectype)))
8557 /* A loop mask for data type X can be reused for data type Y
8558 if X has N times more elements than Y and if Y's elements
8559 are N times bigger than X's. In this case each sequence
8560 of N elements in the loop mask will be all-zero or all-one.
8561 We can then view-convert the mask so that each sequence of
8562 N elements is replaced by a single element. */
8563 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8564 TYPE_VECTOR_SUBPARTS (vectype)));
8565 gimple_seq seq = NULL;
8566 mask_type = build_same_sized_truth_vector_type (vectype);
8567 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8568 if (seq)
8569 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8571 return mask;
8574 /* Scale profiling counters by estimation for LOOP which is vectorized
8575 by factor VF. */
8577 static void
8578 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8580 edge preheader = loop_preheader_edge (loop);
8581 /* Reduce loop iterations by the vectorization factor. */
8582 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8583 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8585 if (freq_h.nonzero_p ())
8587 profile_probability p;
8589 /* Avoid dropping loop body profile counter to 0 because of zero count
8590 in loop's preheader. */
8591 if (!(freq_e == profile_count::zero ()))
8592 freq_e = freq_e.force_nonzero ();
8593 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8594 scale_loop_frequencies (loop, p);
8597 edge exit_e = single_exit (loop);
8598 exit_e->probability = profile_probability::always ()
8599 .apply_scale (1, new_est_niter + 1);
8601 edge exit_l = single_pred_edge (loop->latch);
8602 profile_probability prob = exit_l->probability;
8603 exit_l->probability = exit_e->probability.invert ();
8604 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8605 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8608 /* Function vect_transform_loop.
8610 The analysis phase has determined that the loop is vectorizable.
8611 Vectorize the loop - created vectorized stmts to replace the scalar
8612 stmts in the loop, and update the loop exit condition.
8613 Returns scalar epilogue loop if any. */
8615 struct loop *
8616 vect_transform_loop (loop_vec_info loop_vinfo)
8618 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8619 struct loop *epilogue = NULL;
8620 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8621 int nbbs = loop->num_nodes;
8622 int i;
8623 tree niters_vector = NULL_TREE;
8624 tree step_vector = NULL_TREE;
8625 tree niters_vector_mult_vf = NULL_TREE;
8626 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8627 unsigned int lowest_vf = constant_lower_bound (vf);
8628 bool grouped_store;
8629 bool slp_scheduled = false;
8630 gimple *stmt, *pattern_stmt;
8631 gimple_seq pattern_def_seq = NULL;
8632 gimple_stmt_iterator pattern_def_si = gsi_none ();
8633 bool transform_pattern_stmt = false;
8634 bool check_profitability = false;
8635 unsigned int th;
8637 if (dump_enabled_p ())
8638 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8640 /* Use the more conservative vectorization threshold. If the number
8641 of iterations is constant assume the cost check has been performed
8642 by our caller. If the threshold makes all loops profitable that
8643 run at least the (estimated) vectorization factor number of times
8644 checking is pointless, too. */
8645 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8646 if (th >= vect_vf_for_cost (loop_vinfo)
8647 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8649 if (dump_enabled_p ())
8650 dump_printf_loc (MSG_NOTE, vect_location,
8651 "Profitability threshold is %d loop iterations.\n",
8652 th);
8653 check_profitability = true;
8656 /* Make sure there exists a single-predecessor exit bb. Do this before
8657 versioning. */
8658 edge e = single_exit (loop);
8659 if (! single_pred_p (e->dest))
8661 split_loop_exit_edge (e);
8662 if (dump_enabled_p ())
8663 dump_printf (MSG_NOTE, "split exit edge\n");
8666 /* Version the loop first, if required, so the profitability check
8667 comes first. */
8669 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8671 poly_uint64 versioning_threshold
8672 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8673 if (check_profitability
8674 && ordered_p (poly_uint64 (th), versioning_threshold))
8676 versioning_threshold = ordered_max (poly_uint64 (th),
8677 versioning_threshold);
8678 check_profitability = false;
8680 vect_loop_versioning (loop_vinfo, th, check_profitability,
8681 versioning_threshold);
8682 check_profitability = false;
8685 /* Make sure there exists a single-predecessor exit bb also on the
8686 scalar loop copy. Do this after versioning but before peeling
8687 so CFG structure is fine for both scalar and if-converted loop
8688 to make slpeel_duplicate_current_defs_from_edges face matched
8689 loop closed PHI nodes on the exit. */
8690 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8692 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8693 if (! single_pred_p (e->dest))
8695 split_loop_exit_edge (e);
8696 if (dump_enabled_p ())
8697 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8701 tree niters = vect_build_loop_niters (loop_vinfo);
8702 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8703 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8704 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8705 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8706 &step_vector, &niters_vector_mult_vf, th,
8707 check_profitability, niters_no_overflow);
8709 if (niters_vector == NULL_TREE)
8711 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8712 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8713 && known_eq (lowest_vf, vf))
8715 niters_vector
8716 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8717 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8718 step_vector = build_one_cst (TREE_TYPE (niters));
8720 else
8721 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8722 &step_vector, niters_no_overflow);
8725 /* 1) Make sure the loop header has exactly two entries
8726 2) Make sure we have a preheader basic block. */
8728 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8730 split_edge (loop_preheader_edge (loop));
8732 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8733 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8734 /* This will deal with any possible peeling. */
8735 vect_prepare_for_masked_peels (loop_vinfo);
8737 /* FORNOW: the vectorizer supports only loops which body consist
8738 of one basic block (header + empty latch). When the vectorizer will
8739 support more involved loop forms, the order by which the BBs are
8740 traversed need to be reconsidered. */
8742 for (i = 0; i < nbbs; i++)
8744 basic_block bb = bbs[i];
8745 stmt_vec_info stmt_info;
8747 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8748 gsi_next (&si))
8750 gphi *phi = si.phi ();
8751 if (dump_enabled_p ())
8753 dump_printf_loc (MSG_NOTE, vect_location,
8754 "------>vectorizing phi: ");
8755 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8757 stmt_info = vinfo_for_stmt (phi);
8758 if (!stmt_info)
8759 continue;
8761 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8762 vect_loop_kill_debug_uses (loop, phi);
8764 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8765 && !STMT_VINFO_LIVE_P (stmt_info))
8766 continue;
8768 if (STMT_VINFO_VECTYPE (stmt_info)
8769 && (maybe_ne
8770 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8771 && dump_enabled_p ())
8772 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8774 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8775 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8776 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8777 && ! PURE_SLP_STMT (stmt_info))
8779 if (dump_enabled_p ())
8780 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8781 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8785 pattern_stmt = NULL;
8786 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8787 !gsi_end_p (si) || transform_pattern_stmt;)
8789 bool is_store;
8791 if (transform_pattern_stmt)
8792 stmt = pattern_stmt;
8793 else
8795 stmt = gsi_stmt (si);
8796 /* During vectorization remove existing clobber stmts. */
8797 if (gimple_clobber_p (stmt))
8799 unlink_stmt_vdef (stmt);
8800 gsi_remove (&si, true);
8801 release_defs (stmt);
8802 continue;
8806 if (dump_enabled_p ())
8808 dump_printf_loc (MSG_NOTE, vect_location,
8809 "------>vectorizing statement: ");
8810 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8813 stmt_info = vinfo_for_stmt (stmt);
8815 /* vector stmts created in the outer-loop during vectorization of
8816 stmts in an inner-loop may not have a stmt_info, and do not
8817 need to be vectorized. */
8818 if (!stmt_info)
8820 gsi_next (&si);
8821 continue;
8824 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8825 vect_loop_kill_debug_uses (loop, stmt);
8827 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8828 && !STMT_VINFO_LIVE_P (stmt_info))
8830 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8831 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8832 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8833 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8835 stmt = pattern_stmt;
8836 stmt_info = vinfo_for_stmt (stmt);
8838 else
8840 gsi_next (&si);
8841 continue;
8844 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8845 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8846 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8847 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8848 transform_pattern_stmt = true;
8850 /* If pattern statement has def stmts, vectorize them too. */
8851 if (is_pattern_stmt_p (stmt_info))
8853 if (pattern_def_seq == NULL)
8855 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8856 pattern_def_si = gsi_start (pattern_def_seq);
8858 else if (!gsi_end_p (pattern_def_si))
8859 gsi_next (&pattern_def_si);
8860 if (pattern_def_seq != NULL)
8862 gimple *pattern_def_stmt = NULL;
8863 stmt_vec_info pattern_def_stmt_info = NULL;
8865 while (!gsi_end_p (pattern_def_si))
8867 pattern_def_stmt = gsi_stmt (pattern_def_si);
8868 pattern_def_stmt_info
8869 = vinfo_for_stmt (pattern_def_stmt);
8870 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8871 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8872 break;
8873 gsi_next (&pattern_def_si);
8876 if (!gsi_end_p (pattern_def_si))
8878 if (dump_enabled_p ())
8880 dump_printf_loc (MSG_NOTE, vect_location,
8881 "==> vectorizing pattern def "
8882 "stmt: ");
8883 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8884 pattern_def_stmt, 0);
8887 stmt = pattern_def_stmt;
8888 stmt_info = pattern_def_stmt_info;
8890 else
8892 pattern_def_si = gsi_none ();
8893 transform_pattern_stmt = false;
8896 else
8897 transform_pattern_stmt = false;
8900 if (STMT_VINFO_VECTYPE (stmt_info))
8902 poly_uint64 nunits
8903 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8904 if (!STMT_SLP_TYPE (stmt_info)
8905 && maybe_ne (nunits, vf)
8906 && dump_enabled_p ())
8907 /* For SLP VF is set according to unrolling factor, and not
8908 to vector size, hence for SLP this print is not valid. */
8909 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8912 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8913 reached. */
8914 if (STMT_SLP_TYPE (stmt_info))
8916 if (!slp_scheduled)
8918 slp_scheduled = true;
8920 if (dump_enabled_p ())
8921 dump_printf_loc (MSG_NOTE, vect_location,
8922 "=== scheduling SLP instances ===\n");
8924 vect_schedule_slp (loop_vinfo);
8927 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8928 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8930 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8932 pattern_def_seq = NULL;
8933 gsi_next (&si);
8935 continue;
8939 /* -------- vectorize statement ------------ */
8940 if (dump_enabled_p ())
8941 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8943 grouped_store = false;
8944 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8945 if (is_store)
8947 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8949 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8950 interleaving chain was completed - free all the stores in
8951 the chain. */
8952 gsi_next (&si);
8953 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8955 else
8957 /* Free the attached stmt_vec_info and remove the stmt. */
8958 gimple *store = gsi_stmt (si);
8959 free_stmt_vec_info (store);
8960 unlink_stmt_vdef (store);
8961 gsi_remove (&si, true);
8962 release_defs (store);
8965 /* Stores can only appear at the end of pattern statements. */
8966 gcc_assert (!transform_pattern_stmt);
8967 pattern_def_seq = NULL;
8969 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8971 pattern_def_seq = NULL;
8972 gsi_next (&si);
8974 } /* stmts in BB */
8976 /* Stub out scalar statements that must not survive vectorization.
8977 Doing this here helps with grouped statements, or statements that
8978 are involved in patterns. */
8979 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8980 !gsi_end_p (gsi); gsi_next (&gsi))
8982 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8983 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8985 tree lhs = gimple_get_lhs (call);
8986 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8988 tree zero = build_zero_cst (TREE_TYPE (lhs));
8989 gimple *new_stmt = gimple_build_assign (lhs, zero);
8990 gsi_replace (&gsi, new_stmt, true);
8994 } /* BBs in loop */
8996 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8997 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8998 if (integer_onep (step_vector))
8999 niters_no_overflow = true;
9000 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9001 niters_vector_mult_vf, !niters_no_overflow);
9003 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9004 scale_profile_for_vect_loop (loop, assumed_vf);
9006 /* True if the final iteration might not handle a full vector's
9007 worth of scalar iterations. */
9008 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
9009 /* The minimum number of iterations performed by the epilogue. This
9010 is 1 when peeling for gaps because we always need a final scalar
9011 iteration. */
9012 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9013 /* +1 to convert latch counts to loop iteration counts,
9014 -min_epilogue_iters to remove iterations that cannot be performed
9015 by the vector code. */
9016 int bias_for_lowest = 1 - min_epilogue_iters;
9017 int bias_for_assumed = bias_for_lowest;
9018 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9019 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9021 /* When the amount of peeling is known at compile time, the first
9022 iteration will have exactly alignment_npeels active elements.
9023 In the worst case it will have at least one. */
9024 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9025 bias_for_lowest += lowest_vf - min_first_active;
9026 bias_for_assumed += assumed_vf - min_first_active;
9028 /* In these calculations the "- 1" converts loop iteration counts
9029 back to latch counts. */
9030 if (loop->any_upper_bound)
9031 loop->nb_iterations_upper_bound
9032 = (final_iter_may_be_partial
9033 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9034 lowest_vf) - 1
9035 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9036 lowest_vf) - 1);
9037 if (loop->any_likely_upper_bound)
9038 loop->nb_iterations_likely_upper_bound
9039 = (final_iter_may_be_partial
9040 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9041 + bias_for_lowest, lowest_vf) - 1
9042 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9043 + bias_for_lowest, lowest_vf) - 1);
9044 if (loop->any_estimate)
9045 loop->nb_iterations_estimate
9046 = (final_iter_may_be_partial
9047 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9048 assumed_vf) - 1
9049 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9050 assumed_vf) - 1);
9052 if (dump_enabled_p ())
9054 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9056 dump_printf_loc (MSG_NOTE, vect_location,
9057 "LOOP VECTORIZED\n");
9058 if (loop->inner)
9059 dump_printf_loc (MSG_NOTE, vect_location,
9060 "OUTER LOOP VECTORIZED\n");
9061 dump_printf (MSG_NOTE, "\n");
9063 else
9065 dump_printf_loc (MSG_NOTE, vect_location,
9066 "LOOP EPILOGUE VECTORIZED (VS=");
9067 dump_dec (MSG_NOTE, current_vector_size);
9068 dump_printf (MSG_NOTE, ")\n");
9072 /* Free SLP instances here because otherwise stmt reference counting
9073 won't work. */
9074 slp_instance instance;
9075 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9076 vect_free_slp_instance (instance);
9077 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9078 /* Clear-up safelen field since its value is invalid after vectorization
9079 since vectorized loop can have loop-carried dependencies. */
9080 loop->safelen = 0;
9082 /* Don't vectorize epilogue for epilogue. */
9083 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9084 epilogue = NULL;
9086 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9087 epilogue = NULL;
9089 if (epilogue)
9091 auto_vector_sizes vector_sizes;
9092 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9093 unsigned int next_size = 0;
9095 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9096 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9097 && known_eq (vf, lowest_vf))
9099 unsigned int eiters
9100 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9101 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9102 eiters = eiters % lowest_vf;
9103 epilogue->nb_iterations_upper_bound = eiters - 1;
9105 unsigned int ratio;
9106 while (next_size < vector_sizes.length ()
9107 && !(constant_multiple_p (current_vector_size,
9108 vector_sizes[next_size], &ratio)
9109 && eiters >= lowest_vf / ratio))
9110 next_size += 1;
9112 else
9113 while (next_size < vector_sizes.length ()
9114 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9115 next_size += 1;
9117 if (next_size == vector_sizes.length ())
9118 epilogue = NULL;
9121 if (epilogue)
9123 epilogue->force_vectorize = loop->force_vectorize;
9124 epilogue->safelen = loop->safelen;
9125 epilogue->dont_vectorize = false;
9127 /* We may need to if-convert epilogue to vectorize it. */
9128 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9129 tree_if_conversion (epilogue);
9132 return epilogue;
9135 /* The code below is trying to perform simple optimization - revert
9136 if-conversion for masked stores, i.e. if the mask of a store is zero
9137 do not perform it and all stored value producers also if possible.
9138 For example,
9139 for (i=0; i<n; i++)
9140 if (c[i])
9142 p1[i] += 1;
9143 p2[i] = p3[i] +2;
9145 this transformation will produce the following semi-hammock:
9147 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9149 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9150 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9151 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9152 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9153 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9154 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9158 void
9159 optimize_mask_stores (struct loop *loop)
9161 basic_block *bbs = get_loop_body (loop);
9162 unsigned nbbs = loop->num_nodes;
9163 unsigned i;
9164 basic_block bb;
9165 struct loop *bb_loop;
9166 gimple_stmt_iterator gsi;
9167 gimple *stmt;
9168 auto_vec<gimple *> worklist;
9170 vect_location = find_loop_location (loop);
9171 /* Pick up all masked stores in loop if any. */
9172 for (i = 0; i < nbbs; i++)
9174 bb = bbs[i];
9175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9176 gsi_next (&gsi))
9178 stmt = gsi_stmt (gsi);
9179 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9180 worklist.safe_push (stmt);
9184 free (bbs);
9185 if (worklist.is_empty ())
9186 return;
9188 /* Loop has masked stores. */
9189 while (!worklist.is_empty ())
9191 gimple *last, *last_store;
9192 edge e, efalse;
9193 tree mask;
9194 basic_block store_bb, join_bb;
9195 gimple_stmt_iterator gsi_to;
9196 tree vdef, new_vdef;
9197 gphi *phi;
9198 tree vectype;
9199 tree zero;
9201 last = worklist.pop ();
9202 mask = gimple_call_arg (last, 2);
9203 bb = gimple_bb (last);
9204 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9205 the same loop as if_bb. It could be different to LOOP when two
9206 level loop-nest is vectorized and mask_store belongs to the inner
9207 one. */
9208 e = split_block (bb, last);
9209 bb_loop = bb->loop_father;
9210 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9211 join_bb = e->dest;
9212 store_bb = create_empty_bb (bb);
9213 add_bb_to_loop (store_bb, bb_loop);
9214 e->flags = EDGE_TRUE_VALUE;
9215 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9216 /* Put STORE_BB to likely part. */
9217 efalse->probability = profile_probability::unlikely ();
9218 store_bb->count = efalse->count ();
9219 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9220 if (dom_info_available_p (CDI_DOMINATORS))
9221 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9222 if (dump_enabled_p ())
9223 dump_printf_loc (MSG_NOTE, vect_location,
9224 "Create new block %d to sink mask stores.",
9225 store_bb->index);
9226 /* Create vector comparison with boolean result. */
9227 vectype = TREE_TYPE (mask);
9228 zero = build_zero_cst (vectype);
9229 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9230 gsi = gsi_last_bb (bb);
9231 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9232 /* Create new PHI node for vdef of the last masked store:
9233 .MEM_2 = VDEF <.MEM_1>
9234 will be converted to
9235 .MEM.3 = VDEF <.MEM_1>
9236 and new PHI node will be created in join bb
9237 .MEM_2 = PHI <.MEM_1, .MEM_3>
9239 vdef = gimple_vdef (last);
9240 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9241 gimple_set_vdef (last, new_vdef);
9242 phi = create_phi_node (vdef, join_bb);
9243 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9245 /* Put all masked stores with the same mask to STORE_BB if possible. */
9246 while (true)
9248 gimple_stmt_iterator gsi_from;
9249 gimple *stmt1 = NULL;
9251 /* Move masked store to STORE_BB. */
9252 last_store = last;
9253 gsi = gsi_for_stmt (last);
9254 gsi_from = gsi;
9255 /* Shift GSI to the previous stmt for further traversal. */
9256 gsi_prev (&gsi);
9257 gsi_to = gsi_start_bb (store_bb);
9258 gsi_move_before (&gsi_from, &gsi_to);
9259 /* Setup GSI_TO to the non-empty block start. */
9260 gsi_to = gsi_start_bb (store_bb);
9261 if (dump_enabled_p ())
9263 dump_printf_loc (MSG_NOTE, vect_location,
9264 "Move stmt to created bb\n");
9265 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9267 /* Move all stored value producers if possible. */
9268 while (!gsi_end_p (gsi))
9270 tree lhs;
9271 imm_use_iterator imm_iter;
9272 use_operand_p use_p;
9273 bool res;
9275 /* Skip debug statements. */
9276 if (is_gimple_debug (gsi_stmt (gsi)))
9278 gsi_prev (&gsi);
9279 continue;
9281 stmt1 = gsi_stmt (gsi);
9282 /* Do not consider statements writing to memory or having
9283 volatile operand. */
9284 if (gimple_vdef (stmt1)
9285 || gimple_has_volatile_ops (stmt1))
9286 break;
9287 gsi_from = gsi;
9288 gsi_prev (&gsi);
9289 lhs = gimple_get_lhs (stmt1);
9290 if (!lhs)
9291 break;
9293 /* LHS of vectorized stmt must be SSA_NAME. */
9294 if (TREE_CODE (lhs) != SSA_NAME)
9295 break;
9297 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9299 /* Remove dead scalar statement. */
9300 if (has_zero_uses (lhs))
9302 gsi_remove (&gsi_from, true);
9303 continue;
9307 /* Check that LHS does not have uses outside of STORE_BB. */
9308 res = true;
9309 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9311 gimple *use_stmt;
9312 use_stmt = USE_STMT (use_p);
9313 if (is_gimple_debug (use_stmt))
9314 continue;
9315 if (gimple_bb (use_stmt) != store_bb)
9317 res = false;
9318 break;
9321 if (!res)
9322 break;
9324 if (gimple_vuse (stmt1)
9325 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9326 break;
9328 /* Can move STMT1 to STORE_BB. */
9329 if (dump_enabled_p ())
9331 dump_printf_loc (MSG_NOTE, vect_location,
9332 "Move stmt to created bb\n");
9333 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9335 gsi_move_before (&gsi_from, &gsi_to);
9336 /* Shift GSI_TO for further insertion. */
9337 gsi_prev (&gsi_to);
9339 /* Put other masked stores with the same mask to STORE_BB. */
9340 if (worklist.is_empty ()
9341 || gimple_call_arg (worklist.last (), 2) != mask
9342 || worklist.last () != stmt1)
9343 break;
9344 last = worklist.pop ();
9346 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);