Use non-throwing is_directory in filesystem::create_directory
[official-gcc.git] / gcc / tree-vect-loop.c
blob6585c85cfe039465cbe850926787a34af6cc079a
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
169 in the loop.
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
172 original loop:
173 for (i=0; i<N; i++){
174 a[i] = b[i] + c[i];
177 vectorized loop:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188 unsigned nbbs = loop->num_nodes;
189 poly_uint64 vectorization_factor = 1;
190 tree scalar_type = NULL_TREE;
191 gphi *phi;
192 tree vectype;
193 stmt_vec_info stmt_info;
194 unsigned i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
200 bool bool_result;
201 auto_vec<stmt_vec_info> mask_producers;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i = 0; i < nbbs; i++)
209 basic_block bb = bbs[i];
211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 gsi_next (&si))
214 phi = si.phi ();
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 gcc_assert (stmt_info);
224 if (STMT_VINFO_RELEVANT_P (stmt_info)
225 || STMT_VINFO_LIVE_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 dump_printf (MSG_NOTE, "\n");
268 vect_update_max_nunits (&vectorization_factor, vectype);
272 for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 !gsi_end_p (si) || analyze_pattern_stmt;)
275 tree vf_vectype;
277 if (analyze_pattern_stmt)
278 stmt = pattern_stmt;
279 else
280 stmt = gsi_stmt (si);
282 stmt_info = vinfo_for_stmt (stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 return false;
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 else
434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
438 compute a factor. */
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 && is_gimple_assign (stmt)
441 && gimple_assign_rhs_code (stmt) != COND_EXPR)
443 if (STMT_VINFO_RELEVANT_P (stmt_info)
444 || STMT_VINFO_LIVE_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 == tcc_comparison
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 else
455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
457 pattern_def_seq = NULL;
458 gsi_next (&si);
460 continue;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469 dump_printf (MSG_NOTE, "\n");
471 vectype = get_vectype_for_scalar_type (scalar_type);
472 if (!vectype)
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized: unsupported "
478 "data-type ");
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480 scalar_type);
481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
483 return false;
486 if (!bool_result)
487 STMT_VINFO_VECTYPE (stmt_info) = vectype;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493 dump_printf (MSG_NOTE, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 vf_vectype = vectype;
501 else
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 if (!bool_result)
507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 &dummy);
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 dump_printf (MSG_NOTE, "\n");
516 vf_vectype = get_vectype_for_scalar_type (scalar_type);
518 if (!vf_vectype)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525 scalar_type);
526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 return false;
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vf_vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553 dump_printf (MSG_NOTE, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 dump_printf (MSG_NOTE, "\n");
563 vect_update_max_nunits (&vectorization_factor, vf_vectype);
565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 pattern_def_seq = NULL;
568 gsi_next (&si);
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577 dump_dec (MSG_NOTE, vectorization_factor);
578 dump_printf (MSG_NOTE, "\n");
581 if (known_le (vectorization_factor, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585 "not vectorized: unsupported data-type\n");
586 return false;
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
590 for (i = 0; i < mask_producers.length (); i++)
592 tree mask_type = NULL;
594 stmt = STMT_VINFO_STMT (mask_producers[i]);
596 if (is_gimple_assign (stmt)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 mask_type = get_mask_type_for_scalar_type (scalar_type);
604 if (!mask_type)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "not vectorized: unsupported mask\n");
609 return false;
612 else
614 tree rhs;
615 ssa_op_iter iter;
616 gimple *def_stmt;
617 enum vect_def_type dt;
619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 &def_stmt, &dt, &vectype))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "not vectorized: can't compute mask type "
628 "for statement, ");
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 return false;
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
638 if (!vectype)
639 continue;
641 if (!mask_type)
642 mask_type = vectype;
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 TYPE_VECTOR_SUBPARTS (vectype)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 mask_type);
653 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 vectype);
656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
658 return false;
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 != VECTOR_BOOLEAN_TYPE_P (vectype))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 mask_type);
670 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 vectype);
673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
675 return false;
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
681 if (mask_type
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 && gimple_code (stmt) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 mask_type = build_same_sized_truth_vector_type (mask_type);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
690 if-conversion. */
691 if (!mask_type)
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "not vectorized: can't compute mask type "
697 "for statement, ");
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 return false;
704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 return true;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718 tree * step)
720 tree init_expr;
721 tree step_expr;
722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723 basic_block bb;
725 /* When there is no evolution in this loop, the evolution function
726 is not "simple". */
727 if (evolution_part == NULL_TREE)
728 return false;
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part))
733 return false;
735 step_expr = evolution_part;
736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742 dump_printf (MSG_NOTE, ", init: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744 dump_printf (MSG_NOTE, "\n");
747 *init = init_expr;
748 *step = step_expr;
750 if (TREE_CODE (step_expr) != INTEGER_CST
751 && (TREE_CODE (step_expr) != SSA_NAME
752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 || !flag_associative_math)))
757 && (TREE_CODE (step_expr) != REAL_CST
758 || !flag_associative_math))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "step unknown.\n");
763 return false;
766 return true;
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
774 enclosing LOOP). */
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
779 basic_block bb = loop->header;
780 tree init, step;
781 auto_vec<gimple *, 64> worklist;
782 gphi_iterator gsi;
783 bool double_reduc;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
791 changed. */
792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
794 gphi *phi = gsi.phi ();
795 tree access_fn = NULL;
796 tree def = PHI_RESULT (phi);
797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def))
808 continue;
810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
812 /* Analyze the evolution function. */
813 access_fn = analyze_scalar_evolution (loop, def);
814 if (access_fn)
816 STRIP_NOPS (access_fn);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822 dump_printf (MSG_NOTE, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 = initial_condition_in_loop_num (access_fn, loop->num);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 = evolution_part_in_loop_num (access_fn, loop->num);
830 if (!access_fn
831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 && TREE_CODE (step) != INTEGER_CST))
835 worklist.safe_push (phi);
836 continue;
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 != NULL_TREE);
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist.length () > 0)
852 gimple *phi = worklist.pop ();
853 tree def = PHI_RESULT (phi);
854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855 gimple *reduc_stmt;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 gcc_assert (!virtual_operand_p (def)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
903 chain. */
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 tree *number_of_iterations, tree *number_of_iterationsm1)
1024 edge exit = single_exit (loop);
1025 struct tree_niter_desc niter_desc;
1026 tree niter_assumptions, niter, may_be_zero;
1027 gcond *cond = get_loop_exit_condition (loop);
1029 *assumptions = boolean_true_node;
1030 *number_of_iterationsm1 = chrec_dont_know;
1031 *number_of_iterations = chrec_dont_know;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== get_loop_niters ===\n");
1036 if (!exit)
1037 return cond;
1039 niter = chrec_dont_know;
1040 may_be_zero = NULL_TREE;
1041 niter_assumptions = boolean_true_node;
1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043 || chrec_contains_undetermined (niter_desc.niter))
1044 return cond;
1046 niter_assumptions = niter_desc.assumptions;
1047 may_be_zero = niter_desc.may_be_zero;
1048 niter = niter_desc.niter;
1050 if (may_be_zero && integer_zerop (may_be_zero))
1051 may_be_zero = NULL_TREE;
1053 if (may_be_zero)
1055 if (COMPARISON_CLASS_P (may_be_zero))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 niter_assumptions,
1062 fold_build1 (TRUTH_NOT_EXPR,
1063 boolean_type_node,
1064 may_be_zero));
1065 else
1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 build_int_cst (TREE_TYPE (niter), 0),
1068 rewrite_to_non_trapping_overflow (niter));
1070 may_be_zero = NULL_TREE;
1072 else if (integer_nonzerop (may_be_zero))
1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 return cond;
1078 else
1079 return cond;
1082 *assumptions = niter_assumptions;
1083 *number_of_iterationsm1 = niter;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter && !chrec_contains_undetermined (niter))
1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 build_int_cst (TREE_TYPE (niter), 1));
1092 *number_of_iterations = niter;
1094 return cond;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1104 const struct loop *const loop = (const struct loop *)data;
1105 if (flow_bb_inside_loop_p (loop, bb))
1106 return true;
1107 return false;
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115 : vec_info (vec_info::loop, init_cost (loop_in)),
1116 loop (loop_in),
1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118 num_itersm1 (NULL_TREE),
1119 num_iters (NULL_TREE),
1120 num_iters_unchanged (NULL_TREE),
1121 num_iters_assumptions (NULL_TREE),
1122 th (0),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE),
1127 mask_compare_type (NULL_TREE),
1128 unaligned_dr (NULL),
1129 peeling_for_alignment (0),
1130 ptr_mask (0),
1131 ivexpr_map (NULL),
1132 slp_unrolling_factor (1),
1133 single_scalar_iteration_cost (0),
1134 vectorizable (false),
1135 can_fully_mask_p (true),
1136 fully_masked_p (false),
1137 peeling_for_gaps (false),
1138 peeling_for_niter (false),
1139 operands_swapped (false),
1140 no_data_dependencies (false),
1141 has_mask_store (false),
1142 scalar_loop (NULL),
1143 orig_loop_info (NULL)
1145 /* Create/Update stmt_info for all stmts in the loop. */
1146 basic_block *body = get_loop_body (loop);
1147 for (unsigned int i = 0; i < loop->num_nodes; i++)
1149 basic_block bb = body[i];
1150 gimple_stmt_iterator si;
1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1154 gimple *phi = gsi_stmt (si);
1155 gimple_set_uid (phi, 0);
1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1161 gimple *stmt = gsi_stmt (si);
1162 gimple_set_uid (stmt, 0);
1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 free (body);
1168 /* CHECKME: We want to visit all BBs before their successors (except for
1169 latch blocks, for which this assertion wouldn't hold). In the simple
1170 case of the loop forms we allow, a dfs order of the BBs would the same
1171 as reversed postorder traversal, so we are safe. */
1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 bbs, loop->num_nodes, loop);
1175 gcc_assert (nbbs == loop->num_nodes);
1178 /* Free all levels of MASKS. */
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1183 rgroup_masks *rgm;
1184 unsigned int i;
1185 FOR_EACH_VEC_ELT (*masks, i, rgm)
1186 rgm->masks.release ();
1187 masks->release ();
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191 stmt_vec_info structs of all the stmts in the loop. */
1193 _loop_vec_info::~_loop_vec_info ()
1195 int nbbs;
1196 gimple_stmt_iterator si;
1197 int j;
1199 nbbs = loop->num_nodes;
1200 for (j = 0; j < nbbs; j++)
1202 basic_block bb = bbs[j];
1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204 free_stmt_vec_info (gsi_stmt (si));
1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1208 gimple *stmt = gsi_stmt (si);
1210 /* We may have broken canonical form by moving a constant
1211 into RHS1 of a commutative op. Fix such occurrences. */
1212 if (operands_swapped && is_gimple_assign (stmt))
1214 enum tree_code code = gimple_assign_rhs_code (stmt);
1216 if ((code == PLUS_EXPR
1217 || code == POINTER_PLUS_EXPR
1218 || code == MULT_EXPR)
1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 swap_ssa_operands (stmt,
1221 gimple_assign_rhs1_ptr (stmt),
1222 gimple_assign_rhs2_ptr (stmt));
1223 else if (code == COND_EXPR
1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1226 tree cond_expr = gimple_assign_rhs1 (stmt);
1227 enum tree_code cond_code = TREE_CODE (cond_expr);
1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 0));
1233 cond_code = invert_tree_comparison (cond_code,
1234 honor_nans);
1235 if (cond_code != ERROR_MARK)
1237 TREE_SET_CODE (cond_expr, cond_code);
1238 swap_ssa_operands (stmt,
1239 gimple_assign_rhs2_ptr (stmt),
1240 gimple_assign_rhs3_ptr (stmt));
1246 /* Free stmt_vec_info. */
1247 free_stmt_vec_info (stmt);
1248 gsi_next (&si);
1252 free (bbs);
1254 release_vec_loop_masks (&masks);
1255 delete ivexpr_map;
1257 loop->aux = NULL;
1260 /* Return an invariant or register for EXPR and emit necessary
1261 computations in the LOOP_VINFO loop preheader. */
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1266 if (is_gimple_reg (expr)
1267 || is_gimple_min_invariant (expr))
1268 return expr;
1270 if (! loop_vinfo->ivexpr_map)
1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273 if (! cached)
1275 gimple_seq stmts = NULL;
1276 cached = force_gimple_operand (unshare_expr (expr),
1277 &stmts, true, NULL_TREE);
1278 if (stmts)
1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 gsi_insert_seq_on_edge_immediate (e, stmts);
1284 return cached;
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288 all masks required to mask LOOP_VINFO. */
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1293 rgroup_masks *rgm;
1294 unsigned int i;
1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296 if (rgm->mask_type != NULL_TREE
1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 cmp_type, rgm->mask_type,
1299 OPTIMIZE_FOR_SPEED))
1300 return false;
1301 return true;
1304 /* Calculate the maximum number of scalars per iteration for every
1305 rgroup in LOOP_VINFO. */
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1310 unsigned int res = 1;
1311 unsigned int i;
1312 rgroup_masks *rgm;
1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314 res = MAX (res, rgm->max_nscalars_per_iter);
1315 return res;
1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1319 whether we can actually generate the masks required. Return true if so,
1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326 unsigned int min_ni_width;
1328 /* Use a normal loop if there are no statements that need masking.
1329 This only happens in rare degenerate cases: it means that the loop
1330 has no loads, no stores, and no live-out values. */
1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332 return false;
1334 /* Get the maximum number of iterations that is representable
1335 in the counter type. */
1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1339 /* Get a more refined estimate for the number of iterations. */
1340 widest_int max_back_edges;
1341 if (max_loop_iterations (loop, &max_back_edges))
1342 max_ni = wi::smin (max_ni, max_back_edges + 1);
1344 /* Account for rgroup masks, in which each bit is replicated N times. */
1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1347 /* Work out how many bits we need to represent the limit. */
1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1350 /* Find a scalar mode for which WHILE_ULT is supported. */
1351 opt_scalar_int_mode cmp_mode_iter;
1352 tree cmp_type = NULL_TREE;
1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356 if (cmp_bits >= min_ni_width
1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 if (this_type
1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1363 /* Although we could stop as soon as we find a valid mode,
1364 it's often better to continue until we hit Pmode, since the
1365 operands to the WHILE are more likely to be reusable in
1366 address calculations. */
1367 cmp_type = this_type;
1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 break;
1374 if (!cmp_type)
1375 return false;
1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378 return true;
1381 /* Calculate the cost of one scalar iteration of the loop. */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387 int nbbs = loop->num_nodes, factor;
1388 int innerloop_iters, i;
1390 /* Gather costs for statements in the scalar loop. */
1392 /* FORNOW. */
1393 innerloop_iters = 1;
1394 if (loop->inner)
1395 innerloop_iters = 50; /* FIXME */
1397 for (i = 0; i < nbbs; i++)
1399 gimple_stmt_iterator si;
1400 basic_block bb = bbs[i];
1402 if (bb->loop_father == loop->inner)
1403 factor = innerloop_iters;
1404 else
1405 factor = 1;
1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1409 gimple *stmt = gsi_stmt (si);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413 continue;
1415 /* Skip stmts that are not vectorized inside the loop. */
1416 if (stmt_info
1417 && !STMT_VINFO_RELEVANT_P (stmt_info)
1418 && (!STMT_VINFO_LIVE_P (stmt_info)
1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421 continue;
1423 vect_cost_for_stmt kind;
1424 if (STMT_VINFO_DATA_REF (stmt_info))
1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427 kind = scalar_load;
1428 else
1429 kind = scalar_store;
1431 else
1432 kind = scalar_stmt;
1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 factor, kind, stmt_info, 0, vect_prologue);
1439 /* Now accumulate cost. */
1440 void *target_cost_data = init_cost (loop);
1441 stmt_info_for_cost *si;
1442 int j;
1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 j, si)
1446 struct _stmt_vec_info *stmt_info
1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448 (void) add_stmt_cost (target_cost_data, si->count,
1449 si->kind, stmt_info, si->misalign,
1450 vect_body);
1452 unsigned dummy, body_cost = 0;
1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454 destroy_cost_data (target_cost_data);
1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1459 /* Function vect_analyze_loop_form_1.
1461 Verify that certain CFG restrictions hold, including:
1462 - the loop has a pre-header
1463 - the loop has a single entry and exit
1464 - the loop exit condition is simple enough
1465 - the number of iterations can be analyzed, i.e, a countable loop. The
1466 niter could be analyzed under some assumptions. */
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 tree *assumptions, tree *number_of_iterationsm1,
1471 tree *number_of_iterations, gcond **inner_loop_cond)
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_NOTE, vect_location,
1475 "=== vect_analyze_loop_form ===\n");
1477 /* Different restrictions apply when we are considering an inner-most loop,
1478 vs. an outer (nested) loop.
1479 (FORNOW. May want to relax some of these restrictions in the future). */
1481 if (!loop->inner)
1483 /* Inner-most loop. We currently require that the number of BBs is
1484 exactly 2 (the header and latch). Vectorizable inner-most loops
1485 look like this:
1487 (pre-header)
1489 header <--------+
1490 | | |
1491 | +--> latch --+
1493 (exit-bb) */
1495 if (loop->num_nodes != 2)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: control flow in loop.\n");
1500 return false;
1503 if (empty_block_p (loop->header))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: empty loop.\n");
1508 return false;
1511 else
1513 struct loop *innerloop = loop->inner;
1514 edge entryedge;
1516 /* Nested loop. We currently require that the loop is doubly-nested,
1517 contains a single inner loop, and the number of BBs is exactly 5.
1518 Vectorizable outer-loops look like this:
1520 (pre-header)
1522 header <---+
1524 inner-loop |
1526 tail ------+
1528 (exit-bb)
1530 The inner-loop has the properties expected of inner-most loops
1531 as described above. */
1533 if ((loop->inner)->inner || (loop->inner)->next)
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "not vectorized: multiple nested loops.\n");
1538 return false;
1541 if (loop->num_nodes != 5)
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: control flow in loop.\n");
1546 return false;
1549 entryedge = loop_preheader_edge (innerloop);
1550 if (entryedge->src != loop->header
1551 || !single_exit (innerloop)
1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: unsupported outerloop form.\n");
1557 return false;
1560 /* Analyze the inner-loop. */
1561 tree inner_niterm1, inner_niter, inner_assumptions;
1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 &inner_assumptions, &inner_niterm1,
1564 &inner_niter, NULL)
1565 /* Don't support analyzing niter under assumptions for inner
1566 loop. */
1567 || !integer_onep (inner_assumptions))
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 "not vectorized: Bad inner loop.\n");
1572 return false;
1575 if (!expr_invariant_in_loop_p (loop, inner_niter))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "not vectorized: inner-loop count not"
1580 " invariant.\n");
1581 return false;
1584 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "Considering outer-loop vectorization.\n");
1589 if (!single_exit (loop)
1590 || EDGE_COUNT (loop->header->preds) != 2)
1592 if (dump_enabled_p ())
1594 if (!single_exit (loop))
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: multiple exits.\n");
1597 else if (EDGE_COUNT (loop->header->preds) != 2)
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: too many incoming edges.\n");
1601 return false;
1604 /* We assume that the loop exit condition is at the end of the loop. i.e,
1605 that the loop is represented as a do-while (with a proper if-guard
1606 before the loop if needed), where the loop header contains all the
1607 executable statements, and the latch is empty. */
1608 if (!empty_block_p (loop->latch)
1609 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1611 if (dump_enabled_p ())
1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 "not vectorized: latch block not empty.\n");
1614 return false;
1617 /* Make sure the exit is not abnormal. */
1618 edge e = single_exit (loop);
1619 if (e->flags & EDGE_ABNORMAL)
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "not vectorized: abnormal loop exit edge.\n");
1624 return false;
1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 number_of_iterationsm1);
1629 if (!*loop_cond)
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "not vectorized: complicated exit condition.\n");
1634 return false;
1637 if (integer_zerop (*assumptions)
1638 || !*number_of_iterations
1639 || chrec_contains_undetermined (*number_of_iterations))
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "not vectorized: number of iterations cannot be "
1644 "computed.\n");
1645 return false;
1648 if (integer_zerop (*number_of_iterations))
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: number of iterations = 0.\n");
1653 return false;
1656 return true;
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1664 tree assumptions, number_of_iterations, number_of_iterationsm1;
1665 gcond *loop_cond, *inner_loop_cond = NULL;
1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 &assumptions, &number_of_iterationsm1,
1669 &number_of_iterations, &inner_loop_cond))
1670 return NULL;
1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676 if (!integer_onep (assumptions))
1678 /* We consider to vectorize this loop by versioning it under
1679 some assumptions. In order to do this, we need to clear
1680 existing information computed by scev and niter analyzer. */
1681 scev_reset_htab ();
1682 free_numbers_of_iterations_estimates (loop);
1683 /* Also set flag for this loop so that following scev and niter
1684 analysis are done under the assumptions. */
1685 loop_constraint_set (loop, LOOP_C_FINITE);
1686 /* Also record the assumptions for versioning. */
1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1692 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_NOTE, vect_location,
1695 "Symbolic number of iterations is ");
1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697 dump_printf (MSG_NOTE, "\n");
1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702 if (inner_loop_cond)
1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704 = loop_exit_ctrl_vec_info_type;
1706 gcc_assert (!loop->aux);
1707 loop->aux = loop_vinfo;
1708 return loop_vinfo;
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714 statements update the vectorization factor. */
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721 int nbbs = loop->num_nodes;
1722 poly_uint64 vectorization_factor;
1723 int i;
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "=== vect_update_vf_for_slp ===\n");
1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730 gcc_assert (known_ne (vectorization_factor, 0U));
1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733 vectorization factor of the loop is the unrolling factor required by
1734 the SLP instances. If that unrolling factor is 1, we say, that we
1735 perform pure SLP on loop - cross iteration parallelism is not
1736 exploited. */
1737 bool only_slp_in_loop = true;
1738 for (i = 0; i < nbbs; i++)
1740 basic_block bb = bbs[i];
1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 gsi_next (&si))
1744 gimple *stmt = gsi_stmt (si);
1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 && STMT_VINFO_RELATED_STMT (stmt_info))
1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 stmt_info = vinfo_for_stmt (stmt);
1752 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 && !PURE_SLP_STMT (stmt_info))
1755 /* STMT needs both SLP and loop-based vectorization. */
1756 only_slp_in_loop = false;
1760 if (only_slp_in_loop)
1762 dump_printf_loc (MSG_NOTE, vect_location,
1763 "Loop contains only SLP stmts\n");
1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1766 else
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "Loop contains SLP and non-SLP stmts\n");
1770 /* Both the vectorization factor and unroll factor have the form
1771 current_vector_size * X for some rational X, so they must have
1772 a common multiple. */
1773 vectorization_factor
1774 = force_common_multiple (vectorization_factor,
1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Updating vectorization factor to ");
1783 dump_dec (MSG_NOTE, vectorization_factor);
1784 dump_printf (MSG_NOTE, ".\n");
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789 the other phi in the reduction is also relevant for vectorization.
1790 This rejects cases such as:
1792 outer1:
1793 x_1 = PHI <x_3(outer2), ...>;
1796 inner:
1797 x_2 = ...;
1800 outer2:
1801 x_3 = PHI <x_2(inner)>;
1803 if nothing in x_2 or elsewhere makes x_1 relevant. */
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809 return false;
1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 /* Function vect_analyze_loop_operations.
1817 Scan the loop stmts and make sure they are all vectorizable. */
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824 int nbbs = loop->num_nodes;
1825 int i;
1826 stmt_vec_info stmt_info;
1827 bool need_to_vectorize = false;
1828 bool ok;
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_NOTE, vect_location,
1832 "=== vect_analyze_loop_operations ===\n");
1834 for (i = 0; i < nbbs; i++)
1836 basic_block bb = bbs[i];
1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 gsi_next (&si))
1841 gphi *phi = si.phi ();
1842 ok = true;
1844 stmt_info = vinfo_for_stmt (phi);
1845 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1850 if (virtual_operand_p (gimple_phi_result (phi)))
1851 continue;
1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854 (i.e., a phi in the tail of the outer-loop). */
1855 if (! is_loop_header_bb_p (bb))
1857 /* FORNOW: we currently don't support the case that these phis
1858 are not used in the outerloop (unless it is double reduction,
1859 i.e., this phi is vect_reduction_def), cause this case
1860 requires to actually do something here. */
1861 if (STMT_VINFO_LIVE_P (stmt_info)
1862 && !vect_active_double_reduction_p (stmt_info))
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "Unsupported loop-closed phi in "
1867 "outer-loop.\n");
1868 return false;
1871 /* If PHI is used in the outer loop, we check that its operand
1872 is defined in the inner loop. */
1873 if (STMT_VINFO_RELEVANT_P (stmt_info))
1875 tree phi_op;
1876 gimple *op_def_stmt;
1878 if (gimple_phi_num_args (phi) != 1)
1879 return false;
1881 phi_op = PHI_ARG_DEF (phi, 0);
1882 if (TREE_CODE (phi_op) != SSA_NAME)
1883 return false;
1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 if (gimple_nop_p (op_def_stmt)
1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 || !vinfo_for_stmt (op_def_stmt))
1889 return false;
1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892 != vect_used_in_outer
1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894 != vect_used_in_outer_by_reduction)
1895 return false;
1898 continue;
1901 gcc_assert (stmt_info);
1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904 || STMT_VINFO_LIVE_P (stmt_info))
1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1907 /* A scalar-dependence cycle that we don't support. */
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "not vectorized: scalar dependence cycle.\n");
1911 return false;
1914 if (STMT_VINFO_RELEVANT_P (stmt_info))
1916 need_to_vectorize = true;
1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 && ! PURE_SLP_STMT (stmt_info))
1919 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 && ! PURE_SLP_STMT (stmt_info))
1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1927 if (ok
1928 && STMT_VINFO_LIVE_P (stmt_info)
1929 && !PURE_SLP_STMT (stmt_info))
1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1932 if (!ok)
1934 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 "not vectorized: relevant phi not "
1938 "supported: ");
1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1941 return false;
1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 gsi_next (&si))
1948 gimple *stmt = gsi_stmt (si);
1949 if (!gimple_clobber_p (stmt)
1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 return false;
1953 } /* bbs */
1955 /* All operations in the loop are either irrelevant (deal with loop
1956 control, or dead), or only used outside the loop and can be moved
1957 out of the loop (e.g. invariants, inductions). The loop can be
1958 optimized away by scalar optimizations. We're better off not
1959 touching this loop. */
1960 if (!need_to_vectorize)
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "All the computation can be taken out of the loop.\n");
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 "not vectorized: redundant loop. no profit to "
1968 "vectorize.\n");
1969 return false;
1972 return true;
1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1977 definitely no, or -1 if it's worth retrying. */
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1985 /* Only fully-masked loops can have iteration counts less than the
1986 vectorization factor. */
1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1989 HOST_WIDE_INT max_niter;
1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993 else
1994 max_niter = max_stmt_executions_int (loop);
1996 if (max_niter != -1
1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "not vectorized: iteration count smaller than "
2002 "vectorization factor.\n");
2003 return 0;
2007 int min_profitable_iters, min_profitable_estimate;
2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 &min_profitable_estimate);
2011 if (min_profitable_iters < 0)
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "not vectorized: vectorization not profitable.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: vector version will never be "
2019 "profitable.\n");
2020 return -1;
2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 * assumed_vf);
2026 /* Use the cost model only if it is more conservative than user specified
2027 threshold. */
2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 min_profitable_iters);
2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 "not vectorized: vectorization not profitable.\n");
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "not vectorized: iteration count smaller than user "
2042 "specified loop bound parameter or minimum profitable "
2043 "iterations (whichever is more conservative).\n");
2044 return 0;
2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048 if (estimated_niter == -1)
2049 estimated_niter = likely_max_stmt_executions_int (loop);
2050 if (estimated_niter != -1
2051 && ((unsigned HOST_WIDE_INT) estimated_niter
2052 < MAX (th, (unsigned) min_profitable_estimate)))
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 "not vectorized: estimated iteration count too "
2057 "small.\n");
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not vectorized: estimated iteration count smaller "
2061 "than specified loop bound parameter or minimum "
2062 "profitable iterations (whichever is more "
2063 "conservative).\n");
2064 return -1;
2067 return 1;
2071 /* Function vect_analyze_loop_2.
2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074 for it. The different analyses will record information in the
2075 loop_vec_info struct. */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2079 bool ok;
2080 int res;
2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082 poly_uint64 min_vf = 2;
2083 unsigned int n_stmts = 0;
2085 /* The first group of checks is independent of the vector size. */
2086 fatal = true;
2088 /* Find all data references in the loop (which correspond to vdefs/vuses)
2089 and analyze their evolution in the loop. */
2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "not vectorized: loop nest containing two "
2099 "or more consecutive inner loops cannot be "
2100 "vectorized\n");
2101 return false;
2104 for (unsigned i = 0; i < loop->num_nodes; i++)
2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 !gsi_end_p (gsi); gsi_next (&gsi))
2108 gimple *stmt = gsi_stmt (gsi);
2109 if (is_gimple_debug (stmt))
2110 continue;
2111 ++n_stmts;
2112 if (!find_data_references_in_stmt (loop, stmt,
2113 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2115 if (is_gimple_call (stmt) && loop->safelen)
2117 tree fndecl = gimple_call_fndecl (stmt), op;
2118 if (fndecl != NULL_TREE)
2120 cgraph_node *node = cgraph_node::get (fndecl);
2121 if (node != NULL && node->simd_clones != NULL)
2123 unsigned int j, n = gimple_call_num_args (stmt);
2124 for (j = 0; j < n; j++)
2126 op = gimple_call_arg (stmt, j);
2127 if (DECL_P (op)
2128 || (REFERENCE_CLASS_P (op)
2129 && get_base_address (op)))
2130 break;
2132 op = gimple_call_lhs (stmt);
2133 /* Ignore #pragma omp declare simd functions
2134 if they don't have data references in the
2135 call stmt itself. */
2136 if (j == n
2137 && !(op
2138 && (DECL_P (op)
2139 || (REFERENCE_CLASS_P (op)
2140 && get_base_address (op)))))
2141 continue;
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: loop contains function "
2148 "calls or data references that cannot "
2149 "be analyzed\n");
2150 return false;
2154 /* Analyze the data references and also adjust the minimal
2155 vectorization factor according to the loads and stores. */
2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158 if (!ok)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "bad data references.\n");
2163 return false;
2166 /* Classify all cross-iteration scalar data-flow cycles.
2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2168 vect_analyze_scalar_cycles (loop_vinfo);
2170 vect_pattern_recog (loop_vinfo);
2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2174 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2177 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178 if (!ok)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "bad data access.\n");
2183 return false;
2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189 if (!ok)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "unexpected pattern.\n");
2194 return false;
2197 /* While the rest of the analysis below depends on it in some way. */
2198 fatal = false;
2200 /* Analyze data dependences between the data-refs in the loop
2201 and adjust the maximum vectorization factor according to
2202 the dependences.
2203 FORNOW: fail at the first data dependence that we encounter. */
2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206 if (!ok
2207 || (max_vf != MAX_VECTORIZATION_FACTOR
2208 && maybe_lt (max_vf, min_vf)))
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "bad data dependence.\n");
2213 return false;
2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2217 ok = vect_determine_vectorization_factor (loop_vinfo);
2218 if (!ok)
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 "can't determine vectorization factor.\n");
2223 return false;
2225 if (max_vf != MAX_VECTORIZATION_FACTOR
2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "bad data dependence.\n");
2231 return false;
2234 /* Compute the scalar iteration cost. */
2235 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238 unsigned th;
2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2241 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242 if (!ok)
2243 return false;
2245 /* If there are any SLP instances mark them as pure_slp. */
2246 bool slp = vect_make_slp_decision (loop_vinfo);
2247 if (slp)
2249 /* Find stmts that need to be both vectorized and SLPed. */
2250 vect_detect_hybrid_slp (loop_vinfo);
2252 /* Update the vectorization factor based on the SLP decision. */
2253 vect_update_vf_for_slp (loop_vinfo);
2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2258 /* We don't expect to have to roll back to anything other than an empty
2259 set of rgroups. */
2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2262 /* This is the point where we can re-start analysis with SLP forced off. */
2263 start_over:
2265 /* Now the vectorization factor is final. */
2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267 gcc_assert (known_ne (vectorization_factor, 0U));
2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "vectorization_factor = ");
2273 dump_dec (MSG_NOTE, vectorization_factor);
2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 LOOP_VINFO_INT_NITERS (loop_vinfo));
2278 HOST_WIDE_INT max_niter
2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2281 /* Analyze the alignment of the data-refs in the loop.
2282 Fail if a data reference is found that cannot be vectorized. */
2284 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285 if (!ok)
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data alignment.\n");
2290 return false;
2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294 It is important to call pruning after vect_analyze_data_ref_accesses,
2295 since we use grouping information gathered by interleaving analysis. */
2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297 if (!ok)
2298 return false;
2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301 vectorization. */
2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2304 /* This pass will decide on using loop versioning and/or loop peeling in
2305 order to enhance the alignment of data references in the loop. */
2306 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return false;
2316 if (slp)
2318 /* Analyze operations in the SLP instances. Note this may
2319 remove unsupported SLP instances which makes the above
2320 SLP kind detection invalid. */
2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322 vect_slp_analyze_operations (loop_vinfo);
2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 goto again;
2327 /* Scan all the remaining operations in the loop that are not subject
2328 to SLP and make sure they are vectorizable. */
2329 ok = vect_analyze_loop_operations (loop_vinfo);
2330 if (!ok)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad operation or unsupported loop bound.\n");
2335 return false;
2338 /* Decide whether to use a fully-masked loop for this vectorization
2339 factor. */
2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342 && vect_verify_full_masking (loop_vinfo));
2343 if (dump_enabled_p ())
2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 dump_printf_loc (MSG_NOTE, vect_location,
2347 "using a fully-masked loop.\n");
2348 else
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "not using a fully-masked loop.\n");
2353 /* If epilog loop is required because of data accesses with gaps,
2354 one additional iteration needs to be peeled. Check if there is
2355 enough iterations for vectorization. */
2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2363 if (known_lt (wi::to_widest (scalar_niters), vf))
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "loop has no enough iterations to support"
2368 " peeling for gaps.\n");
2369 return false;
2373 /* Check the costings of the loop make vectorizing worthwhile. */
2374 res = vect_analyze_loop_costing (loop_vinfo);
2375 if (res < 0)
2376 goto again;
2377 if (!res)
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "Loop costings not worthwhile.\n");
2382 return false;
2385 /* Decide whether we need to create an epilogue loop to handle
2386 remaining scalar iterations. */
2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2389 unsigned HOST_WIDE_INT const_vf;
2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391 /* The main loop handles all iterations. */
2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2396 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2401 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404 < (unsigned) exact_log2 (const_vf))
2405 /* In case of versioning, check if the maximum number of
2406 iterations is greater than th. If they are identical,
2407 the epilogue is unnecessary. */
2408 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 || ((unsigned HOST_WIDE_INT) max_niter
2410 > (th / const_vf) * const_vf))))
2411 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2413 /* If an epilogue loop is required make sure we can create one. */
2414 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419 if (!vect_can_advance_ivs_p (loop_vinfo)
2420 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421 single_exit (LOOP_VINFO_LOOP
2422 (loop_vinfo))))
2424 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426 "not vectorized: can't create required "
2427 "epilog loop\n");
2428 goto again;
2432 /* During peeling, we need to check if number of loop iterations is
2433 enough for both peeled prolog loop and vector loop. This check
2434 can be merged along with threshold check of loop versioning, so
2435 increase threshold for this case if necessary. */
2436 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2438 poly_uint64 niters_th = 0;
2440 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2442 /* Niters for peeled prolog loop. */
2443 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2445 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446 tree vectype
2447 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2450 else
2451 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2454 /* Niters for at least one iteration of vectorized loop. */
2455 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457 /* One additional iteration because of peeling for gap. */
2458 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459 niters_th += 1;
2460 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2463 gcc_assert (known_eq (vectorization_factor,
2464 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2466 /* Ok to vectorize! */
2467 return true;
2469 again:
2470 /* Try again with SLP forced off but if we didn't do any SLP there is
2471 no point in re-trying. */
2472 if (!slp)
2473 return false;
2475 /* If there are reduction chains re-trying will fail anyway. */
2476 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477 return false;
2479 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480 via interleaving or lane instructions. */
2481 slp_instance instance;
2482 slp_tree node;
2483 unsigned i, j;
2484 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2486 stmt_vec_info vinfo;
2487 vinfo = vinfo_for_stmt
2488 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490 continue;
2491 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494 if (! vect_store_lanes_supported (vectype, size, false)
2495 && ! vect_grouped_store_supported (vectype, size))
2496 return false;
2497 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2499 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2500 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2501 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2502 size = STMT_VINFO_GROUP_SIZE (vinfo);
2503 vectype = STMT_VINFO_VECTYPE (vinfo);
2504 if (! vect_load_lanes_supported (vectype, size, false)
2505 && ! vect_grouped_load_supported (vectype, single_element_p,
2506 size))
2507 return false;
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "re-trying with SLP disabled\n");
2515 /* Roll back state appropriately. No SLP this time. */
2516 slp = false;
2517 /* Restore vectorization factor as it were without SLP. */
2518 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2519 /* Free the SLP instances. */
2520 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2521 vect_free_slp_instance (instance);
2522 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2523 /* Reset SLP type to loop_vect on all stmts. */
2524 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2526 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2527 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2528 !gsi_end_p (si); gsi_next (&si))
2530 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2531 STMT_SLP_TYPE (stmt_info) = loop_vect;
2533 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2534 !gsi_end_p (si); gsi_next (&si))
2536 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2537 STMT_SLP_TYPE (stmt_info) = loop_vect;
2538 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2540 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2541 STMT_SLP_TYPE (stmt_info) = loop_vect;
2542 for (gimple_stmt_iterator pi
2543 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2544 !gsi_end_p (pi); gsi_next (&pi))
2546 gimple *pstmt = gsi_stmt (pi);
2547 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2552 /* Free optimized alias test DDRS. */
2553 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2554 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2555 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2556 /* Reset target cost data. */
2557 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2558 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2559 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2560 /* Reset accumulated rgroup information. */
2561 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2562 /* Reset assorted flags. */
2563 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2564 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2565 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2566 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2567 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2569 goto start_over;
2572 /* Function vect_analyze_loop.
2574 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2575 for it. The different analyses will record information in the
2576 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2577 be vectorized. */
2578 loop_vec_info
2579 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2581 loop_vec_info loop_vinfo;
2582 auto_vector_sizes vector_sizes;
2584 /* Autodetect first vector size we try. */
2585 current_vector_size = 0;
2586 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2587 unsigned int next_size = 0;
2589 if (dump_enabled_p ())
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "===== analyze_loop_nest =====\n");
2593 if (loop_outer (loop)
2594 && loop_vec_info_for_loop (loop_outer (loop))
2595 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "outer-loop already vectorized.\n");
2600 return NULL;
2603 poly_uint64 autodetected_vector_size = 0;
2604 while (1)
2606 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2607 loop_vinfo = vect_analyze_loop_form (loop);
2608 if (!loop_vinfo)
2610 if (dump_enabled_p ())
2611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2612 "bad loop form.\n");
2613 return NULL;
2616 bool fatal = false;
2618 if (orig_loop_vinfo)
2619 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2621 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2623 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2625 return loop_vinfo;
2628 delete loop_vinfo;
2630 if (next_size == 0)
2631 autodetected_vector_size = current_vector_size;
2633 if (next_size < vector_sizes.length ()
2634 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2635 next_size += 1;
2637 if (fatal
2638 || next_size == vector_sizes.length ()
2639 || known_eq (current_vector_size, 0U))
2640 return NULL;
2642 /* Try the next biggest vector size. */
2643 current_vector_size = vector_sizes[next_size++];
2644 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "***** Re-trying analysis with "
2648 "vector size ");
2649 dump_dec (MSG_NOTE, current_vector_size);
2650 dump_printf (MSG_NOTE, "\n");
2655 /* Return true if there is an in-order reduction function for CODE, storing
2656 it in *REDUC_FN if so. */
2658 static bool
2659 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2661 switch (code)
2663 case PLUS_EXPR:
2664 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2665 return true;
2667 default:
2668 return false;
2672 /* Function reduction_fn_for_scalar_code
2674 Input:
2675 CODE - tree_code of a reduction operations.
2677 Output:
2678 REDUC_FN - the corresponding internal function to be used to reduce the
2679 vector of partial results into a single scalar result, or IFN_LAST
2680 if the operation is a supported reduction operation, but does not have
2681 such an internal function.
2683 Return FALSE if CODE currently cannot be vectorized as reduction. */
2685 static bool
2686 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2688 switch (code)
2690 case MAX_EXPR:
2691 *reduc_fn = IFN_REDUC_MAX;
2692 return true;
2694 case MIN_EXPR:
2695 *reduc_fn = IFN_REDUC_MIN;
2696 return true;
2698 case PLUS_EXPR:
2699 *reduc_fn = IFN_REDUC_PLUS;
2700 return true;
2702 case BIT_AND_EXPR:
2703 *reduc_fn = IFN_REDUC_AND;
2704 return true;
2706 case BIT_IOR_EXPR:
2707 *reduc_fn = IFN_REDUC_IOR;
2708 return true;
2710 case BIT_XOR_EXPR:
2711 *reduc_fn = IFN_REDUC_XOR;
2712 return true;
2714 case MULT_EXPR:
2715 case MINUS_EXPR:
2716 *reduc_fn = IFN_LAST;
2717 return true;
2719 default:
2720 return false;
2724 /* If there is a neutral value X such that SLP reduction NODE would not
2725 be affected by the introduction of additional X elements, return that X,
2726 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2727 is true if the SLP statements perform a single reduction, false if each
2728 statement performs an independent reduction. */
2730 static tree
2731 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2732 bool reduc_chain)
2734 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2735 gimple *stmt = stmts[0];
2736 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2737 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2738 tree scalar_type = TREE_TYPE (vector_type);
2739 struct loop *loop = gimple_bb (stmt)->loop_father;
2740 gcc_assert (loop);
2742 switch (code)
2744 case WIDEN_SUM_EXPR:
2745 case DOT_PROD_EXPR:
2746 case SAD_EXPR:
2747 case PLUS_EXPR:
2748 case MINUS_EXPR:
2749 case BIT_IOR_EXPR:
2750 case BIT_XOR_EXPR:
2751 return build_zero_cst (scalar_type);
2753 case MULT_EXPR:
2754 return build_one_cst (scalar_type);
2756 case BIT_AND_EXPR:
2757 return build_all_ones_cst (scalar_type);
2759 case MAX_EXPR:
2760 case MIN_EXPR:
2761 /* For MIN/MAX the initial values are neutral. A reduction chain
2762 has only a single initial value, so that value is neutral for
2763 all statements. */
2764 if (reduc_chain)
2765 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2766 return NULL_TREE;
2768 default:
2769 return NULL_TREE;
2773 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2774 STMT is printed with a message MSG. */
2776 static void
2777 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2779 dump_printf_loc (msg_type, vect_location, "%s", msg);
2780 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2784 /* Detect SLP reduction of the form:
2786 #a1 = phi <a5, a0>
2787 a2 = operation (a1)
2788 a3 = operation (a2)
2789 a4 = operation (a3)
2790 a5 = operation (a4)
2792 #a = phi <a5>
2794 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2795 FIRST_STMT is the first reduction stmt in the chain
2796 (a2 = operation (a1)).
2798 Return TRUE if a reduction chain was detected. */
2800 static bool
2801 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2802 gimple *first_stmt)
2804 struct loop *loop = (gimple_bb (phi))->loop_father;
2805 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2806 enum tree_code code;
2807 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2808 stmt_vec_info use_stmt_info, current_stmt_info;
2809 tree lhs;
2810 imm_use_iterator imm_iter;
2811 use_operand_p use_p;
2812 int nloop_uses, size = 0, n_out_of_loop_uses;
2813 bool found = false;
2815 if (loop != vect_loop)
2816 return false;
2818 lhs = PHI_RESULT (phi);
2819 code = gimple_assign_rhs_code (first_stmt);
2820 while (1)
2822 nloop_uses = 0;
2823 n_out_of_loop_uses = 0;
2824 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2826 gimple *use_stmt = USE_STMT (use_p);
2827 if (is_gimple_debug (use_stmt))
2828 continue;
2830 /* Check if we got back to the reduction phi. */
2831 if (use_stmt == phi)
2833 loop_use_stmt = use_stmt;
2834 found = true;
2835 break;
2838 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2840 loop_use_stmt = use_stmt;
2841 nloop_uses++;
2843 else
2844 n_out_of_loop_uses++;
2846 /* There are can be either a single use in the loop or two uses in
2847 phi nodes. */
2848 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2849 return false;
2852 if (found)
2853 break;
2855 /* We reached a statement with no loop uses. */
2856 if (nloop_uses == 0)
2857 return false;
2859 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2860 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2861 return false;
2863 if (!is_gimple_assign (loop_use_stmt)
2864 || code != gimple_assign_rhs_code (loop_use_stmt)
2865 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2866 return false;
2868 /* Insert USE_STMT into reduction chain. */
2869 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2870 if (current_stmt)
2872 current_stmt_info = vinfo_for_stmt (current_stmt);
2873 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2874 GROUP_FIRST_ELEMENT (use_stmt_info)
2875 = GROUP_FIRST_ELEMENT (current_stmt_info);
2877 else
2878 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2880 lhs = gimple_assign_lhs (loop_use_stmt);
2881 current_stmt = loop_use_stmt;
2882 size++;
2885 if (!found || loop_use_stmt != phi || size < 2)
2886 return false;
2888 /* Swap the operands, if needed, to make the reduction operand be the second
2889 operand. */
2890 lhs = PHI_RESULT (phi);
2891 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2892 while (next_stmt)
2894 if (gimple_assign_rhs2 (next_stmt) == lhs)
2896 tree op = gimple_assign_rhs1 (next_stmt);
2897 gimple *def_stmt = NULL;
2899 if (TREE_CODE (op) == SSA_NAME)
2900 def_stmt = SSA_NAME_DEF_STMT (op);
2902 /* Check that the other def is either defined in the loop
2903 ("vect_internal_def"), or it's an induction (defined by a
2904 loop-header phi-node). */
2905 if (def_stmt
2906 && gimple_bb (def_stmt)
2907 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2908 && (is_gimple_assign (def_stmt)
2909 || is_gimple_call (def_stmt)
2910 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2911 == vect_induction_def
2912 || (gimple_code (def_stmt) == GIMPLE_PHI
2913 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914 == vect_internal_def
2915 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917 lhs = gimple_assign_lhs (next_stmt);
2918 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2919 continue;
2922 return false;
2924 else
2926 tree op = gimple_assign_rhs2 (next_stmt);
2927 gimple *def_stmt = NULL;
2929 if (TREE_CODE (op) == SSA_NAME)
2930 def_stmt = SSA_NAME_DEF_STMT (op);
2932 /* Check that the other def is either defined in the loop
2933 ("vect_internal_def"), or it's an induction (defined by a
2934 loop-header phi-node). */
2935 if (def_stmt
2936 && gimple_bb (def_stmt)
2937 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938 && (is_gimple_assign (def_stmt)
2939 || is_gimple_call (def_stmt)
2940 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941 == vect_induction_def
2942 || (gimple_code (def_stmt) == GIMPLE_PHI
2943 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944 == vect_internal_def
2945 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2947 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2953 swap_ssa_operands (next_stmt,
2954 gimple_assign_rhs1_ptr (next_stmt),
2955 gimple_assign_rhs2_ptr (next_stmt));
2956 update_stmt (next_stmt);
2958 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2961 else
2962 return false;
2965 lhs = gimple_assign_lhs (next_stmt);
2966 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2969 /* Save the chain for further analysis in SLP detection. */
2970 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2971 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2972 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2974 return true;
2977 /* Return true if we need an in-order reduction for operation CODE
2978 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2979 overflow must wrap. */
2981 static bool
2982 needs_fold_left_reduction_p (tree type, tree_code code,
2983 bool need_wrapping_integral_overflow)
2985 /* CHECKME: check for !flag_finite_math_only too? */
2986 if (SCALAR_FLOAT_TYPE_P (type))
2987 switch (code)
2989 case MIN_EXPR:
2990 case MAX_EXPR:
2991 return false;
2993 default:
2994 return !flag_associative_math;
2997 if (INTEGRAL_TYPE_P (type))
2999 if (!operation_no_trapping_overflow (type, code))
3000 return true;
3001 if (need_wrapping_integral_overflow
3002 && !TYPE_OVERFLOW_WRAPS (type)
3003 && operation_can_overflow (code))
3004 return true;
3005 return false;
3008 if (SAT_FIXED_POINT_TYPE_P (type))
3009 return true;
3011 return false;
3014 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3015 reduction operation CODE has a handled computation expression. */
3017 bool
3018 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3019 enum tree_code code)
3021 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3022 auto_bitmap visited;
3023 tree lookfor = PHI_RESULT (phi);
3024 ssa_op_iter curri;
3025 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3026 while (USE_FROM_PTR (curr) != loop_arg)
3027 curr = op_iter_next_use (&curri);
3028 curri.i = curri.numops;
3031 path.safe_push (std::make_pair (curri, curr));
3032 tree use = USE_FROM_PTR (curr);
3033 if (use == lookfor)
3034 break;
3035 gimple *def = SSA_NAME_DEF_STMT (use);
3036 if (gimple_nop_p (def)
3037 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3039 pop:
3042 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3043 curri = x.first;
3044 curr = x.second;
3046 curr = op_iter_next_use (&curri);
3047 /* Skip already visited or non-SSA operands (from iterating
3048 over PHI args). */
3049 while (curr != NULL_USE_OPERAND_P
3050 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3051 || ! bitmap_set_bit (visited,
3052 SSA_NAME_VERSION
3053 (USE_FROM_PTR (curr)))));
3055 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3056 if (curr == NULL_USE_OPERAND_P)
3057 break;
3059 else
3061 if (gimple_code (def) == GIMPLE_PHI)
3062 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3063 else
3064 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3065 while (curr != NULL_USE_OPERAND_P
3066 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3067 || ! bitmap_set_bit (visited,
3068 SSA_NAME_VERSION
3069 (USE_FROM_PTR (curr)))))
3070 curr = op_iter_next_use (&curri);
3071 if (curr == NULL_USE_OPERAND_P)
3072 goto pop;
3075 while (1);
3076 if (dump_file && (dump_flags & TDF_DETAILS))
3078 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3079 unsigned i;
3080 std::pair<ssa_op_iter, use_operand_p> *x;
3081 FOR_EACH_VEC_ELT (path, i, x)
3083 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3084 dump_printf (MSG_NOTE, " ");
3086 dump_printf (MSG_NOTE, "\n");
3089 /* Check whether the reduction path detected is valid. */
3090 bool fail = path.length () == 0;
3091 bool neg = false;
3092 for (unsigned i = 1; i < path.length (); ++i)
3094 gimple *use_stmt = USE_STMT (path[i].second);
3095 tree op = USE_FROM_PTR (path[i].second);
3096 if (! has_single_use (op)
3097 || ! is_gimple_assign (use_stmt))
3099 fail = true;
3100 break;
3102 if (gimple_assign_rhs_code (use_stmt) != code)
3104 if (code == PLUS_EXPR
3105 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3107 /* Track whether we negate the reduction value each iteration. */
3108 if (gimple_assign_rhs2 (use_stmt) == op)
3109 neg = ! neg;
3111 else
3113 fail = true;
3114 break;
3118 return ! fail && ! neg;
3122 /* Function vect_is_simple_reduction
3124 (1) Detect a cross-iteration def-use cycle that represents a simple
3125 reduction computation. We look for the following pattern:
3127 loop_header:
3128 a1 = phi < a0, a2 >
3129 a3 = ...
3130 a2 = operation (a3, a1)
3134 a3 = ...
3135 loop_header:
3136 a1 = phi < a0, a2 >
3137 a2 = operation (a3, a1)
3139 such that:
3140 1. operation is commutative and associative and it is safe to
3141 change the order of the computation
3142 2. no uses for a2 in the loop (a2 is used out of the loop)
3143 3. no uses of a1 in the loop besides the reduction operation
3144 4. no uses of a1 outside the loop.
3146 Conditions 1,4 are tested here.
3147 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3149 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3150 nested cycles.
3152 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3153 reductions:
3155 a1 = phi < a0, a2 >
3156 inner loop (def of a3)
3157 a2 = phi < a3 >
3159 (4) Detect condition expressions, ie:
3160 for (int i = 0; i < N; i++)
3161 if (a[i] < val)
3162 ret_val = a[i];
3166 static gimple *
3167 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3168 bool *double_reduc,
3169 bool need_wrapping_integral_overflow,
3170 enum vect_reduction_type *v_reduc_type)
3172 struct loop *loop = (gimple_bb (phi))->loop_father;
3173 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3174 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3175 enum tree_code orig_code, code;
3176 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3177 tree type;
3178 int nloop_uses;
3179 tree name;
3180 imm_use_iterator imm_iter;
3181 use_operand_p use_p;
3182 bool phi_def;
3184 *double_reduc = false;
3185 *v_reduc_type = TREE_CODE_REDUCTION;
3187 tree phi_name = PHI_RESULT (phi);
3188 /* ??? If there are no uses of the PHI result the inner loop reduction
3189 won't be detected as possibly double-reduction by vectorizable_reduction
3190 because that tries to walk the PHI arg from the preheader edge which
3191 can be constant. See PR60382. */
3192 if (has_zero_uses (phi_name))
3193 return NULL;
3194 nloop_uses = 0;
3195 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3197 gimple *use_stmt = USE_STMT (use_p);
3198 if (is_gimple_debug (use_stmt))
3199 continue;
3201 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205 "intermediate value used outside loop.\n");
3207 return NULL;
3210 nloop_uses++;
3211 if (nloop_uses > 1)
3213 if (dump_enabled_p ())
3214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3215 "reduction value used in loop.\n");
3216 return NULL;
3219 phi_use_stmt = use_stmt;
3222 edge latch_e = loop_latch_edge (loop);
3223 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3224 if (TREE_CODE (loop_arg) != SSA_NAME)
3226 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229 "reduction: not ssa_name: ");
3230 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3231 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3233 return NULL;
3236 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3237 if (is_gimple_assign (def_stmt))
3239 name = gimple_assign_lhs (def_stmt);
3240 phi_def = false;
3242 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3244 name = PHI_RESULT (def_stmt);
3245 phi_def = true;
3247 else
3249 if (dump_enabled_p ())
3251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3252 "reduction: unhandled reduction operation: ");
3253 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3255 return NULL;
3258 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3259 return NULL;
3261 nloop_uses = 0;
3262 auto_vec<gphi *, 3> lcphis;
3263 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3265 gimple *use_stmt = USE_STMT (use_p);
3266 if (is_gimple_debug (use_stmt))
3267 continue;
3268 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3269 nloop_uses++;
3270 else
3271 /* We can have more than one loop-closed PHI. */
3272 lcphis.safe_push (as_a <gphi *> (use_stmt));
3273 if (nloop_uses > 1)
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "reduction used in loop.\n");
3278 return NULL;
3282 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3283 defined in the inner loop. */
3284 if (phi_def)
3286 op1 = PHI_ARG_DEF (def_stmt, 0);
3288 if (gimple_phi_num_args (def_stmt) != 1
3289 || TREE_CODE (op1) != SSA_NAME)
3291 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3293 "unsupported phi node definition.\n");
3295 return NULL;
3298 def1 = SSA_NAME_DEF_STMT (op1);
3299 if (gimple_bb (def1)
3300 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3301 && loop->inner
3302 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3303 && is_gimple_assign (def1)
3304 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3306 if (dump_enabled_p ())
3307 report_vect_op (MSG_NOTE, def_stmt,
3308 "detected double reduction: ");
3310 *double_reduc = true;
3311 return def_stmt;
3314 return NULL;
3317 /* If we are vectorizing an inner reduction we are executing that
3318 in the original order only in case we are not dealing with a
3319 double reduction. */
3320 bool check_reduction = true;
3321 if (flow_loop_nested_p (vect_loop, loop))
3323 gphi *lcphi;
3324 unsigned i;
3325 check_reduction = false;
3326 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3327 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3329 gimple *use_stmt = USE_STMT (use_p);
3330 if (is_gimple_debug (use_stmt))
3331 continue;
3332 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3333 check_reduction = true;
3337 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3338 code = orig_code = gimple_assign_rhs_code (def_stmt);
3340 /* We can handle "res -= x[i]", which is non-associative by
3341 simply rewriting this into "res += -x[i]". Avoid changing
3342 gimple instruction for the first simple tests and only do this
3343 if we're allowed to change code at all. */
3344 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3345 code = PLUS_EXPR;
3347 if (code == COND_EXPR)
3349 if (! nested_in_vect_loop)
3350 *v_reduc_type = COND_REDUCTION;
3352 op3 = gimple_assign_rhs1 (def_stmt);
3353 if (COMPARISON_CLASS_P (op3))
3355 op4 = TREE_OPERAND (op3, 1);
3356 op3 = TREE_OPERAND (op3, 0);
3358 if (op3 == phi_name || op4 == phi_name)
3360 if (dump_enabled_p ())
3361 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3362 "reduction: condition depends on previous"
3363 " iteration: ");
3364 return NULL;
3367 op1 = gimple_assign_rhs2 (def_stmt);
3368 op2 = gimple_assign_rhs3 (def_stmt);
3370 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3372 if (dump_enabled_p ())
3373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3374 "reduction: not commutative/associative: ");
3375 return NULL;
3377 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3379 op1 = gimple_assign_rhs1 (def_stmt);
3380 op2 = gimple_assign_rhs2 (def_stmt);
3382 else
3384 if (dump_enabled_p ())
3385 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3386 "reduction: not handled operation: ");
3387 return NULL;
3390 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3392 if (dump_enabled_p ())
3393 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3394 "reduction: both uses not ssa_names: ");
3396 return NULL;
3399 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3400 if ((TREE_CODE (op1) == SSA_NAME
3401 && !types_compatible_p (type,TREE_TYPE (op1)))
3402 || (TREE_CODE (op2) == SSA_NAME
3403 && !types_compatible_p (type, TREE_TYPE (op2)))
3404 || (op3 && TREE_CODE (op3) == SSA_NAME
3405 && !types_compatible_p (type, TREE_TYPE (op3)))
3406 || (op4 && TREE_CODE (op4) == SSA_NAME
3407 && !types_compatible_p (type, TREE_TYPE (op4))))
3409 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 "reduction: multiple types: operation type: ");
3413 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3414 dump_printf (MSG_NOTE, ", operands types: ");
3415 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3416 TREE_TYPE (op1));
3417 dump_printf (MSG_NOTE, ",");
3418 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3419 TREE_TYPE (op2));
3420 if (op3)
3422 dump_printf (MSG_NOTE, ",");
3423 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3424 TREE_TYPE (op3));
3427 if (op4)
3429 dump_printf (MSG_NOTE, ",");
3430 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431 TREE_TYPE (op4));
3433 dump_printf (MSG_NOTE, "\n");
3436 return NULL;
3439 /* Check whether it's ok to change the order of the computation.
3440 Generally, when vectorizing a reduction we change the order of the
3441 computation. This may change the behavior of the program in some
3442 cases, so we need to check that this is ok. One exception is when
3443 vectorizing an outer-loop: the inner-loop is executed sequentially,
3444 and therefore vectorizing reductions in the inner-loop during
3445 outer-loop vectorization is safe. */
3446 if (check_reduction
3447 && *v_reduc_type == TREE_CODE_REDUCTION
3448 && needs_fold_left_reduction_p (type, code,
3449 need_wrapping_integral_overflow))
3450 *v_reduc_type = FOLD_LEFT_REDUCTION;
3452 /* Reduction is safe. We're dealing with one of the following:
3453 1) integer arithmetic and no trapv
3454 2) floating point arithmetic, and special flags permit this optimization
3455 3) nested cycle (i.e., outer loop vectorization). */
3456 if (TREE_CODE (op1) == SSA_NAME)
3457 def1 = SSA_NAME_DEF_STMT (op1);
3459 if (TREE_CODE (op2) == SSA_NAME)
3460 def2 = SSA_NAME_DEF_STMT (op2);
3462 if (code != COND_EXPR
3463 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3465 if (dump_enabled_p ())
3466 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3467 return NULL;
3470 /* Check that one def is the reduction def, defined by PHI,
3471 the other def is either defined in the loop ("vect_internal_def"),
3472 or it's an induction (defined by a loop-header phi-node). */
3474 if (def2 && def2 == phi
3475 && (code == COND_EXPR
3476 || !def1 || gimple_nop_p (def1)
3477 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3478 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479 && (is_gimple_assign (def1)
3480 || is_gimple_call (def1)
3481 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3482 == vect_induction_def
3483 || (gimple_code (def1) == GIMPLE_PHI
3484 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3485 == vect_internal_def
3486 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3488 if (dump_enabled_p ())
3489 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3490 return def_stmt;
3493 if (def1 && def1 == phi
3494 && (code == COND_EXPR
3495 || !def2 || gimple_nop_p (def2)
3496 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3497 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498 && (is_gimple_assign (def2)
3499 || is_gimple_call (def2)
3500 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3501 == vect_induction_def
3502 || (gimple_code (def2) == GIMPLE_PHI
3503 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3504 == vect_internal_def
3505 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3507 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3509 /* Check if we can swap operands (just for simplicity - so that
3510 the rest of the code can assume that the reduction variable
3511 is always the last (second) argument). */
3512 if (code == COND_EXPR)
3514 /* Swap cond_expr by inverting the condition. */
3515 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3516 enum tree_code invert_code = ERROR_MARK;
3517 enum tree_code cond_code = TREE_CODE (cond_expr);
3519 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3521 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3522 invert_code = invert_tree_comparison (cond_code, honor_nans);
3524 if (invert_code != ERROR_MARK)
3526 TREE_SET_CODE (cond_expr, invert_code);
3527 swap_ssa_operands (def_stmt,
3528 gimple_assign_rhs2_ptr (def_stmt),
3529 gimple_assign_rhs3_ptr (def_stmt));
3531 else
3533 if (dump_enabled_p ())
3534 report_vect_op (MSG_NOTE, def_stmt,
3535 "detected reduction: cannot swap operands "
3536 "for cond_expr");
3537 return NULL;
3540 else
3541 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3542 gimple_assign_rhs2_ptr (def_stmt));
3544 if (dump_enabled_p ())
3545 report_vect_op (MSG_NOTE, def_stmt,
3546 "detected reduction: need to swap operands: ");
3548 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3549 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3551 else
3553 if (dump_enabled_p ())
3554 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3557 return def_stmt;
3560 /* Try to find SLP reduction chain. */
3561 if (! nested_in_vect_loop
3562 && code != COND_EXPR
3563 && orig_code != MINUS_EXPR
3564 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3566 if (dump_enabled_p ())
3567 report_vect_op (MSG_NOTE, def_stmt,
3568 "reduction: detected reduction chain: ");
3570 return def_stmt;
3573 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3574 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3575 while (first)
3577 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3578 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3579 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580 first = next;
3583 /* Look for the expression computing loop_arg from loop PHI result. */
3584 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3585 code))
3586 return def_stmt;
3588 if (dump_enabled_p ())
3590 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3591 "reduction: unknown pattern: ");
3594 return NULL;
3597 /* Wrapper around vect_is_simple_reduction, which will modify code
3598 in-place if it enables detection of more reductions. Arguments
3599 as there. */
3601 gimple *
3602 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3603 bool *double_reduc,
3604 bool need_wrapping_integral_overflow)
3606 enum vect_reduction_type v_reduc_type;
3607 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3608 need_wrapping_integral_overflow,
3609 &v_reduc_type);
3610 if (def)
3612 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3613 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3614 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3615 reduc_def_info = vinfo_for_stmt (def);
3616 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3617 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3619 return def;
3622 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3624 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3625 int *peel_iters_epilogue,
3626 stmt_vector_for_cost *scalar_cost_vec,
3627 stmt_vector_for_cost *prologue_cost_vec,
3628 stmt_vector_for_cost *epilogue_cost_vec)
3630 int retval = 0;
3631 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3633 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3635 *peel_iters_epilogue = assumed_vf / 2;
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "cost model: epilogue peel iters set to vf/2 "
3639 "because loop iterations are unknown .\n");
3641 /* If peeled iterations are known but number of scalar loop
3642 iterations are unknown, count a taken branch per peeled loop. */
3643 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3644 NULL, 0, vect_prologue);
3645 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3646 NULL, 0, vect_epilogue);
3648 else
3650 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3651 peel_iters_prologue = niters < peel_iters_prologue ?
3652 niters : peel_iters_prologue;
3653 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3654 /* If we need to peel for gaps, but no peeling is required, we have to
3655 peel VF iterations. */
3656 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3657 *peel_iters_epilogue = assumed_vf;
3660 stmt_info_for_cost *si;
3661 int j;
3662 if (peel_iters_prologue)
3663 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3665 stmt_vec_info stmt_info
3666 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3667 retval += record_stmt_cost (prologue_cost_vec,
3668 si->count * peel_iters_prologue,
3669 si->kind, stmt_info, si->misalign,
3670 vect_prologue);
3672 if (*peel_iters_epilogue)
3673 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3675 stmt_vec_info stmt_info
3676 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3677 retval += record_stmt_cost (epilogue_cost_vec,
3678 si->count * *peel_iters_epilogue,
3679 si->kind, stmt_info, si->misalign,
3680 vect_epilogue);
3683 return retval;
3686 /* Function vect_estimate_min_profitable_iters
3688 Return the number of iterations required for the vector version of the
3689 loop to be profitable relative to the cost of the scalar version of the
3690 loop.
3692 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3693 of iterations for vectorization. -1 value means loop vectorization
3694 is not profitable. This returned value may be used for dynamic
3695 profitability check.
3697 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3698 for static check against estimated number of iterations. */
3700 static void
3701 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3702 int *ret_min_profitable_niters,
3703 int *ret_min_profitable_estimate)
3705 int min_profitable_iters;
3706 int min_profitable_estimate;
3707 int peel_iters_prologue;
3708 int peel_iters_epilogue;
3709 unsigned vec_inside_cost = 0;
3710 int vec_outside_cost = 0;
3711 unsigned vec_prologue_cost = 0;
3712 unsigned vec_epilogue_cost = 0;
3713 int scalar_single_iter_cost = 0;
3714 int scalar_outside_cost = 0;
3715 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3716 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3717 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719 /* Cost model disabled. */
3720 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3722 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3723 *ret_min_profitable_niters = 0;
3724 *ret_min_profitable_estimate = 0;
3725 return;
3728 /* Requires loop versioning tests to handle misalignment. */
3729 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3731 /* FIXME: Make cost depend on complexity of individual check. */
3732 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3733 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3734 vect_prologue);
3735 dump_printf (MSG_NOTE,
3736 "cost model: Adding cost of checks for loop "
3737 "versioning to treat misalignment.\n");
3740 /* Requires loop versioning with alias checks. */
3741 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3743 /* FIXME: Make cost depend on complexity of individual check. */
3744 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3745 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3746 vect_prologue);
3747 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3748 if (len)
3749 /* Count LEN - 1 ANDs and LEN comparisons. */
3750 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3751 NULL, 0, vect_prologue);
3752 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3753 if (len)
3755 /* Count LEN - 1 ANDs and LEN comparisons. */
3756 unsigned int nstmts = len * 2 - 1;
3757 /* +1 for each bias that needs adding. */
3758 for (unsigned int i = 0; i < len; ++i)
3759 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3760 nstmts += 1;
3761 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3762 NULL, 0, vect_prologue);
3764 dump_printf (MSG_NOTE,
3765 "cost model: Adding cost of checks for loop "
3766 "versioning aliasing.\n");
3769 /* Requires loop versioning with niter checks. */
3770 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3772 /* FIXME: Make cost depend on complexity of individual check. */
3773 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3774 vect_prologue);
3775 dump_printf (MSG_NOTE,
3776 "cost model: Adding cost of checks for loop "
3777 "versioning niters.\n");
3780 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3781 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3782 vect_prologue);
3784 /* Count statements in scalar loop. Using this as scalar cost for a single
3785 iteration for now.
3787 TODO: Add outer loop support.
3789 TODO: Consider assigning different costs to different scalar
3790 statements. */
3792 scalar_single_iter_cost
3793 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3795 /* Add additional cost for the peeled instructions in prologue and epilogue
3796 loop. (For fully-masked loops there will be no peeling.)
3798 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3799 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3801 TODO: Build an expression that represents peel_iters for prologue and
3802 epilogue to be used in a run-time test. */
3804 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806 peel_iters_prologue = 0;
3807 peel_iters_epilogue = 0;
3809 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3811 /* We need to peel exactly one iteration. */
3812 peel_iters_epilogue += 1;
3813 stmt_info_for_cost *si;
3814 int j;
3815 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3816 j, si)
3818 struct _stmt_vec_info *stmt_info
3819 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3820 (void) add_stmt_cost (target_cost_data, si->count,
3821 si->kind, stmt_info, si->misalign,
3822 vect_epilogue);
3826 else if (npeel < 0)
3828 peel_iters_prologue = assumed_vf / 2;
3829 dump_printf (MSG_NOTE, "cost model: "
3830 "prologue peel iters set to vf/2.\n");
3832 /* If peeling for alignment is unknown, loop bound of main loop becomes
3833 unknown. */
3834 peel_iters_epilogue = assumed_vf / 2;
3835 dump_printf (MSG_NOTE, "cost model: "
3836 "epilogue peel iters set to vf/2 because "
3837 "peeling for alignment is unknown.\n");
3839 /* If peeled iterations are unknown, count a taken branch and a not taken
3840 branch per peeled loop. Even if scalar loop iterations are known,
3841 vector iterations are not known since peeled prologue iterations are
3842 not known. Hence guards remain the same. */
3843 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3844 NULL, 0, vect_prologue);
3845 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3846 NULL, 0, vect_prologue);
3847 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3848 NULL, 0, vect_epilogue);
3849 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3850 NULL, 0, vect_epilogue);
3851 stmt_info_for_cost *si;
3852 int j;
3853 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3855 struct _stmt_vec_info *stmt_info
3856 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3857 (void) add_stmt_cost (target_cost_data,
3858 si->count * peel_iters_prologue,
3859 si->kind, stmt_info, si->misalign,
3860 vect_prologue);
3861 (void) add_stmt_cost (target_cost_data,
3862 si->count * peel_iters_epilogue,
3863 si->kind, stmt_info, si->misalign,
3864 vect_epilogue);
3867 else
3869 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3870 stmt_info_for_cost *si;
3871 int j;
3872 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3874 prologue_cost_vec.create (2);
3875 epilogue_cost_vec.create (2);
3876 peel_iters_prologue = npeel;
3878 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3879 &peel_iters_epilogue,
3880 &LOOP_VINFO_SCALAR_ITERATION_COST
3881 (loop_vinfo),
3882 &prologue_cost_vec,
3883 &epilogue_cost_vec);
3885 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3887 struct _stmt_vec_info *stmt_info
3888 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3889 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3890 si->misalign, vect_prologue);
3893 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3895 struct _stmt_vec_info *stmt_info
3896 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3897 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3898 si->misalign, vect_epilogue);
3901 prologue_cost_vec.release ();
3902 epilogue_cost_vec.release ();
3905 /* FORNOW: The scalar outside cost is incremented in one of the
3906 following ways:
3908 1. The vectorizer checks for alignment and aliasing and generates
3909 a condition that allows dynamic vectorization. A cost model
3910 check is ANDED with the versioning condition. Hence scalar code
3911 path now has the added cost of the versioning check.
3913 if (cost > th & versioning_check)
3914 jmp to vector code
3916 Hence run-time scalar is incremented by not-taken branch cost.
3918 2. The vectorizer then checks if a prologue is required. If the
3919 cost model check was not done before during versioning, it has to
3920 be done before the prologue check.
3922 if (cost <= th)
3923 prologue = scalar_iters
3924 if (prologue == 0)
3925 jmp to vector code
3926 else
3927 execute prologue
3928 if (prologue == num_iters)
3929 go to exit
3931 Hence the run-time scalar cost is incremented by a taken branch,
3932 plus a not-taken branch, plus a taken branch cost.
3934 3. The vectorizer then checks if an epilogue is required. If the
3935 cost model check was not done before during prologue check, it
3936 has to be done with the epilogue check.
3938 if (prologue == 0)
3939 jmp to vector code
3940 else
3941 execute prologue
3942 if (prologue == num_iters)
3943 go to exit
3944 vector code:
3945 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3946 jmp to epilogue
3948 Hence the run-time scalar cost should be incremented by 2 taken
3949 branches.
3951 TODO: The back end may reorder the BBS's differently and reverse
3952 conditions/branch directions. Change the estimates below to
3953 something more reasonable. */
3955 /* If the number of iterations is known and we do not do versioning, we can
3956 decide whether to vectorize at compile time. Hence the scalar version
3957 do not carry cost model guard costs. */
3958 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3959 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3961 /* Cost model check occurs at versioning. */
3962 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3963 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3964 else
3966 /* Cost model check occurs at prologue generation. */
3967 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3968 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3969 + vect_get_stmt_cost (cond_branch_not_taken);
3970 /* Cost model check occurs at epilogue generation. */
3971 else
3972 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3976 /* Complete the target-specific cost calculations. */
3977 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3978 &vec_inside_cost, &vec_epilogue_cost);
3980 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3985 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3986 vec_inside_cost);
3987 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3988 vec_prologue_cost);
3989 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3990 vec_epilogue_cost);
3991 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3992 scalar_single_iter_cost);
3993 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3994 scalar_outside_cost);
3995 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3996 vec_outside_cost);
3997 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3998 peel_iters_prologue);
3999 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4000 peel_iters_epilogue);
4003 /* Calculate number of iterations required to make the vector version
4004 profitable, relative to the loop bodies only. The following condition
4005 must hold true:
4006 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4007 where
4008 SIC = scalar iteration cost, VIC = vector iteration cost,
4009 VOC = vector outside cost, VF = vectorization factor,
4010 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4011 SOC = scalar outside cost for run time cost model check. */
4013 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4015 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4016 * assumed_vf
4017 - vec_inside_cost * peel_iters_prologue
4018 - vec_inside_cost * peel_iters_epilogue);
4019 if (min_profitable_iters <= 0)
4020 min_profitable_iters = 0;
4021 else
4023 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4024 - vec_inside_cost);
4026 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4027 <= (((int) vec_inside_cost * min_profitable_iters)
4028 + (((int) vec_outside_cost - scalar_outside_cost)
4029 * assumed_vf)))
4030 min_profitable_iters++;
4033 /* vector version will never be profitable. */
4034 else
4036 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4037 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4038 "did not happen for a simd loop");
4040 if (dump_enabled_p ())
4041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4042 "cost model: the vector iteration cost = %d "
4043 "divided by the scalar iteration cost = %d "
4044 "is greater or equal to the vectorization factor = %d"
4045 ".\n",
4046 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4047 *ret_min_profitable_niters = -1;
4048 *ret_min_profitable_estimate = -1;
4049 return;
4052 dump_printf (MSG_NOTE,
4053 " Calculated minimum iters for profitability: %d\n",
4054 min_profitable_iters);
4056 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4057 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4058 /* We want the vectorized loop to execute at least once. */
4059 min_profitable_iters = assumed_vf + peel_iters_prologue;
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE, vect_location,
4063 " Runtime profitability threshold = %d\n",
4064 min_profitable_iters);
4066 *ret_min_profitable_niters = min_profitable_iters;
4068 /* Calculate number of iterations required to make the vector version
4069 profitable, relative to the loop bodies only.
4071 Non-vectorized variant is SIC * niters and it must win over vector
4072 variant on the expected loop trip count. The following condition must hold true:
4073 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4075 if (vec_outside_cost <= 0)
4076 min_profitable_estimate = 0;
4077 else
4079 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4080 * assumed_vf
4081 - vec_inside_cost * peel_iters_prologue
4082 - vec_inside_cost * peel_iters_epilogue)
4083 / ((scalar_single_iter_cost * assumed_vf)
4084 - vec_inside_cost);
4086 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4087 if (dump_enabled_p ())
4088 dump_printf_loc (MSG_NOTE, vect_location,
4089 " Static estimate profitability threshold = %d\n",
4090 min_profitable_estimate);
4092 *ret_min_profitable_estimate = min_profitable_estimate;
4095 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4096 vector elements (not bits) for a vector with NELT elements. */
4097 static void
4098 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4099 vec_perm_builder *sel)
4101 /* The encoding is a single stepped pattern. Any wrap-around is handled
4102 by vec_perm_indices. */
4103 sel->new_vector (nelt, 1, 3);
4104 for (unsigned int i = 0; i < 3; i++)
4105 sel->quick_push (i + offset);
4108 /* Checks whether the target supports whole-vector shifts for vectors of mode
4109 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4110 it supports vec_perm_const with masks for all necessary shift amounts. */
4111 static bool
4112 have_whole_vector_shift (machine_mode mode)
4114 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4115 return true;
4117 /* Variable-length vectors should be handled via the optab. */
4118 unsigned int nelt;
4119 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4120 return false;
4122 vec_perm_builder sel;
4123 vec_perm_indices indices;
4124 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4126 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4127 indices.new_vector (sel, 2, nelt);
4128 if (!can_vec_perm_const_p (mode, indices, false))
4129 return false;
4131 return true;
4134 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4135 functions. Design better to avoid maintenance issues. */
4137 /* Function vect_model_reduction_cost.
4139 Models cost for a reduction operation, including the vector ops
4140 generated within the strip-mine loop, the initial definition before
4141 the loop, and the epilogue code that must be generated. */
4143 static void
4144 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4145 int ncopies)
4147 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4148 enum tree_code code;
4149 optab optab;
4150 tree vectype;
4151 gimple *orig_stmt;
4152 machine_mode mode;
4153 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4154 struct loop *loop = NULL;
4155 void *target_cost_data;
4157 if (loop_vinfo)
4159 loop = LOOP_VINFO_LOOP (loop_vinfo);
4160 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4162 else
4163 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4165 /* Condition reductions generate two reductions in the loop. */
4166 vect_reduction_type reduction_type
4167 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4168 if (reduction_type == COND_REDUCTION)
4169 ncopies *= 2;
4171 vectype = STMT_VINFO_VECTYPE (stmt_info);
4172 mode = TYPE_MODE (vectype);
4173 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4175 if (!orig_stmt)
4176 orig_stmt = STMT_VINFO_STMT (stmt_info);
4178 code = gimple_assign_rhs_code (orig_stmt);
4180 if (reduction_type == EXTRACT_LAST_REDUCTION
4181 || reduction_type == FOLD_LEFT_REDUCTION)
4183 /* No extra instructions needed in the prologue. */
4184 prologue_cost = 0;
4186 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4187 /* Count one reduction-like operation per vector. */
4188 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4189 stmt_info, 0, vect_body);
4190 else
4192 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4193 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4194 inside_cost = add_stmt_cost (target_cost_data, nelements,
4195 vec_to_scalar, stmt_info, 0,
4196 vect_body);
4197 inside_cost += add_stmt_cost (target_cost_data, nelements,
4198 scalar_stmt, stmt_info, 0,
4199 vect_body);
4202 else
4204 /* Add in cost for initial definition.
4205 For cond reduction we have four vectors: initial index, step,
4206 initial result of the data reduction, initial value of the index
4207 reduction. */
4208 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4209 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4210 scalar_to_vec, stmt_info, 0,
4211 vect_prologue);
4213 /* Cost of reduction op inside loop. */
4214 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4215 stmt_info, 0, vect_body);
4218 /* Determine cost of epilogue code.
4220 We have a reduction operator that will reduce the vector in one statement.
4221 Also requires scalar extract. */
4223 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4225 if (reduc_fn != IFN_LAST)
4227 if (reduction_type == COND_REDUCTION)
4229 /* An EQ stmt and an COND_EXPR stmt. */
4230 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4231 vector_stmt, stmt_info, 0,
4232 vect_epilogue);
4233 /* Reduction of the max index and a reduction of the found
4234 values. */
4235 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4236 vec_to_scalar, stmt_info, 0,
4237 vect_epilogue);
4238 /* A broadcast of the max value. */
4239 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4240 scalar_to_vec, stmt_info, 0,
4241 vect_epilogue);
4243 else
4245 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4246 stmt_info, 0, vect_epilogue);
4247 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4248 vec_to_scalar, stmt_info, 0,
4249 vect_epilogue);
4252 else if (reduction_type == COND_REDUCTION)
4254 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4255 /* Extraction of scalar elements. */
4256 epilogue_cost += add_stmt_cost (target_cost_data,
4257 2 * estimated_nunits,
4258 vec_to_scalar, stmt_info, 0,
4259 vect_epilogue);
4260 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4261 epilogue_cost += add_stmt_cost (target_cost_data,
4262 2 * estimated_nunits - 3,
4263 scalar_stmt, stmt_info, 0,
4264 vect_epilogue);
4266 else if (reduction_type == EXTRACT_LAST_REDUCTION
4267 || reduction_type == FOLD_LEFT_REDUCTION)
4268 /* No extra instructions need in the epilogue. */
4270 else
4272 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4273 tree bitsize =
4274 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4275 int element_bitsize = tree_to_uhwi (bitsize);
4276 int nelements = vec_size_in_bits / element_bitsize;
4278 if (code == COND_EXPR)
4279 code = MAX_EXPR;
4281 optab = optab_for_tree_code (code, vectype, optab_default);
4283 /* We have a whole vector shift available. */
4284 if (optab != unknown_optab
4285 && VECTOR_MODE_P (mode)
4286 && optab_handler (optab, mode) != CODE_FOR_nothing
4287 && have_whole_vector_shift (mode))
4289 /* Final reduction via vector shifts and the reduction operator.
4290 Also requires scalar extract. */
4291 epilogue_cost += add_stmt_cost (target_cost_data,
4292 exact_log2 (nelements) * 2,
4293 vector_stmt, stmt_info, 0,
4294 vect_epilogue);
4295 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4296 vec_to_scalar, stmt_info, 0,
4297 vect_epilogue);
4299 else
4300 /* Use extracts and reduction op for final reduction. For N
4301 elements, we have N extracts and N-1 reduction ops. */
4302 epilogue_cost += add_stmt_cost (target_cost_data,
4303 nelements + nelements - 1,
4304 vector_stmt, stmt_info, 0,
4305 vect_epilogue);
4309 if (dump_enabled_p ())
4310 dump_printf (MSG_NOTE,
4311 "vect_model_reduction_cost: inside_cost = %d, "
4312 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4313 prologue_cost, epilogue_cost);
4317 /* Function vect_model_induction_cost.
4319 Models cost for induction operations. */
4321 static void
4322 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4324 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4325 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4326 unsigned inside_cost, prologue_cost;
4328 if (PURE_SLP_STMT (stmt_info))
4329 return;
4331 /* loop cost for vec_loop. */
4332 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4333 stmt_info, 0, vect_body);
4335 /* prologue cost for vec_init and vec_step. */
4336 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4337 stmt_info, 0, vect_prologue);
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "vect_model_induction_cost: inside_cost = %d, "
4342 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4347 /* Function get_initial_def_for_reduction
4349 Input:
4350 STMT - a stmt that performs a reduction operation in the loop.
4351 INIT_VAL - the initial value of the reduction variable
4353 Output:
4354 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4355 of the reduction (used for adjusting the epilog - see below).
4356 Return a vector variable, initialized according to the operation that STMT
4357 performs. This vector will be used as the initial value of the
4358 vector of partial results.
4360 Option1 (adjust in epilog): Initialize the vector as follows:
4361 add/bit or/xor: [0,0,...,0,0]
4362 mult/bit and: [1,1,...,1,1]
4363 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4364 and when necessary (e.g. add/mult case) let the caller know
4365 that it needs to adjust the result by init_val.
4367 Option2: Initialize the vector as follows:
4368 add/bit or/xor: [init_val,0,0,...,0]
4369 mult/bit and: [init_val,1,1,...,1]
4370 min/max/cond_expr: [init_val,init_val,...,init_val]
4371 and no adjustments are needed.
4373 For example, for the following code:
4375 s = init_val;
4376 for (i=0;i<n;i++)
4377 s = s + a[i];
4379 STMT is 's = s + a[i]', and the reduction variable is 's'.
4380 For a vector of 4 units, we want to return either [0,0,0,init_val],
4381 or [0,0,0,0] and let the caller know that it needs to adjust
4382 the result at the end by 'init_val'.
4384 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4385 initialization vector is simpler (same element in all entries), if
4386 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4388 A cost model should help decide between these two schemes. */
4390 tree
4391 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4392 tree *adjustment_def)
4394 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4395 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4396 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397 tree scalar_type = TREE_TYPE (init_val);
4398 tree vectype = get_vectype_for_scalar_type (scalar_type);
4399 enum tree_code code = gimple_assign_rhs_code (stmt);
4400 tree def_for_init;
4401 tree init_def;
4402 bool nested_in_vect_loop = false;
4403 REAL_VALUE_TYPE real_init_val = dconst0;
4404 int int_init_val = 0;
4405 gimple *def_stmt = NULL;
4406 gimple_seq stmts = NULL;
4408 gcc_assert (vectype);
4410 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4411 || SCALAR_FLOAT_TYPE_P (scalar_type));
4413 if (nested_in_vect_loop_p (loop, stmt))
4414 nested_in_vect_loop = true;
4415 else
4416 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4418 /* In case of double reduction we only create a vector variable to be put
4419 in the reduction phi node. The actual statement creation is done in
4420 vect_create_epilog_for_reduction. */
4421 if (adjustment_def && nested_in_vect_loop
4422 && TREE_CODE (init_val) == SSA_NAME
4423 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4424 && gimple_code (def_stmt) == GIMPLE_PHI
4425 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4426 && vinfo_for_stmt (def_stmt)
4427 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4428 == vect_double_reduction_def)
4430 *adjustment_def = NULL;
4431 return vect_create_destination_var (init_val, vectype);
4434 vect_reduction_type reduction_type
4435 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4437 /* In case of a nested reduction do not use an adjustment def as
4438 that case is not supported by the epilogue generation correctly
4439 if ncopies is not one. */
4440 if (adjustment_def && nested_in_vect_loop)
4442 *adjustment_def = NULL;
4443 return vect_get_vec_def_for_operand (init_val, stmt);
4446 switch (code)
4448 case WIDEN_SUM_EXPR:
4449 case DOT_PROD_EXPR:
4450 case SAD_EXPR:
4451 case PLUS_EXPR:
4452 case MINUS_EXPR:
4453 case BIT_IOR_EXPR:
4454 case BIT_XOR_EXPR:
4455 case MULT_EXPR:
4456 case BIT_AND_EXPR:
4458 /* ADJUSTMENT_DEF is NULL when called from
4459 vect_create_epilog_for_reduction to vectorize double reduction. */
4460 if (adjustment_def)
4461 *adjustment_def = init_val;
4463 if (code == MULT_EXPR)
4465 real_init_val = dconst1;
4466 int_init_val = 1;
4469 if (code == BIT_AND_EXPR)
4470 int_init_val = -1;
4472 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4473 def_for_init = build_real (scalar_type, real_init_val);
4474 else
4475 def_for_init = build_int_cst (scalar_type, int_init_val);
4477 if (adjustment_def)
4478 /* Option1: the first element is '0' or '1' as well. */
4479 init_def = gimple_build_vector_from_val (&stmts, vectype,
4480 def_for_init);
4481 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4483 /* Option2 (variable length): the first element is INIT_VAL. */
4484 init_def = build_vector_from_val (vectype, def_for_init);
4485 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4486 2, init_def, init_val);
4487 init_def = make_ssa_name (vectype);
4488 gimple_call_set_lhs (call, init_def);
4489 gimple_seq_add_stmt (&stmts, call);
4491 else
4493 /* Option2: the first element is INIT_VAL. */
4494 tree_vector_builder elts (vectype, 1, 2);
4495 elts.quick_push (init_val);
4496 elts.quick_push (def_for_init);
4497 init_def = gimple_build_vector (&stmts, &elts);
4500 break;
4502 case MIN_EXPR:
4503 case MAX_EXPR:
4504 case COND_EXPR:
4506 if (adjustment_def)
4508 *adjustment_def = NULL_TREE;
4509 if (reduction_type != COND_REDUCTION
4510 && reduction_type != EXTRACT_LAST_REDUCTION)
4512 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4513 break;
4516 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4517 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4519 break;
4521 default:
4522 gcc_unreachable ();
4525 if (stmts)
4526 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4527 return init_def;
4530 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4531 NUMBER_OF_VECTORS is the number of vector defs to create.
4532 If NEUTRAL_OP is nonnull, introducing extra elements of that
4533 value will not change the result. */
4535 static void
4536 get_initial_defs_for_reduction (slp_tree slp_node,
4537 vec<tree> *vec_oprnds,
4538 unsigned int number_of_vectors,
4539 bool reduc_chain, tree neutral_op)
4541 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4542 gimple *stmt = stmts[0];
4543 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4544 unsigned HOST_WIDE_INT nunits;
4545 unsigned j, number_of_places_left_in_vector;
4546 tree vector_type;
4547 tree vop;
4548 int group_size = stmts.length ();
4549 unsigned int vec_num, i;
4550 unsigned number_of_copies = 1;
4551 vec<tree> voprnds;
4552 voprnds.create (number_of_vectors);
4553 struct loop *loop;
4554 auto_vec<tree, 16> permute_results;
4556 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4558 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4560 loop = (gimple_bb (stmt))->loop_father;
4561 gcc_assert (loop);
4562 edge pe = loop_preheader_edge (loop);
4564 gcc_assert (!reduc_chain || neutral_op);
4566 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4567 created vectors. It is greater than 1 if unrolling is performed.
4569 For example, we have two scalar operands, s1 and s2 (e.g., group of
4570 strided accesses of size two), while NUNITS is four (i.e., four scalars
4571 of this type can be packed in a vector). The output vector will contain
4572 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4573 will be 2).
4575 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4576 containing the operands.
4578 For example, NUNITS is four as before, and the group size is 8
4579 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4580 {s5, s6, s7, s8}. */
4582 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4583 nunits = group_size;
4585 number_of_copies = nunits * number_of_vectors / group_size;
4587 number_of_places_left_in_vector = nunits;
4588 bool constant_p = true;
4589 tree_vector_builder elts (vector_type, nunits, 1);
4590 elts.quick_grow (nunits);
4591 for (j = 0; j < number_of_copies; j++)
4593 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4595 tree op;
4596 /* Get the def before the loop. In reduction chain we have only
4597 one initial value. */
4598 if ((j != (number_of_copies - 1)
4599 || (reduc_chain && i != 0))
4600 && neutral_op)
4601 op = neutral_op;
4602 else
4603 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4605 /* Create 'vect_ = {op0,op1,...,opn}'. */
4606 number_of_places_left_in_vector--;
4607 elts[number_of_places_left_in_vector] = op;
4608 if (!CONSTANT_CLASS_P (op))
4609 constant_p = false;
4611 if (number_of_places_left_in_vector == 0)
4613 gimple_seq ctor_seq = NULL;
4614 tree init;
4615 if (constant_p && !neutral_op
4616 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4617 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4618 /* Build the vector directly from ELTS. */
4619 init = gimple_build_vector (&ctor_seq, &elts);
4620 else if (neutral_op)
4622 /* Build a vector of the neutral value and shift the
4623 other elements into place. */
4624 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4625 neutral_op);
4626 int k = nunits;
4627 while (k > 0 && elts[k - 1] == neutral_op)
4628 k -= 1;
4629 while (k > 0)
4631 k -= 1;
4632 gcall *call = gimple_build_call_internal
4633 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4634 init = make_ssa_name (vector_type);
4635 gimple_call_set_lhs (call, init);
4636 gimple_seq_add_stmt (&ctor_seq, call);
4639 else
4641 /* First time round, duplicate ELTS to fill the
4642 required number of vectors, then cherry pick the
4643 appropriate result for each iteration. */
4644 if (vec_oprnds->is_empty ())
4645 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4646 number_of_vectors,
4647 permute_results);
4648 init = permute_results[number_of_vectors - j - 1];
4650 if (ctor_seq != NULL)
4651 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4652 voprnds.quick_push (init);
4654 number_of_places_left_in_vector = nunits;
4655 elts.new_vector (vector_type, nunits, 1);
4656 elts.quick_grow (nunits);
4657 constant_p = true;
4662 /* Since the vectors are created in the reverse order, we should invert
4663 them. */
4664 vec_num = voprnds.length ();
4665 for (j = vec_num; j != 0; j--)
4667 vop = voprnds[j - 1];
4668 vec_oprnds->quick_push (vop);
4671 voprnds.release ();
4673 /* In case that VF is greater than the unrolling factor needed for the SLP
4674 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4675 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4676 to replicate the vectors. */
4677 tree neutral_vec = NULL;
4678 while (number_of_vectors > vec_oprnds->length ())
4680 if (neutral_op)
4682 if (!neutral_vec)
4684 gimple_seq ctor_seq = NULL;
4685 neutral_vec = gimple_build_vector_from_val
4686 (&ctor_seq, vector_type, neutral_op);
4687 if (ctor_seq != NULL)
4688 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4690 vec_oprnds->quick_push (neutral_vec);
4692 else
4694 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4695 vec_oprnds->quick_push (vop);
4701 /* Function vect_create_epilog_for_reduction
4703 Create code at the loop-epilog to finalize the result of a reduction
4704 computation.
4706 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4707 reduction statements.
4708 STMT is the scalar reduction stmt that is being vectorized.
4709 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4710 number of elements that we can fit in a vectype (nunits). In this case
4711 we have to generate more than one vector stmt - i.e - we need to "unroll"
4712 the vector stmt by a factor VF/nunits. For more details see documentation
4713 in vectorizable_operation.
4714 REDUC_FN is the internal function for the epilog reduction.
4715 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4716 computation.
4717 REDUC_INDEX is the index of the operand in the right hand side of the
4718 statement that is defined by REDUCTION_PHI.
4719 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4720 SLP_NODE is an SLP node containing a group of reduction statements. The
4721 first one in this group is STMT.
4722 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4723 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4724 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4725 any value of the IV in the loop.
4726 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4727 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4728 null if this is not an SLP reduction
4730 This function:
4731 1. Creates the reduction def-use cycles: sets the arguments for
4732 REDUCTION_PHIS:
4733 The loop-entry argument is the vectorized initial-value of the reduction.
4734 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4735 sums.
4736 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4737 by calling the function specified by REDUC_FN if available, or by
4738 other means (whole-vector shifts or a scalar loop).
4739 The function also creates a new phi node at the loop exit to preserve
4740 loop-closed form, as illustrated below.
4742 The flow at the entry to this function:
4744 loop:
4745 vec_def = phi <null, null> # REDUCTION_PHI
4746 VECT_DEF = vector_stmt # vectorized form of STMT
4747 s_loop = scalar_stmt # (scalar) STMT
4748 loop_exit:
4749 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4750 use <s_out0>
4751 use <s_out0>
4753 The above is transformed by this function into:
4755 loop:
4756 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4757 VECT_DEF = vector_stmt # vectorized form of STMT
4758 s_loop = scalar_stmt # (scalar) STMT
4759 loop_exit:
4760 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4761 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4762 v_out2 = reduce <v_out1>
4763 s_out3 = extract_field <v_out2, 0>
4764 s_out4 = adjust_result <s_out3>
4765 use <s_out4>
4766 use <s_out4>
4769 static void
4770 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4771 gimple *reduc_def_stmt,
4772 int ncopies, internal_fn reduc_fn,
4773 vec<gimple *> reduction_phis,
4774 bool double_reduc,
4775 slp_tree slp_node,
4776 slp_instance slp_node_instance,
4777 tree induc_val, enum tree_code induc_code,
4778 tree neutral_op)
4780 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4781 stmt_vec_info prev_phi_info;
4782 tree vectype;
4783 machine_mode mode;
4784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4785 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4786 basic_block exit_bb;
4787 tree scalar_dest;
4788 tree scalar_type;
4789 gimple *new_phi = NULL, *phi;
4790 gimple_stmt_iterator exit_gsi;
4791 tree vec_dest;
4792 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4793 gimple *epilog_stmt = NULL;
4794 enum tree_code code = gimple_assign_rhs_code (stmt);
4795 gimple *exit_phi;
4796 tree bitsize;
4797 tree adjustment_def = NULL;
4798 tree vec_initial_def = NULL;
4799 tree expr, def, initial_def = NULL;
4800 tree orig_name, scalar_result;
4801 imm_use_iterator imm_iter, phi_imm_iter;
4802 use_operand_p use_p, phi_use_p;
4803 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4804 bool nested_in_vect_loop = false;
4805 auto_vec<gimple *> new_phis;
4806 auto_vec<gimple *> inner_phis;
4807 enum vect_def_type dt = vect_unknown_def_type;
4808 int j, i;
4809 auto_vec<tree> scalar_results;
4810 unsigned int group_size = 1, k, ratio;
4811 auto_vec<tree> vec_initial_defs;
4812 auto_vec<gimple *> phis;
4813 bool slp_reduc = false;
4814 bool direct_slp_reduc;
4815 tree new_phi_result;
4816 gimple *inner_phi = NULL;
4817 tree induction_index = NULL_TREE;
4819 if (slp_node)
4820 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4822 if (nested_in_vect_loop_p (loop, stmt))
4824 outer_loop = loop;
4825 loop = loop->inner;
4826 nested_in_vect_loop = true;
4827 gcc_assert (!slp_node);
4830 vectype = STMT_VINFO_VECTYPE (stmt_info);
4831 gcc_assert (vectype);
4832 mode = TYPE_MODE (vectype);
4834 /* 1. Create the reduction def-use cycle:
4835 Set the arguments of REDUCTION_PHIS, i.e., transform
4837 loop:
4838 vec_def = phi <null, null> # REDUCTION_PHI
4839 VECT_DEF = vector_stmt # vectorized form of STMT
4842 into:
4844 loop:
4845 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4846 VECT_DEF = vector_stmt # vectorized form of STMT
4849 (in case of SLP, do it for all the phis). */
4851 /* Get the loop-entry arguments. */
4852 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4853 if (slp_node)
4855 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4856 vec_initial_defs.reserve (vec_num);
4857 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4858 &vec_initial_defs, vec_num,
4859 GROUP_FIRST_ELEMENT (stmt_info),
4860 neutral_op);
4862 else
4864 /* Get at the scalar def before the loop, that defines the initial value
4865 of the reduction variable. */
4866 gimple *def_stmt;
4867 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4868 loop_preheader_edge (loop));
4869 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4870 and we can't use zero for induc_val, use initial_def. Similarly
4871 for REDUC_MIN and initial_def larger than the base. */
4872 if (TREE_CODE (initial_def) == INTEGER_CST
4873 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4874 == INTEGER_INDUC_COND_REDUCTION)
4875 && !integer_zerop (induc_val)
4876 && ((induc_code == MAX_EXPR
4877 && tree_int_cst_lt (initial_def, induc_val))
4878 || (induc_code == MIN_EXPR
4879 && tree_int_cst_lt (induc_val, initial_def))))
4880 induc_val = initial_def;
4881 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4882 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4883 &adjustment_def);
4884 vec_initial_defs.create (1);
4885 vec_initial_defs.quick_push (vec_initial_def);
4888 /* Set phi nodes arguments. */
4889 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4891 tree vec_init_def = vec_initial_defs[i];
4892 tree def = vect_defs[i];
4893 for (j = 0; j < ncopies; j++)
4895 if (j != 0)
4897 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4898 if (nested_in_vect_loop)
4899 vec_init_def
4900 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4901 vec_init_def);
4904 /* Set the loop-entry arg of the reduction-phi. */
4906 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4907 == INTEGER_INDUC_COND_REDUCTION)
4909 /* Initialise the reduction phi to zero. This prevents initial
4910 values of non-zero interferring with the reduction op. */
4911 gcc_assert (ncopies == 1);
4912 gcc_assert (i == 0);
4914 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4915 tree induc_val_vec
4916 = build_vector_from_val (vec_init_def_type, induc_val);
4918 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4919 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4921 else
4922 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4923 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4925 /* Set the loop-latch arg for the reduction-phi. */
4926 if (j > 0)
4927 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4929 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4930 UNKNOWN_LOCATION);
4932 if (dump_enabled_p ())
4934 dump_printf_loc (MSG_NOTE, vect_location,
4935 "transform reduction: created def-use cycle: ");
4936 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4937 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4942 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4943 which is updated with the current index of the loop for every match of
4944 the original loop's cond_expr (VEC_STMT). This results in a vector
4945 containing the last time the condition passed for that vector lane.
4946 The first match will be a 1 to allow 0 to be used for non-matching
4947 indexes. If there are no matches at all then the vector will be all
4948 zeroes. */
4949 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4951 tree indx_before_incr, indx_after_incr;
4952 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4954 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4955 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4957 int scalar_precision
4958 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4959 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4960 tree cr_index_vector_type = build_vector_type
4961 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4963 /* First we create a simple vector induction variable which starts
4964 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4965 vector size (STEP). */
4967 /* Create a {1,2,3,...} vector. */
4968 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4970 /* Create a vector of the step value. */
4971 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4972 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4974 /* Create an induction variable. */
4975 gimple_stmt_iterator incr_gsi;
4976 bool insert_after;
4977 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4978 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4979 insert_after, &indx_before_incr, &indx_after_incr);
4981 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4982 filled with zeros (VEC_ZERO). */
4984 /* Create a vector of 0s. */
4985 tree zero = build_zero_cst (cr_index_scalar_type);
4986 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4988 /* Create a vector phi node. */
4989 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4990 new_phi = create_phi_node (new_phi_tree, loop->header);
4991 set_vinfo_for_stmt (new_phi,
4992 new_stmt_vec_info (new_phi, loop_vinfo));
4993 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4994 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4996 /* Now take the condition from the loops original cond_expr
4997 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4998 every match uses values from the induction variable
4999 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5000 (NEW_PHI_TREE).
5001 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5002 the new cond_expr (INDEX_COND_EXPR). */
5004 /* Duplicate the condition from vec_stmt. */
5005 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5007 /* Create a conditional, where the condition is taken from vec_stmt
5008 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5009 else is the phi (NEW_PHI_TREE). */
5010 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5011 ccompare, indx_before_incr,
5012 new_phi_tree);
5013 induction_index = make_ssa_name (cr_index_vector_type);
5014 gimple *index_condition = gimple_build_assign (induction_index,
5015 index_cond_expr);
5016 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5017 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5018 loop_vinfo);
5019 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5020 set_vinfo_for_stmt (index_condition, index_vec_info);
5022 /* Update the phi with the vec cond. */
5023 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5024 loop_latch_edge (loop), UNKNOWN_LOCATION);
5027 /* 2. Create epilog code.
5028 The reduction epilog code operates across the elements of the vector
5029 of partial results computed by the vectorized loop.
5030 The reduction epilog code consists of:
5032 step 1: compute the scalar result in a vector (v_out2)
5033 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5034 step 3: adjust the scalar result (s_out3) if needed.
5036 Step 1 can be accomplished using one the following three schemes:
5037 (scheme 1) using reduc_fn, if available.
5038 (scheme 2) using whole-vector shifts, if available.
5039 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5040 combined.
5042 The overall epilog code looks like this:
5044 s_out0 = phi <s_loop> # original EXIT_PHI
5045 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5046 v_out2 = reduce <v_out1> # step 1
5047 s_out3 = extract_field <v_out2, 0> # step 2
5048 s_out4 = adjust_result <s_out3> # step 3
5050 (step 3 is optional, and steps 1 and 2 may be combined).
5051 Lastly, the uses of s_out0 are replaced by s_out4. */
5054 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5055 v_out1 = phi <VECT_DEF>
5056 Store them in NEW_PHIS. */
5058 exit_bb = single_exit (loop)->dest;
5059 prev_phi_info = NULL;
5060 new_phis.create (vect_defs.length ());
5061 FOR_EACH_VEC_ELT (vect_defs, i, def)
5063 for (j = 0; j < ncopies; j++)
5065 tree new_def = copy_ssa_name (def);
5066 phi = create_phi_node (new_def, exit_bb);
5067 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5068 if (j == 0)
5069 new_phis.quick_push (phi);
5070 else
5072 def = vect_get_vec_def_for_stmt_copy (dt, def);
5073 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5076 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5077 prev_phi_info = vinfo_for_stmt (phi);
5081 /* The epilogue is created for the outer-loop, i.e., for the loop being
5082 vectorized. Create exit phis for the outer loop. */
5083 if (double_reduc)
5085 loop = outer_loop;
5086 exit_bb = single_exit (loop)->dest;
5087 inner_phis.create (vect_defs.length ());
5088 FOR_EACH_VEC_ELT (new_phis, i, phi)
5090 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5091 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5092 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5093 PHI_RESULT (phi));
5094 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5095 loop_vinfo));
5096 inner_phis.quick_push (phi);
5097 new_phis[i] = outer_phi;
5098 prev_phi_info = vinfo_for_stmt (outer_phi);
5099 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5101 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5102 new_result = copy_ssa_name (PHI_RESULT (phi));
5103 outer_phi = create_phi_node (new_result, exit_bb);
5104 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5105 PHI_RESULT (phi));
5106 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5107 loop_vinfo));
5108 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5109 prev_phi_info = vinfo_for_stmt (outer_phi);
5114 exit_gsi = gsi_after_labels (exit_bb);
5116 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5117 (i.e. when reduc_fn is not available) and in the final adjustment
5118 code (if needed). Also get the original scalar reduction variable as
5119 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5120 represents a reduction pattern), the tree-code and scalar-def are
5121 taken from the original stmt that the pattern-stmt (STMT) replaces.
5122 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5123 are taken from STMT. */
5125 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5126 if (!orig_stmt)
5128 /* Regular reduction */
5129 orig_stmt = stmt;
5131 else
5133 /* Reduction pattern */
5134 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5135 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5136 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5139 code = gimple_assign_rhs_code (orig_stmt);
5140 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5141 partial results are added and not subtracted. */
5142 if (code == MINUS_EXPR)
5143 code = PLUS_EXPR;
5145 scalar_dest = gimple_assign_lhs (orig_stmt);
5146 scalar_type = TREE_TYPE (scalar_dest);
5147 scalar_results.create (group_size);
5148 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5149 bitsize = TYPE_SIZE (scalar_type);
5151 /* In case this is a reduction in an inner-loop while vectorizing an outer
5152 loop - we don't need to extract a single scalar result at the end of the
5153 inner-loop (unless it is double reduction, i.e., the use of reduction is
5154 outside the outer-loop). The final vector of partial results will be used
5155 in the vectorized outer-loop, or reduced to a scalar result at the end of
5156 the outer-loop. */
5157 if (nested_in_vect_loop && !double_reduc)
5158 goto vect_finalize_reduction;
5160 /* SLP reduction without reduction chain, e.g.,
5161 # a1 = phi <a2, a0>
5162 # b1 = phi <b2, b0>
5163 a2 = operation (a1)
5164 b2 = operation (b1) */
5165 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5167 /* True if we should implement SLP_REDUC using native reduction operations
5168 instead of scalar operations. */
5169 direct_slp_reduc = (reduc_fn != IFN_LAST
5170 && slp_reduc
5171 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5173 /* In case of reduction chain, e.g.,
5174 # a1 = phi <a3, a0>
5175 a2 = operation (a1)
5176 a3 = operation (a2),
5178 we may end up with more than one vector result. Here we reduce them to
5179 one vector. */
5180 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5182 tree first_vect = PHI_RESULT (new_phis[0]);
5183 gassign *new_vec_stmt = NULL;
5184 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5185 for (k = 1; k < new_phis.length (); k++)
5187 gimple *next_phi = new_phis[k];
5188 tree second_vect = PHI_RESULT (next_phi);
5189 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5190 new_vec_stmt = gimple_build_assign (tem, code,
5191 first_vect, second_vect);
5192 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5193 first_vect = tem;
5196 new_phi_result = first_vect;
5197 if (new_vec_stmt)
5199 new_phis.truncate (0);
5200 new_phis.safe_push (new_vec_stmt);
5203 /* Likewise if we couldn't use a single defuse cycle. */
5204 else if (ncopies > 1)
5206 gcc_assert (new_phis.length () == 1);
5207 tree first_vect = PHI_RESULT (new_phis[0]);
5208 gassign *new_vec_stmt = NULL;
5209 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5210 gimple *next_phi = new_phis[0];
5211 for (int k = 1; k < ncopies; ++k)
5213 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5214 tree second_vect = PHI_RESULT (next_phi);
5215 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5216 new_vec_stmt = gimple_build_assign (tem, code,
5217 first_vect, second_vect);
5218 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5219 first_vect = tem;
5221 new_phi_result = first_vect;
5222 new_phis.truncate (0);
5223 new_phis.safe_push (new_vec_stmt);
5225 else
5226 new_phi_result = PHI_RESULT (new_phis[0]);
5228 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5229 && reduc_fn != IFN_LAST)
5231 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5232 various data values where the condition matched and another vector
5233 (INDUCTION_INDEX) containing all the indexes of those matches. We
5234 need to extract the last matching index (which will be the index with
5235 highest value) and use this to index into the data vector.
5236 For the case where there were no matches, the data vector will contain
5237 all default values and the index vector will be all zeros. */
5239 /* Get various versions of the type of the vector of indexes. */
5240 tree index_vec_type = TREE_TYPE (induction_index);
5241 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5242 tree index_scalar_type = TREE_TYPE (index_vec_type);
5243 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5244 (index_vec_type);
5246 /* Get an unsigned integer version of the type of the data vector. */
5247 int scalar_precision
5248 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5249 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5250 tree vectype_unsigned = build_vector_type
5251 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5253 /* First we need to create a vector (ZERO_VEC) of zeros and another
5254 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5255 can create using a MAX reduction and then expanding.
5256 In the case where the loop never made any matches, the max index will
5257 be zero. */
5259 /* Vector of {0, 0, 0,...}. */
5260 tree zero_vec = make_ssa_name (vectype);
5261 tree zero_vec_rhs = build_zero_cst (vectype);
5262 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5263 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5265 /* Find maximum value from the vector of found indexes. */
5266 tree max_index = make_ssa_name (index_scalar_type);
5267 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5268 1, induction_index);
5269 gimple_call_set_lhs (max_index_stmt, max_index);
5270 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5272 /* Vector of {max_index, max_index, max_index,...}. */
5273 tree max_index_vec = make_ssa_name (index_vec_type);
5274 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5275 max_index);
5276 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5277 max_index_vec_rhs);
5278 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5280 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5281 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5282 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5283 otherwise. Only one value should match, resulting in a vector
5284 (VEC_COND) with one data value and the rest zeros.
5285 In the case where the loop never made any matches, every index will
5286 match, resulting in a vector with all data values (which will all be
5287 the default value). */
5289 /* Compare the max index vector to the vector of found indexes to find
5290 the position of the max value. */
5291 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5292 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5293 induction_index,
5294 max_index_vec);
5295 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5297 /* Use the compare to choose either values from the data vector or
5298 zero. */
5299 tree vec_cond = make_ssa_name (vectype);
5300 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5301 vec_compare, new_phi_result,
5302 zero_vec);
5303 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5305 /* Finally we need to extract the data value from the vector (VEC_COND)
5306 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5307 reduction, but because this doesn't exist, we can use a MAX reduction
5308 instead. The data value might be signed or a float so we need to cast
5309 it first.
5310 In the case where the loop never made any matches, the data values are
5311 all identical, and so will reduce down correctly. */
5313 /* Make the matched data values unsigned. */
5314 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5315 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5316 vec_cond);
5317 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5318 VIEW_CONVERT_EXPR,
5319 vec_cond_cast_rhs);
5320 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5322 /* Reduce down to a scalar value. */
5323 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5324 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5325 1, vec_cond_cast);
5326 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5327 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5329 /* Convert the reduced value back to the result type and set as the
5330 result. */
5331 gimple_seq stmts = NULL;
5332 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5333 data_reduc);
5334 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5335 scalar_results.safe_push (new_temp);
5337 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5338 && reduc_fn == IFN_LAST)
5340 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5341 idx = 0;
5342 idx_val = induction_index[0];
5343 val = data_reduc[0];
5344 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5345 if (induction_index[i] > idx_val)
5346 val = data_reduc[i], idx_val = induction_index[i];
5347 return val; */
5349 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5350 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5351 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5352 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5353 /* Enforced by vectorizable_reduction, which ensures we have target
5354 support before allowing a conditional reduction on variable-length
5355 vectors. */
5356 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5357 tree idx_val = NULL_TREE, val = NULL_TREE;
5358 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5360 tree old_idx_val = idx_val;
5361 tree old_val = val;
5362 idx_val = make_ssa_name (idx_eltype);
5363 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5364 build3 (BIT_FIELD_REF, idx_eltype,
5365 induction_index,
5366 bitsize_int (el_size),
5367 bitsize_int (off)));
5368 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369 val = make_ssa_name (data_eltype);
5370 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5371 build3 (BIT_FIELD_REF,
5372 data_eltype,
5373 new_phi_result,
5374 bitsize_int (el_size),
5375 bitsize_int (off)));
5376 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5377 if (off != 0)
5379 tree new_idx_val = idx_val;
5380 tree new_val = val;
5381 if (off != v_size - el_size)
5383 new_idx_val = make_ssa_name (idx_eltype);
5384 epilog_stmt = gimple_build_assign (new_idx_val,
5385 MAX_EXPR, idx_val,
5386 old_idx_val);
5387 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5389 new_val = make_ssa_name (data_eltype);
5390 epilog_stmt = gimple_build_assign (new_val,
5391 COND_EXPR,
5392 build2 (GT_EXPR,
5393 boolean_type_node,
5394 idx_val,
5395 old_idx_val),
5396 val, old_val);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 idx_val = new_idx_val;
5399 val = new_val;
5402 /* Convert the reduced value back to the result type and set as the
5403 result. */
5404 gimple_seq stmts = NULL;
5405 val = gimple_convert (&stmts, scalar_type, val);
5406 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5407 scalar_results.safe_push (val);
5410 /* 2.3 Create the reduction code, using one of the three schemes described
5411 above. In SLP we simply need to extract all the elements from the
5412 vector (without reducing them), so we use scalar shifts. */
5413 else if (reduc_fn != IFN_LAST && !slp_reduc)
5415 tree tmp;
5416 tree vec_elem_type;
5418 /* Case 1: Create:
5419 v_out2 = reduc_expr <v_out1> */
5421 if (dump_enabled_p ())
5422 dump_printf_loc (MSG_NOTE, vect_location,
5423 "Reduce using direct vector reduction.\n");
5425 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5426 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5428 tree tmp_dest
5429 = vect_create_destination_var (scalar_dest, vec_elem_type);
5430 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5431 new_phi_result);
5432 gimple_set_lhs (epilog_stmt, tmp_dest);
5433 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5434 gimple_set_lhs (epilog_stmt, new_temp);
5435 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5437 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5438 new_temp);
5440 else
5442 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5443 new_phi_result);
5444 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5447 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5448 gimple_set_lhs (epilog_stmt, new_temp);
5449 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5452 == INTEGER_INDUC_COND_REDUCTION)
5453 && !operand_equal_p (initial_def, induc_val, 0))
5455 /* Earlier we set the initial value to be a vector if induc_val
5456 values. Check the result and if it is induc_val then replace
5457 with the original initial value, unless induc_val is
5458 the same as initial_def already. */
5459 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5460 induc_val);
5462 tmp = make_ssa_name (new_scalar_dest);
5463 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5464 initial_def, new_temp);
5465 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5466 new_temp = tmp;
5469 scalar_results.safe_push (new_temp);
5471 else if (direct_slp_reduc)
5473 /* Here we create one vector for each of the GROUP_SIZE results,
5474 with the elements for other SLP statements replaced with the
5475 neutral value. We can then do a normal reduction on each vector. */
5477 /* Enforced by vectorizable_reduction. */
5478 gcc_assert (new_phis.length () == 1);
5479 gcc_assert (pow2p_hwi (group_size));
5481 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5482 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5483 gimple_seq seq = NULL;
5485 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5486 and the same element size as VECTYPE. */
5487 tree index = build_index_vector (vectype, 0, 1);
5488 tree index_type = TREE_TYPE (index);
5489 tree index_elt_type = TREE_TYPE (index_type);
5490 tree mask_type = build_same_sized_truth_vector_type (index_type);
5492 /* Create a vector that, for each element, identifies which of
5493 the GROUP_SIZE results should use it. */
5494 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5495 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5496 build_vector_from_val (index_type, index_mask));
5498 /* Get a neutral vector value. This is simply a splat of the neutral
5499 scalar value if we have one, otherwise the initial scalar value
5500 is itself a neutral value. */
5501 tree vector_identity = NULL_TREE;
5502 if (neutral_op)
5503 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5504 neutral_op);
5505 for (unsigned int i = 0; i < group_size; ++i)
5507 /* If there's no univeral neutral value, we can use the
5508 initial scalar value from the original PHI. This is used
5509 for MIN and MAX reduction, for example. */
5510 if (!neutral_op)
5512 tree scalar_value
5513 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5514 loop_preheader_edge (loop));
5515 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5516 scalar_value);
5519 /* Calculate the equivalent of:
5521 sel[j] = (index[j] == i);
5523 which selects the elements of NEW_PHI_RESULT that should
5524 be included in the result. */
5525 tree compare_val = build_int_cst (index_elt_type, i);
5526 compare_val = build_vector_from_val (index_type, compare_val);
5527 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5528 index, compare_val);
5530 /* Calculate the equivalent of:
5532 vec = seq ? new_phi_result : vector_identity;
5534 VEC is now suitable for a full vector reduction. */
5535 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5536 sel, new_phi_result, vector_identity);
5538 /* Do the reduction and convert it to the appropriate type. */
5539 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5540 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5541 gimple_call_set_lhs (call, scalar);
5542 gimple_seq_add_stmt (&seq, call);
5543 scalar = gimple_convert (&seq, scalar_type, scalar);
5544 scalar_results.safe_push (scalar);
5546 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5548 else
5550 bool reduce_with_shift;
5551 tree vec_temp;
5553 /* COND reductions all do the final reduction with MAX_EXPR
5554 or MIN_EXPR. */
5555 if (code == COND_EXPR)
5557 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5558 == INTEGER_INDUC_COND_REDUCTION)
5559 code = induc_code;
5560 else
5561 code = MAX_EXPR;
5564 /* See if the target wants to do the final (shift) reduction
5565 in a vector mode of smaller size and first reduce upper/lower
5566 halves against each other. */
5567 enum machine_mode mode1 = mode;
5568 tree vectype1 = vectype;
5569 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5570 unsigned sz1 = sz;
5571 if (!slp_reduc
5572 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5573 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5575 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5576 reduce_with_shift = have_whole_vector_shift (mode1);
5577 if (!VECTOR_MODE_P (mode1))
5578 reduce_with_shift = false;
5579 else
5581 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5582 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5583 reduce_with_shift = false;
5586 /* First reduce the vector to the desired vector size we should
5587 do shift reduction on by combining upper and lower halves. */
5588 new_temp = new_phi_result;
5589 while (sz > sz1)
5591 gcc_assert (!slp_reduc);
5592 sz /= 2;
5593 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5595 /* The target has to make sure we support lowpart/highpart
5596 extraction, either via direct vector extract or through
5597 an integer mode punning. */
5598 tree dst1, dst2;
5599 if (convert_optab_handler (vec_extract_optab,
5600 TYPE_MODE (TREE_TYPE (new_temp)),
5601 TYPE_MODE (vectype1))
5602 != CODE_FOR_nothing)
5604 /* Extract sub-vectors directly once vec_extract becomes
5605 a conversion optab. */
5606 dst1 = make_ssa_name (vectype1);
5607 epilog_stmt
5608 = gimple_build_assign (dst1, BIT_FIELD_REF,
5609 build3 (BIT_FIELD_REF, vectype1,
5610 new_temp, TYPE_SIZE (vectype1),
5611 bitsize_int (0)));
5612 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5613 dst2 = make_ssa_name (vectype1);
5614 epilog_stmt
5615 = gimple_build_assign (dst2, BIT_FIELD_REF,
5616 build3 (BIT_FIELD_REF, vectype1,
5617 new_temp, TYPE_SIZE (vectype1),
5618 bitsize_int (sz * BITS_PER_UNIT)));
5619 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621 else
5623 /* Extract via punning to appropriately sized integer mode
5624 vector. */
5625 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5627 tree etype = build_vector_type (eltype, 2);
5628 gcc_assert (convert_optab_handler (vec_extract_optab,
5629 TYPE_MODE (etype),
5630 TYPE_MODE (eltype))
5631 != CODE_FOR_nothing);
5632 tree tem = make_ssa_name (etype);
5633 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5634 build1 (VIEW_CONVERT_EXPR,
5635 etype, new_temp));
5636 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5637 new_temp = tem;
5638 tem = make_ssa_name (eltype);
5639 epilog_stmt
5640 = gimple_build_assign (tem, BIT_FIELD_REF,
5641 build3 (BIT_FIELD_REF, eltype,
5642 new_temp, TYPE_SIZE (eltype),
5643 bitsize_int (0)));
5644 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645 dst1 = make_ssa_name (vectype1);
5646 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5647 build1 (VIEW_CONVERT_EXPR,
5648 vectype1, tem));
5649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5650 tem = make_ssa_name (eltype);
5651 epilog_stmt
5652 = gimple_build_assign (tem, BIT_FIELD_REF,
5653 build3 (BIT_FIELD_REF, eltype,
5654 new_temp, TYPE_SIZE (eltype),
5655 bitsize_int (sz * BITS_PER_UNIT)));
5656 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5657 dst2 = make_ssa_name (vectype1);
5658 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5659 build1 (VIEW_CONVERT_EXPR,
5660 vectype1, tem));
5661 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5664 new_temp = make_ssa_name (vectype1);
5665 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5666 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5669 if (reduce_with_shift && !slp_reduc)
5671 int element_bitsize = tree_to_uhwi (bitsize);
5672 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5673 for variable-length vectors and also requires direct target support
5674 for loop reductions. */
5675 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5676 int nelements = vec_size_in_bits / element_bitsize;
5677 vec_perm_builder sel;
5678 vec_perm_indices indices;
5680 int elt_offset;
5682 tree zero_vec = build_zero_cst (vectype1);
5683 /* Case 2: Create:
5684 for (offset = nelements/2; offset >= 1; offset/=2)
5686 Create: va' = vec_shift <va, offset>
5687 Create: va = vop <va, va'>
5688 } */
5690 tree rhs;
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_NOTE, vect_location,
5694 "Reduce using vector shifts\n");
5696 mode1 = TYPE_MODE (vectype1);
5697 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5698 for (elt_offset = nelements / 2;
5699 elt_offset >= 1;
5700 elt_offset /= 2)
5702 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5703 indices.new_vector (sel, 2, nelements);
5704 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5705 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5706 new_temp, zero_vec, mask);
5707 new_name = make_ssa_name (vec_dest, epilog_stmt);
5708 gimple_assign_set_lhs (epilog_stmt, new_name);
5709 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5711 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5712 new_temp);
5713 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5714 gimple_assign_set_lhs (epilog_stmt, new_temp);
5715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5718 /* 2.4 Extract the final scalar result. Create:
5719 s_out3 = extract_field <v_out2, bitpos> */
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_NOTE, vect_location,
5723 "extract scalar result\n");
5725 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5726 bitsize, bitsize_zero_node);
5727 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5728 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5729 gimple_assign_set_lhs (epilog_stmt, new_temp);
5730 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5731 scalar_results.safe_push (new_temp);
5733 else
5735 /* Case 3: Create:
5736 s = extract_field <v_out2, 0>
5737 for (offset = element_size;
5738 offset < vector_size;
5739 offset += element_size;)
5741 Create: s' = extract_field <v_out2, offset>
5742 Create: s = op <s, s'> // For non SLP cases
5743 } */
5745 if (dump_enabled_p ())
5746 dump_printf_loc (MSG_NOTE, vect_location,
5747 "Reduce using scalar code.\n");
5749 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5750 int element_bitsize = tree_to_uhwi (bitsize);
5751 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5753 int bit_offset;
5754 if (gimple_code (new_phi) == GIMPLE_PHI)
5755 vec_temp = PHI_RESULT (new_phi);
5756 else
5757 vec_temp = gimple_assign_lhs (new_phi);
5758 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5759 bitsize_zero_node);
5760 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5761 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5762 gimple_assign_set_lhs (epilog_stmt, new_temp);
5763 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5765 /* In SLP we don't need to apply reduction operation, so we just
5766 collect s' values in SCALAR_RESULTS. */
5767 if (slp_reduc)
5768 scalar_results.safe_push (new_temp);
5770 for (bit_offset = element_bitsize;
5771 bit_offset < vec_size_in_bits;
5772 bit_offset += element_bitsize)
5774 tree bitpos = bitsize_int (bit_offset);
5775 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5776 bitsize, bitpos);
5778 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5779 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5780 gimple_assign_set_lhs (epilog_stmt, new_name);
5781 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5783 if (slp_reduc)
5785 /* In SLP we don't need to apply reduction operation, so
5786 we just collect s' values in SCALAR_RESULTS. */
5787 new_temp = new_name;
5788 scalar_results.safe_push (new_name);
5790 else
5792 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5793 new_name, new_temp);
5794 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5795 gimple_assign_set_lhs (epilog_stmt, new_temp);
5796 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5801 /* The only case where we need to reduce scalar results in SLP, is
5802 unrolling. If the size of SCALAR_RESULTS is greater than
5803 GROUP_SIZE, we reduce them combining elements modulo
5804 GROUP_SIZE. */
5805 if (slp_reduc)
5807 tree res, first_res, new_res;
5808 gimple *new_stmt;
5810 /* Reduce multiple scalar results in case of SLP unrolling. */
5811 for (j = group_size; scalar_results.iterate (j, &res);
5812 j++)
5814 first_res = scalar_results[j % group_size];
5815 new_stmt = gimple_build_assign (new_scalar_dest, code,
5816 first_res, res);
5817 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5818 gimple_assign_set_lhs (new_stmt, new_res);
5819 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5820 scalar_results[j % group_size] = new_res;
5823 else
5824 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5825 scalar_results.safe_push (new_temp);
5828 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5829 == INTEGER_INDUC_COND_REDUCTION)
5830 && !operand_equal_p (initial_def, induc_val, 0))
5832 /* Earlier we set the initial value to be a vector if induc_val
5833 values. Check the result and if it is induc_val then replace
5834 with the original initial value, unless induc_val is
5835 the same as initial_def already. */
5836 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5837 induc_val);
5839 tree tmp = make_ssa_name (new_scalar_dest);
5840 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5841 initial_def, new_temp);
5842 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5843 scalar_results[0] = tmp;
5847 vect_finalize_reduction:
5849 if (double_reduc)
5850 loop = loop->inner;
5852 /* 2.5 Adjust the final result by the initial value of the reduction
5853 variable. (When such adjustment is not needed, then
5854 'adjustment_def' is zero). For example, if code is PLUS we create:
5855 new_temp = loop_exit_def + adjustment_def */
5857 if (adjustment_def)
5859 gcc_assert (!slp_reduc);
5860 if (nested_in_vect_loop)
5862 new_phi = new_phis[0];
5863 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5864 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5865 new_dest = vect_create_destination_var (scalar_dest, vectype);
5867 else
5869 new_temp = scalar_results[0];
5870 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5871 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5872 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5875 epilog_stmt = gimple_build_assign (new_dest, expr);
5876 new_temp = make_ssa_name (new_dest, epilog_stmt);
5877 gimple_assign_set_lhs (epilog_stmt, new_temp);
5878 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5879 if (nested_in_vect_loop)
5881 set_vinfo_for_stmt (epilog_stmt,
5882 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5883 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5884 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5886 if (!double_reduc)
5887 scalar_results.quick_push (new_temp);
5888 else
5889 scalar_results[0] = new_temp;
5891 else
5892 scalar_results[0] = new_temp;
5894 new_phis[0] = epilog_stmt;
5897 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5898 phis with new adjusted scalar results, i.e., replace use <s_out0>
5899 with use <s_out4>.
5901 Transform:
5902 loop_exit:
5903 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5904 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5905 v_out2 = reduce <v_out1>
5906 s_out3 = extract_field <v_out2, 0>
5907 s_out4 = adjust_result <s_out3>
5908 use <s_out0>
5909 use <s_out0>
5911 into:
5913 loop_exit:
5914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5915 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5916 v_out2 = reduce <v_out1>
5917 s_out3 = extract_field <v_out2, 0>
5918 s_out4 = adjust_result <s_out3>
5919 use <s_out4>
5920 use <s_out4> */
5923 /* In SLP reduction chain we reduce vector results into one vector if
5924 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5925 the last stmt in the reduction chain, since we are looking for the loop
5926 exit phi node. */
5927 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5929 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5930 /* Handle reduction patterns. */
5931 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5932 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5934 scalar_dest = gimple_assign_lhs (dest_stmt);
5935 group_size = 1;
5938 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5939 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5940 need to match SCALAR_RESULTS with corresponding statements. The first
5941 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5942 the first vector stmt, etc.
5943 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5944 if (group_size > new_phis.length ())
5946 ratio = group_size / new_phis.length ();
5947 gcc_assert (!(group_size % new_phis.length ()));
5949 else
5950 ratio = 1;
5952 for (k = 0; k < group_size; k++)
5954 if (k % ratio == 0)
5956 epilog_stmt = new_phis[k / ratio];
5957 reduction_phi = reduction_phis[k / ratio];
5958 if (double_reduc)
5959 inner_phi = inner_phis[k / ratio];
5962 if (slp_reduc)
5964 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5966 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5967 /* SLP statements can't participate in patterns. */
5968 gcc_assert (!orig_stmt);
5969 scalar_dest = gimple_assign_lhs (current_stmt);
5972 phis.create (3);
5973 /* Find the loop-closed-use at the loop exit of the original scalar
5974 result. (The reduction result is expected to have two immediate uses -
5975 one at the latch block, and one at the loop exit). */
5976 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5977 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5978 && !is_gimple_debug (USE_STMT (use_p)))
5979 phis.safe_push (USE_STMT (use_p));
5981 /* While we expect to have found an exit_phi because of loop-closed-ssa
5982 form we can end up without one if the scalar cycle is dead. */
5984 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5986 if (outer_loop)
5988 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5989 gphi *vect_phi;
5991 /* FORNOW. Currently not supporting the case that an inner-loop
5992 reduction is not used in the outer-loop (but only outside the
5993 outer-loop), unless it is double reduction. */
5994 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5995 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5996 || double_reduc);
5998 if (double_reduc)
5999 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6000 else
6001 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6002 if (!double_reduc
6003 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6004 != vect_double_reduction_def)
6005 continue;
6007 /* Handle double reduction:
6009 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
6010 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6011 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
6012 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
6014 At that point the regular reduction (stmt2 and stmt3) is
6015 already vectorized, as well as the exit phi node, stmt4.
6016 Here we vectorize the phi node of double reduction, stmt1, and
6017 update all relevant statements. */
6019 /* Go through all the uses of s2 to find double reduction phi
6020 node, i.e., stmt1 above. */
6021 orig_name = PHI_RESULT (exit_phi);
6022 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6024 stmt_vec_info use_stmt_vinfo;
6025 stmt_vec_info new_phi_vinfo;
6026 tree vect_phi_init, preheader_arg, vect_phi_res;
6027 basic_block bb = gimple_bb (use_stmt);
6028 gimple *use;
6030 /* Check that USE_STMT is really double reduction phi
6031 node. */
6032 if (gimple_code (use_stmt) != GIMPLE_PHI
6033 || gimple_phi_num_args (use_stmt) != 2
6034 || bb->loop_father != outer_loop)
6035 continue;
6036 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6037 if (!use_stmt_vinfo
6038 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6039 != vect_double_reduction_def)
6040 continue;
6042 /* Create vector phi node for double reduction:
6043 vs1 = phi <vs0, vs2>
6044 vs1 was created previously in this function by a call to
6045 vect_get_vec_def_for_operand and is stored in
6046 vec_initial_def;
6047 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6048 vs0 is created here. */
6050 /* Create vector phi node. */
6051 vect_phi = create_phi_node (vec_initial_def, bb);
6052 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6053 loop_vec_info_for_loop (outer_loop));
6054 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6056 /* Create vs0 - initial def of the double reduction phi. */
6057 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6058 loop_preheader_edge (outer_loop));
6059 vect_phi_init = get_initial_def_for_reduction
6060 (stmt, preheader_arg, NULL);
6062 /* Update phi node arguments with vs0 and vs2. */
6063 add_phi_arg (vect_phi, vect_phi_init,
6064 loop_preheader_edge (outer_loop),
6065 UNKNOWN_LOCATION);
6066 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6067 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6068 if (dump_enabled_p ())
6070 dump_printf_loc (MSG_NOTE, vect_location,
6071 "created double reduction phi node: ");
6072 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6075 vect_phi_res = PHI_RESULT (vect_phi);
6077 /* Replace the use, i.e., set the correct vs1 in the regular
6078 reduction phi node. FORNOW, NCOPIES is always 1, so the
6079 loop is redundant. */
6080 use = reduction_phi;
6081 for (j = 0; j < ncopies; j++)
6083 edge pr_edge = loop_preheader_edge (loop);
6084 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6085 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6091 phis.release ();
6092 if (nested_in_vect_loop)
6094 if (double_reduc)
6095 loop = outer_loop;
6096 else
6097 continue;
6100 phis.create (3);
6101 /* Find the loop-closed-use at the loop exit of the original scalar
6102 result. (The reduction result is expected to have two immediate uses,
6103 one at the latch block, and one at the loop exit). For double
6104 reductions we are looking for exit phis of the outer loop. */
6105 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6107 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6109 if (!is_gimple_debug (USE_STMT (use_p)))
6110 phis.safe_push (USE_STMT (use_p));
6112 else
6114 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6116 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6118 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6120 if (!flow_bb_inside_loop_p (loop,
6121 gimple_bb (USE_STMT (phi_use_p)))
6122 && !is_gimple_debug (USE_STMT (phi_use_p)))
6123 phis.safe_push (USE_STMT (phi_use_p));
6129 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6131 /* Replace the uses: */
6132 orig_name = PHI_RESULT (exit_phi);
6133 scalar_result = scalar_results[k];
6134 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6135 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6136 SET_USE (use_p, scalar_result);
6139 phis.release ();
6143 /* Return a vector of type VECTYPE that is equal to the vector select
6144 operation "MASK ? VEC : IDENTITY". Insert the select statements
6145 before GSI. */
6147 static tree
6148 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6149 tree vec, tree identity)
6151 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6152 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6153 mask, vec, identity);
6154 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6155 return cond;
6158 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6159 order, starting with LHS. Insert the extraction statements before GSI and
6160 associate the new scalar SSA names with variable SCALAR_DEST.
6161 Return the SSA name for the result. */
6163 static tree
6164 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6165 tree_code code, tree lhs, tree vector_rhs)
6167 tree vectype = TREE_TYPE (vector_rhs);
6168 tree scalar_type = TREE_TYPE (vectype);
6169 tree bitsize = TYPE_SIZE (scalar_type);
6170 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6171 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6173 for (unsigned HOST_WIDE_INT bit_offset = 0;
6174 bit_offset < vec_size_in_bits;
6175 bit_offset += element_bitsize)
6177 tree bitpos = bitsize_int (bit_offset);
6178 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6179 bitsize, bitpos);
6181 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6182 rhs = make_ssa_name (scalar_dest, stmt);
6183 gimple_assign_set_lhs (stmt, rhs);
6184 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6186 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6187 tree new_name = make_ssa_name (scalar_dest, stmt);
6188 gimple_assign_set_lhs (stmt, new_name);
6189 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6190 lhs = new_name;
6192 return lhs;
6195 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6196 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6197 statement. CODE is the operation performed by STMT and OPS are
6198 its scalar operands. REDUC_INDEX is the index of the operand in
6199 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6200 implements in-order reduction, or IFN_LAST if we should open-code it.
6201 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6202 that should be used to control the operation in a fully-masked loop. */
6204 static bool
6205 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6206 gimple **vec_stmt, slp_tree slp_node,
6207 gimple *reduc_def_stmt,
6208 tree_code code, internal_fn reduc_fn,
6209 tree ops[3], tree vectype_in,
6210 int reduc_index, vec_loop_masks *masks)
6212 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6213 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6214 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6215 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6216 gimple *new_stmt = NULL;
6218 int ncopies;
6219 if (slp_node)
6220 ncopies = 1;
6221 else
6222 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6224 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6225 gcc_assert (ncopies == 1);
6226 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6227 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6228 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6229 == FOLD_LEFT_REDUCTION);
6231 if (slp_node)
6232 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6233 TYPE_VECTOR_SUBPARTS (vectype_in)));
6235 tree op0 = ops[1 - reduc_index];
6237 int group_size = 1;
6238 gimple *scalar_dest_def;
6239 auto_vec<tree> vec_oprnds0;
6240 if (slp_node)
6242 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6243 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6244 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6246 else
6248 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6249 vec_oprnds0.create (1);
6250 vec_oprnds0.quick_push (loop_vec_def0);
6251 scalar_dest_def = stmt;
6254 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6255 tree scalar_type = TREE_TYPE (scalar_dest);
6256 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6258 int vec_num = vec_oprnds0.length ();
6259 gcc_assert (vec_num == 1 || slp_node);
6260 tree vec_elem_type = TREE_TYPE (vectype_out);
6261 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6263 tree vector_identity = NULL_TREE;
6264 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6265 vector_identity = build_zero_cst (vectype_out);
6267 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6268 int i;
6269 tree def0;
6270 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6272 tree mask = NULL_TREE;
6273 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6274 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6276 /* Handle MINUS by adding the negative. */
6277 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6279 tree negated = make_ssa_name (vectype_out);
6280 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6281 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6282 def0 = negated;
6285 if (mask)
6286 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6287 vector_identity);
6289 /* On the first iteration the input is simply the scalar phi
6290 result, and for subsequent iterations it is the output of
6291 the preceding operation. */
6292 if (reduc_fn != IFN_LAST)
6294 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6295 /* For chained SLP reductions the output of the previous reduction
6296 operation serves as the input of the next. For the final statement
6297 the output cannot be a temporary - we reuse the original
6298 scalar destination of the last statement. */
6299 if (i != vec_num - 1)
6301 gimple_set_lhs (new_stmt, scalar_dest_var);
6302 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6303 gimple_set_lhs (new_stmt, reduc_var);
6306 else
6308 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6309 reduc_var, def0);
6310 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6311 /* Remove the statement, so that we can use the same code paths
6312 as for statements that we've just created. */
6313 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6314 gsi_remove (&tmp_gsi, false);
6317 if (i == vec_num - 1)
6319 gimple_set_lhs (new_stmt, scalar_dest);
6320 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6322 else
6323 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6325 if (slp_node)
6326 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6329 if (!slp_node)
6330 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6332 return true;
6335 /* Function is_nonwrapping_integer_induction.
6337 Check if STMT (which is part of loop LOOP) both increments and
6338 does not cause overflow. */
6340 static bool
6341 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6343 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6344 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6345 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6346 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6347 widest_int ni, max_loop_value, lhs_max;
6348 bool overflow = false;
6350 /* Make sure the loop is integer based. */
6351 if (TREE_CODE (base) != INTEGER_CST
6352 || TREE_CODE (step) != INTEGER_CST)
6353 return false;
6355 /* Check that the max size of the loop will not wrap. */
6357 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6358 return true;
6360 if (! max_stmt_executions (loop, &ni))
6361 return false;
6363 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6364 &overflow);
6365 if (overflow)
6366 return false;
6368 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6369 TYPE_SIGN (lhs_type), &overflow);
6370 if (overflow)
6371 return false;
6373 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6374 <= TYPE_PRECISION (lhs_type));
6377 /* Function vectorizable_reduction.
6379 Check if STMT performs a reduction operation that can be vectorized.
6380 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6381 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6382 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6384 This function also handles reduction idioms (patterns) that have been
6385 recognized in advance during vect_pattern_recog. In this case, STMT may be
6386 of this form:
6387 X = pattern_expr (arg0, arg1, ..., X)
6388 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6389 sequence that had been detected and replaced by the pattern-stmt (STMT).
6391 This function also handles reduction of condition expressions, for example:
6392 for (int i = 0; i < N; i++)
6393 if (a[i] < value)
6394 last = a[i];
6395 This is handled by vectorising the loop and creating an additional vector
6396 containing the loop indexes for which "a[i] < value" was true. In the
6397 function epilogue this is reduced to a single max value and then used to
6398 index into the vector of results.
6400 In some cases of reduction patterns, the type of the reduction variable X is
6401 different than the type of the other arguments of STMT.
6402 In such cases, the vectype that is used when transforming STMT into a vector
6403 stmt is different than the vectype that is used to determine the
6404 vectorization factor, because it consists of a different number of elements
6405 than the actual number of elements that are being operated upon in parallel.
6407 For example, consider an accumulation of shorts into an int accumulator.
6408 On some targets it's possible to vectorize this pattern operating on 8
6409 shorts at a time (hence, the vectype for purposes of determining the
6410 vectorization factor should be V8HI); on the other hand, the vectype that
6411 is used to create the vector form is actually V4SI (the type of the result).
6413 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6414 indicates what is the actual level of parallelism (V8HI in the example), so
6415 that the right vectorization factor would be derived. This vectype
6416 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6417 be used to create the vectorized stmt. The right vectype for the vectorized
6418 stmt is obtained from the type of the result X:
6419 get_vectype_for_scalar_type (TREE_TYPE (X))
6421 This means that, contrary to "regular" reductions (or "regular" stmts in
6422 general), the following equation:
6423 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6424 does *NOT* necessarily hold for reduction patterns. */
6426 bool
6427 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6428 gimple **vec_stmt, slp_tree slp_node,
6429 slp_instance slp_node_instance)
6431 tree vec_dest;
6432 tree scalar_dest;
6433 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6434 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6435 tree vectype_in = NULL_TREE;
6436 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6437 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6438 enum tree_code code, orig_code;
6439 internal_fn reduc_fn;
6440 machine_mode vec_mode;
6441 int op_type;
6442 optab optab;
6443 tree new_temp = NULL_TREE;
6444 gimple *def_stmt;
6445 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6446 gimple *cond_reduc_def_stmt = NULL;
6447 enum tree_code cond_reduc_op_code = ERROR_MARK;
6448 tree scalar_type;
6449 bool is_simple_use;
6450 gimple *orig_stmt;
6451 stmt_vec_info orig_stmt_info = NULL;
6452 int i;
6453 int ncopies;
6454 int epilog_copies;
6455 stmt_vec_info prev_stmt_info, prev_phi_info;
6456 bool single_defuse_cycle = false;
6457 gimple *new_stmt = NULL;
6458 int j;
6459 tree ops[3];
6460 enum vect_def_type dts[3];
6461 bool nested_cycle = false, found_nested_cycle_def = false;
6462 bool double_reduc = false;
6463 basic_block def_bb;
6464 struct loop * def_stmt_loop, *outer_loop = NULL;
6465 tree def_arg;
6466 gimple *def_arg_stmt;
6467 auto_vec<tree> vec_oprnds0;
6468 auto_vec<tree> vec_oprnds1;
6469 auto_vec<tree> vec_oprnds2;
6470 auto_vec<tree> vect_defs;
6471 auto_vec<gimple *> phis;
6472 int vec_num;
6473 tree def0, tem;
6474 bool first_p = true;
6475 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6476 tree cond_reduc_val = NULL_TREE;
6478 /* Make sure it was already recognized as a reduction computation. */
6479 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6480 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6481 return false;
6483 if (nested_in_vect_loop_p (loop, stmt))
6485 outer_loop = loop;
6486 loop = loop->inner;
6487 nested_cycle = true;
6490 /* In case of reduction chain we switch to the first stmt in the chain, but
6491 we don't update STMT_INFO, since only the last stmt is marked as reduction
6492 and has reduction properties. */
6493 if (GROUP_FIRST_ELEMENT (stmt_info)
6494 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6496 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6497 first_p = false;
6500 if (gimple_code (stmt) == GIMPLE_PHI)
6502 /* Analysis is fully done on the reduction stmt invocation. */
6503 if (! vec_stmt)
6505 if (slp_node)
6506 slp_node_instance->reduc_phis = slp_node;
6508 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6509 return true;
6512 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6513 /* Leave the scalar phi in place. Note that checking
6514 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6515 for reductions involving a single statement. */
6516 return true;
6518 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6519 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6520 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6522 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6523 == EXTRACT_LAST_REDUCTION)
6524 /* Leave the scalar phi in place. */
6525 return true;
6527 gcc_assert (is_gimple_assign (reduc_stmt));
6528 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6530 tree op = gimple_op (reduc_stmt, k);
6531 if (op == gimple_phi_result (stmt))
6532 continue;
6533 if (k == 1
6534 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6535 continue;
6536 if (!vectype_in
6537 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6538 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6539 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6540 break;
6542 gcc_assert (vectype_in);
6544 if (slp_node)
6545 ncopies = 1;
6546 else
6547 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6549 use_operand_p use_p;
6550 gimple *use_stmt;
6551 if (ncopies > 1
6552 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6553 <= vect_used_only_live)
6554 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6555 && (use_stmt == reduc_stmt
6556 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6557 == reduc_stmt)))
6558 single_defuse_cycle = true;
6560 /* Create the destination vector */
6561 scalar_dest = gimple_assign_lhs (reduc_stmt);
6562 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6564 if (slp_node)
6565 /* The size vect_schedule_slp_instance computes is off for us. */
6566 vec_num = vect_get_num_vectors
6567 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6568 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6569 vectype_in);
6570 else
6571 vec_num = 1;
6573 /* Generate the reduction PHIs upfront. */
6574 prev_phi_info = NULL;
6575 for (j = 0; j < ncopies; j++)
6577 if (j == 0 || !single_defuse_cycle)
6579 for (i = 0; i < vec_num; i++)
6581 /* Create the reduction-phi that defines the reduction
6582 operand. */
6583 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6584 set_vinfo_for_stmt (new_phi,
6585 new_stmt_vec_info (new_phi, loop_vinfo));
6587 if (slp_node)
6588 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6589 else
6591 if (j == 0)
6592 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6593 else
6594 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6595 prev_phi_info = vinfo_for_stmt (new_phi);
6601 return true;
6604 /* 1. Is vectorizable reduction? */
6605 /* Not supportable if the reduction variable is used in the loop, unless
6606 it's a reduction chain. */
6607 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6608 && !GROUP_FIRST_ELEMENT (stmt_info))
6609 return false;
6611 /* Reductions that are not used even in an enclosing outer-loop,
6612 are expected to be "live" (used out of the loop). */
6613 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6614 && !STMT_VINFO_LIVE_P (stmt_info))
6615 return false;
6617 /* 2. Has this been recognized as a reduction pattern?
6619 Check if STMT represents a pattern that has been recognized
6620 in earlier analysis stages. For stmts that represent a pattern,
6621 the STMT_VINFO_RELATED_STMT field records the last stmt in
6622 the original sequence that constitutes the pattern. */
6624 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6625 if (orig_stmt)
6627 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6628 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6629 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6632 /* 3. Check the operands of the operation. The first operands are defined
6633 inside the loop body. The last operand is the reduction variable,
6634 which is defined by the loop-header-phi. */
6636 gcc_assert (is_gimple_assign (stmt));
6638 /* Flatten RHS. */
6639 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6641 case GIMPLE_BINARY_RHS:
6642 code = gimple_assign_rhs_code (stmt);
6643 op_type = TREE_CODE_LENGTH (code);
6644 gcc_assert (op_type == binary_op);
6645 ops[0] = gimple_assign_rhs1 (stmt);
6646 ops[1] = gimple_assign_rhs2 (stmt);
6647 break;
6649 case GIMPLE_TERNARY_RHS:
6650 code = gimple_assign_rhs_code (stmt);
6651 op_type = TREE_CODE_LENGTH (code);
6652 gcc_assert (op_type == ternary_op);
6653 ops[0] = gimple_assign_rhs1 (stmt);
6654 ops[1] = gimple_assign_rhs2 (stmt);
6655 ops[2] = gimple_assign_rhs3 (stmt);
6656 break;
6658 case GIMPLE_UNARY_RHS:
6659 return false;
6661 default:
6662 gcc_unreachable ();
6665 if (code == COND_EXPR && slp_node)
6666 return false;
6668 scalar_dest = gimple_assign_lhs (stmt);
6669 scalar_type = TREE_TYPE (scalar_dest);
6670 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6671 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6672 return false;
6674 /* Do not try to vectorize bit-precision reductions. */
6675 if (!type_has_mode_precision_p (scalar_type))
6676 return false;
6678 /* All uses but the last are expected to be defined in the loop.
6679 The last use is the reduction variable. In case of nested cycle this
6680 assumption is not true: we use reduc_index to record the index of the
6681 reduction variable. */
6682 gimple *reduc_def_stmt = NULL;
6683 int reduc_index = -1;
6684 for (i = 0; i < op_type; i++)
6686 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6687 if (i == 0 && code == COND_EXPR)
6688 continue;
6690 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6691 &def_stmt, &dts[i], &tem);
6692 dt = dts[i];
6693 gcc_assert (is_simple_use);
6694 if (dt == vect_reduction_def)
6696 reduc_def_stmt = def_stmt;
6697 reduc_index = i;
6698 continue;
6700 else if (tem)
6702 /* To properly compute ncopies we are interested in the widest
6703 input type in case we're looking at a widening accumulation. */
6704 if (!vectype_in
6705 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6706 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6707 vectype_in = tem;
6710 if (dt != vect_internal_def
6711 && dt != vect_external_def
6712 && dt != vect_constant_def
6713 && dt != vect_induction_def
6714 && !(dt == vect_nested_cycle && nested_cycle))
6715 return false;
6717 if (dt == vect_nested_cycle)
6719 found_nested_cycle_def = true;
6720 reduc_def_stmt = def_stmt;
6721 reduc_index = i;
6724 if (i == 1 && code == COND_EXPR)
6726 /* Record how value of COND_EXPR is defined. */
6727 if (dt == vect_constant_def)
6729 cond_reduc_dt = dt;
6730 cond_reduc_val = ops[i];
6732 if (dt == vect_induction_def
6733 && def_stmt != NULL
6734 && is_nonwrapping_integer_induction (def_stmt, loop))
6736 cond_reduc_dt = dt;
6737 cond_reduc_def_stmt = def_stmt;
6742 if (!vectype_in)
6743 vectype_in = vectype_out;
6745 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6746 directy used in stmt. */
6747 if (reduc_index == -1)
6749 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6751 if (dump_enabled_p ())
6752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 "in-order reduction chain without SLP.\n");
6754 return false;
6757 if (orig_stmt)
6758 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6759 else
6760 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6763 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6764 return false;
6766 if (!(reduc_index == -1
6767 || dts[reduc_index] == vect_reduction_def
6768 || dts[reduc_index] == vect_nested_cycle
6769 || ((dts[reduc_index] == vect_internal_def
6770 || dts[reduc_index] == vect_external_def
6771 || dts[reduc_index] == vect_constant_def
6772 || dts[reduc_index] == vect_induction_def)
6773 && nested_cycle && found_nested_cycle_def)))
6775 /* For pattern recognized stmts, orig_stmt might be a reduction,
6776 but some helper statements for the pattern might not, or
6777 might be COND_EXPRs with reduction uses in the condition. */
6778 gcc_assert (orig_stmt);
6779 return false;
6782 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6783 enum vect_reduction_type v_reduc_type
6784 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6785 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6787 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6788 /* If we have a condition reduction, see if we can simplify it further. */
6789 if (v_reduc_type == COND_REDUCTION)
6791 /* Loop peeling modifies initial value of reduction PHI, which
6792 makes the reduction stmt to be transformed different to the
6793 original stmt analyzed. We need to record reduction code for
6794 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6795 it can be used directly at transform stage. */
6796 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6797 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6799 /* Also set the reduction type to CONST_COND_REDUCTION. */
6800 gcc_assert (cond_reduc_dt == vect_constant_def);
6801 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6803 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6804 vectype_in, OPTIMIZE_FOR_SPEED))
6806 if (dump_enabled_p ())
6807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808 "optimizing condition reduction with"
6809 " FOLD_EXTRACT_LAST.\n");
6810 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6812 else if (cond_reduc_dt == vect_induction_def)
6814 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6815 tree base
6816 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6817 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6819 gcc_assert (TREE_CODE (base) == INTEGER_CST
6820 && TREE_CODE (step) == INTEGER_CST);
6821 cond_reduc_val = NULL_TREE;
6822 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6823 above base; punt if base is the minimum value of the type for
6824 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6825 if (tree_int_cst_sgn (step) == -1)
6827 cond_reduc_op_code = MIN_EXPR;
6828 if (tree_int_cst_sgn (base) == -1)
6829 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6830 else if (tree_int_cst_lt (base,
6831 TYPE_MAX_VALUE (TREE_TYPE (base))))
6832 cond_reduc_val
6833 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6835 else
6837 cond_reduc_op_code = MAX_EXPR;
6838 if (tree_int_cst_sgn (base) == 1)
6839 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6840 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6841 base))
6842 cond_reduc_val
6843 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6845 if (cond_reduc_val)
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_NOTE, vect_location,
6849 "condition expression based on "
6850 "integer induction.\n");
6851 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6852 = INTEGER_INDUC_COND_REDUCTION;
6855 else if (cond_reduc_dt == vect_constant_def)
6857 enum vect_def_type cond_initial_dt;
6858 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6859 tree cond_initial_val
6860 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6862 gcc_assert (cond_reduc_val != NULL_TREE);
6863 vect_is_simple_use (cond_initial_val, loop_vinfo,
6864 &def_stmt, &cond_initial_dt);
6865 if (cond_initial_dt == vect_constant_def
6866 && types_compatible_p (TREE_TYPE (cond_initial_val),
6867 TREE_TYPE (cond_reduc_val)))
6869 tree e = fold_binary (LE_EXPR, boolean_type_node,
6870 cond_initial_val, cond_reduc_val);
6871 if (e && (integer_onep (e) || integer_zerop (e)))
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_NOTE, vect_location,
6875 "condition expression based on "
6876 "compile time constant.\n");
6877 /* Record reduction code at analysis stage. */
6878 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6879 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6880 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6881 = CONST_COND_REDUCTION;
6887 if (orig_stmt)
6888 gcc_assert (tmp == orig_stmt
6889 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6890 else
6891 /* We changed STMT to be the first stmt in reduction chain, hence we
6892 check that in this case the first element in the chain is STMT. */
6893 gcc_assert (stmt == tmp
6894 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6896 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6897 return false;
6899 if (slp_node)
6900 ncopies = 1;
6901 else
6902 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6904 gcc_assert (ncopies >= 1);
6906 vec_mode = TYPE_MODE (vectype_in);
6907 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6909 if (code == COND_EXPR)
6911 /* Only call during the analysis stage, otherwise we'll lose
6912 STMT_VINFO_TYPE. */
6913 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6914 ops[reduc_index], 0, NULL))
6916 if (dump_enabled_p ())
6917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6918 "unsupported condition in reduction\n");
6919 return false;
6922 else
6924 /* 4. Supportable by target? */
6926 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6927 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6929 /* Shifts and rotates are only supported by vectorizable_shifts,
6930 not vectorizable_reduction. */
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933 "unsupported shift or rotation.\n");
6934 return false;
6937 /* 4.1. check support for the operation in the loop */
6938 optab = optab_for_tree_code (code, vectype_in, optab_default);
6939 if (!optab)
6941 if (dump_enabled_p ())
6942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943 "no optab.\n");
6945 return false;
6948 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6950 if (dump_enabled_p ())
6951 dump_printf (MSG_NOTE, "op not supported by target.\n");
6953 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6954 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6955 return false;
6957 if (dump_enabled_p ())
6958 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6961 /* Worthwhile without SIMD support? */
6962 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6963 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6965 if (dump_enabled_p ())
6966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967 "not worthwhile without SIMD support.\n");
6969 return false;
6973 /* 4.2. Check support for the epilog operation.
6975 If STMT represents a reduction pattern, then the type of the
6976 reduction variable may be different than the type of the rest
6977 of the arguments. For example, consider the case of accumulation
6978 of shorts into an int accumulator; The original code:
6979 S1: int_a = (int) short_a;
6980 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6982 was replaced with:
6983 STMT: int_acc = widen_sum <short_a, int_acc>
6985 This means that:
6986 1. The tree-code that is used to create the vector operation in the
6987 epilog code (that reduces the partial results) is not the
6988 tree-code of STMT, but is rather the tree-code of the original
6989 stmt from the pattern that STMT is replacing. I.e, in the example
6990 above we want to use 'widen_sum' in the loop, but 'plus' in the
6991 epilog.
6992 2. The type (mode) we use to check available target support
6993 for the vector operation to be created in the *epilog*, is
6994 determined by the type of the reduction variable (in the example
6995 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6996 However the type (mode) we use to check available target support
6997 for the vector operation to be created *inside the loop*, is
6998 determined by the type of the other arguments to STMT (in the
6999 example we'd check this: optab_handler (widen_sum_optab,
7000 vect_short_mode)).
7002 This is contrary to "regular" reductions, in which the types of all
7003 the arguments are the same as the type of the reduction variable.
7004 For "regular" reductions we can therefore use the same vector type
7005 (and also the same tree-code) when generating the epilog code and
7006 when generating the code inside the loop. */
7008 vect_reduction_type reduction_type
7009 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7010 if (orig_stmt
7011 && (reduction_type == TREE_CODE_REDUCTION
7012 || reduction_type == FOLD_LEFT_REDUCTION))
7014 /* This is a reduction pattern: get the vectype from the type of the
7015 reduction variable, and get the tree-code from orig_stmt. */
7016 orig_code = gimple_assign_rhs_code (orig_stmt);
7017 gcc_assert (vectype_out);
7018 vec_mode = TYPE_MODE (vectype_out);
7020 else
7022 /* Regular reduction: use the same vectype and tree-code as used for
7023 the vector code inside the loop can be used for the epilog code. */
7024 orig_code = code;
7026 if (code == MINUS_EXPR)
7027 orig_code = PLUS_EXPR;
7029 /* For simple condition reductions, replace with the actual expression
7030 we want to base our reduction around. */
7031 if (reduction_type == CONST_COND_REDUCTION)
7033 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7034 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7036 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7037 orig_code = cond_reduc_op_code;
7040 if (nested_cycle)
7042 def_bb = gimple_bb (reduc_def_stmt);
7043 def_stmt_loop = def_bb->loop_father;
7044 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7045 loop_preheader_edge (def_stmt_loop));
7046 if (TREE_CODE (def_arg) == SSA_NAME
7047 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7048 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7049 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7050 && vinfo_for_stmt (def_arg_stmt)
7051 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7052 == vect_double_reduction_def)
7053 double_reduc = true;
7056 reduc_fn = IFN_LAST;
7058 if (reduction_type == TREE_CODE_REDUCTION
7059 || reduction_type == FOLD_LEFT_REDUCTION
7060 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7061 || reduction_type == CONST_COND_REDUCTION)
7063 if (reduction_type == FOLD_LEFT_REDUCTION
7064 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7065 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7067 if (reduc_fn != IFN_LAST
7068 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7069 OPTIMIZE_FOR_SPEED))
7071 if (dump_enabled_p ())
7072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7073 "reduc op not supported by target.\n");
7075 reduc_fn = IFN_LAST;
7078 else
7080 if (!nested_cycle || double_reduc)
7082 if (dump_enabled_p ())
7083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7084 "no reduc code for scalar code.\n");
7086 return false;
7090 else if (reduction_type == COND_REDUCTION)
7092 int scalar_precision
7093 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7094 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7095 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7096 nunits_out);
7098 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7099 OPTIMIZE_FOR_SPEED))
7100 reduc_fn = IFN_REDUC_MAX;
7103 if (reduction_type != EXTRACT_LAST_REDUCTION
7104 && reduc_fn == IFN_LAST
7105 && !nunits_out.is_constant ())
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 "missing target support for reduction on"
7110 " variable-length vectors.\n");
7111 return false;
7114 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7115 && ncopies > 1)
7117 if (dump_enabled_p ())
7118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7119 "multiple types in double reduction or condition "
7120 "reduction.\n");
7121 return false;
7124 /* For SLP reductions, see if there is a neutral value we can use. */
7125 tree neutral_op = NULL_TREE;
7126 if (slp_node)
7127 neutral_op
7128 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7129 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7131 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7133 /* We can't support in-order reductions of code such as this:
7135 for (int i = 0; i < n1; ++i)
7136 for (int j = 0; j < n2; ++j)
7137 l += a[j];
7139 since GCC effectively transforms the loop when vectorizing:
7141 for (int i = 0; i < n1 / VF; ++i)
7142 for (int j = 0; j < n2; ++j)
7143 for (int k = 0; k < VF; ++k)
7144 l += a[j];
7146 which is a reassociation of the original operation. */
7147 if (dump_enabled_p ())
7148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7149 "in-order double reduction not supported.\n");
7151 return false;
7154 if (reduction_type == FOLD_LEFT_REDUCTION
7155 && slp_node
7156 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7158 /* We cannot use in-order reductions in this case because there is
7159 an implicit reassociation of the operations involved. */
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7162 "in-order unchained SLP reductions not supported.\n");
7163 return false;
7166 /* For double reductions, and for SLP reductions with a neutral value,
7167 we construct a variable-length initial vector by loading a vector
7168 full of the neutral value and then shift-and-inserting the start
7169 values into the low-numbered elements. */
7170 if ((double_reduc || neutral_op)
7171 && !nunits_out.is_constant ()
7172 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7173 vectype_out, OPTIMIZE_FOR_SPEED))
7175 if (dump_enabled_p ())
7176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7177 "reduction on variable-length vectors requires"
7178 " target support for a vector-shift-and-insert"
7179 " operation.\n");
7180 return false;
7183 /* Check extra constraints for variable-length unchained SLP reductions. */
7184 if (STMT_SLP_TYPE (stmt_info)
7185 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7186 && !nunits_out.is_constant ())
7188 /* We checked above that we could build the initial vector when
7189 there's a neutral element value. Check here for the case in
7190 which each SLP statement has its own initial value and in which
7191 that value needs to be repeated for every instance of the
7192 statement within the initial vector. */
7193 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7194 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7195 if (!neutral_op
7196 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7198 if (dump_enabled_p ())
7199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7200 "unsupported form of SLP reduction for"
7201 " variable-length vectors: cannot build"
7202 " initial vector.\n");
7203 return false;
7205 /* The epilogue code relies on the number of elements being a multiple
7206 of the group size. The duplicate-and-interleave approach to setting
7207 up the the initial vector does too. */
7208 if (!multiple_p (nunits_out, group_size))
7210 if (dump_enabled_p ())
7211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7212 "unsupported form of SLP reduction for"
7213 " variable-length vectors: the vector size"
7214 " is not a multiple of the number of results.\n");
7215 return false;
7219 /* In case of widenning multiplication by a constant, we update the type
7220 of the constant to be the type of the other operand. We check that the
7221 constant fits the type in the pattern recognition pass. */
7222 if (code == DOT_PROD_EXPR
7223 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7225 if (TREE_CODE (ops[0]) == INTEGER_CST)
7226 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7227 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7228 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7229 else
7231 if (dump_enabled_p ())
7232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7233 "invalid types in dot-prod\n");
7235 return false;
7239 if (reduction_type == COND_REDUCTION)
7241 widest_int ni;
7243 if (! max_loop_iterations (loop, &ni))
7245 if (dump_enabled_p ())
7246 dump_printf_loc (MSG_NOTE, vect_location,
7247 "loop count not known, cannot create cond "
7248 "reduction.\n");
7249 return false;
7251 /* Convert backedges to iterations. */
7252 ni += 1;
7254 /* The additional index will be the same type as the condition. Check
7255 that the loop can fit into this less one (because we'll use up the
7256 zero slot for when there are no matches). */
7257 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7258 if (wi::geu_p (ni, wi::to_widest (max_index)))
7260 if (dump_enabled_p ())
7261 dump_printf_loc (MSG_NOTE, vect_location,
7262 "loop size is greater than data size.\n");
7263 return false;
7267 /* In case the vectorization factor (VF) is bigger than the number
7268 of elements that we can fit in a vectype (nunits), we have to generate
7269 more than one vector stmt - i.e - we need to "unroll" the
7270 vector stmt by a factor VF/nunits. For more details see documentation
7271 in vectorizable_operation. */
7273 /* If the reduction is used in an outer loop we need to generate
7274 VF intermediate results, like so (e.g. for ncopies=2):
7275 r0 = phi (init, r0)
7276 r1 = phi (init, r1)
7277 r0 = x0 + r0;
7278 r1 = x1 + r1;
7279 (i.e. we generate VF results in 2 registers).
7280 In this case we have a separate def-use cycle for each copy, and therefore
7281 for each copy we get the vector def for the reduction variable from the
7282 respective phi node created for this copy.
7284 Otherwise (the reduction is unused in the loop nest), we can combine
7285 together intermediate results, like so (e.g. for ncopies=2):
7286 r = phi (init, r)
7287 r = x0 + r;
7288 r = x1 + r;
7289 (i.e. we generate VF/2 results in a single register).
7290 In this case for each copy we get the vector def for the reduction variable
7291 from the vectorized reduction operation generated in the previous iteration.
7293 This only works when we see both the reduction PHI and its only consumer
7294 in vectorizable_reduction and there are no intermediate stmts
7295 participating. */
7296 use_operand_p use_p;
7297 gimple *use_stmt;
7298 if (ncopies > 1
7299 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7300 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7301 && (use_stmt == stmt
7302 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7304 single_defuse_cycle = true;
7305 epilog_copies = 1;
7307 else
7308 epilog_copies = ncopies;
7310 /* If the reduction stmt is one of the patterns that have lane
7311 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7312 if ((ncopies > 1
7313 && ! single_defuse_cycle)
7314 && (code == DOT_PROD_EXPR
7315 || code == WIDEN_SUM_EXPR
7316 || code == SAD_EXPR))
7318 if (dump_enabled_p ())
7319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7320 "multi def-use cycle not possible for lane-reducing "
7321 "reduction operation\n");
7322 return false;
7325 if (slp_node)
7326 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7327 else
7328 vec_num = 1;
7330 internal_fn cond_fn = get_conditional_internal_fn (code);
7331 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7333 if (!vec_stmt) /* transformation not required. */
7335 if (first_p)
7336 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7337 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7339 if (reduction_type != FOLD_LEFT_REDUCTION
7340 && (cond_fn == IFN_LAST
7341 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7342 OPTIMIZE_FOR_SPEED)))
7344 if (dump_enabled_p ())
7345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7346 "can't use a fully-masked loop because no"
7347 " conditional operation is available.\n");
7348 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7350 else if (reduc_index == -1)
7352 if (dump_enabled_p ())
7353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7354 "can't use a fully-masked loop for chained"
7355 " reductions.\n");
7356 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7358 else
7359 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7360 vectype_in);
7362 if (dump_enabled_p ()
7363 && reduction_type == FOLD_LEFT_REDUCTION)
7364 dump_printf_loc (MSG_NOTE, vect_location,
7365 "using an in-order (fold-left) reduction.\n");
7366 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7367 return true;
7370 /* Transform. */
7372 if (dump_enabled_p ())
7373 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7375 /* FORNOW: Multiple types are not supported for condition. */
7376 if (code == COND_EXPR)
7377 gcc_assert (ncopies == 1);
7379 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7381 if (reduction_type == FOLD_LEFT_REDUCTION)
7382 return vectorize_fold_left_reduction
7383 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7384 reduc_fn, ops, vectype_in, reduc_index, masks);
7386 if (reduction_type == EXTRACT_LAST_REDUCTION)
7388 gcc_assert (!slp_node);
7389 return vectorizable_condition (stmt, gsi, vec_stmt,
7390 NULL, reduc_index, NULL);
7393 /* Create the destination vector */
7394 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7396 prev_stmt_info = NULL;
7397 prev_phi_info = NULL;
7398 if (!slp_node)
7400 vec_oprnds0.create (1);
7401 vec_oprnds1.create (1);
7402 if (op_type == ternary_op)
7403 vec_oprnds2.create (1);
7406 phis.create (vec_num);
7407 vect_defs.create (vec_num);
7408 if (!slp_node)
7409 vect_defs.quick_push (NULL_TREE);
7411 if (slp_node)
7412 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7413 else
7414 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7416 for (j = 0; j < ncopies; j++)
7418 if (code == COND_EXPR)
7420 gcc_assert (!slp_node);
7421 vectorizable_condition (stmt, gsi, vec_stmt,
7422 PHI_RESULT (phis[0]),
7423 reduc_index, NULL);
7424 /* Multiple types are not supported for condition. */
7425 break;
7428 /* Handle uses. */
7429 if (j == 0)
7431 if (slp_node)
7433 /* Get vec defs for all the operands except the reduction index,
7434 ensuring the ordering of the ops in the vector is kept. */
7435 auto_vec<tree, 3> slp_ops;
7436 auto_vec<vec<tree>, 3> vec_defs;
7438 slp_ops.quick_push (ops[0]);
7439 slp_ops.quick_push (ops[1]);
7440 if (op_type == ternary_op)
7441 slp_ops.quick_push (ops[2]);
7443 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7445 vec_oprnds0.safe_splice (vec_defs[0]);
7446 vec_defs[0].release ();
7447 vec_oprnds1.safe_splice (vec_defs[1]);
7448 vec_defs[1].release ();
7449 if (op_type == ternary_op)
7451 vec_oprnds2.safe_splice (vec_defs[2]);
7452 vec_defs[2].release ();
7455 else
7457 vec_oprnds0.quick_push
7458 (vect_get_vec_def_for_operand (ops[0], stmt));
7459 vec_oprnds1.quick_push
7460 (vect_get_vec_def_for_operand (ops[1], stmt));
7461 if (op_type == ternary_op)
7462 vec_oprnds2.quick_push
7463 (vect_get_vec_def_for_operand (ops[2], stmt));
7466 else
7468 if (!slp_node)
7470 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7472 if (single_defuse_cycle && reduc_index == 0)
7473 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7474 else
7475 vec_oprnds0[0]
7476 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7477 if (single_defuse_cycle && reduc_index == 1)
7478 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7479 else
7480 vec_oprnds1[0]
7481 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7482 if (op_type == ternary_op)
7484 if (single_defuse_cycle && reduc_index == 2)
7485 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7486 else
7487 vec_oprnds2[0]
7488 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7493 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7495 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7496 if (masked_loop_p)
7498 /* Make sure that the reduction accumulator is vop[0]. */
7499 if (reduc_index == 1)
7501 gcc_assert (commutative_tree_code (code));
7502 std::swap (vop[0], vop[1]);
7504 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7505 vectype_in, i * ncopies + j);
7506 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7507 vop[0], vop[1]);
7508 new_temp = make_ssa_name (vec_dest, call);
7509 gimple_call_set_lhs (call, new_temp);
7510 gimple_call_set_nothrow (call, true);
7511 new_stmt = call;
7513 else
7515 if (op_type == ternary_op)
7516 vop[2] = vec_oprnds2[i];
7518 new_temp = make_ssa_name (vec_dest, new_stmt);
7519 new_stmt = gimple_build_assign (new_temp, code,
7520 vop[0], vop[1], vop[2]);
7522 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7524 if (slp_node)
7526 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7527 vect_defs.quick_push (new_temp);
7529 else
7530 vect_defs[0] = new_temp;
7533 if (slp_node)
7534 continue;
7536 if (j == 0)
7537 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7538 else
7539 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7541 prev_stmt_info = vinfo_for_stmt (new_stmt);
7544 /* Finalize the reduction-phi (set its arguments) and create the
7545 epilog reduction code. */
7546 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7547 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7549 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7550 epilog_copies, reduc_fn, phis,
7551 double_reduc, slp_node, slp_node_instance,
7552 cond_reduc_val, cond_reduc_op_code,
7553 neutral_op);
7555 return true;
7558 /* Function vect_min_worthwhile_factor.
7560 For a loop where we could vectorize the operation indicated by CODE,
7561 return the minimum vectorization factor that makes it worthwhile
7562 to use generic vectors. */
7563 static unsigned int
7564 vect_min_worthwhile_factor (enum tree_code code)
7566 switch (code)
7568 case PLUS_EXPR:
7569 case MINUS_EXPR:
7570 case NEGATE_EXPR:
7571 return 4;
7573 case BIT_AND_EXPR:
7574 case BIT_IOR_EXPR:
7575 case BIT_XOR_EXPR:
7576 case BIT_NOT_EXPR:
7577 return 2;
7579 default:
7580 return INT_MAX;
7584 /* Return true if VINFO indicates we are doing loop vectorization and if
7585 it is worth decomposing CODE operations into scalar operations for
7586 that loop's vectorization factor. */
7588 bool
7589 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7591 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7592 unsigned HOST_WIDE_INT value;
7593 return (loop_vinfo
7594 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7595 && value >= vect_min_worthwhile_factor (code));
7598 /* Function vectorizable_induction
7600 Check if PHI performs an induction computation that can be vectorized.
7601 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7602 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7603 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7605 bool
7606 vectorizable_induction (gimple *phi,
7607 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7608 gimple **vec_stmt, slp_tree slp_node)
7610 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7611 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7612 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7613 unsigned ncopies;
7614 bool nested_in_vect_loop = false;
7615 struct loop *iv_loop;
7616 tree vec_def;
7617 edge pe = loop_preheader_edge (loop);
7618 basic_block new_bb;
7619 tree new_vec, vec_init, vec_step, t;
7620 tree new_name;
7621 gimple *new_stmt;
7622 gphi *induction_phi;
7623 tree induc_def, vec_dest;
7624 tree init_expr, step_expr;
7625 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7626 unsigned i;
7627 tree expr;
7628 gimple_seq stmts;
7629 imm_use_iterator imm_iter;
7630 use_operand_p use_p;
7631 gimple *exit_phi;
7632 edge latch_e;
7633 tree loop_arg;
7634 gimple_stmt_iterator si;
7635 basic_block bb = gimple_bb (phi);
7637 if (gimple_code (phi) != GIMPLE_PHI)
7638 return false;
7640 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7641 return false;
7643 /* Make sure it was recognized as induction computation. */
7644 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7645 return false;
7647 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7648 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7650 if (slp_node)
7651 ncopies = 1;
7652 else
7653 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7654 gcc_assert (ncopies >= 1);
7656 /* FORNOW. These restrictions should be relaxed. */
7657 if (nested_in_vect_loop_p (loop, phi))
7659 imm_use_iterator imm_iter;
7660 use_operand_p use_p;
7661 gimple *exit_phi;
7662 edge latch_e;
7663 tree loop_arg;
7665 if (ncopies > 1)
7667 if (dump_enabled_p ())
7668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669 "multiple types in nested loop.\n");
7670 return false;
7673 /* FORNOW: outer loop induction with SLP not supported. */
7674 if (STMT_SLP_TYPE (stmt_info))
7675 return false;
7677 exit_phi = NULL;
7678 latch_e = loop_latch_edge (loop->inner);
7679 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7680 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7682 gimple *use_stmt = USE_STMT (use_p);
7683 if (is_gimple_debug (use_stmt))
7684 continue;
7686 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7688 exit_phi = use_stmt;
7689 break;
7692 if (exit_phi)
7694 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7695 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7696 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7698 if (dump_enabled_p ())
7699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7700 "inner-loop induction only used outside "
7701 "of the outer vectorized loop.\n");
7702 return false;
7706 nested_in_vect_loop = true;
7707 iv_loop = loop->inner;
7709 else
7710 iv_loop = loop;
7711 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7713 if (slp_node && !nunits.is_constant ())
7715 /* The current SLP code creates the initial value element-by-element. */
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718 "SLP induction not supported for variable-length"
7719 " vectors.\n");
7720 return false;
7723 if (!vec_stmt) /* transformation not required. */
7725 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7726 if (dump_enabled_p ())
7727 dump_printf_loc (MSG_NOTE, vect_location,
7728 "=== vectorizable_induction ===\n");
7729 vect_model_induction_cost (stmt_info, ncopies);
7730 return true;
7733 /* Transform. */
7735 /* Compute a vector variable, initialized with the first VF values of
7736 the induction variable. E.g., for an iv with IV_PHI='X' and
7737 evolution S, for a vector of 4 units, we want to compute:
7738 [X, X + S, X + 2*S, X + 3*S]. */
7740 if (dump_enabled_p ())
7741 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7743 latch_e = loop_latch_edge (iv_loop);
7744 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7746 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7747 gcc_assert (step_expr != NULL_TREE);
7749 pe = loop_preheader_edge (iv_loop);
7750 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7751 loop_preheader_edge (iv_loop));
7753 stmts = NULL;
7754 if (!nested_in_vect_loop)
7756 /* Convert the initial value to the desired type. */
7757 tree new_type = TREE_TYPE (vectype);
7758 init_expr = gimple_convert (&stmts, new_type, init_expr);
7760 /* If we are using the loop mask to "peel" for alignment then we need
7761 to adjust the start value here. */
7762 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7763 if (skip_niters != NULL_TREE)
7765 if (FLOAT_TYPE_P (vectype))
7766 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7767 skip_niters);
7768 else
7769 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7770 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7771 skip_niters, step_expr);
7772 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7773 init_expr, skip_step);
7777 /* Convert the step to the desired type. */
7778 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7780 if (stmts)
7782 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7783 gcc_assert (!new_bb);
7786 /* Find the first insertion point in the BB. */
7787 si = gsi_after_labels (bb);
7789 /* For SLP induction we have to generate several IVs as for example
7790 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7791 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7792 [VF*S, VF*S, VF*S, VF*S] for all. */
7793 if (slp_node)
7795 /* Enforced above. */
7796 unsigned int const_nunits = nunits.to_constant ();
7798 /* Generate [VF*S, VF*S, ... ]. */
7799 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7801 expr = build_int_cst (integer_type_node, vf);
7802 expr = fold_convert (TREE_TYPE (step_expr), expr);
7804 else
7805 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7806 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7807 expr, step_expr);
7808 if (! CONSTANT_CLASS_P (new_name))
7809 new_name = vect_init_vector (phi, new_name,
7810 TREE_TYPE (step_expr), NULL);
7811 new_vec = build_vector_from_val (vectype, new_name);
7812 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7814 /* Now generate the IVs. */
7815 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7816 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7817 unsigned elts = const_nunits * nvects;
7818 unsigned nivs = least_common_multiple (group_size,
7819 const_nunits) / const_nunits;
7820 gcc_assert (elts % group_size == 0);
7821 tree elt = init_expr;
7822 unsigned ivn;
7823 for (ivn = 0; ivn < nivs; ++ivn)
7825 tree_vector_builder elts (vectype, const_nunits, 1);
7826 stmts = NULL;
7827 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7829 if (ivn*const_nunits + eltn >= group_size
7830 && (ivn * const_nunits + eltn) % group_size == 0)
7831 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7832 elt, step_expr);
7833 elts.quick_push (elt);
7835 vec_init = gimple_build_vector (&stmts, &elts);
7836 if (stmts)
7838 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7839 gcc_assert (!new_bb);
7842 /* Create the induction-phi that defines the induction-operand. */
7843 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7844 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7845 set_vinfo_for_stmt (induction_phi,
7846 new_stmt_vec_info (induction_phi, loop_vinfo));
7847 induc_def = PHI_RESULT (induction_phi);
7849 /* Create the iv update inside the loop */
7850 vec_def = make_ssa_name (vec_dest);
7851 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7852 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7853 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7855 /* Set the arguments of the phi node: */
7856 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7857 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7858 UNKNOWN_LOCATION);
7860 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7863 /* Re-use IVs when we can. */
7864 if (ivn < nvects)
7866 unsigned vfp
7867 = least_common_multiple (group_size, const_nunits) / group_size;
7868 /* Generate [VF'*S, VF'*S, ... ]. */
7869 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7871 expr = build_int_cst (integer_type_node, vfp);
7872 expr = fold_convert (TREE_TYPE (step_expr), expr);
7874 else
7875 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7876 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7877 expr, step_expr);
7878 if (! CONSTANT_CLASS_P (new_name))
7879 new_name = vect_init_vector (phi, new_name,
7880 TREE_TYPE (step_expr), NULL);
7881 new_vec = build_vector_from_val (vectype, new_name);
7882 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7883 for (; ivn < nvects; ++ivn)
7885 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7886 tree def;
7887 if (gimple_code (iv) == GIMPLE_PHI)
7888 def = gimple_phi_result (iv);
7889 else
7890 def = gimple_assign_lhs (iv);
7891 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7892 PLUS_EXPR,
7893 def, vec_step);
7894 if (gimple_code (iv) == GIMPLE_PHI)
7895 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7896 else
7898 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7899 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7901 set_vinfo_for_stmt (new_stmt,
7902 new_stmt_vec_info (new_stmt, loop_vinfo));
7903 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7907 return true;
7910 /* Create the vector that holds the initial_value of the induction. */
7911 if (nested_in_vect_loop)
7913 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7914 been created during vectorization of previous stmts. We obtain it
7915 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7916 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7917 /* If the initial value is not of proper type, convert it. */
7918 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7920 new_stmt
7921 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7922 vect_simple_var,
7923 "vec_iv_"),
7924 VIEW_CONVERT_EXPR,
7925 build1 (VIEW_CONVERT_EXPR, vectype,
7926 vec_init));
7927 vec_init = gimple_assign_lhs (new_stmt);
7928 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7929 new_stmt);
7930 gcc_assert (!new_bb);
7931 set_vinfo_for_stmt (new_stmt,
7932 new_stmt_vec_info (new_stmt, loop_vinfo));
7935 else
7937 /* iv_loop is the loop to be vectorized. Create:
7938 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7939 stmts = NULL;
7940 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7942 unsigned HOST_WIDE_INT const_nunits;
7943 if (nunits.is_constant (&const_nunits))
7945 tree_vector_builder elts (vectype, const_nunits, 1);
7946 elts.quick_push (new_name);
7947 for (i = 1; i < const_nunits; i++)
7949 /* Create: new_name_i = new_name + step_expr */
7950 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7951 new_name, step_expr);
7952 elts.quick_push (new_name);
7954 /* Create a vector from [new_name_0, new_name_1, ...,
7955 new_name_nunits-1] */
7956 vec_init = gimple_build_vector (&stmts, &elts);
7958 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7959 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7960 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7961 new_name, step_expr);
7962 else
7964 /* Build:
7965 [base, base, base, ...]
7966 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7967 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7968 gcc_assert (flag_associative_math);
7969 tree index = build_index_vector (vectype, 0, 1);
7970 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7971 new_name);
7972 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7973 step_expr);
7974 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7975 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7976 vec_init, step_vec);
7977 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7978 vec_init, base_vec);
7981 if (stmts)
7983 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7984 gcc_assert (!new_bb);
7989 /* Create the vector that holds the step of the induction. */
7990 if (nested_in_vect_loop)
7991 /* iv_loop is nested in the loop to be vectorized. Generate:
7992 vec_step = [S, S, S, S] */
7993 new_name = step_expr;
7994 else
7996 /* iv_loop is the loop to be vectorized. Generate:
7997 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7998 gimple_seq seq = NULL;
7999 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8001 expr = build_int_cst (integer_type_node, vf);
8002 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8004 else
8005 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8006 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8007 expr, step_expr);
8008 if (seq)
8010 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8011 gcc_assert (!new_bb);
8015 t = unshare_expr (new_name);
8016 gcc_assert (CONSTANT_CLASS_P (new_name)
8017 || TREE_CODE (new_name) == SSA_NAME);
8018 new_vec = build_vector_from_val (vectype, t);
8019 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8022 /* Create the following def-use cycle:
8023 loop prolog:
8024 vec_init = ...
8025 vec_step = ...
8026 loop:
8027 vec_iv = PHI <vec_init, vec_loop>
8029 STMT
8031 vec_loop = vec_iv + vec_step; */
8033 /* Create the induction-phi that defines the induction-operand. */
8034 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8035 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8036 set_vinfo_for_stmt (induction_phi,
8037 new_stmt_vec_info (induction_phi, loop_vinfo));
8038 induc_def = PHI_RESULT (induction_phi);
8040 /* Create the iv update inside the loop */
8041 vec_def = make_ssa_name (vec_dest);
8042 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8043 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8044 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8046 /* Set the arguments of the phi node: */
8047 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8048 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8049 UNKNOWN_LOCATION);
8051 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8053 /* In case that vectorization factor (VF) is bigger than the number
8054 of elements that we can fit in a vectype (nunits), we have to generate
8055 more than one vector stmt - i.e - we need to "unroll" the
8056 vector stmt by a factor VF/nunits. For more details see documentation
8057 in vectorizable_operation. */
8059 if (ncopies > 1)
8061 gimple_seq seq = NULL;
8062 stmt_vec_info prev_stmt_vinfo;
8063 /* FORNOW. This restriction should be relaxed. */
8064 gcc_assert (!nested_in_vect_loop);
8066 /* Create the vector that holds the step of the induction. */
8067 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8069 expr = build_int_cst (integer_type_node, nunits);
8070 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8072 else
8073 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8074 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8075 expr, step_expr);
8076 if (seq)
8078 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8079 gcc_assert (!new_bb);
8082 t = unshare_expr (new_name);
8083 gcc_assert (CONSTANT_CLASS_P (new_name)
8084 || TREE_CODE (new_name) == SSA_NAME);
8085 new_vec = build_vector_from_val (vectype, t);
8086 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8088 vec_def = induc_def;
8089 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8090 for (i = 1; i < ncopies; i++)
8092 /* vec_i = vec_prev + vec_step */
8093 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8094 vec_def, vec_step);
8095 vec_def = make_ssa_name (vec_dest, new_stmt);
8096 gimple_assign_set_lhs (new_stmt, vec_def);
8098 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8099 set_vinfo_for_stmt (new_stmt,
8100 new_stmt_vec_info (new_stmt, loop_vinfo));
8101 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8102 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8106 if (nested_in_vect_loop)
8108 /* Find the loop-closed exit-phi of the induction, and record
8109 the final vector of induction results: */
8110 exit_phi = NULL;
8111 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8113 gimple *use_stmt = USE_STMT (use_p);
8114 if (is_gimple_debug (use_stmt))
8115 continue;
8117 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8119 exit_phi = use_stmt;
8120 break;
8123 if (exit_phi)
8125 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8126 /* FORNOW. Currently not supporting the case that an inner-loop induction
8127 is not used in the outer-loop (i.e. only outside the outer-loop). */
8128 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8129 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8131 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8132 if (dump_enabled_p ())
8134 dump_printf_loc (MSG_NOTE, vect_location,
8135 "vector of inductions after inner-loop:");
8136 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8142 if (dump_enabled_p ())
8144 dump_printf_loc (MSG_NOTE, vect_location,
8145 "transform induction: created def-use cycle: ");
8146 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8147 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8148 SSA_NAME_DEF_STMT (vec_def), 0);
8151 return true;
8154 /* Function vectorizable_live_operation.
8156 STMT computes a value that is used outside the loop. Check if
8157 it can be supported. */
8159 bool
8160 vectorizable_live_operation (gimple *stmt,
8161 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8162 slp_tree slp_node, int slp_index,
8163 gimple **vec_stmt)
8165 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8166 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8167 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8168 imm_use_iterator imm_iter;
8169 tree lhs, lhs_type, bitsize, vec_bitsize;
8170 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8171 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8172 int ncopies;
8173 gimple *use_stmt;
8174 auto_vec<tree> vec_oprnds;
8175 int vec_entry = 0;
8176 poly_uint64 vec_index = 0;
8178 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8180 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8181 return false;
8183 /* FORNOW. CHECKME. */
8184 if (nested_in_vect_loop_p (loop, stmt))
8185 return false;
8187 /* If STMT is not relevant and it is a simple assignment and its inputs are
8188 invariant then it can remain in place, unvectorized. The original last
8189 scalar value that it computes will be used. */
8190 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8192 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8193 if (dump_enabled_p ())
8194 dump_printf_loc (MSG_NOTE, vect_location,
8195 "statement is simple and uses invariant. Leaving in "
8196 "place.\n");
8197 return true;
8200 if (slp_node)
8201 ncopies = 1;
8202 else
8203 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8205 if (slp_node)
8207 gcc_assert (slp_index >= 0);
8209 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8210 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8212 /* Get the last occurrence of the scalar index from the concatenation of
8213 all the slp vectors. Calculate which slp vector it is and the index
8214 within. */
8215 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8217 /* Calculate which vector contains the result, and which lane of
8218 that vector we need. */
8219 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8221 if (dump_enabled_p ())
8222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8223 "Cannot determine which vector holds the"
8224 " final result.\n");
8225 return false;
8229 if (!vec_stmt)
8231 /* No transformation required. */
8232 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8234 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8235 OPTIMIZE_FOR_SPEED))
8237 if (dump_enabled_p ())
8238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8239 "can't use a fully-masked loop because "
8240 "the target doesn't support extract last "
8241 "reduction.\n");
8242 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8244 else if (slp_node)
8246 if (dump_enabled_p ())
8247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 "can't use a fully-masked loop because an "
8249 "SLP statement is live after the loop.\n");
8250 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8252 else if (ncopies > 1)
8254 if (dump_enabled_p ())
8255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8256 "can't use a fully-masked loop because"
8257 " ncopies is greater than 1.\n");
8258 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8260 else
8262 gcc_assert (ncopies == 1 && !slp_node);
8263 vect_record_loop_mask (loop_vinfo,
8264 &LOOP_VINFO_MASKS (loop_vinfo),
8265 1, vectype);
8268 return true;
8271 /* If stmt has a related stmt, then use that for getting the lhs. */
8272 if (is_pattern_stmt_p (stmt_info))
8273 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8275 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8276 : gimple_get_lhs (stmt);
8277 lhs_type = TREE_TYPE (lhs);
8279 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8280 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8281 : TYPE_SIZE (TREE_TYPE (vectype)));
8282 vec_bitsize = TYPE_SIZE (vectype);
8284 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8285 tree vec_lhs, bitstart;
8286 if (slp_node)
8288 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8290 /* Get the correct slp vectorized stmt. */
8291 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8292 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8293 vec_lhs = gimple_phi_result (phi);
8294 else
8295 vec_lhs = gimple_get_lhs (vec_stmt);
8297 /* Get entry to use. */
8298 bitstart = bitsize_int (vec_index);
8299 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8301 else
8303 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8304 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8305 gcc_checking_assert (ncopies == 1
8306 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8308 /* For multiple copies, get the last copy. */
8309 for (int i = 1; i < ncopies; ++i)
8310 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8311 vec_lhs);
8313 /* Get the last lane in the vector. */
8314 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8317 gimple_seq stmts = NULL;
8318 tree new_tree;
8319 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8321 /* Emit:
8323 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8325 where VEC_LHS is the vectorized live-out result and MASK is
8326 the loop mask for the final iteration. */
8327 gcc_assert (ncopies == 1 && !slp_node);
8328 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8329 tree scalar_res = make_ssa_name (scalar_type);
8330 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8331 1, vectype, 0);
8332 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8333 2, mask, vec_lhs);
8334 gimple_call_set_lhs (new_stmt, scalar_res);
8335 gimple_seq_add_stmt (&stmts, new_stmt);
8337 /* Convert the extracted vector element to the required scalar type. */
8338 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8340 else
8342 tree bftype = TREE_TYPE (vectype);
8343 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8344 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8345 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8346 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8347 &stmts, true, NULL_TREE);
8350 if (stmts)
8351 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8353 /* Replace use of lhs with newly computed result. If the use stmt is a
8354 single arg PHI, just replace all uses of PHI result. It's necessary
8355 because lcssa PHI defining lhs may be before newly inserted stmt. */
8356 use_operand_p use_p;
8357 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8358 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8359 && !is_gimple_debug (use_stmt))
8361 if (gimple_code (use_stmt) == GIMPLE_PHI
8362 && gimple_phi_num_args (use_stmt) == 1)
8364 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8366 else
8368 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8369 SET_USE (use_p, new_tree);
8371 update_stmt (use_stmt);
8374 return true;
8377 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8379 static void
8380 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8382 ssa_op_iter op_iter;
8383 imm_use_iterator imm_iter;
8384 def_operand_p def_p;
8385 gimple *ustmt;
8387 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8389 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8391 basic_block bb;
8393 if (!is_gimple_debug (ustmt))
8394 continue;
8396 bb = gimple_bb (ustmt);
8398 if (!flow_bb_inside_loop_p (loop, bb))
8400 if (gimple_debug_bind_p (ustmt))
8402 if (dump_enabled_p ())
8403 dump_printf_loc (MSG_NOTE, vect_location,
8404 "killing debug use\n");
8406 gimple_debug_bind_reset_value (ustmt);
8407 update_stmt (ustmt);
8409 else
8410 gcc_unreachable ();
8416 /* Given loop represented by LOOP_VINFO, return true if computation of
8417 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8418 otherwise. */
8420 static bool
8421 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8423 /* Constant case. */
8424 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8426 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8427 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8429 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8430 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8431 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8432 return true;
8435 widest_int max;
8436 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8437 /* Check the upper bound of loop niters. */
8438 if (get_max_loop_iterations (loop, &max))
8440 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8441 signop sgn = TYPE_SIGN (type);
8442 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8443 if (max < type_max)
8444 return true;
8446 return false;
8449 /* Return a mask type with half the number of elements as TYPE. */
8451 tree
8452 vect_halve_mask_nunits (tree type)
8454 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8455 return build_truth_vector_type (nunits, current_vector_size);
8458 /* Return a mask type with twice as many elements as TYPE. */
8460 tree
8461 vect_double_mask_nunits (tree type)
8463 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8464 return build_truth_vector_type (nunits, current_vector_size);
8467 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8468 contain a sequence of NVECTORS masks that each control a vector of type
8469 VECTYPE. */
8471 void
8472 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8473 unsigned int nvectors, tree vectype)
8475 gcc_assert (nvectors != 0);
8476 if (masks->length () < nvectors)
8477 masks->safe_grow_cleared (nvectors);
8478 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8479 /* The number of scalars per iteration and the number of vectors are
8480 both compile-time constants. */
8481 unsigned int nscalars_per_iter
8482 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8483 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8484 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8486 rgm->max_nscalars_per_iter = nscalars_per_iter;
8487 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8491 /* Given a complete set of masks MASKS, extract mask number INDEX
8492 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8493 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8495 See the comment above vec_loop_masks for more details about the mask
8496 arrangement. */
8498 tree
8499 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8500 unsigned int nvectors, tree vectype, unsigned int index)
8502 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8503 tree mask_type = rgm->mask_type;
8505 /* Populate the rgroup's mask array, if this is the first time we've
8506 used it. */
8507 if (rgm->masks.is_empty ())
8509 rgm->masks.safe_grow_cleared (nvectors);
8510 for (unsigned int i = 0; i < nvectors; ++i)
8512 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8513 /* Provide a dummy definition until the real one is available. */
8514 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8515 rgm->masks[i] = mask;
8519 tree mask = rgm->masks[index];
8520 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8521 TYPE_VECTOR_SUBPARTS (vectype)))
8523 /* A loop mask for data type X can be reused for data type Y
8524 if X has N times more elements than Y and if Y's elements
8525 are N times bigger than X's. In this case each sequence
8526 of N elements in the loop mask will be all-zero or all-one.
8527 We can then view-convert the mask so that each sequence of
8528 N elements is replaced by a single element. */
8529 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8530 TYPE_VECTOR_SUBPARTS (vectype)));
8531 gimple_seq seq = NULL;
8532 mask_type = build_same_sized_truth_vector_type (vectype);
8533 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8534 if (seq)
8535 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8537 return mask;
8540 /* Scale profiling counters by estimation for LOOP which is vectorized
8541 by factor VF. */
8543 static void
8544 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8546 edge preheader = loop_preheader_edge (loop);
8547 /* Reduce loop iterations by the vectorization factor. */
8548 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8549 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8551 if (freq_h.nonzero_p ())
8553 profile_probability p;
8555 /* Avoid dropping loop body profile counter to 0 because of zero count
8556 in loop's preheader. */
8557 if (!(freq_e == profile_count::zero ()))
8558 freq_e = freq_e.force_nonzero ();
8559 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8560 scale_loop_frequencies (loop, p);
8563 edge exit_e = single_exit (loop);
8564 exit_e->probability = profile_probability::always ()
8565 .apply_scale (1, new_est_niter + 1);
8567 edge exit_l = single_pred_edge (loop->latch);
8568 profile_probability prob = exit_l->probability;
8569 exit_l->probability = exit_e->probability.invert ();
8570 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8571 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8574 /* Function vect_transform_loop.
8576 The analysis phase has determined that the loop is vectorizable.
8577 Vectorize the loop - created vectorized stmts to replace the scalar
8578 stmts in the loop, and update the loop exit condition.
8579 Returns scalar epilogue loop if any. */
8581 struct loop *
8582 vect_transform_loop (loop_vec_info loop_vinfo)
8584 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8585 struct loop *epilogue = NULL;
8586 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8587 int nbbs = loop->num_nodes;
8588 int i;
8589 tree niters_vector = NULL_TREE;
8590 tree step_vector = NULL_TREE;
8591 tree niters_vector_mult_vf = NULL_TREE;
8592 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8593 unsigned int lowest_vf = constant_lower_bound (vf);
8594 bool grouped_store;
8595 bool slp_scheduled = false;
8596 gimple *stmt, *pattern_stmt;
8597 gimple_seq pattern_def_seq = NULL;
8598 gimple_stmt_iterator pattern_def_si = gsi_none ();
8599 bool transform_pattern_stmt = false;
8600 bool check_profitability = false;
8601 unsigned int th;
8603 if (dump_enabled_p ())
8604 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8606 /* Use the more conservative vectorization threshold. If the number
8607 of iterations is constant assume the cost check has been performed
8608 by our caller. If the threshold makes all loops profitable that
8609 run at least the (estimated) vectorization factor number of times
8610 checking is pointless, too. */
8611 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8612 if (th >= vect_vf_for_cost (loop_vinfo)
8613 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8615 if (dump_enabled_p ())
8616 dump_printf_loc (MSG_NOTE, vect_location,
8617 "Profitability threshold is %d loop iterations.\n",
8618 th);
8619 check_profitability = true;
8622 /* Make sure there exists a single-predecessor exit bb. Do this before
8623 versioning. */
8624 edge e = single_exit (loop);
8625 if (! single_pred_p (e->dest))
8627 split_loop_exit_edge (e);
8628 if (dump_enabled_p ())
8629 dump_printf (MSG_NOTE, "split exit edge\n");
8632 /* Version the loop first, if required, so the profitability check
8633 comes first. */
8635 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8637 poly_uint64 versioning_threshold
8638 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8639 if (check_profitability
8640 && ordered_p (poly_uint64 (th), versioning_threshold))
8642 versioning_threshold = ordered_max (poly_uint64 (th),
8643 versioning_threshold);
8644 check_profitability = false;
8646 vect_loop_versioning (loop_vinfo, th, check_profitability,
8647 versioning_threshold);
8648 check_profitability = false;
8651 /* Make sure there exists a single-predecessor exit bb also on the
8652 scalar loop copy. Do this after versioning but before peeling
8653 so CFG structure is fine for both scalar and if-converted loop
8654 to make slpeel_duplicate_current_defs_from_edges face matched
8655 loop closed PHI nodes on the exit. */
8656 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8658 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8659 if (! single_pred_p (e->dest))
8661 split_loop_exit_edge (e);
8662 if (dump_enabled_p ())
8663 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8667 tree niters = vect_build_loop_niters (loop_vinfo);
8668 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8669 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8670 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8671 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8672 &step_vector, &niters_vector_mult_vf, th,
8673 check_profitability, niters_no_overflow);
8675 if (niters_vector == NULL_TREE)
8677 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8678 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8679 && known_eq (lowest_vf, vf))
8681 niters_vector
8682 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8683 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8684 step_vector = build_one_cst (TREE_TYPE (niters));
8686 else
8687 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8688 &step_vector, niters_no_overflow);
8691 /* 1) Make sure the loop header has exactly two entries
8692 2) Make sure we have a preheader basic block. */
8694 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8696 split_edge (loop_preheader_edge (loop));
8698 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8699 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8700 /* This will deal with any possible peeling. */
8701 vect_prepare_for_masked_peels (loop_vinfo);
8703 /* FORNOW: the vectorizer supports only loops which body consist
8704 of one basic block (header + empty latch). When the vectorizer will
8705 support more involved loop forms, the order by which the BBs are
8706 traversed need to be reconsidered. */
8708 for (i = 0; i < nbbs; i++)
8710 basic_block bb = bbs[i];
8711 stmt_vec_info stmt_info;
8713 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8714 gsi_next (&si))
8716 gphi *phi = si.phi ();
8717 if (dump_enabled_p ())
8719 dump_printf_loc (MSG_NOTE, vect_location,
8720 "------>vectorizing phi: ");
8721 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8723 stmt_info = vinfo_for_stmt (phi);
8724 if (!stmt_info)
8725 continue;
8727 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8728 vect_loop_kill_debug_uses (loop, phi);
8730 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8731 && !STMT_VINFO_LIVE_P (stmt_info))
8732 continue;
8734 if (STMT_VINFO_VECTYPE (stmt_info)
8735 && (maybe_ne
8736 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8737 && dump_enabled_p ())
8738 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8740 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8741 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8742 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8743 && ! PURE_SLP_STMT (stmt_info))
8745 if (dump_enabled_p ())
8746 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8747 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8751 pattern_stmt = NULL;
8752 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8753 !gsi_end_p (si) || transform_pattern_stmt;)
8755 bool is_store;
8757 if (transform_pattern_stmt)
8758 stmt = pattern_stmt;
8759 else
8761 stmt = gsi_stmt (si);
8762 /* During vectorization remove existing clobber stmts. */
8763 if (gimple_clobber_p (stmt))
8765 unlink_stmt_vdef (stmt);
8766 gsi_remove (&si, true);
8767 release_defs (stmt);
8768 continue;
8772 if (dump_enabled_p ())
8774 dump_printf_loc (MSG_NOTE, vect_location,
8775 "------>vectorizing statement: ");
8776 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8779 stmt_info = vinfo_for_stmt (stmt);
8781 /* vector stmts created in the outer-loop during vectorization of
8782 stmts in an inner-loop may not have a stmt_info, and do not
8783 need to be vectorized. */
8784 if (!stmt_info)
8786 gsi_next (&si);
8787 continue;
8790 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8791 vect_loop_kill_debug_uses (loop, stmt);
8793 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8794 && !STMT_VINFO_LIVE_P (stmt_info))
8796 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8797 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8798 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8799 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8801 stmt = pattern_stmt;
8802 stmt_info = vinfo_for_stmt (stmt);
8804 else
8806 gsi_next (&si);
8807 continue;
8810 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8811 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8812 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8813 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8814 transform_pattern_stmt = true;
8816 /* If pattern statement has def stmts, vectorize them too. */
8817 if (is_pattern_stmt_p (stmt_info))
8819 if (pattern_def_seq == NULL)
8821 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8822 pattern_def_si = gsi_start (pattern_def_seq);
8824 else if (!gsi_end_p (pattern_def_si))
8825 gsi_next (&pattern_def_si);
8826 if (pattern_def_seq != NULL)
8828 gimple *pattern_def_stmt = NULL;
8829 stmt_vec_info pattern_def_stmt_info = NULL;
8831 while (!gsi_end_p (pattern_def_si))
8833 pattern_def_stmt = gsi_stmt (pattern_def_si);
8834 pattern_def_stmt_info
8835 = vinfo_for_stmt (pattern_def_stmt);
8836 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8837 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8838 break;
8839 gsi_next (&pattern_def_si);
8842 if (!gsi_end_p (pattern_def_si))
8844 if (dump_enabled_p ())
8846 dump_printf_loc (MSG_NOTE, vect_location,
8847 "==> vectorizing pattern def "
8848 "stmt: ");
8849 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8850 pattern_def_stmt, 0);
8853 stmt = pattern_def_stmt;
8854 stmt_info = pattern_def_stmt_info;
8856 else
8858 pattern_def_si = gsi_none ();
8859 transform_pattern_stmt = false;
8862 else
8863 transform_pattern_stmt = false;
8866 if (STMT_VINFO_VECTYPE (stmt_info))
8868 poly_uint64 nunits
8869 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8870 if (!STMT_SLP_TYPE (stmt_info)
8871 && maybe_ne (nunits, vf)
8872 && dump_enabled_p ())
8873 /* For SLP VF is set according to unrolling factor, and not
8874 to vector size, hence for SLP this print is not valid. */
8875 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8878 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8879 reached. */
8880 if (STMT_SLP_TYPE (stmt_info))
8882 if (!slp_scheduled)
8884 slp_scheduled = true;
8886 if (dump_enabled_p ())
8887 dump_printf_loc (MSG_NOTE, vect_location,
8888 "=== scheduling SLP instances ===\n");
8890 vect_schedule_slp (loop_vinfo);
8893 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8894 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8896 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8898 pattern_def_seq = NULL;
8899 gsi_next (&si);
8901 continue;
8905 /* -------- vectorize statement ------------ */
8906 if (dump_enabled_p ())
8907 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8909 grouped_store = false;
8910 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8911 if (is_store)
8913 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8915 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8916 interleaving chain was completed - free all the stores in
8917 the chain. */
8918 gsi_next (&si);
8919 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8921 else
8923 /* Free the attached stmt_vec_info and remove the stmt. */
8924 gimple *store = gsi_stmt (si);
8925 free_stmt_vec_info (store);
8926 unlink_stmt_vdef (store);
8927 gsi_remove (&si, true);
8928 release_defs (store);
8931 /* Stores can only appear at the end of pattern statements. */
8932 gcc_assert (!transform_pattern_stmt);
8933 pattern_def_seq = NULL;
8935 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8937 pattern_def_seq = NULL;
8938 gsi_next (&si);
8940 } /* stmts in BB */
8942 /* Stub out scalar statements that must not survive vectorization.
8943 Doing this here helps with grouped statements, or statements that
8944 are involved in patterns. */
8945 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8946 !gsi_end_p (gsi); gsi_next (&gsi))
8948 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8949 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8951 tree lhs = gimple_get_lhs (call);
8952 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8954 tree zero = build_zero_cst (TREE_TYPE (lhs));
8955 gimple *new_stmt = gimple_build_assign (lhs, zero);
8956 gsi_replace (&gsi, new_stmt, true);
8960 } /* BBs in loop */
8962 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8963 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8964 if (integer_onep (step_vector))
8965 niters_no_overflow = true;
8966 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8967 niters_vector_mult_vf, !niters_no_overflow);
8969 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8970 scale_profile_for_vect_loop (loop, assumed_vf);
8972 /* True if the final iteration might not handle a full vector's
8973 worth of scalar iterations. */
8974 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8975 /* The minimum number of iterations performed by the epilogue. This
8976 is 1 when peeling for gaps because we always need a final scalar
8977 iteration. */
8978 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8979 /* +1 to convert latch counts to loop iteration counts,
8980 -min_epilogue_iters to remove iterations that cannot be performed
8981 by the vector code. */
8982 int bias_for_lowest = 1 - min_epilogue_iters;
8983 int bias_for_assumed = bias_for_lowest;
8984 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8985 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8987 /* When the amount of peeling is known at compile time, the first
8988 iteration will have exactly alignment_npeels active elements.
8989 In the worst case it will have at least one. */
8990 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8991 bias_for_lowest += lowest_vf - min_first_active;
8992 bias_for_assumed += assumed_vf - min_first_active;
8994 /* In these calculations the "- 1" converts loop iteration counts
8995 back to latch counts. */
8996 if (loop->any_upper_bound)
8997 loop->nb_iterations_upper_bound
8998 = (final_iter_may_be_partial
8999 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9000 lowest_vf) - 1
9001 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9002 lowest_vf) - 1);
9003 if (loop->any_likely_upper_bound)
9004 loop->nb_iterations_likely_upper_bound
9005 = (final_iter_may_be_partial
9006 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9007 + bias_for_lowest, lowest_vf) - 1
9008 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9009 + bias_for_lowest, lowest_vf) - 1);
9010 if (loop->any_estimate)
9011 loop->nb_iterations_estimate
9012 = (final_iter_may_be_partial
9013 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9014 assumed_vf) - 1
9015 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9016 assumed_vf) - 1);
9018 if (dump_enabled_p ())
9020 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9022 dump_printf_loc (MSG_NOTE, vect_location,
9023 "LOOP VECTORIZED\n");
9024 if (loop->inner)
9025 dump_printf_loc (MSG_NOTE, vect_location,
9026 "OUTER LOOP VECTORIZED\n");
9027 dump_printf (MSG_NOTE, "\n");
9029 else
9031 dump_printf_loc (MSG_NOTE, vect_location,
9032 "LOOP EPILOGUE VECTORIZED (VS=");
9033 dump_dec (MSG_NOTE, current_vector_size);
9034 dump_printf (MSG_NOTE, ")\n");
9038 /* Free SLP instances here because otherwise stmt reference counting
9039 won't work. */
9040 slp_instance instance;
9041 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9042 vect_free_slp_instance (instance);
9043 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9044 /* Clear-up safelen field since its value is invalid after vectorization
9045 since vectorized loop can have loop-carried dependencies. */
9046 loop->safelen = 0;
9048 /* Don't vectorize epilogue for epilogue. */
9049 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9050 epilogue = NULL;
9052 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9053 epilogue = NULL;
9055 if (epilogue)
9057 auto_vector_sizes vector_sizes;
9058 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9059 unsigned int next_size = 0;
9061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9062 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9063 && known_eq (vf, lowest_vf))
9065 unsigned int eiters
9066 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9067 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9068 eiters = eiters % lowest_vf;
9069 epilogue->nb_iterations_upper_bound = eiters - 1;
9071 unsigned int ratio;
9072 while (next_size < vector_sizes.length ()
9073 && !(constant_multiple_p (current_vector_size,
9074 vector_sizes[next_size], &ratio)
9075 && eiters >= lowest_vf / ratio))
9076 next_size += 1;
9078 else
9079 while (next_size < vector_sizes.length ()
9080 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9081 next_size += 1;
9083 if (next_size == vector_sizes.length ())
9084 epilogue = NULL;
9087 if (epilogue)
9089 epilogue->force_vectorize = loop->force_vectorize;
9090 epilogue->safelen = loop->safelen;
9091 epilogue->dont_vectorize = false;
9093 /* We may need to if-convert epilogue to vectorize it. */
9094 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9095 tree_if_conversion (epilogue);
9098 return epilogue;
9101 /* The code below is trying to perform simple optimization - revert
9102 if-conversion for masked stores, i.e. if the mask of a store is zero
9103 do not perform it and all stored value producers also if possible.
9104 For example,
9105 for (i=0; i<n; i++)
9106 if (c[i])
9108 p1[i] += 1;
9109 p2[i] = p3[i] +2;
9111 this transformation will produce the following semi-hammock:
9113 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9115 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9116 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9117 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9118 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9119 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9120 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9124 void
9125 optimize_mask_stores (struct loop *loop)
9127 basic_block *bbs = get_loop_body (loop);
9128 unsigned nbbs = loop->num_nodes;
9129 unsigned i;
9130 basic_block bb;
9131 struct loop *bb_loop;
9132 gimple_stmt_iterator gsi;
9133 gimple *stmt;
9134 auto_vec<gimple *> worklist;
9136 vect_location = find_loop_location (loop);
9137 /* Pick up all masked stores in loop if any. */
9138 for (i = 0; i < nbbs; i++)
9140 bb = bbs[i];
9141 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9142 gsi_next (&gsi))
9144 stmt = gsi_stmt (gsi);
9145 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9146 worklist.safe_push (stmt);
9150 free (bbs);
9151 if (worklist.is_empty ())
9152 return;
9154 /* Loop has masked stores. */
9155 while (!worklist.is_empty ())
9157 gimple *last, *last_store;
9158 edge e, efalse;
9159 tree mask;
9160 basic_block store_bb, join_bb;
9161 gimple_stmt_iterator gsi_to;
9162 tree vdef, new_vdef;
9163 gphi *phi;
9164 tree vectype;
9165 tree zero;
9167 last = worklist.pop ();
9168 mask = gimple_call_arg (last, 2);
9169 bb = gimple_bb (last);
9170 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9171 the same loop as if_bb. It could be different to LOOP when two
9172 level loop-nest is vectorized and mask_store belongs to the inner
9173 one. */
9174 e = split_block (bb, last);
9175 bb_loop = bb->loop_father;
9176 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9177 join_bb = e->dest;
9178 store_bb = create_empty_bb (bb);
9179 add_bb_to_loop (store_bb, bb_loop);
9180 e->flags = EDGE_TRUE_VALUE;
9181 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9182 /* Put STORE_BB to likely part. */
9183 efalse->probability = profile_probability::unlikely ();
9184 store_bb->count = efalse->count ();
9185 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9186 if (dom_info_available_p (CDI_DOMINATORS))
9187 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9188 if (dump_enabled_p ())
9189 dump_printf_loc (MSG_NOTE, vect_location,
9190 "Create new block %d to sink mask stores.",
9191 store_bb->index);
9192 /* Create vector comparison with boolean result. */
9193 vectype = TREE_TYPE (mask);
9194 zero = build_zero_cst (vectype);
9195 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9196 gsi = gsi_last_bb (bb);
9197 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9198 /* Create new PHI node for vdef of the last masked store:
9199 .MEM_2 = VDEF <.MEM_1>
9200 will be converted to
9201 .MEM.3 = VDEF <.MEM_1>
9202 and new PHI node will be created in join bb
9203 .MEM_2 = PHI <.MEM_1, .MEM_3>
9205 vdef = gimple_vdef (last);
9206 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9207 gimple_set_vdef (last, new_vdef);
9208 phi = create_phi_node (vdef, join_bb);
9209 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9211 /* Put all masked stores with the same mask to STORE_BB if possible. */
9212 while (true)
9214 gimple_stmt_iterator gsi_from;
9215 gimple *stmt1 = NULL;
9217 /* Move masked store to STORE_BB. */
9218 last_store = last;
9219 gsi = gsi_for_stmt (last);
9220 gsi_from = gsi;
9221 /* Shift GSI to the previous stmt for further traversal. */
9222 gsi_prev (&gsi);
9223 gsi_to = gsi_start_bb (store_bb);
9224 gsi_move_before (&gsi_from, &gsi_to);
9225 /* Setup GSI_TO to the non-empty block start. */
9226 gsi_to = gsi_start_bb (store_bb);
9227 if (dump_enabled_p ())
9229 dump_printf_loc (MSG_NOTE, vect_location,
9230 "Move stmt to created bb\n");
9231 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9233 /* Move all stored value producers if possible. */
9234 while (!gsi_end_p (gsi))
9236 tree lhs;
9237 imm_use_iterator imm_iter;
9238 use_operand_p use_p;
9239 bool res;
9241 /* Skip debug statements. */
9242 if (is_gimple_debug (gsi_stmt (gsi)))
9244 gsi_prev (&gsi);
9245 continue;
9247 stmt1 = gsi_stmt (gsi);
9248 /* Do not consider statements writing to memory or having
9249 volatile operand. */
9250 if (gimple_vdef (stmt1)
9251 || gimple_has_volatile_ops (stmt1))
9252 break;
9253 gsi_from = gsi;
9254 gsi_prev (&gsi);
9255 lhs = gimple_get_lhs (stmt1);
9256 if (!lhs)
9257 break;
9259 /* LHS of vectorized stmt must be SSA_NAME. */
9260 if (TREE_CODE (lhs) != SSA_NAME)
9261 break;
9263 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9265 /* Remove dead scalar statement. */
9266 if (has_zero_uses (lhs))
9268 gsi_remove (&gsi_from, true);
9269 continue;
9273 /* Check that LHS does not have uses outside of STORE_BB. */
9274 res = true;
9275 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9277 gimple *use_stmt;
9278 use_stmt = USE_STMT (use_p);
9279 if (is_gimple_debug (use_stmt))
9280 continue;
9281 if (gimple_bb (use_stmt) != store_bb)
9283 res = false;
9284 break;
9287 if (!res)
9288 break;
9290 if (gimple_vuse (stmt1)
9291 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9292 break;
9294 /* Can move STMT1 to STORE_BB. */
9295 if (dump_enabled_p ())
9297 dump_printf_loc (MSG_NOTE, vect_location,
9298 "Move stmt to created bb\n");
9299 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9301 gsi_move_before (&gsi_from, &gsi_to);
9302 /* Shift GSI_TO for further insertion. */
9303 gsi_prev (&gsi_to);
9305 /* Put other masked stores with the same mask to STORE_BB. */
9306 if (worklist.is_empty ()
9307 || gimple_call_arg (worklist.last (), 2) != mask
9308 || worklist.last () != stmt1)
9309 break;
9310 last = worklist.pop ();
9312 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);