Fix typo in last ChangeLog entry
[official-gcc.git] / gcc / tree-vect-loop.c
blob7b3009aed2dcddcfd9153d06f1bf31fc2e08f131
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
169 in the loop.
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
172 original loop:
173 for (i=0; i<N; i++){
174 a[i] = b[i] + c[i];
177 vectorized loop:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188 unsigned nbbs = loop->num_nodes;
189 poly_uint64 vectorization_factor = 1;
190 tree scalar_type = NULL_TREE;
191 gphi *phi;
192 tree vectype;
193 stmt_vec_info stmt_info;
194 unsigned i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
200 bool bool_result;
201 auto_vec<stmt_vec_info> mask_producers;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i = 0; i < nbbs; i++)
209 basic_block bb = bbs[i];
211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 gsi_next (&si))
214 phi = si.phi ();
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 gcc_assert (stmt_info);
224 if (STMT_VINFO_RELEVANT_P (stmt_info)
225 || STMT_VINFO_LIVE_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 dump_printf (MSG_NOTE, "\n");
268 vect_update_max_nunits (&vectorization_factor, vectype);
272 for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 !gsi_end_p (si) || analyze_pattern_stmt;)
275 tree vf_vectype;
277 if (analyze_pattern_stmt)
278 stmt = pattern_stmt;
279 else
280 stmt = gsi_stmt (si);
282 stmt_info = vinfo_for_stmt (stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 return false;
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 else
434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
438 compute a factor. */
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 && is_gimple_assign (stmt)
441 && gimple_assign_rhs_code (stmt) != COND_EXPR)
443 if (STMT_VINFO_RELEVANT_P (stmt_info)
444 || STMT_VINFO_LIVE_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 == tcc_comparison
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 else
455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
457 pattern_def_seq = NULL;
458 gsi_next (&si);
460 continue;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469 dump_printf (MSG_NOTE, "\n");
471 vectype = get_vectype_for_scalar_type (scalar_type);
472 if (!vectype)
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized: unsupported "
478 "data-type ");
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480 scalar_type);
481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
483 return false;
486 if (!bool_result)
487 STMT_VINFO_VECTYPE (stmt_info) = vectype;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493 dump_printf (MSG_NOTE, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 vf_vectype = vectype;
501 else
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 if (!bool_result)
507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 &dummy);
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 dump_printf (MSG_NOTE, "\n");
516 vf_vectype = get_vectype_for_scalar_type (scalar_type);
518 if (!vf_vectype)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525 scalar_type);
526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 return false;
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vf_vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553 dump_printf (MSG_NOTE, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 dump_printf (MSG_NOTE, "\n");
563 vect_update_max_nunits (&vectorization_factor, vf_vectype);
565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 pattern_def_seq = NULL;
568 gsi_next (&si);
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577 dump_dec (MSG_NOTE, vectorization_factor);
578 dump_printf (MSG_NOTE, "\n");
581 if (known_le (vectorization_factor, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585 "not vectorized: unsupported data-type\n");
586 return false;
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
590 for (i = 0; i < mask_producers.length (); i++)
592 tree mask_type = NULL;
594 stmt = STMT_VINFO_STMT (mask_producers[i]);
596 if (is_gimple_assign (stmt)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 mask_type = get_mask_type_for_scalar_type (scalar_type);
604 if (!mask_type)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "not vectorized: unsupported mask\n");
609 return false;
612 else
614 tree rhs;
615 ssa_op_iter iter;
616 gimple *def_stmt;
617 enum vect_def_type dt;
619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 &def_stmt, &dt, &vectype))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "not vectorized: can't compute mask type "
628 "for statement, ");
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 return false;
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
638 if (!vectype)
639 continue;
641 if (!mask_type)
642 mask_type = vectype;
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 TYPE_VECTOR_SUBPARTS (vectype)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 mask_type);
653 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 vectype);
656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
658 return false;
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 != VECTOR_BOOLEAN_TYPE_P (vectype))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 mask_type);
670 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 vectype);
673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
675 return false;
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
681 if (mask_type
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 && gimple_code (stmt) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 mask_type = build_same_sized_truth_vector_type (mask_type);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
690 if-conversion. */
691 if (!mask_type)
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "not vectorized: can't compute mask type "
697 "for statement, ");
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 return false;
704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 return true;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718 tree * step)
720 tree init_expr;
721 tree step_expr;
722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723 basic_block bb;
725 /* When there is no evolution in this loop, the evolution function
726 is not "simple". */
727 if (evolution_part == NULL_TREE)
728 return false;
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part))
733 return false;
735 step_expr = evolution_part;
736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742 dump_printf (MSG_NOTE, ", init: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744 dump_printf (MSG_NOTE, "\n");
747 *init = init_expr;
748 *step = step_expr;
750 if (TREE_CODE (step_expr) != INTEGER_CST
751 && (TREE_CODE (step_expr) != SSA_NAME
752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 || !flag_associative_math)))
757 && (TREE_CODE (step_expr) != REAL_CST
758 || !flag_associative_math))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "step unknown.\n");
763 return false;
766 return true;
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
774 enclosing LOOP). */
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
779 basic_block bb = loop->header;
780 tree init, step;
781 auto_vec<gimple *, 64> worklist;
782 gphi_iterator gsi;
783 bool double_reduc;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
791 changed. */
792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
794 gphi *phi = gsi.phi ();
795 tree access_fn = NULL;
796 tree def = PHI_RESULT (phi);
797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def))
808 continue;
810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
812 /* Analyze the evolution function. */
813 access_fn = analyze_scalar_evolution (loop, def);
814 if (access_fn)
816 STRIP_NOPS (access_fn);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822 dump_printf (MSG_NOTE, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 = initial_condition_in_loop_num (access_fn, loop->num);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 = evolution_part_in_loop_num (access_fn, loop->num);
830 if (!access_fn
831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 && TREE_CODE (step) != INTEGER_CST))
835 worklist.safe_push (phi);
836 continue;
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 != NULL_TREE);
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist.length () > 0)
852 gimple *phi = worklist.pop ();
853 tree def = PHI_RESULT (phi);
854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855 gimple *reduc_stmt;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 gcc_assert (!virtual_operand_p (def)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
903 chain. */
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 tree *number_of_iterations, tree *number_of_iterationsm1)
1024 edge exit = single_exit (loop);
1025 struct tree_niter_desc niter_desc;
1026 tree niter_assumptions, niter, may_be_zero;
1027 gcond *cond = get_loop_exit_condition (loop);
1029 *assumptions = boolean_true_node;
1030 *number_of_iterationsm1 = chrec_dont_know;
1031 *number_of_iterations = chrec_dont_know;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== get_loop_niters ===\n");
1036 if (!exit)
1037 return cond;
1039 niter = chrec_dont_know;
1040 may_be_zero = NULL_TREE;
1041 niter_assumptions = boolean_true_node;
1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043 || chrec_contains_undetermined (niter_desc.niter))
1044 return cond;
1046 niter_assumptions = niter_desc.assumptions;
1047 may_be_zero = niter_desc.may_be_zero;
1048 niter = niter_desc.niter;
1050 if (may_be_zero && integer_zerop (may_be_zero))
1051 may_be_zero = NULL_TREE;
1053 if (may_be_zero)
1055 if (COMPARISON_CLASS_P (may_be_zero))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 niter_assumptions,
1062 fold_build1 (TRUTH_NOT_EXPR,
1063 boolean_type_node,
1064 may_be_zero));
1065 else
1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 build_int_cst (TREE_TYPE (niter), 0),
1068 rewrite_to_non_trapping_overflow (niter));
1070 may_be_zero = NULL_TREE;
1072 else if (integer_nonzerop (may_be_zero))
1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 return cond;
1078 else
1079 return cond;
1082 *assumptions = niter_assumptions;
1083 *number_of_iterationsm1 = niter;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter && !chrec_contains_undetermined (niter))
1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 build_int_cst (TREE_TYPE (niter), 1));
1092 *number_of_iterations = niter;
1094 return cond;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1104 const struct loop *const loop = (const struct loop *)data;
1105 if (flow_bb_inside_loop_p (loop, bb))
1106 return true;
1107 return false;
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115 : vec_info (vec_info::loop, init_cost (loop_in)),
1116 loop (loop_in),
1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118 num_itersm1 (NULL_TREE),
1119 num_iters (NULL_TREE),
1120 num_iters_unchanged (NULL_TREE),
1121 num_iters_assumptions (NULL_TREE),
1122 th (0),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE),
1127 mask_compare_type (NULL_TREE),
1128 unaligned_dr (NULL),
1129 peeling_for_alignment (0),
1130 ptr_mask (0),
1131 ivexpr_map (NULL),
1132 slp_unrolling_factor (1),
1133 single_scalar_iteration_cost (0),
1134 vectorizable (false),
1135 can_fully_mask_p (true),
1136 fully_masked_p (false),
1137 peeling_for_gaps (false),
1138 peeling_for_niter (false),
1139 operands_swapped (false),
1140 no_data_dependencies (false),
1141 has_mask_store (false),
1142 scalar_loop (NULL),
1143 orig_loop_info (NULL)
1145 /* Create/Update stmt_info for all stmts in the loop. */
1146 basic_block *body = get_loop_body (loop);
1147 for (unsigned int i = 0; i < loop->num_nodes; i++)
1149 basic_block bb = body[i];
1150 gimple_stmt_iterator si;
1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1154 gimple *phi = gsi_stmt (si);
1155 gimple_set_uid (phi, 0);
1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1161 gimple *stmt = gsi_stmt (si);
1162 gimple_set_uid (stmt, 0);
1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 free (body);
1168 /* CHECKME: We want to visit all BBs before their successors (except for
1169 latch blocks, for which this assertion wouldn't hold). In the simple
1170 case of the loop forms we allow, a dfs order of the BBs would the same
1171 as reversed postorder traversal, so we are safe. */
1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 bbs, loop->num_nodes, loop);
1175 gcc_assert (nbbs == loop->num_nodes);
1178 /* Free all levels of MASKS. */
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1183 rgroup_masks *rgm;
1184 unsigned int i;
1185 FOR_EACH_VEC_ELT (*masks, i, rgm)
1186 rgm->masks.release ();
1187 masks->release ();
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191 stmt_vec_info structs of all the stmts in the loop. */
1193 _loop_vec_info::~_loop_vec_info ()
1195 int nbbs;
1196 gimple_stmt_iterator si;
1197 int j;
1199 nbbs = loop->num_nodes;
1200 for (j = 0; j < nbbs; j++)
1202 basic_block bb = bbs[j];
1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204 free_stmt_vec_info (gsi_stmt (si));
1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1208 gimple *stmt = gsi_stmt (si);
1210 /* We may have broken canonical form by moving a constant
1211 into RHS1 of a commutative op. Fix such occurrences. */
1212 if (operands_swapped && is_gimple_assign (stmt))
1214 enum tree_code code = gimple_assign_rhs_code (stmt);
1216 if ((code == PLUS_EXPR
1217 || code == POINTER_PLUS_EXPR
1218 || code == MULT_EXPR)
1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 swap_ssa_operands (stmt,
1221 gimple_assign_rhs1_ptr (stmt),
1222 gimple_assign_rhs2_ptr (stmt));
1223 else if (code == COND_EXPR
1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1226 tree cond_expr = gimple_assign_rhs1 (stmt);
1227 enum tree_code cond_code = TREE_CODE (cond_expr);
1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 0));
1233 cond_code = invert_tree_comparison (cond_code,
1234 honor_nans);
1235 if (cond_code != ERROR_MARK)
1237 TREE_SET_CODE (cond_expr, cond_code);
1238 swap_ssa_operands (stmt,
1239 gimple_assign_rhs2_ptr (stmt),
1240 gimple_assign_rhs3_ptr (stmt));
1246 /* Free stmt_vec_info. */
1247 free_stmt_vec_info (stmt);
1248 gsi_next (&si);
1252 free (bbs);
1254 release_vec_loop_masks (&masks);
1255 delete ivexpr_map;
1257 loop->aux = NULL;
1260 /* Return an invariant or register for EXPR and emit necessary
1261 computations in the LOOP_VINFO loop preheader. */
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1266 if (is_gimple_reg (expr)
1267 || is_gimple_min_invariant (expr))
1268 return expr;
1270 if (! loop_vinfo->ivexpr_map)
1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273 if (! cached)
1275 gimple_seq stmts = NULL;
1276 cached = force_gimple_operand (unshare_expr (expr),
1277 &stmts, true, NULL_TREE);
1278 if (stmts)
1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 gsi_insert_seq_on_edge_immediate (e, stmts);
1284 return cached;
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288 all masks required to mask LOOP_VINFO. */
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1293 rgroup_masks *rgm;
1294 unsigned int i;
1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296 if (rgm->mask_type != NULL_TREE
1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 cmp_type, rgm->mask_type,
1299 OPTIMIZE_FOR_SPEED))
1300 return false;
1301 return true;
1304 /* Calculate the maximum number of scalars per iteration for every
1305 rgroup in LOOP_VINFO. */
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1310 unsigned int res = 1;
1311 unsigned int i;
1312 rgroup_masks *rgm;
1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314 res = MAX (res, rgm->max_nscalars_per_iter);
1315 return res;
1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1319 whether we can actually generate the masks required. Return true if so,
1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326 unsigned int min_ni_width;
1328 /* Use a normal loop if there are no statements that need masking.
1329 This only happens in rare degenerate cases: it means that the loop
1330 has no loads, no stores, and no live-out values. */
1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332 return false;
1334 /* Get the maximum number of iterations that is representable
1335 in the counter type. */
1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1339 /* Get a more refined estimate for the number of iterations. */
1340 widest_int max_back_edges;
1341 if (max_loop_iterations (loop, &max_back_edges))
1342 max_ni = wi::smin (max_ni, max_back_edges + 1);
1344 /* Account for rgroup masks, in which each bit is replicated N times. */
1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1347 /* Work out how many bits we need to represent the limit. */
1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1350 /* Find a scalar mode for which WHILE_ULT is supported. */
1351 opt_scalar_int_mode cmp_mode_iter;
1352 tree cmp_type = NULL_TREE;
1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356 if (cmp_bits >= min_ni_width
1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 if (this_type
1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1363 /* Although we could stop as soon as we find a valid mode,
1364 it's often better to continue until we hit Pmode, since the
1365 operands to the WHILE are more likely to be reusable in
1366 address calculations. */
1367 cmp_type = this_type;
1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 break;
1374 if (!cmp_type)
1375 return false;
1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378 return true;
1381 /* Calculate the cost of one scalar iteration of the loop. */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387 int nbbs = loop->num_nodes, factor;
1388 int innerloop_iters, i;
1390 /* Gather costs for statements in the scalar loop. */
1392 /* FORNOW. */
1393 innerloop_iters = 1;
1394 if (loop->inner)
1395 innerloop_iters = 50; /* FIXME */
1397 for (i = 0; i < nbbs; i++)
1399 gimple_stmt_iterator si;
1400 basic_block bb = bbs[i];
1402 if (bb->loop_father == loop->inner)
1403 factor = innerloop_iters;
1404 else
1405 factor = 1;
1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1409 gimple *stmt = gsi_stmt (si);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413 continue;
1415 /* Skip stmts that are not vectorized inside the loop. */
1416 if (stmt_info
1417 && !STMT_VINFO_RELEVANT_P (stmt_info)
1418 && (!STMT_VINFO_LIVE_P (stmt_info)
1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421 continue;
1423 vect_cost_for_stmt kind;
1424 if (STMT_VINFO_DATA_REF (stmt_info))
1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427 kind = scalar_load;
1428 else
1429 kind = scalar_store;
1431 else
1432 kind = scalar_stmt;
1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 factor, kind, stmt_info, 0, vect_prologue);
1439 /* Now accumulate cost. */
1440 void *target_cost_data = init_cost (loop);
1441 stmt_info_for_cost *si;
1442 int j;
1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 j, si)
1446 struct _stmt_vec_info *stmt_info
1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448 (void) add_stmt_cost (target_cost_data, si->count,
1449 si->kind, stmt_info, si->misalign,
1450 vect_body);
1452 unsigned dummy, body_cost = 0;
1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454 destroy_cost_data (target_cost_data);
1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1459 /* Function vect_analyze_loop_form_1.
1461 Verify that certain CFG restrictions hold, including:
1462 - the loop has a pre-header
1463 - the loop has a single entry and exit
1464 - the loop exit condition is simple enough
1465 - the number of iterations can be analyzed, i.e, a countable loop. The
1466 niter could be analyzed under some assumptions. */
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 tree *assumptions, tree *number_of_iterationsm1,
1471 tree *number_of_iterations, gcond **inner_loop_cond)
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_NOTE, vect_location,
1475 "=== vect_analyze_loop_form ===\n");
1477 /* Different restrictions apply when we are considering an inner-most loop,
1478 vs. an outer (nested) loop.
1479 (FORNOW. May want to relax some of these restrictions in the future). */
1481 if (!loop->inner)
1483 /* Inner-most loop. We currently require that the number of BBs is
1484 exactly 2 (the header and latch). Vectorizable inner-most loops
1485 look like this:
1487 (pre-header)
1489 header <--------+
1490 | | |
1491 | +--> latch --+
1493 (exit-bb) */
1495 if (loop->num_nodes != 2)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: control flow in loop.\n");
1500 return false;
1503 if (empty_block_p (loop->header))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: empty loop.\n");
1508 return false;
1511 else
1513 struct loop *innerloop = loop->inner;
1514 edge entryedge;
1516 /* Nested loop. We currently require that the loop is doubly-nested,
1517 contains a single inner loop, and the number of BBs is exactly 5.
1518 Vectorizable outer-loops look like this:
1520 (pre-header)
1522 header <---+
1524 inner-loop |
1526 tail ------+
1528 (exit-bb)
1530 The inner-loop has the properties expected of inner-most loops
1531 as described above. */
1533 if ((loop->inner)->inner || (loop->inner)->next)
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "not vectorized: multiple nested loops.\n");
1538 return false;
1541 if (loop->num_nodes != 5)
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: control flow in loop.\n");
1546 return false;
1549 entryedge = loop_preheader_edge (innerloop);
1550 if (entryedge->src != loop->header
1551 || !single_exit (innerloop)
1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: unsupported outerloop form.\n");
1557 return false;
1560 /* Analyze the inner-loop. */
1561 tree inner_niterm1, inner_niter, inner_assumptions;
1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 &inner_assumptions, &inner_niterm1,
1564 &inner_niter, NULL)
1565 /* Don't support analyzing niter under assumptions for inner
1566 loop. */
1567 || !integer_onep (inner_assumptions))
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 "not vectorized: Bad inner loop.\n");
1572 return false;
1575 if (!expr_invariant_in_loop_p (loop, inner_niter))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "not vectorized: inner-loop count not"
1580 " invariant.\n");
1581 return false;
1584 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "Considering outer-loop vectorization.\n");
1589 if (!single_exit (loop)
1590 || EDGE_COUNT (loop->header->preds) != 2)
1592 if (dump_enabled_p ())
1594 if (!single_exit (loop))
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: multiple exits.\n");
1597 else if (EDGE_COUNT (loop->header->preds) != 2)
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: too many incoming edges.\n");
1601 return false;
1604 /* We assume that the loop exit condition is at the end of the loop. i.e,
1605 that the loop is represented as a do-while (with a proper if-guard
1606 before the loop if needed), where the loop header contains all the
1607 executable statements, and the latch is empty. */
1608 if (!empty_block_p (loop->latch)
1609 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1611 if (dump_enabled_p ())
1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 "not vectorized: latch block not empty.\n");
1614 return false;
1617 /* Make sure the exit is not abnormal. */
1618 edge e = single_exit (loop);
1619 if (e->flags & EDGE_ABNORMAL)
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "not vectorized: abnormal loop exit edge.\n");
1624 return false;
1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 number_of_iterationsm1);
1629 if (!*loop_cond)
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "not vectorized: complicated exit condition.\n");
1634 return false;
1637 if (integer_zerop (*assumptions)
1638 || !*number_of_iterations
1639 || chrec_contains_undetermined (*number_of_iterations))
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "not vectorized: number of iterations cannot be "
1644 "computed.\n");
1645 return false;
1648 if (integer_zerop (*number_of_iterations))
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: number of iterations = 0.\n");
1653 return false;
1656 return true;
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1664 tree assumptions, number_of_iterations, number_of_iterationsm1;
1665 gcond *loop_cond, *inner_loop_cond = NULL;
1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 &assumptions, &number_of_iterationsm1,
1669 &number_of_iterations, &inner_loop_cond))
1670 return NULL;
1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676 if (!integer_onep (assumptions))
1678 /* We consider to vectorize this loop by versioning it under
1679 some assumptions. In order to do this, we need to clear
1680 existing information computed by scev and niter analyzer. */
1681 scev_reset_htab ();
1682 free_numbers_of_iterations_estimates (loop);
1683 /* Also set flag for this loop so that following scev and niter
1684 analysis are done under the assumptions. */
1685 loop_constraint_set (loop, LOOP_C_FINITE);
1686 /* Also record the assumptions for versioning. */
1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1692 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_NOTE, vect_location,
1695 "Symbolic number of iterations is ");
1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697 dump_printf (MSG_NOTE, "\n");
1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702 if (inner_loop_cond)
1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704 = loop_exit_ctrl_vec_info_type;
1706 gcc_assert (!loop->aux);
1707 loop->aux = loop_vinfo;
1708 return loop_vinfo;
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714 statements update the vectorization factor. */
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721 int nbbs = loop->num_nodes;
1722 poly_uint64 vectorization_factor;
1723 int i;
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "=== vect_update_vf_for_slp ===\n");
1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730 gcc_assert (known_ne (vectorization_factor, 0U));
1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733 vectorization factor of the loop is the unrolling factor required by
1734 the SLP instances. If that unrolling factor is 1, we say, that we
1735 perform pure SLP on loop - cross iteration parallelism is not
1736 exploited. */
1737 bool only_slp_in_loop = true;
1738 for (i = 0; i < nbbs; i++)
1740 basic_block bb = bbs[i];
1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 gsi_next (&si))
1744 gimple *stmt = gsi_stmt (si);
1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 && STMT_VINFO_RELATED_STMT (stmt_info))
1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 stmt_info = vinfo_for_stmt (stmt);
1752 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 && !PURE_SLP_STMT (stmt_info))
1755 /* STMT needs both SLP and loop-based vectorization. */
1756 only_slp_in_loop = false;
1760 if (only_slp_in_loop)
1762 dump_printf_loc (MSG_NOTE, vect_location,
1763 "Loop contains only SLP stmts\n");
1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1766 else
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "Loop contains SLP and non-SLP stmts\n");
1770 /* Both the vectorization factor and unroll factor have the form
1771 current_vector_size * X for some rational X, so they must have
1772 a common multiple. */
1773 vectorization_factor
1774 = force_common_multiple (vectorization_factor,
1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Updating vectorization factor to ");
1783 dump_dec (MSG_NOTE, vectorization_factor);
1784 dump_printf (MSG_NOTE, ".\n");
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789 the other phi in the reduction is also relevant for vectorization.
1790 This rejects cases such as:
1792 outer1:
1793 x_1 = PHI <x_3(outer2), ...>;
1796 inner:
1797 x_2 = ...;
1800 outer2:
1801 x_3 = PHI <x_2(inner)>;
1803 if nothing in x_2 or elsewhere makes x_1 relevant. */
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809 return false;
1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 /* Function vect_analyze_loop_operations.
1817 Scan the loop stmts and make sure they are all vectorizable. */
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824 int nbbs = loop->num_nodes;
1825 int i;
1826 stmt_vec_info stmt_info;
1827 bool need_to_vectorize = false;
1828 bool ok;
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_NOTE, vect_location,
1832 "=== vect_analyze_loop_operations ===\n");
1834 for (i = 0; i < nbbs; i++)
1836 basic_block bb = bbs[i];
1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 gsi_next (&si))
1841 gphi *phi = si.phi ();
1842 ok = true;
1844 stmt_info = vinfo_for_stmt (phi);
1845 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1850 if (virtual_operand_p (gimple_phi_result (phi)))
1851 continue;
1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854 (i.e., a phi in the tail of the outer-loop). */
1855 if (! is_loop_header_bb_p (bb))
1857 /* FORNOW: we currently don't support the case that these phis
1858 are not used in the outerloop (unless it is double reduction,
1859 i.e., this phi is vect_reduction_def), cause this case
1860 requires to actually do something here. */
1861 if (STMT_VINFO_LIVE_P (stmt_info)
1862 && !vect_active_double_reduction_p (stmt_info))
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "Unsupported loop-closed phi in "
1867 "outer-loop.\n");
1868 return false;
1871 /* If PHI is used in the outer loop, we check that its operand
1872 is defined in the inner loop. */
1873 if (STMT_VINFO_RELEVANT_P (stmt_info))
1875 tree phi_op;
1876 gimple *op_def_stmt;
1878 if (gimple_phi_num_args (phi) != 1)
1879 return false;
1881 phi_op = PHI_ARG_DEF (phi, 0);
1882 if (TREE_CODE (phi_op) != SSA_NAME)
1883 return false;
1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 if (gimple_nop_p (op_def_stmt)
1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 || !vinfo_for_stmt (op_def_stmt))
1889 return false;
1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892 != vect_used_in_outer
1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894 != vect_used_in_outer_by_reduction)
1895 return false;
1898 continue;
1901 gcc_assert (stmt_info);
1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904 || STMT_VINFO_LIVE_P (stmt_info))
1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1907 /* A scalar-dependence cycle that we don't support. */
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "not vectorized: scalar dependence cycle.\n");
1911 return false;
1914 if (STMT_VINFO_RELEVANT_P (stmt_info))
1916 need_to_vectorize = true;
1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 && ! PURE_SLP_STMT (stmt_info))
1919 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 && ! PURE_SLP_STMT (stmt_info))
1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1927 if (ok
1928 && STMT_VINFO_LIVE_P (stmt_info)
1929 && !PURE_SLP_STMT (stmt_info))
1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1932 if (!ok)
1934 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 "not vectorized: relevant phi not "
1938 "supported: ");
1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1941 return false;
1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 gsi_next (&si))
1948 gimple *stmt = gsi_stmt (si);
1949 if (!gimple_clobber_p (stmt)
1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 return false;
1953 } /* bbs */
1955 /* All operations in the loop are either irrelevant (deal with loop
1956 control, or dead), or only used outside the loop and can be moved
1957 out of the loop (e.g. invariants, inductions). The loop can be
1958 optimized away by scalar optimizations. We're better off not
1959 touching this loop. */
1960 if (!need_to_vectorize)
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "All the computation can be taken out of the loop.\n");
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 "not vectorized: redundant loop. no profit to "
1968 "vectorize.\n");
1969 return false;
1972 return true;
1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1977 definitely no, or -1 if it's worth retrying. */
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1985 /* Only fully-masked loops can have iteration counts less than the
1986 vectorization factor. */
1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1989 HOST_WIDE_INT max_niter;
1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993 else
1994 max_niter = max_stmt_executions_int (loop);
1996 if (max_niter != -1
1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "not vectorized: iteration count smaller than "
2002 "vectorization factor.\n");
2003 return 0;
2007 int min_profitable_iters, min_profitable_estimate;
2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 &min_profitable_estimate);
2011 if (min_profitable_iters < 0)
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "not vectorized: vectorization not profitable.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: vector version will never be "
2019 "profitable.\n");
2020 return -1;
2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 * assumed_vf);
2026 /* Use the cost model only if it is more conservative than user specified
2027 threshold. */
2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 min_profitable_iters);
2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 "not vectorized: vectorization not profitable.\n");
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "not vectorized: iteration count smaller than user "
2042 "specified loop bound parameter or minimum profitable "
2043 "iterations (whichever is more conservative).\n");
2044 return 0;
2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048 if (estimated_niter == -1)
2049 estimated_niter = likely_max_stmt_executions_int (loop);
2050 if (estimated_niter != -1
2051 && ((unsigned HOST_WIDE_INT) estimated_niter
2052 < MAX (th, (unsigned) min_profitable_estimate)))
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 "not vectorized: estimated iteration count too "
2057 "small.\n");
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not vectorized: estimated iteration count smaller "
2061 "than specified loop bound parameter or minimum "
2062 "profitable iterations (whichever is more "
2063 "conservative).\n");
2064 return -1;
2067 return 1;
2071 /* Function vect_analyze_loop_2.
2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074 for it. The different analyses will record information in the
2075 loop_vec_info struct. */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2079 bool ok;
2080 int res;
2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082 poly_uint64 min_vf = 2;
2083 unsigned int n_stmts = 0;
2085 /* The first group of checks is independent of the vector size. */
2086 fatal = true;
2088 /* Find all data references in the loop (which correspond to vdefs/vuses)
2089 and analyze their evolution in the loop. */
2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "not vectorized: loop nest containing two "
2099 "or more consecutive inner loops cannot be "
2100 "vectorized\n");
2101 return false;
2104 for (unsigned i = 0; i < loop->num_nodes; i++)
2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 !gsi_end_p (gsi); gsi_next (&gsi))
2108 gimple *stmt = gsi_stmt (gsi);
2109 if (is_gimple_debug (stmt))
2110 continue;
2111 ++n_stmts;
2112 if (!find_data_references_in_stmt (loop, stmt,
2113 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2115 if (is_gimple_call (stmt) && loop->safelen)
2117 tree fndecl = gimple_call_fndecl (stmt), op;
2118 if (fndecl != NULL_TREE)
2120 cgraph_node *node = cgraph_node::get (fndecl);
2121 if (node != NULL && node->simd_clones != NULL)
2123 unsigned int j, n = gimple_call_num_args (stmt);
2124 for (j = 0; j < n; j++)
2126 op = gimple_call_arg (stmt, j);
2127 if (DECL_P (op)
2128 || (REFERENCE_CLASS_P (op)
2129 && get_base_address (op)))
2130 break;
2132 op = gimple_call_lhs (stmt);
2133 /* Ignore #pragma omp declare simd functions
2134 if they don't have data references in the
2135 call stmt itself. */
2136 if (j == n
2137 && !(op
2138 && (DECL_P (op)
2139 || (REFERENCE_CLASS_P (op)
2140 && get_base_address (op)))))
2141 continue;
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: loop contains function "
2148 "calls or data references that cannot "
2149 "be analyzed\n");
2150 return false;
2154 /* Analyze the data references and also adjust the minimal
2155 vectorization factor according to the loads and stores. */
2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158 if (!ok)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "bad data references.\n");
2163 return false;
2166 /* Classify all cross-iteration scalar data-flow cycles.
2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2168 vect_analyze_scalar_cycles (loop_vinfo);
2170 vect_pattern_recog (loop_vinfo);
2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2174 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2177 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178 if (!ok)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "bad data access.\n");
2183 return false;
2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189 if (!ok)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "unexpected pattern.\n");
2194 return false;
2197 /* While the rest of the analysis below depends on it in some way. */
2198 fatal = false;
2200 /* Analyze data dependences between the data-refs in the loop
2201 and adjust the maximum vectorization factor according to
2202 the dependences.
2203 FORNOW: fail at the first data dependence that we encounter. */
2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206 if (!ok
2207 || (max_vf != MAX_VECTORIZATION_FACTOR
2208 && maybe_lt (max_vf, min_vf)))
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "bad data dependence.\n");
2213 return false;
2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2217 ok = vect_determine_vectorization_factor (loop_vinfo);
2218 if (!ok)
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 "can't determine vectorization factor.\n");
2223 return false;
2225 if (max_vf != MAX_VECTORIZATION_FACTOR
2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "bad data dependence.\n");
2231 return false;
2234 /* Compute the scalar iteration cost. */
2235 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238 unsigned th;
2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2241 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242 if (!ok)
2243 return false;
2245 /* If there are any SLP instances mark them as pure_slp. */
2246 bool slp = vect_make_slp_decision (loop_vinfo);
2247 if (slp)
2249 /* Find stmts that need to be both vectorized and SLPed. */
2250 vect_detect_hybrid_slp (loop_vinfo);
2252 /* Update the vectorization factor based on the SLP decision. */
2253 vect_update_vf_for_slp (loop_vinfo);
2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2258 /* We don't expect to have to roll back to anything other than an empty
2259 set of rgroups. */
2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2262 /* This is the point where we can re-start analysis with SLP forced off. */
2263 start_over:
2265 /* Now the vectorization factor is final. */
2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267 gcc_assert (known_ne (vectorization_factor, 0U));
2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "vectorization_factor = ");
2273 dump_dec (MSG_NOTE, vectorization_factor);
2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 LOOP_VINFO_INT_NITERS (loop_vinfo));
2278 HOST_WIDE_INT max_niter
2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2281 /* Analyze the alignment of the data-refs in the loop.
2282 Fail if a data reference is found that cannot be vectorized. */
2284 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285 if (!ok)
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data alignment.\n");
2290 return false;
2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294 It is important to call pruning after vect_analyze_data_ref_accesses,
2295 since we use grouping information gathered by interleaving analysis. */
2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297 if (!ok)
2298 return false;
2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301 vectorization. */
2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2304 /* This pass will decide on using loop versioning and/or loop peeling in
2305 order to enhance the alignment of data references in the loop. */
2306 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return false;
2316 if (slp)
2318 /* Analyze operations in the SLP instances. Note this may
2319 remove unsupported SLP instances which makes the above
2320 SLP kind detection invalid. */
2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322 vect_slp_analyze_operations (loop_vinfo);
2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 goto again;
2327 /* Scan all the remaining operations in the loop that are not subject
2328 to SLP and make sure they are vectorizable. */
2329 ok = vect_analyze_loop_operations (loop_vinfo);
2330 if (!ok)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad operation or unsupported loop bound.\n");
2335 return false;
2338 /* Decide whether to use a fully-masked loop for this vectorization
2339 factor. */
2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342 && vect_verify_full_masking (loop_vinfo));
2343 if (dump_enabled_p ())
2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 dump_printf_loc (MSG_NOTE, vect_location,
2347 "using a fully-masked loop.\n");
2348 else
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "not using a fully-masked loop.\n");
2353 /* If epilog loop is required because of data accesses with gaps,
2354 one additional iteration needs to be peeled. Check if there is
2355 enough iterations for vectorization. */
2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2363 if (known_lt (wi::to_widest (scalar_niters), vf))
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "loop has no enough iterations to support"
2368 " peeling for gaps.\n");
2369 return false;
2373 /* Check the costings of the loop make vectorizing worthwhile. */
2374 res = vect_analyze_loop_costing (loop_vinfo);
2375 if (res < 0)
2376 goto again;
2377 if (!res)
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "Loop costings not worthwhile.\n");
2382 return false;
2385 /* Decide whether we need to create an epilogue loop to handle
2386 remaining scalar iterations. */
2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2389 unsigned HOST_WIDE_INT const_vf;
2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391 /* The main loop handles all iterations. */
2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2396 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2401 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404 < (unsigned) exact_log2 (const_vf))
2405 /* In case of versioning, check if the maximum number of
2406 iterations is greater than th. If they are identical,
2407 the epilogue is unnecessary. */
2408 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 || ((unsigned HOST_WIDE_INT) max_niter
2410 > (th / const_vf) * const_vf))))
2411 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2413 /* If an epilogue loop is required make sure we can create one. */
2414 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419 if (!vect_can_advance_ivs_p (loop_vinfo)
2420 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421 single_exit (LOOP_VINFO_LOOP
2422 (loop_vinfo))))
2424 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426 "not vectorized: can't create required "
2427 "epilog loop\n");
2428 goto again;
2432 /* During peeling, we need to check if number of loop iterations is
2433 enough for both peeled prolog loop and vector loop. This check
2434 can be merged along with threshold check of loop versioning, so
2435 increase threshold for this case if necessary. */
2436 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2438 poly_uint64 niters_th = 0;
2440 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2442 /* Niters for peeled prolog loop. */
2443 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2445 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446 tree vectype
2447 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2450 else
2451 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2454 /* Niters for at least one iteration of vectorized loop. */
2455 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457 /* One additional iteration because of peeling for gap. */
2458 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459 niters_th += 1;
2460 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2463 gcc_assert (known_eq (vectorization_factor,
2464 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2466 /* Ok to vectorize! */
2467 return true;
2469 again:
2470 /* Try again with SLP forced off but if we didn't do any SLP there is
2471 no point in re-trying. */
2472 if (!slp)
2473 return false;
2475 /* If there are reduction chains re-trying will fail anyway. */
2476 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477 return false;
2479 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480 via interleaving or lane instructions. */
2481 slp_instance instance;
2482 slp_tree node;
2483 unsigned i, j;
2484 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2486 stmt_vec_info vinfo;
2487 vinfo = vinfo_for_stmt
2488 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490 continue;
2491 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494 if (! vect_store_lanes_supported (vectype, size, false)
2495 && ! vect_grouped_store_supported (vectype, size))
2496 return false;
2497 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2499 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2500 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2501 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2502 size = STMT_VINFO_GROUP_SIZE (vinfo);
2503 vectype = STMT_VINFO_VECTYPE (vinfo);
2504 if (! vect_load_lanes_supported (vectype, size, false)
2505 && ! vect_grouped_load_supported (vectype, single_element_p,
2506 size))
2507 return false;
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "re-trying with SLP disabled\n");
2515 /* Roll back state appropriately. No SLP this time. */
2516 slp = false;
2517 /* Restore vectorization factor as it were without SLP. */
2518 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2519 /* Free the SLP instances. */
2520 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2521 vect_free_slp_instance (instance);
2522 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2523 /* Reset SLP type to loop_vect on all stmts. */
2524 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2526 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2527 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2528 !gsi_end_p (si); gsi_next (&si))
2530 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2531 STMT_SLP_TYPE (stmt_info) = loop_vect;
2533 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2534 !gsi_end_p (si); gsi_next (&si))
2536 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2537 STMT_SLP_TYPE (stmt_info) = loop_vect;
2538 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2540 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2541 STMT_SLP_TYPE (stmt_info) = loop_vect;
2542 for (gimple_stmt_iterator pi
2543 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2544 !gsi_end_p (pi); gsi_next (&pi))
2546 gimple *pstmt = gsi_stmt (pi);
2547 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2552 /* Free optimized alias test DDRS. */
2553 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2554 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2555 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2556 /* Reset target cost data. */
2557 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2558 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2559 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2560 /* Reset accumulated rgroup information. */
2561 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2562 /* Reset assorted flags. */
2563 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2564 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2565 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2566 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2567 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2569 goto start_over;
2572 /* Function vect_analyze_loop.
2574 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2575 for it. The different analyses will record information in the
2576 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2577 be vectorized. */
2578 loop_vec_info
2579 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2581 loop_vec_info loop_vinfo;
2582 auto_vector_sizes vector_sizes;
2584 /* Autodetect first vector size we try. */
2585 current_vector_size = 0;
2586 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2587 unsigned int next_size = 0;
2589 if (dump_enabled_p ())
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "===== analyze_loop_nest =====\n");
2593 if (loop_outer (loop)
2594 && loop_vec_info_for_loop (loop_outer (loop))
2595 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "outer-loop already vectorized.\n");
2600 return NULL;
2603 poly_uint64 autodetected_vector_size = 0;
2604 while (1)
2606 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2607 loop_vinfo = vect_analyze_loop_form (loop);
2608 if (!loop_vinfo)
2610 if (dump_enabled_p ())
2611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2612 "bad loop form.\n");
2613 return NULL;
2616 bool fatal = false;
2618 if (orig_loop_vinfo)
2619 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2621 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2623 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2625 return loop_vinfo;
2628 delete loop_vinfo;
2630 if (next_size == 0)
2631 autodetected_vector_size = current_vector_size;
2633 if (next_size < vector_sizes.length ()
2634 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2635 next_size += 1;
2637 if (fatal
2638 || next_size == vector_sizes.length ()
2639 || known_eq (current_vector_size, 0U))
2640 return NULL;
2642 /* Try the next biggest vector size. */
2643 current_vector_size = vector_sizes[next_size++];
2644 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "***** Re-trying analysis with "
2648 "vector size ");
2649 dump_dec (MSG_NOTE, current_vector_size);
2650 dump_printf (MSG_NOTE, "\n");
2655 /* Return true if there is an in-order reduction function for CODE, storing
2656 it in *REDUC_FN if so. */
2658 static bool
2659 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2661 switch (code)
2663 case PLUS_EXPR:
2664 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2665 return true;
2667 default:
2668 return false;
2672 /* Function reduction_fn_for_scalar_code
2674 Input:
2675 CODE - tree_code of a reduction operations.
2677 Output:
2678 REDUC_FN - the corresponding internal function to be used to reduce the
2679 vector of partial results into a single scalar result, or IFN_LAST
2680 if the operation is a supported reduction operation, but does not have
2681 such an internal function.
2683 Return FALSE if CODE currently cannot be vectorized as reduction. */
2685 static bool
2686 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2688 switch (code)
2690 case MAX_EXPR:
2691 *reduc_fn = IFN_REDUC_MAX;
2692 return true;
2694 case MIN_EXPR:
2695 *reduc_fn = IFN_REDUC_MIN;
2696 return true;
2698 case PLUS_EXPR:
2699 *reduc_fn = IFN_REDUC_PLUS;
2700 return true;
2702 case BIT_AND_EXPR:
2703 *reduc_fn = IFN_REDUC_AND;
2704 return true;
2706 case BIT_IOR_EXPR:
2707 *reduc_fn = IFN_REDUC_IOR;
2708 return true;
2710 case BIT_XOR_EXPR:
2711 *reduc_fn = IFN_REDUC_XOR;
2712 return true;
2714 case MULT_EXPR:
2715 case MINUS_EXPR:
2716 *reduc_fn = IFN_LAST;
2717 return true;
2719 default:
2720 return false;
2724 /* If there is a neutral value X such that SLP reduction NODE would not
2725 be affected by the introduction of additional X elements, return that X,
2726 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2727 is true if the SLP statements perform a single reduction, false if each
2728 statement performs an independent reduction. */
2730 static tree
2731 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2732 bool reduc_chain)
2734 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2735 gimple *stmt = stmts[0];
2736 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2737 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2738 tree scalar_type = TREE_TYPE (vector_type);
2739 struct loop *loop = gimple_bb (stmt)->loop_father;
2740 gcc_assert (loop);
2742 switch (code)
2744 case WIDEN_SUM_EXPR:
2745 case DOT_PROD_EXPR:
2746 case SAD_EXPR:
2747 case PLUS_EXPR:
2748 case MINUS_EXPR:
2749 case BIT_IOR_EXPR:
2750 case BIT_XOR_EXPR:
2751 return build_zero_cst (scalar_type);
2753 case MULT_EXPR:
2754 return build_one_cst (scalar_type);
2756 case BIT_AND_EXPR:
2757 return build_all_ones_cst (scalar_type);
2759 case MAX_EXPR:
2760 case MIN_EXPR:
2761 /* For MIN/MAX the initial values are neutral. A reduction chain
2762 has only a single initial value, so that value is neutral for
2763 all statements. */
2764 if (reduc_chain)
2765 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2766 return NULL_TREE;
2768 default:
2769 return NULL_TREE;
2773 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2774 STMT is printed with a message MSG. */
2776 static void
2777 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2779 dump_printf_loc (msg_type, vect_location, "%s", msg);
2780 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2784 /* Detect SLP reduction of the form:
2786 #a1 = phi <a5, a0>
2787 a2 = operation (a1)
2788 a3 = operation (a2)
2789 a4 = operation (a3)
2790 a5 = operation (a4)
2792 #a = phi <a5>
2794 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2795 FIRST_STMT is the first reduction stmt in the chain
2796 (a2 = operation (a1)).
2798 Return TRUE if a reduction chain was detected. */
2800 static bool
2801 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2802 gimple *first_stmt)
2804 struct loop *loop = (gimple_bb (phi))->loop_father;
2805 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2806 enum tree_code code;
2807 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2808 stmt_vec_info use_stmt_info, current_stmt_info;
2809 tree lhs;
2810 imm_use_iterator imm_iter;
2811 use_operand_p use_p;
2812 int nloop_uses, size = 0, n_out_of_loop_uses;
2813 bool found = false;
2815 if (loop != vect_loop)
2816 return false;
2818 lhs = PHI_RESULT (phi);
2819 code = gimple_assign_rhs_code (first_stmt);
2820 while (1)
2822 nloop_uses = 0;
2823 n_out_of_loop_uses = 0;
2824 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2826 gimple *use_stmt = USE_STMT (use_p);
2827 if (is_gimple_debug (use_stmt))
2828 continue;
2830 /* Check if we got back to the reduction phi. */
2831 if (use_stmt == phi)
2833 loop_use_stmt = use_stmt;
2834 found = true;
2835 break;
2838 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2840 loop_use_stmt = use_stmt;
2841 nloop_uses++;
2843 else
2844 n_out_of_loop_uses++;
2846 /* There are can be either a single use in the loop or two uses in
2847 phi nodes. */
2848 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2849 return false;
2852 if (found)
2853 break;
2855 /* We reached a statement with no loop uses. */
2856 if (nloop_uses == 0)
2857 return false;
2859 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2860 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2861 return false;
2863 if (!is_gimple_assign (loop_use_stmt)
2864 || code != gimple_assign_rhs_code (loop_use_stmt)
2865 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2866 return false;
2868 /* Insert USE_STMT into reduction chain. */
2869 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2870 if (current_stmt)
2872 current_stmt_info = vinfo_for_stmt (current_stmt);
2873 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2874 GROUP_FIRST_ELEMENT (use_stmt_info)
2875 = GROUP_FIRST_ELEMENT (current_stmt_info);
2877 else
2878 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2880 lhs = gimple_assign_lhs (loop_use_stmt);
2881 current_stmt = loop_use_stmt;
2882 size++;
2885 if (!found || loop_use_stmt != phi || size < 2)
2886 return false;
2888 /* Swap the operands, if needed, to make the reduction operand be the second
2889 operand. */
2890 lhs = PHI_RESULT (phi);
2891 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2892 while (next_stmt)
2894 if (gimple_assign_rhs2 (next_stmt) == lhs)
2896 tree op = gimple_assign_rhs1 (next_stmt);
2897 gimple *def_stmt = NULL;
2899 if (TREE_CODE (op) == SSA_NAME)
2900 def_stmt = SSA_NAME_DEF_STMT (op);
2902 /* Check that the other def is either defined in the loop
2903 ("vect_internal_def"), or it's an induction (defined by a
2904 loop-header phi-node). */
2905 if (def_stmt
2906 && gimple_bb (def_stmt)
2907 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2908 && (is_gimple_assign (def_stmt)
2909 || is_gimple_call (def_stmt)
2910 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2911 == vect_induction_def
2912 || (gimple_code (def_stmt) == GIMPLE_PHI
2913 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914 == vect_internal_def
2915 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917 lhs = gimple_assign_lhs (next_stmt);
2918 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2919 continue;
2922 return false;
2924 else
2926 tree op = gimple_assign_rhs2 (next_stmt);
2927 gimple *def_stmt = NULL;
2929 if (TREE_CODE (op) == SSA_NAME)
2930 def_stmt = SSA_NAME_DEF_STMT (op);
2932 /* Check that the other def is either defined in the loop
2933 ("vect_internal_def"), or it's an induction (defined by a
2934 loop-header phi-node). */
2935 if (def_stmt
2936 && gimple_bb (def_stmt)
2937 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938 && (is_gimple_assign (def_stmt)
2939 || is_gimple_call (def_stmt)
2940 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941 == vect_induction_def
2942 || (gimple_code (def_stmt) == GIMPLE_PHI
2943 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944 == vect_internal_def
2945 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2947 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2953 swap_ssa_operands (next_stmt,
2954 gimple_assign_rhs1_ptr (next_stmt),
2955 gimple_assign_rhs2_ptr (next_stmt));
2956 update_stmt (next_stmt);
2958 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2961 else
2962 return false;
2965 lhs = gimple_assign_lhs (next_stmt);
2966 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2969 /* Save the chain for further analysis in SLP detection. */
2970 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2971 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2972 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2974 return true;
2977 /* Return true if we need an in-order reduction for operation CODE
2978 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2979 overflow must wrap. */
2981 static bool
2982 needs_fold_left_reduction_p (tree type, tree_code code,
2983 bool need_wrapping_integral_overflow)
2985 /* CHECKME: check for !flag_finite_math_only too? */
2986 if (SCALAR_FLOAT_TYPE_P (type))
2987 switch (code)
2989 case MIN_EXPR:
2990 case MAX_EXPR:
2991 return false;
2993 default:
2994 return !flag_associative_math;
2997 if (INTEGRAL_TYPE_P (type))
2999 if (!operation_no_trapping_overflow (type, code))
3000 return true;
3001 if (need_wrapping_integral_overflow
3002 && !TYPE_OVERFLOW_WRAPS (type)
3003 && operation_can_overflow (code))
3004 return true;
3005 return false;
3008 if (SAT_FIXED_POINT_TYPE_P (type))
3009 return true;
3011 return false;
3014 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3015 reduction operation CODE has a handled computation expression. */
3017 bool
3018 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3019 enum tree_code code)
3021 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3022 auto_bitmap visited;
3023 tree lookfor = PHI_RESULT (phi);
3024 ssa_op_iter curri;
3025 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3026 while (USE_FROM_PTR (curr) != loop_arg)
3027 curr = op_iter_next_use (&curri);
3028 curri.i = curri.numops;
3031 path.safe_push (std::make_pair (curri, curr));
3032 tree use = USE_FROM_PTR (curr);
3033 if (use == lookfor)
3034 break;
3035 gimple *def = SSA_NAME_DEF_STMT (use);
3036 if (gimple_nop_p (def)
3037 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3039 pop:
3042 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3043 curri = x.first;
3044 curr = x.second;
3046 curr = op_iter_next_use (&curri);
3047 /* Skip already visited or non-SSA operands (from iterating
3048 over PHI args). */
3049 while (curr != NULL_USE_OPERAND_P
3050 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3051 || ! bitmap_set_bit (visited,
3052 SSA_NAME_VERSION
3053 (USE_FROM_PTR (curr)))));
3055 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3056 if (curr == NULL_USE_OPERAND_P)
3057 break;
3059 else
3061 if (gimple_code (def) == GIMPLE_PHI)
3062 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3063 else
3064 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3065 while (curr != NULL_USE_OPERAND_P
3066 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3067 || ! bitmap_set_bit (visited,
3068 SSA_NAME_VERSION
3069 (USE_FROM_PTR (curr)))))
3070 curr = op_iter_next_use (&curri);
3071 if (curr == NULL_USE_OPERAND_P)
3072 goto pop;
3075 while (1);
3076 if (dump_file && (dump_flags & TDF_DETAILS))
3078 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3079 unsigned i;
3080 std::pair<ssa_op_iter, use_operand_p> *x;
3081 FOR_EACH_VEC_ELT (path, i, x)
3083 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3084 dump_printf (MSG_NOTE, " ");
3086 dump_printf (MSG_NOTE, "\n");
3089 /* Check whether the reduction path detected is valid. */
3090 bool fail = path.length () == 0;
3091 bool neg = false;
3092 for (unsigned i = 1; i < path.length (); ++i)
3094 gimple *use_stmt = USE_STMT (path[i].second);
3095 tree op = USE_FROM_PTR (path[i].second);
3096 if (! has_single_use (op)
3097 || ! is_gimple_assign (use_stmt))
3099 fail = true;
3100 break;
3102 if (gimple_assign_rhs_code (use_stmt) != code)
3104 if (code == PLUS_EXPR
3105 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3107 /* Track whether we negate the reduction value each iteration. */
3108 if (gimple_assign_rhs2 (use_stmt) == op)
3109 neg = ! neg;
3111 else
3113 fail = true;
3114 break;
3118 return ! fail && ! neg;
3122 /* Function vect_is_simple_reduction
3124 (1) Detect a cross-iteration def-use cycle that represents a simple
3125 reduction computation. We look for the following pattern:
3127 loop_header:
3128 a1 = phi < a0, a2 >
3129 a3 = ...
3130 a2 = operation (a3, a1)
3134 a3 = ...
3135 loop_header:
3136 a1 = phi < a0, a2 >
3137 a2 = operation (a3, a1)
3139 such that:
3140 1. operation is commutative and associative and it is safe to
3141 change the order of the computation
3142 2. no uses for a2 in the loop (a2 is used out of the loop)
3143 3. no uses of a1 in the loop besides the reduction operation
3144 4. no uses of a1 outside the loop.
3146 Conditions 1,4 are tested here.
3147 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3149 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3150 nested cycles.
3152 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3153 reductions:
3155 a1 = phi < a0, a2 >
3156 inner loop (def of a3)
3157 a2 = phi < a3 >
3159 (4) Detect condition expressions, ie:
3160 for (int i = 0; i < N; i++)
3161 if (a[i] < val)
3162 ret_val = a[i];
3166 static gimple *
3167 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3168 bool *double_reduc,
3169 bool need_wrapping_integral_overflow,
3170 enum vect_reduction_type *v_reduc_type)
3172 struct loop *loop = (gimple_bb (phi))->loop_father;
3173 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3174 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3175 enum tree_code orig_code, code;
3176 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3177 tree type;
3178 int nloop_uses;
3179 tree name;
3180 imm_use_iterator imm_iter;
3181 use_operand_p use_p;
3182 bool phi_def;
3184 *double_reduc = false;
3185 *v_reduc_type = TREE_CODE_REDUCTION;
3187 tree phi_name = PHI_RESULT (phi);
3188 /* ??? If there are no uses of the PHI result the inner loop reduction
3189 won't be detected as possibly double-reduction by vectorizable_reduction
3190 because that tries to walk the PHI arg from the preheader edge which
3191 can be constant. See PR60382. */
3192 if (has_zero_uses (phi_name))
3193 return NULL;
3194 nloop_uses = 0;
3195 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3197 gimple *use_stmt = USE_STMT (use_p);
3198 if (is_gimple_debug (use_stmt))
3199 continue;
3201 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205 "intermediate value used outside loop.\n");
3207 return NULL;
3210 nloop_uses++;
3211 if (nloop_uses > 1)
3213 if (dump_enabled_p ())
3214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3215 "reduction value used in loop.\n");
3216 return NULL;
3219 phi_use_stmt = use_stmt;
3222 edge latch_e = loop_latch_edge (loop);
3223 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3224 if (TREE_CODE (loop_arg) != SSA_NAME)
3226 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229 "reduction: not ssa_name: ");
3230 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3231 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3233 return NULL;
3236 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3237 if (is_gimple_assign (def_stmt))
3239 name = gimple_assign_lhs (def_stmt);
3240 phi_def = false;
3242 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3244 name = PHI_RESULT (def_stmt);
3245 phi_def = true;
3247 else
3249 if (dump_enabled_p ())
3251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3252 "reduction: unhandled reduction operation: ");
3253 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3255 return NULL;
3258 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3259 return NULL;
3261 nloop_uses = 0;
3262 auto_vec<gphi *, 3> lcphis;
3263 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3265 gimple *use_stmt = USE_STMT (use_p);
3266 if (is_gimple_debug (use_stmt))
3267 continue;
3268 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3269 nloop_uses++;
3270 else
3271 /* We can have more than one loop-closed PHI. */
3272 lcphis.safe_push (as_a <gphi *> (use_stmt));
3273 if (nloop_uses > 1)
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "reduction used in loop.\n");
3278 return NULL;
3282 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3283 defined in the inner loop. */
3284 if (phi_def)
3286 op1 = PHI_ARG_DEF (def_stmt, 0);
3288 if (gimple_phi_num_args (def_stmt) != 1
3289 || TREE_CODE (op1) != SSA_NAME)
3291 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3293 "unsupported phi node definition.\n");
3295 return NULL;
3298 def1 = SSA_NAME_DEF_STMT (op1);
3299 if (gimple_bb (def1)
3300 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3301 && loop->inner
3302 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3303 && is_gimple_assign (def1)
3304 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3306 if (dump_enabled_p ())
3307 report_vect_op (MSG_NOTE, def_stmt,
3308 "detected double reduction: ");
3310 *double_reduc = true;
3311 return def_stmt;
3314 return NULL;
3317 /* If we are vectorizing an inner reduction we are executing that
3318 in the original order only in case we are not dealing with a
3319 double reduction. */
3320 bool check_reduction = true;
3321 if (flow_loop_nested_p (vect_loop, loop))
3323 gphi *lcphi;
3324 unsigned i;
3325 check_reduction = false;
3326 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3327 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3329 gimple *use_stmt = USE_STMT (use_p);
3330 if (is_gimple_debug (use_stmt))
3331 continue;
3332 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3333 check_reduction = true;
3337 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3338 code = orig_code = gimple_assign_rhs_code (def_stmt);
3340 /* We can handle "res -= x[i]", which is non-associative by
3341 simply rewriting this into "res += -x[i]". Avoid changing
3342 gimple instruction for the first simple tests and only do this
3343 if we're allowed to change code at all. */
3344 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3345 code = PLUS_EXPR;
3347 if (code == COND_EXPR)
3349 if (! nested_in_vect_loop)
3350 *v_reduc_type = COND_REDUCTION;
3352 op3 = gimple_assign_rhs1 (def_stmt);
3353 if (COMPARISON_CLASS_P (op3))
3355 op4 = TREE_OPERAND (op3, 1);
3356 op3 = TREE_OPERAND (op3, 0);
3358 if (op3 == phi_name || op4 == phi_name)
3360 if (dump_enabled_p ())
3361 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3362 "reduction: condition depends on previous"
3363 " iteration: ");
3364 return NULL;
3367 op1 = gimple_assign_rhs2 (def_stmt);
3368 op2 = gimple_assign_rhs3 (def_stmt);
3370 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3372 if (dump_enabled_p ())
3373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3374 "reduction: not commutative/associative: ");
3375 return NULL;
3377 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3379 op1 = gimple_assign_rhs1 (def_stmt);
3380 op2 = gimple_assign_rhs2 (def_stmt);
3382 else
3384 if (dump_enabled_p ())
3385 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3386 "reduction: not handled operation: ");
3387 return NULL;
3390 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3392 if (dump_enabled_p ())
3393 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3394 "reduction: both uses not ssa_names: ");
3396 return NULL;
3399 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3400 if ((TREE_CODE (op1) == SSA_NAME
3401 && !types_compatible_p (type,TREE_TYPE (op1)))
3402 || (TREE_CODE (op2) == SSA_NAME
3403 && !types_compatible_p (type, TREE_TYPE (op2)))
3404 || (op3 && TREE_CODE (op3) == SSA_NAME
3405 && !types_compatible_p (type, TREE_TYPE (op3)))
3406 || (op4 && TREE_CODE (op4) == SSA_NAME
3407 && !types_compatible_p (type, TREE_TYPE (op4))))
3409 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 "reduction: multiple types: operation type: ");
3413 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3414 dump_printf (MSG_NOTE, ", operands types: ");
3415 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3416 TREE_TYPE (op1));
3417 dump_printf (MSG_NOTE, ",");
3418 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3419 TREE_TYPE (op2));
3420 if (op3)
3422 dump_printf (MSG_NOTE, ",");
3423 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3424 TREE_TYPE (op3));
3427 if (op4)
3429 dump_printf (MSG_NOTE, ",");
3430 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431 TREE_TYPE (op4));
3433 dump_printf (MSG_NOTE, "\n");
3436 return NULL;
3439 /* Check whether it's ok to change the order of the computation.
3440 Generally, when vectorizing a reduction we change the order of the
3441 computation. This may change the behavior of the program in some
3442 cases, so we need to check that this is ok. One exception is when
3443 vectorizing an outer-loop: the inner-loop is executed sequentially,
3444 and therefore vectorizing reductions in the inner-loop during
3445 outer-loop vectorization is safe. */
3446 if (check_reduction
3447 && *v_reduc_type == TREE_CODE_REDUCTION
3448 && needs_fold_left_reduction_p (type, code,
3449 need_wrapping_integral_overflow))
3450 *v_reduc_type = FOLD_LEFT_REDUCTION;
3452 /* Reduction is safe. We're dealing with one of the following:
3453 1) integer arithmetic and no trapv
3454 2) floating point arithmetic, and special flags permit this optimization
3455 3) nested cycle (i.e., outer loop vectorization). */
3456 if (TREE_CODE (op1) == SSA_NAME)
3457 def1 = SSA_NAME_DEF_STMT (op1);
3459 if (TREE_CODE (op2) == SSA_NAME)
3460 def2 = SSA_NAME_DEF_STMT (op2);
3462 if (code != COND_EXPR
3463 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3465 if (dump_enabled_p ())
3466 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3467 return NULL;
3470 /* Check that one def is the reduction def, defined by PHI,
3471 the other def is either defined in the loop ("vect_internal_def"),
3472 or it's an induction (defined by a loop-header phi-node). */
3474 if (def2 && def2 == phi
3475 && (code == COND_EXPR
3476 || !def1 || gimple_nop_p (def1)
3477 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3478 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479 && (is_gimple_assign (def1)
3480 || is_gimple_call (def1)
3481 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3482 == vect_induction_def
3483 || (gimple_code (def1) == GIMPLE_PHI
3484 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3485 == vect_internal_def
3486 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3488 if (dump_enabled_p ())
3489 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3490 return def_stmt;
3493 if (def1 && def1 == phi
3494 && (code == COND_EXPR
3495 || !def2 || gimple_nop_p (def2)
3496 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3497 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498 && (is_gimple_assign (def2)
3499 || is_gimple_call (def2)
3500 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3501 == vect_induction_def
3502 || (gimple_code (def2) == GIMPLE_PHI
3503 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3504 == vect_internal_def
3505 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3507 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3509 /* Check if we can swap operands (just for simplicity - so that
3510 the rest of the code can assume that the reduction variable
3511 is always the last (second) argument). */
3512 if (code == COND_EXPR)
3514 /* Swap cond_expr by inverting the condition. */
3515 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3516 enum tree_code invert_code = ERROR_MARK;
3517 enum tree_code cond_code = TREE_CODE (cond_expr);
3519 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3521 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3522 invert_code = invert_tree_comparison (cond_code, honor_nans);
3524 if (invert_code != ERROR_MARK)
3526 TREE_SET_CODE (cond_expr, invert_code);
3527 swap_ssa_operands (def_stmt,
3528 gimple_assign_rhs2_ptr (def_stmt),
3529 gimple_assign_rhs3_ptr (def_stmt));
3531 else
3533 if (dump_enabled_p ())
3534 report_vect_op (MSG_NOTE, def_stmt,
3535 "detected reduction: cannot swap operands "
3536 "for cond_expr");
3537 return NULL;
3540 else
3541 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3542 gimple_assign_rhs2_ptr (def_stmt));
3544 if (dump_enabled_p ())
3545 report_vect_op (MSG_NOTE, def_stmt,
3546 "detected reduction: need to swap operands: ");
3548 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3549 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3551 else
3553 if (dump_enabled_p ())
3554 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3557 return def_stmt;
3560 /* Try to find SLP reduction chain. */
3561 if (! nested_in_vect_loop
3562 && code != COND_EXPR
3563 && orig_code != MINUS_EXPR
3564 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3566 if (dump_enabled_p ())
3567 report_vect_op (MSG_NOTE, def_stmt,
3568 "reduction: detected reduction chain: ");
3570 return def_stmt;
3573 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3574 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3575 while (first)
3577 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3578 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3579 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580 first = next;
3583 /* Look for the expression computing loop_arg from loop PHI result. */
3584 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3585 code))
3586 return def_stmt;
3588 if (dump_enabled_p ())
3590 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3591 "reduction: unknown pattern: ");
3594 return NULL;
3597 /* Wrapper around vect_is_simple_reduction, which will modify code
3598 in-place if it enables detection of more reductions. Arguments
3599 as there. */
3601 gimple *
3602 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3603 bool *double_reduc,
3604 bool need_wrapping_integral_overflow)
3606 enum vect_reduction_type v_reduc_type;
3607 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3608 need_wrapping_integral_overflow,
3609 &v_reduc_type);
3610 if (def)
3612 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3613 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3614 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3615 reduc_def_info = vinfo_for_stmt (def);
3616 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3617 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3619 return def;
3622 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3624 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3625 int *peel_iters_epilogue,
3626 stmt_vector_for_cost *scalar_cost_vec,
3627 stmt_vector_for_cost *prologue_cost_vec,
3628 stmt_vector_for_cost *epilogue_cost_vec)
3630 int retval = 0;
3631 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3633 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3635 *peel_iters_epilogue = assumed_vf / 2;
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "cost model: epilogue peel iters set to vf/2 "
3639 "because loop iterations are unknown .\n");
3641 /* If peeled iterations are known but number of scalar loop
3642 iterations are unknown, count a taken branch per peeled loop. */
3643 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3644 NULL, 0, vect_prologue);
3645 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3646 NULL, 0, vect_epilogue);
3648 else
3650 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3651 peel_iters_prologue = niters < peel_iters_prologue ?
3652 niters : peel_iters_prologue;
3653 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3654 /* If we need to peel for gaps, but no peeling is required, we have to
3655 peel VF iterations. */
3656 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3657 *peel_iters_epilogue = assumed_vf;
3660 stmt_info_for_cost *si;
3661 int j;
3662 if (peel_iters_prologue)
3663 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3665 stmt_vec_info stmt_info
3666 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3667 retval += record_stmt_cost (prologue_cost_vec,
3668 si->count * peel_iters_prologue,
3669 si->kind, stmt_info, si->misalign,
3670 vect_prologue);
3672 if (*peel_iters_epilogue)
3673 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3675 stmt_vec_info stmt_info
3676 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3677 retval += record_stmt_cost (epilogue_cost_vec,
3678 si->count * *peel_iters_epilogue,
3679 si->kind, stmt_info, si->misalign,
3680 vect_epilogue);
3683 return retval;
3686 /* Function vect_estimate_min_profitable_iters
3688 Return the number of iterations required for the vector version of the
3689 loop to be profitable relative to the cost of the scalar version of the
3690 loop.
3692 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3693 of iterations for vectorization. -1 value means loop vectorization
3694 is not profitable. This returned value may be used for dynamic
3695 profitability check.
3697 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3698 for static check against estimated number of iterations. */
3700 static void
3701 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3702 int *ret_min_profitable_niters,
3703 int *ret_min_profitable_estimate)
3705 int min_profitable_iters;
3706 int min_profitable_estimate;
3707 int peel_iters_prologue;
3708 int peel_iters_epilogue;
3709 unsigned vec_inside_cost = 0;
3710 int vec_outside_cost = 0;
3711 unsigned vec_prologue_cost = 0;
3712 unsigned vec_epilogue_cost = 0;
3713 int scalar_single_iter_cost = 0;
3714 int scalar_outside_cost = 0;
3715 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3716 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3717 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719 /* Cost model disabled. */
3720 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3722 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3723 *ret_min_profitable_niters = 0;
3724 *ret_min_profitable_estimate = 0;
3725 return;
3728 /* Requires loop versioning tests to handle misalignment. */
3729 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3731 /* FIXME: Make cost depend on complexity of individual check. */
3732 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3733 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3734 vect_prologue);
3735 dump_printf (MSG_NOTE,
3736 "cost model: Adding cost of checks for loop "
3737 "versioning to treat misalignment.\n");
3740 /* Requires loop versioning with alias checks. */
3741 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3743 /* FIXME: Make cost depend on complexity of individual check. */
3744 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3745 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3746 vect_prologue);
3747 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3748 if (len)
3749 /* Count LEN - 1 ANDs and LEN comparisons. */
3750 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3751 NULL, 0, vect_prologue);
3752 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3753 if (len)
3755 /* Count LEN - 1 ANDs and LEN comparisons. */
3756 unsigned int nstmts = len * 2 - 1;
3757 /* +1 for each bias that needs adding. */
3758 for (unsigned int i = 0; i < len; ++i)
3759 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3760 nstmts += 1;
3761 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3762 NULL, 0, vect_prologue);
3764 dump_printf (MSG_NOTE,
3765 "cost model: Adding cost of checks for loop "
3766 "versioning aliasing.\n");
3769 /* Requires loop versioning with niter checks. */
3770 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3772 /* FIXME: Make cost depend on complexity of individual check. */
3773 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3774 vect_prologue);
3775 dump_printf (MSG_NOTE,
3776 "cost model: Adding cost of checks for loop "
3777 "versioning niters.\n");
3780 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3781 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3782 vect_prologue);
3784 /* Count statements in scalar loop. Using this as scalar cost for a single
3785 iteration for now.
3787 TODO: Add outer loop support.
3789 TODO: Consider assigning different costs to different scalar
3790 statements. */
3792 scalar_single_iter_cost
3793 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3795 /* Add additional cost for the peeled instructions in prologue and epilogue
3796 loop. (For fully-masked loops there will be no peeling.)
3798 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3799 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3801 TODO: Build an expression that represents peel_iters for prologue and
3802 epilogue to be used in a run-time test. */
3804 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806 peel_iters_prologue = 0;
3807 peel_iters_epilogue = 0;
3809 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3811 /* We need to peel exactly one iteration. */
3812 peel_iters_epilogue += 1;
3813 stmt_info_for_cost *si;
3814 int j;
3815 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3816 j, si)
3818 struct _stmt_vec_info *stmt_info
3819 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3820 (void) add_stmt_cost (target_cost_data, si->count,
3821 si->kind, stmt_info, si->misalign,
3822 vect_epilogue);
3826 else if (npeel < 0)
3828 peel_iters_prologue = assumed_vf / 2;
3829 dump_printf (MSG_NOTE, "cost model: "
3830 "prologue peel iters set to vf/2.\n");
3832 /* If peeling for alignment is unknown, loop bound of main loop becomes
3833 unknown. */
3834 peel_iters_epilogue = assumed_vf / 2;
3835 dump_printf (MSG_NOTE, "cost model: "
3836 "epilogue peel iters set to vf/2 because "
3837 "peeling for alignment is unknown.\n");
3839 /* If peeled iterations are unknown, count a taken branch and a not taken
3840 branch per peeled loop. Even if scalar loop iterations are known,
3841 vector iterations are not known since peeled prologue iterations are
3842 not known. Hence guards remain the same. */
3843 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3844 NULL, 0, vect_prologue);
3845 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3846 NULL, 0, vect_prologue);
3847 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3848 NULL, 0, vect_epilogue);
3849 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3850 NULL, 0, vect_epilogue);
3851 stmt_info_for_cost *si;
3852 int j;
3853 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3855 struct _stmt_vec_info *stmt_info
3856 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3857 (void) add_stmt_cost (target_cost_data,
3858 si->count * peel_iters_prologue,
3859 si->kind, stmt_info, si->misalign,
3860 vect_prologue);
3861 (void) add_stmt_cost (target_cost_data,
3862 si->count * peel_iters_epilogue,
3863 si->kind, stmt_info, si->misalign,
3864 vect_epilogue);
3867 else
3869 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3870 stmt_info_for_cost *si;
3871 int j;
3872 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3874 prologue_cost_vec.create (2);
3875 epilogue_cost_vec.create (2);
3876 peel_iters_prologue = npeel;
3878 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3879 &peel_iters_epilogue,
3880 &LOOP_VINFO_SCALAR_ITERATION_COST
3881 (loop_vinfo),
3882 &prologue_cost_vec,
3883 &epilogue_cost_vec);
3885 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3887 struct _stmt_vec_info *stmt_info
3888 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3889 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3890 si->misalign, vect_prologue);
3893 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3895 struct _stmt_vec_info *stmt_info
3896 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3897 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3898 si->misalign, vect_epilogue);
3901 prologue_cost_vec.release ();
3902 epilogue_cost_vec.release ();
3905 /* FORNOW: The scalar outside cost is incremented in one of the
3906 following ways:
3908 1. The vectorizer checks for alignment and aliasing and generates
3909 a condition that allows dynamic vectorization. A cost model
3910 check is ANDED with the versioning condition. Hence scalar code
3911 path now has the added cost of the versioning check.
3913 if (cost > th & versioning_check)
3914 jmp to vector code
3916 Hence run-time scalar is incremented by not-taken branch cost.
3918 2. The vectorizer then checks if a prologue is required. If the
3919 cost model check was not done before during versioning, it has to
3920 be done before the prologue check.
3922 if (cost <= th)
3923 prologue = scalar_iters
3924 if (prologue == 0)
3925 jmp to vector code
3926 else
3927 execute prologue
3928 if (prologue == num_iters)
3929 go to exit
3931 Hence the run-time scalar cost is incremented by a taken branch,
3932 plus a not-taken branch, plus a taken branch cost.
3934 3. The vectorizer then checks if an epilogue is required. If the
3935 cost model check was not done before during prologue check, it
3936 has to be done with the epilogue check.
3938 if (prologue == 0)
3939 jmp to vector code
3940 else
3941 execute prologue
3942 if (prologue == num_iters)
3943 go to exit
3944 vector code:
3945 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3946 jmp to epilogue
3948 Hence the run-time scalar cost should be incremented by 2 taken
3949 branches.
3951 TODO: The back end may reorder the BBS's differently and reverse
3952 conditions/branch directions. Change the estimates below to
3953 something more reasonable. */
3955 /* If the number of iterations is known and we do not do versioning, we can
3956 decide whether to vectorize at compile time. Hence the scalar version
3957 do not carry cost model guard costs. */
3958 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3959 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3961 /* Cost model check occurs at versioning. */
3962 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3963 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3964 else
3966 /* Cost model check occurs at prologue generation. */
3967 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3968 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3969 + vect_get_stmt_cost (cond_branch_not_taken);
3970 /* Cost model check occurs at epilogue generation. */
3971 else
3972 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3976 /* Complete the target-specific cost calculations. */
3977 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3978 &vec_inside_cost, &vec_epilogue_cost);
3980 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3985 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3986 vec_inside_cost);
3987 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3988 vec_prologue_cost);
3989 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3990 vec_epilogue_cost);
3991 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3992 scalar_single_iter_cost);
3993 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3994 scalar_outside_cost);
3995 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3996 vec_outside_cost);
3997 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3998 peel_iters_prologue);
3999 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4000 peel_iters_epilogue);
4003 /* Calculate number of iterations required to make the vector version
4004 profitable, relative to the loop bodies only. The following condition
4005 must hold true:
4006 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4007 where
4008 SIC = scalar iteration cost, VIC = vector iteration cost,
4009 VOC = vector outside cost, VF = vectorization factor,
4010 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4011 SOC = scalar outside cost for run time cost model check. */
4013 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4015 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4016 * assumed_vf
4017 - vec_inside_cost * peel_iters_prologue
4018 - vec_inside_cost * peel_iters_epilogue);
4019 if (min_profitable_iters <= 0)
4020 min_profitable_iters = 0;
4021 else
4023 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4024 - vec_inside_cost);
4026 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4027 <= (((int) vec_inside_cost * min_profitable_iters)
4028 + (((int) vec_outside_cost - scalar_outside_cost)
4029 * assumed_vf)))
4030 min_profitable_iters++;
4033 /* vector version will never be profitable. */
4034 else
4036 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4037 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4038 "did not happen for a simd loop");
4040 if (dump_enabled_p ())
4041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4042 "cost model: the vector iteration cost = %d "
4043 "divided by the scalar iteration cost = %d "
4044 "is greater or equal to the vectorization factor = %d"
4045 ".\n",
4046 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4047 *ret_min_profitable_niters = -1;
4048 *ret_min_profitable_estimate = -1;
4049 return;
4052 dump_printf (MSG_NOTE,
4053 " Calculated minimum iters for profitability: %d\n",
4054 min_profitable_iters);
4056 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4057 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4058 /* We want the vectorized loop to execute at least once. */
4059 min_profitable_iters = assumed_vf + peel_iters_prologue;
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE, vect_location,
4063 " Runtime profitability threshold = %d\n",
4064 min_profitable_iters);
4066 *ret_min_profitable_niters = min_profitable_iters;
4068 /* Calculate number of iterations required to make the vector version
4069 profitable, relative to the loop bodies only.
4071 Non-vectorized variant is SIC * niters and it must win over vector
4072 variant on the expected loop trip count. The following condition must hold true:
4073 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4075 if (vec_outside_cost <= 0)
4076 min_profitable_estimate = 0;
4077 else
4079 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4080 * assumed_vf
4081 - vec_inside_cost * peel_iters_prologue
4082 - vec_inside_cost * peel_iters_epilogue)
4083 / ((scalar_single_iter_cost * assumed_vf)
4084 - vec_inside_cost);
4086 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4087 if (dump_enabled_p ())
4088 dump_printf_loc (MSG_NOTE, vect_location,
4089 " Static estimate profitability threshold = %d\n",
4090 min_profitable_estimate);
4092 *ret_min_profitable_estimate = min_profitable_estimate;
4095 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4096 vector elements (not bits) for a vector with NELT elements. */
4097 static void
4098 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4099 vec_perm_builder *sel)
4101 /* The encoding is a single stepped pattern. Any wrap-around is handled
4102 by vec_perm_indices. */
4103 sel->new_vector (nelt, 1, 3);
4104 for (unsigned int i = 0; i < 3; i++)
4105 sel->quick_push (i + offset);
4108 /* Checks whether the target supports whole-vector shifts for vectors of mode
4109 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4110 it supports vec_perm_const with masks for all necessary shift amounts. */
4111 static bool
4112 have_whole_vector_shift (machine_mode mode)
4114 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4115 return true;
4117 /* Variable-length vectors should be handled via the optab. */
4118 unsigned int nelt;
4119 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4120 return false;
4122 vec_perm_builder sel;
4123 vec_perm_indices indices;
4124 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4126 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4127 indices.new_vector (sel, 2, nelt);
4128 if (!can_vec_perm_const_p (mode, indices, false))
4129 return false;
4131 return true;
4134 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4135 functions. Design better to avoid maintenance issues. */
4137 /* Function vect_model_reduction_cost.
4139 Models cost for a reduction operation, including the vector ops
4140 generated within the strip-mine loop, the initial definition before
4141 the loop, and the epilogue code that must be generated. */
4143 static void
4144 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4145 int ncopies)
4147 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4148 enum tree_code code;
4149 optab optab;
4150 tree vectype;
4151 gimple *orig_stmt;
4152 machine_mode mode;
4153 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4154 struct loop *loop = NULL;
4155 void *target_cost_data;
4157 if (loop_vinfo)
4159 loop = LOOP_VINFO_LOOP (loop_vinfo);
4160 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4162 else
4163 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4165 /* Condition reductions generate two reductions in the loop. */
4166 vect_reduction_type reduction_type
4167 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4168 if (reduction_type == COND_REDUCTION)
4169 ncopies *= 2;
4171 vectype = STMT_VINFO_VECTYPE (stmt_info);
4172 mode = TYPE_MODE (vectype);
4173 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4175 if (!orig_stmt)
4176 orig_stmt = STMT_VINFO_STMT (stmt_info);
4178 code = gimple_assign_rhs_code (orig_stmt);
4180 if (reduction_type == EXTRACT_LAST_REDUCTION
4181 || reduction_type == FOLD_LEFT_REDUCTION)
4183 /* No extra instructions needed in the prologue. */
4184 prologue_cost = 0;
4186 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4187 /* Count one reduction-like operation per vector. */
4188 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4189 stmt_info, 0, vect_body);
4190 else
4192 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4193 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4194 inside_cost = add_stmt_cost (target_cost_data, nelements,
4195 vec_to_scalar, stmt_info, 0,
4196 vect_body);
4197 inside_cost += add_stmt_cost (target_cost_data, nelements,
4198 scalar_stmt, stmt_info, 0,
4199 vect_body);
4202 else
4204 /* Add in cost for initial definition.
4205 For cond reduction we have four vectors: initial index, step,
4206 initial result of the data reduction, initial value of the index
4207 reduction. */
4208 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4209 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4210 scalar_to_vec, stmt_info, 0,
4211 vect_prologue);
4213 /* Cost of reduction op inside loop. */
4214 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4215 stmt_info, 0, vect_body);
4218 /* Determine cost of epilogue code.
4220 We have a reduction operator that will reduce the vector in one statement.
4221 Also requires scalar extract. */
4223 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4225 if (reduc_fn != IFN_LAST)
4227 if (reduction_type == COND_REDUCTION)
4229 /* An EQ stmt and an COND_EXPR stmt. */
4230 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4231 vector_stmt, stmt_info, 0,
4232 vect_epilogue);
4233 /* Reduction of the max index and a reduction of the found
4234 values. */
4235 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4236 vec_to_scalar, stmt_info, 0,
4237 vect_epilogue);
4238 /* A broadcast of the max value. */
4239 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4240 scalar_to_vec, stmt_info, 0,
4241 vect_epilogue);
4243 else
4245 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4246 stmt_info, 0, vect_epilogue);
4247 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4248 vec_to_scalar, stmt_info, 0,
4249 vect_epilogue);
4252 else if (reduction_type == COND_REDUCTION)
4254 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4255 /* Extraction of scalar elements. */
4256 epilogue_cost += add_stmt_cost (target_cost_data,
4257 2 * estimated_nunits,
4258 vec_to_scalar, stmt_info, 0,
4259 vect_epilogue);
4260 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4261 epilogue_cost += add_stmt_cost (target_cost_data,
4262 2 * estimated_nunits - 3,
4263 scalar_stmt, stmt_info, 0,
4264 vect_epilogue);
4266 else if (reduction_type == EXTRACT_LAST_REDUCTION
4267 || reduction_type == FOLD_LEFT_REDUCTION)
4268 /* No extra instructions need in the epilogue. */
4270 else
4272 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4273 tree bitsize =
4274 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4275 int element_bitsize = tree_to_uhwi (bitsize);
4276 int nelements = vec_size_in_bits / element_bitsize;
4278 if (code == COND_EXPR)
4279 code = MAX_EXPR;
4281 optab = optab_for_tree_code (code, vectype, optab_default);
4283 /* We have a whole vector shift available. */
4284 if (optab != unknown_optab
4285 && VECTOR_MODE_P (mode)
4286 && optab_handler (optab, mode) != CODE_FOR_nothing
4287 && have_whole_vector_shift (mode))
4289 /* Final reduction via vector shifts and the reduction operator.
4290 Also requires scalar extract. */
4291 epilogue_cost += add_stmt_cost (target_cost_data,
4292 exact_log2 (nelements) * 2,
4293 vector_stmt, stmt_info, 0,
4294 vect_epilogue);
4295 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4296 vec_to_scalar, stmt_info, 0,
4297 vect_epilogue);
4299 else
4300 /* Use extracts and reduction op for final reduction. For N
4301 elements, we have N extracts and N-1 reduction ops. */
4302 epilogue_cost += add_stmt_cost (target_cost_data,
4303 nelements + nelements - 1,
4304 vector_stmt, stmt_info, 0,
4305 vect_epilogue);
4309 if (dump_enabled_p ())
4310 dump_printf (MSG_NOTE,
4311 "vect_model_reduction_cost: inside_cost = %d, "
4312 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4313 prologue_cost, epilogue_cost);
4317 /* Function vect_model_induction_cost.
4319 Models cost for induction operations. */
4321 static void
4322 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4324 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4325 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4326 unsigned inside_cost, prologue_cost;
4328 if (PURE_SLP_STMT (stmt_info))
4329 return;
4331 /* loop cost for vec_loop. */
4332 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4333 stmt_info, 0, vect_body);
4335 /* prologue cost for vec_init and vec_step. */
4336 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4337 stmt_info, 0, vect_prologue);
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "vect_model_induction_cost: inside_cost = %d, "
4342 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4347 /* Function get_initial_def_for_reduction
4349 Input:
4350 STMT - a stmt that performs a reduction operation in the loop.
4351 INIT_VAL - the initial value of the reduction variable
4353 Output:
4354 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4355 of the reduction (used for adjusting the epilog - see below).
4356 Return a vector variable, initialized according to the operation that STMT
4357 performs. This vector will be used as the initial value of the
4358 vector of partial results.
4360 Option1 (adjust in epilog): Initialize the vector as follows:
4361 add/bit or/xor: [0,0,...,0,0]
4362 mult/bit and: [1,1,...,1,1]
4363 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4364 and when necessary (e.g. add/mult case) let the caller know
4365 that it needs to adjust the result by init_val.
4367 Option2: Initialize the vector as follows:
4368 add/bit or/xor: [init_val,0,0,...,0]
4369 mult/bit and: [init_val,1,1,...,1]
4370 min/max/cond_expr: [init_val,init_val,...,init_val]
4371 and no adjustments are needed.
4373 For example, for the following code:
4375 s = init_val;
4376 for (i=0;i<n;i++)
4377 s = s + a[i];
4379 STMT is 's = s + a[i]', and the reduction variable is 's'.
4380 For a vector of 4 units, we want to return either [0,0,0,init_val],
4381 or [0,0,0,0] and let the caller know that it needs to adjust
4382 the result at the end by 'init_val'.
4384 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4385 initialization vector is simpler (same element in all entries), if
4386 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4388 A cost model should help decide between these two schemes. */
4390 tree
4391 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4392 tree *adjustment_def)
4394 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4395 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4396 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397 tree scalar_type = TREE_TYPE (init_val);
4398 tree vectype = get_vectype_for_scalar_type (scalar_type);
4399 enum tree_code code = gimple_assign_rhs_code (stmt);
4400 tree def_for_init;
4401 tree init_def;
4402 bool nested_in_vect_loop = false;
4403 REAL_VALUE_TYPE real_init_val = dconst0;
4404 int int_init_val = 0;
4405 gimple *def_stmt = NULL;
4406 gimple_seq stmts = NULL;
4408 gcc_assert (vectype);
4410 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4411 || SCALAR_FLOAT_TYPE_P (scalar_type));
4413 if (nested_in_vect_loop_p (loop, stmt))
4414 nested_in_vect_loop = true;
4415 else
4416 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4418 /* In case of double reduction we only create a vector variable to be put
4419 in the reduction phi node. The actual statement creation is done in
4420 vect_create_epilog_for_reduction. */
4421 if (adjustment_def && nested_in_vect_loop
4422 && TREE_CODE (init_val) == SSA_NAME
4423 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4424 && gimple_code (def_stmt) == GIMPLE_PHI
4425 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4426 && vinfo_for_stmt (def_stmt)
4427 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4428 == vect_double_reduction_def)
4430 *adjustment_def = NULL;
4431 return vect_create_destination_var (init_val, vectype);
4434 vect_reduction_type reduction_type
4435 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4437 /* In case of a nested reduction do not use an adjustment def as
4438 that case is not supported by the epilogue generation correctly
4439 if ncopies is not one. */
4440 if (adjustment_def && nested_in_vect_loop)
4442 *adjustment_def = NULL;
4443 return vect_get_vec_def_for_operand (init_val, stmt);
4446 switch (code)
4448 case WIDEN_SUM_EXPR:
4449 case DOT_PROD_EXPR:
4450 case SAD_EXPR:
4451 case PLUS_EXPR:
4452 case MINUS_EXPR:
4453 case BIT_IOR_EXPR:
4454 case BIT_XOR_EXPR:
4455 case MULT_EXPR:
4456 case BIT_AND_EXPR:
4458 /* ADJUSTMENT_DEF is NULL when called from
4459 vect_create_epilog_for_reduction to vectorize double reduction. */
4460 if (adjustment_def)
4461 *adjustment_def = init_val;
4463 if (code == MULT_EXPR)
4465 real_init_val = dconst1;
4466 int_init_val = 1;
4469 if (code == BIT_AND_EXPR)
4470 int_init_val = -1;
4472 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4473 def_for_init = build_real (scalar_type, real_init_val);
4474 else
4475 def_for_init = build_int_cst (scalar_type, int_init_val);
4477 if (adjustment_def)
4478 /* Option1: the first element is '0' or '1' as well. */
4479 init_def = gimple_build_vector_from_val (&stmts, vectype,
4480 def_for_init);
4481 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4483 /* Option2 (variable length): the first element is INIT_VAL. */
4484 init_def = build_vector_from_val (vectype, def_for_init);
4485 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4486 2, init_def, init_val);
4487 init_def = make_ssa_name (vectype);
4488 gimple_call_set_lhs (call, init_def);
4489 gimple_seq_add_stmt (&stmts, call);
4491 else
4493 /* Option2: the first element is INIT_VAL. */
4494 tree_vector_builder elts (vectype, 1, 2);
4495 elts.quick_push (init_val);
4496 elts.quick_push (def_for_init);
4497 init_def = gimple_build_vector (&stmts, &elts);
4500 break;
4502 case MIN_EXPR:
4503 case MAX_EXPR:
4504 case COND_EXPR:
4506 if (adjustment_def)
4508 *adjustment_def = NULL_TREE;
4509 if (reduction_type != COND_REDUCTION
4510 && reduction_type != EXTRACT_LAST_REDUCTION)
4512 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4513 break;
4516 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4517 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4519 break;
4521 default:
4522 gcc_unreachable ();
4525 if (stmts)
4526 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4527 return init_def;
4530 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4531 NUMBER_OF_VECTORS is the number of vector defs to create.
4532 If NEUTRAL_OP is nonnull, introducing extra elements of that
4533 value will not change the result. */
4535 static void
4536 get_initial_defs_for_reduction (slp_tree slp_node,
4537 vec<tree> *vec_oprnds,
4538 unsigned int number_of_vectors,
4539 bool reduc_chain, tree neutral_op)
4541 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4542 gimple *stmt = stmts[0];
4543 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4544 unsigned HOST_WIDE_INT nunits;
4545 unsigned j, number_of_places_left_in_vector;
4546 tree vector_type;
4547 tree vop;
4548 int group_size = stmts.length ();
4549 unsigned int vec_num, i;
4550 unsigned number_of_copies = 1;
4551 vec<tree> voprnds;
4552 voprnds.create (number_of_vectors);
4553 struct loop *loop;
4554 auto_vec<tree, 16> permute_results;
4556 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4558 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4560 loop = (gimple_bb (stmt))->loop_father;
4561 gcc_assert (loop);
4562 edge pe = loop_preheader_edge (loop);
4564 gcc_assert (!reduc_chain || neutral_op);
4566 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4567 created vectors. It is greater than 1 if unrolling is performed.
4569 For example, we have two scalar operands, s1 and s2 (e.g., group of
4570 strided accesses of size two), while NUNITS is four (i.e., four scalars
4571 of this type can be packed in a vector). The output vector will contain
4572 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4573 will be 2).
4575 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4576 containing the operands.
4578 For example, NUNITS is four as before, and the group size is 8
4579 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4580 {s5, s6, s7, s8}. */
4582 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4583 nunits = group_size;
4585 number_of_copies = nunits * number_of_vectors / group_size;
4587 number_of_places_left_in_vector = nunits;
4588 bool constant_p = true;
4589 tree_vector_builder elts (vector_type, nunits, 1);
4590 elts.quick_grow (nunits);
4591 for (j = 0; j < number_of_copies; j++)
4593 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4595 tree op;
4596 /* Get the def before the loop. In reduction chain we have only
4597 one initial value. */
4598 if ((j != (number_of_copies - 1)
4599 || (reduc_chain && i != 0))
4600 && neutral_op)
4601 op = neutral_op;
4602 else
4603 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4605 /* Create 'vect_ = {op0,op1,...,opn}'. */
4606 number_of_places_left_in_vector--;
4607 elts[number_of_places_left_in_vector] = op;
4608 if (!CONSTANT_CLASS_P (op))
4609 constant_p = false;
4611 if (number_of_places_left_in_vector == 0)
4613 gimple_seq ctor_seq = NULL;
4614 tree init;
4615 if (constant_p && !neutral_op
4616 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4617 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4618 /* Build the vector directly from ELTS. */
4619 init = gimple_build_vector (&ctor_seq, &elts);
4620 else if (neutral_op)
4622 /* Build a vector of the neutral value and shift the
4623 other elements into place. */
4624 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4625 neutral_op);
4626 int k = nunits;
4627 while (k > 0 && elts[k - 1] == neutral_op)
4628 k -= 1;
4629 while (k > 0)
4631 k -= 1;
4632 gcall *call = gimple_build_call_internal
4633 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4634 init = make_ssa_name (vector_type);
4635 gimple_call_set_lhs (call, init);
4636 gimple_seq_add_stmt (&ctor_seq, call);
4639 else
4641 /* First time round, duplicate ELTS to fill the
4642 required number of vectors, then cherry pick the
4643 appropriate result for each iteration. */
4644 if (vec_oprnds->is_empty ())
4645 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4646 number_of_vectors,
4647 permute_results);
4648 init = permute_results[number_of_vectors - j - 1];
4650 if (ctor_seq != NULL)
4651 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4652 voprnds.quick_push (init);
4654 number_of_places_left_in_vector = nunits;
4655 elts.new_vector (vector_type, nunits, 1);
4656 elts.quick_grow (nunits);
4657 constant_p = true;
4662 /* Since the vectors are created in the reverse order, we should invert
4663 them. */
4664 vec_num = voprnds.length ();
4665 for (j = vec_num; j != 0; j--)
4667 vop = voprnds[j - 1];
4668 vec_oprnds->quick_push (vop);
4671 voprnds.release ();
4673 /* In case that VF is greater than the unrolling factor needed for the SLP
4674 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4675 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4676 to replicate the vectors. */
4677 tree neutral_vec = NULL;
4678 while (number_of_vectors > vec_oprnds->length ())
4680 if (neutral_op)
4682 if (!neutral_vec)
4684 gimple_seq ctor_seq = NULL;
4685 neutral_vec = gimple_build_vector_from_val
4686 (&ctor_seq, vector_type, neutral_op);
4687 if (ctor_seq != NULL)
4688 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4690 vec_oprnds->quick_push (neutral_vec);
4692 else
4694 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4695 vec_oprnds->quick_push (vop);
4701 /* Function vect_create_epilog_for_reduction
4703 Create code at the loop-epilog to finalize the result of a reduction
4704 computation.
4706 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4707 reduction statements.
4708 STMT is the scalar reduction stmt that is being vectorized.
4709 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4710 number of elements that we can fit in a vectype (nunits). In this case
4711 we have to generate more than one vector stmt - i.e - we need to "unroll"
4712 the vector stmt by a factor VF/nunits. For more details see documentation
4713 in vectorizable_operation.
4714 REDUC_FN is the internal function for the epilog reduction.
4715 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4716 computation.
4717 REDUC_INDEX is the index of the operand in the right hand side of the
4718 statement that is defined by REDUCTION_PHI.
4719 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4720 SLP_NODE is an SLP node containing a group of reduction statements. The
4721 first one in this group is STMT.
4722 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4723 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4724 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4725 any value of the IV in the loop.
4726 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4727 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4728 null if this is not an SLP reduction
4730 This function:
4731 1. Creates the reduction def-use cycles: sets the arguments for
4732 REDUCTION_PHIS:
4733 The loop-entry argument is the vectorized initial-value of the reduction.
4734 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4735 sums.
4736 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4737 by calling the function specified by REDUC_FN if available, or by
4738 other means (whole-vector shifts or a scalar loop).
4739 The function also creates a new phi node at the loop exit to preserve
4740 loop-closed form, as illustrated below.
4742 The flow at the entry to this function:
4744 loop:
4745 vec_def = phi <null, null> # REDUCTION_PHI
4746 VECT_DEF = vector_stmt # vectorized form of STMT
4747 s_loop = scalar_stmt # (scalar) STMT
4748 loop_exit:
4749 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4750 use <s_out0>
4751 use <s_out0>
4753 The above is transformed by this function into:
4755 loop:
4756 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4757 VECT_DEF = vector_stmt # vectorized form of STMT
4758 s_loop = scalar_stmt # (scalar) STMT
4759 loop_exit:
4760 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4761 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4762 v_out2 = reduce <v_out1>
4763 s_out3 = extract_field <v_out2, 0>
4764 s_out4 = adjust_result <s_out3>
4765 use <s_out4>
4766 use <s_out4>
4769 static void
4770 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4771 gimple *reduc_def_stmt,
4772 int ncopies, internal_fn reduc_fn,
4773 vec<gimple *> reduction_phis,
4774 bool double_reduc,
4775 slp_tree slp_node,
4776 slp_instance slp_node_instance,
4777 tree induc_val, enum tree_code induc_code,
4778 tree neutral_op)
4780 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4781 stmt_vec_info prev_phi_info;
4782 tree vectype;
4783 machine_mode mode;
4784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4785 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4786 basic_block exit_bb;
4787 tree scalar_dest;
4788 tree scalar_type;
4789 gimple *new_phi = NULL, *phi;
4790 gimple_stmt_iterator exit_gsi;
4791 tree vec_dest;
4792 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4793 gimple *epilog_stmt = NULL;
4794 enum tree_code code = gimple_assign_rhs_code (stmt);
4795 gimple *exit_phi;
4796 tree bitsize;
4797 tree adjustment_def = NULL;
4798 tree vec_initial_def = NULL;
4799 tree expr, def, initial_def = NULL;
4800 tree orig_name, scalar_result;
4801 imm_use_iterator imm_iter, phi_imm_iter;
4802 use_operand_p use_p, phi_use_p;
4803 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4804 bool nested_in_vect_loop = false;
4805 auto_vec<gimple *> new_phis;
4806 auto_vec<gimple *> inner_phis;
4807 enum vect_def_type dt = vect_unknown_def_type;
4808 int j, i;
4809 auto_vec<tree> scalar_results;
4810 unsigned int group_size = 1, k, ratio;
4811 auto_vec<tree> vec_initial_defs;
4812 auto_vec<gimple *> phis;
4813 bool slp_reduc = false;
4814 bool direct_slp_reduc;
4815 tree new_phi_result;
4816 gimple *inner_phi = NULL;
4817 tree induction_index = NULL_TREE;
4819 if (slp_node)
4820 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4822 if (nested_in_vect_loop_p (loop, stmt))
4824 outer_loop = loop;
4825 loop = loop->inner;
4826 nested_in_vect_loop = true;
4827 gcc_assert (!slp_node);
4830 vectype = STMT_VINFO_VECTYPE (stmt_info);
4831 gcc_assert (vectype);
4832 mode = TYPE_MODE (vectype);
4834 /* 1. Create the reduction def-use cycle:
4835 Set the arguments of REDUCTION_PHIS, i.e., transform
4837 loop:
4838 vec_def = phi <null, null> # REDUCTION_PHI
4839 VECT_DEF = vector_stmt # vectorized form of STMT
4842 into:
4844 loop:
4845 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4846 VECT_DEF = vector_stmt # vectorized form of STMT
4849 (in case of SLP, do it for all the phis). */
4851 /* Get the loop-entry arguments. */
4852 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4853 if (slp_node)
4855 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4856 vec_initial_defs.reserve (vec_num);
4857 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4858 &vec_initial_defs, vec_num,
4859 GROUP_FIRST_ELEMENT (stmt_info),
4860 neutral_op);
4862 else
4864 /* Get at the scalar def before the loop, that defines the initial value
4865 of the reduction variable. */
4866 gimple *def_stmt;
4867 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4868 loop_preheader_edge (loop));
4869 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4870 and we can't use zero for induc_val, use initial_def. Similarly
4871 for REDUC_MIN and initial_def larger than the base. */
4872 if (TREE_CODE (initial_def) == INTEGER_CST
4873 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4874 == INTEGER_INDUC_COND_REDUCTION)
4875 && !integer_zerop (induc_val)
4876 && ((induc_code == MAX_EXPR
4877 && tree_int_cst_lt (initial_def, induc_val))
4878 || (induc_code == MIN_EXPR
4879 && tree_int_cst_lt (induc_val, initial_def))))
4880 induc_val = initial_def;
4881 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4882 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4883 &adjustment_def);
4884 vec_initial_defs.create (1);
4885 vec_initial_defs.quick_push (vec_initial_def);
4888 /* Set phi nodes arguments. */
4889 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4891 tree vec_init_def = vec_initial_defs[i];
4892 tree def = vect_defs[i];
4893 for (j = 0; j < ncopies; j++)
4895 if (j != 0)
4897 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4898 if (nested_in_vect_loop)
4899 vec_init_def
4900 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4901 vec_init_def);
4904 /* Set the loop-entry arg of the reduction-phi. */
4906 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4907 == INTEGER_INDUC_COND_REDUCTION)
4909 /* Initialise the reduction phi to zero. This prevents initial
4910 values of non-zero interferring with the reduction op. */
4911 gcc_assert (ncopies == 1);
4912 gcc_assert (i == 0);
4914 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4915 tree induc_val_vec
4916 = build_vector_from_val (vec_init_def_type, induc_val);
4918 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4919 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4921 else
4922 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4923 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4925 /* Set the loop-latch arg for the reduction-phi. */
4926 if (j > 0)
4927 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4929 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4930 UNKNOWN_LOCATION);
4932 if (dump_enabled_p ())
4934 dump_printf_loc (MSG_NOTE, vect_location,
4935 "transform reduction: created def-use cycle: ");
4936 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4937 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4942 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4943 which is updated with the current index of the loop for every match of
4944 the original loop's cond_expr (VEC_STMT). This results in a vector
4945 containing the last time the condition passed for that vector lane.
4946 The first match will be a 1 to allow 0 to be used for non-matching
4947 indexes. If there are no matches at all then the vector will be all
4948 zeroes. */
4949 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4951 tree indx_before_incr, indx_after_incr;
4952 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4954 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4955 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4957 int scalar_precision
4958 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4959 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4960 tree cr_index_vector_type = build_vector_type
4961 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4963 /* First we create a simple vector induction variable which starts
4964 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4965 vector size (STEP). */
4967 /* Create a {1,2,3,...} vector. */
4968 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4970 /* Create a vector of the step value. */
4971 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4972 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4974 /* Create an induction variable. */
4975 gimple_stmt_iterator incr_gsi;
4976 bool insert_after;
4977 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4978 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4979 insert_after, &indx_before_incr, &indx_after_incr);
4981 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4982 filled with zeros (VEC_ZERO). */
4984 /* Create a vector of 0s. */
4985 tree zero = build_zero_cst (cr_index_scalar_type);
4986 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4988 /* Create a vector phi node. */
4989 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4990 new_phi = create_phi_node (new_phi_tree, loop->header);
4991 set_vinfo_for_stmt (new_phi,
4992 new_stmt_vec_info (new_phi, loop_vinfo));
4993 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4994 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4996 /* Now take the condition from the loops original cond_expr
4997 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4998 every match uses values from the induction variable
4999 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5000 (NEW_PHI_TREE).
5001 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5002 the new cond_expr (INDEX_COND_EXPR). */
5004 /* Duplicate the condition from vec_stmt. */
5005 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5007 /* Create a conditional, where the condition is taken from vec_stmt
5008 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5009 else is the phi (NEW_PHI_TREE). */
5010 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5011 ccompare, indx_before_incr,
5012 new_phi_tree);
5013 induction_index = make_ssa_name (cr_index_vector_type);
5014 gimple *index_condition = gimple_build_assign (induction_index,
5015 index_cond_expr);
5016 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5017 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5018 loop_vinfo);
5019 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5020 set_vinfo_for_stmt (index_condition, index_vec_info);
5022 /* Update the phi with the vec cond. */
5023 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5024 loop_latch_edge (loop), UNKNOWN_LOCATION);
5027 /* 2. Create epilog code.
5028 The reduction epilog code operates across the elements of the vector
5029 of partial results computed by the vectorized loop.
5030 The reduction epilog code consists of:
5032 step 1: compute the scalar result in a vector (v_out2)
5033 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5034 step 3: adjust the scalar result (s_out3) if needed.
5036 Step 1 can be accomplished using one the following three schemes:
5037 (scheme 1) using reduc_fn, if available.
5038 (scheme 2) using whole-vector shifts, if available.
5039 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5040 combined.
5042 The overall epilog code looks like this:
5044 s_out0 = phi <s_loop> # original EXIT_PHI
5045 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5046 v_out2 = reduce <v_out1> # step 1
5047 s_out3 = extract_field <v_out2, 0> # step 2
5048 s_out4 = adjust_result <s_out3> # step 3
5050 (step 3 is optional, and steps 1 and 2 may be combined).
5051 Lastly, the uses of s_out0 are replaced by s_out4. */
5054 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5055 v_out1 = phi <VECT_DEF>
5056 Store them in NEW_PHIS. */
5058 exit_bb = single_exit (loop)->dest;
5059 prev_phi_info = NULL;
5060 new_phis.create (vect_defs.length ());
5061 FOR_EACH_VEC_ELT (vect_defs, i, def)
5063 for (j = 0; j < ncopies; j++)
5065 tree new_def = copy_ssa_name (def);
5066 phi = create_phi_node (new_def, exit_bb);
5067 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5068 if (j == 0)
5069 new_phis.quick_push (phi);
5070 else
5072 def = vect_get_vec_def_for_stmt_copy (dt, def);
5073 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5076 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5077 prev_phi_info = vinfo_for_stmt (phi);
5081 /* The epilogue is created for the outer-loop, i.e., for the loop being
5082 vectorized. Create exit phis for the outer loop. */
5083 if (double_reduc)
5085 loop = outer_loop;
5086 exit_bb = single_exit (loop)->dest;
5087 inner_phis.create (vect_defs.length ());
5088 FOR_EACH_VEC_ELT (new_phis, i, phi)
5090 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5091 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5092 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5093 PHI_RESULT (phi));
5094 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5095 loop_vinfo));
5096 inner_phis.quick_push (phi);
5097 new_phis[i] = outer_phi;
5098 prev_phi_info = vinfo_for_stmt (outer_phi);
5099 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5101 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5102 new_result = copy_ssa_name (PHI_RESULT (phi));
5103 outer_phi = create_phi_node (new_result, exit_bb);
5104 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5105 PHI_RESULT (phi));
5106 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5107 loop_vinfo));
5108 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5109 prev_phi_info = vinfo_for_stmt (outer_phi);
5114 exit_gsi = gsi_after_labels (exit_bb);
5116 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5117 (i.e. when reduc_fn is not available) and in the final adjustment
5118 code (if needed). Also get the original scalar reduction variable as
5119 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5120 represents a reduction pattern), the tree-code and scalar-def are
5121 taken from the original stmt that the pattern-stmt (STMT) replaces.
5122 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5123 are taken from STMT. */
5125 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5126 if (!orig_stmt)
5128 /* Regular reduction */
5129 orig_stmt = stmt;
5131 else
5133 /* Reduction pattern */
5134 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5135 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5136 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5139 code = gimple_assign_rhs_code (orig_stmt);
5140 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5141 partial results are added and not subtracted. */
5142 if (code == MINUS_EXPR)
5143 code = PLUS_EXPR;
5145 scalar_dest = gimple_assign_lhs (orig_stmt);
5146 scalar_type = TREE_TYPE (scalar_dest);
5147 scalar_results.create (group_size);
5148 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5149 bitsize = TYPE_SIZE (scalar_type);
5151 /* In case this is a reduction in an inner-loop while vectorizing an outer
5152 loop - we don't need to extract a single scalar result at the end of the
5153 inner-loop (unless it is double reduction, i.e., the use of reduction is
5154 outside the outer-loop). The final vector of partial results will be used
5155 in the vectorized outer-loop, or reduced to a scalar result at the end of
5156 the outer-loop. */
5157 if (nested_in_vect_loop && !double_reduc)
5158 goto vect_finalize_reduction;
5160 /* SLP reduction without reduction chain, e.g.,
5161 # a1 = phi <a2, a0>
5162 # b1 = phi <b2, b0>
5163 a2 = operation (a1)
5164 b2 = operation (b1) */
5165 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5167 /* True if we should implement SLP_REDUC using native reduction operations
5168 instead of scalar operations. */
5169 direct_slp_reduc = (reduc_fn != IFN_LAST
5170 && slp_reduc
5171 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5173 /* In case of reduction chain, e.g.,
5174 # a1 = phi <a3, a0>
5175 a2 = operation (a1)
5176 a3 = operation (a2),
5178 we may end up with more than one vector result. Here we reduce them to
5179 one vector. */
5180 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5182 tree first_vect = PHI_RESULT (new_phis[0]);
5183 gassign *new_vec_stmt = NULL;
5184 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5185 for (k = 1; k < new_phis.length (); k++)
5187 gimple *next_phi = new_phis[k];
5188 tree second_vect = PHI_RESULT (next_phi);
5189 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5190 new_vec_stmt = gimple_build_assign (tem, code,
5191 first_vect, second_vect);
5192 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5193 first_vect = tem;
5196 new_phi_result = first_vect;
5197 if (new_vec_stmt)
5199 new_phis.truncate (0);
5200 new_phis.safe_push (new_vec_stmt);
5203 /* Likewise if we couldn't use a single defuse cycle. */
5204 else if (ncopies > 1)
5206 gcc_assert (new_phis.length () == 1);
5207 tree first_vect = PHI_RESULT (new_phis[0]);
5208 gassign *new_vec_stmt = NULL;
5209 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5210 gimple *next_phi = new_phis[0];
5211 for (int k = 1; k < ncopies; ++k)
5213 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5214 tree second_vect = PHI_RESULT (next_phi);
5215 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5216 new_vec_stmt = gimple_build_assign (tem, code,
5217 first_vect, second_vect);
5218 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5219 first_vect = tem;
5221 new_phi_result = first_vect;
5222 new_phis.truncate (0);
5223 new_phis.safe_push (new_vec_stmt);
5225 else
5226 new_phi_result = PHI_RESULT (new_phis[0]);
5228 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5229 && reduc_fn != IFN_LAST)
5231 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5232 various data values where the condition matched and another vector
5233 (INDUCTION_INDEX) containing all the indexes of those matches. We
5234 need to extract the last matching index (which will be the index with
5235 highest value) and use this to index into the data vector.
5236 For the case where there were no matches, the data vector will contain
5237 all default values and the index vector will be all zeros. */
5239 /* Get various versions of the type of the vector of indexes. */
5240 tree index_vec_type = TREE_TYPE (induction_index);
5241 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5242 tree index_scalar_type = TREE_TYPE (index_vec_type);
5243 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5244 (index_vec_type);
5246 /* Get an unsigned integer version of the type of the data vector. */
5247 int scalar_precision
5248 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5249 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5250 tree vectype_unsigned = build_vector_type
5251 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5253 /* First we need to create a vector (ZERO_VEC) of zeros and another
5254 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5255 can create using a MAX reduction and then expanding.
5256 In the case where the loop never made any matches, the max index will
5257 be zero. */
5259 /* Vector of {0, 0, 0,...}. */
5260 tree zero_vec = make_ssa_name (vectype);
5261 tree zero_vec_rhs = build_zero_cst (vectype);
5262 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5263 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5265 /* Find maximum value from the vector of found indexes. */
5266 tree max_index = make_ssa_name (index_scalar_type);
5267 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5268 1, induction_index);
5269 gimple_call_set_lhs (max_index_stmt, max_index);
5270 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5272 /* Vector of {max_index, max_index, max_index,...}. */
5273 tree max_index_vec = make_ssa_name (index_vec_type);
5274 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5275 max_index);
5276 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5277 max_index_vec_rhs);
5278 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5280 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5281 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5282 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5283 otherwise. Only one value should match, resulting in a vector
5284 (VEC_COND) with one data value and the rest zeros.
5285 In the case where the loop never made any matches, every index will
5286 match, resulting in a vector with all data values (which will all be
5287 the default value). */
5289 /* Compare the max index vector to the vector of found indexes to find
5290 the position of the max value. */
5291 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5292 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5293 induction_index,
5294 max_index_vec);
5295 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5297 /* Use the compare to choose either values from the data vector or
5298 zero. */
5299 tree vec_cond = make_ssa_name (vectype);
5300 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5301 vec_compare, new_phi_result,
5302 zero_vec);
5303 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5305 /* Finally we need to extract the data value from the vector (VEC_COND)
5306 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5307 reduction, but because this doesn't exist, we can use a MAX reduction
5308 instead. The data value might be signed or a float so we need to cast
5309 it first.
5310 In the case where the loop never made any matches, the data values are
5311 all identical, and so will reduce down correctly. */
5313 /* Make the matched data values unsigned. */
5314 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5315 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5316 vec_cond);
5317 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5318 VIEW_CONVERT_EXPR,
5319 vec_cond_cast_rhs);
5320 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5322 /* Reduce down to a scalar value. */
5323 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5324 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5325 1, vec_cond_cast);
5326 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5327 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5329 /* Convert the reduced value back to the result type and set as the
5330 result. */
5331 gimple_seq stmts = NULL;
5332 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5333 data_reduc);
5334 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5335 scalar_results.safe_push (new_temp);
5337 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5338 && reduc_fn == IFN_LAST)
5340 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5341 idx = 0;
5342 idx_val = induction_index[0];
5343 val = data_reduc[0];
5344 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5345 if (induction_index[i] > idx_val)
5346 val = data_reduc[i], idx_val = induction_index[i];
5347 return val; */
5349 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5350 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5351 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5352 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5353 /* Enforced by vectorizable_reduction, which ensures we have target
5354 support before allowing a conditional reduction on variable-length
5355 vectors. */
5356 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5357 tree idx_val = NULL_TREE, val = NULL_TREE;
5358 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5360 tree old_idx_val = idx_val;
5361 tree old_val = val;
5362 idx_val = make_ssa_name (idx_eltype);
5363 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5364 build3 (BIT_FIELD_REF, idx_eltype,
5365 induction_index,
5366 bitsize_int (el_size),
5367 bitsize_int (off)));
5368 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369 val = make_ssa_name (data_eltype);
5370 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5371 build3 (BIT_FIELD_REF,
5372 data_eltype,
5373 new_phi_result,
5374 bitsize_int (el_size),
5375 bitsize_int (off)));
5376 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5377 if (off != 0)
5379 tree new_idx_val = idx_val;
5380 tree new_val = val;
5381 if (off != v_size - el_size)
5383 new_idx_val = make_ssa_name (idx_eltype);
5384 epilog_stmt = gimple_build_assign (new_idx_val,
5385 MAX_EXPR, idx_val,
5386 old_idx_val);
5387 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5389 new_val = make_ssa_name (data_eltype);
5390 epilog_stmt = gimple_build_assign (new_val,
5391 COND_EXPR,
5392 build2 (GT_EXPR,
5393 boolean_type_node,
5394 idx_val,
5395 old_idx_val),
5396 val, old_val);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 idx_val = new_idx_val;
5399 val = new_val;
5402 /* Convert the reduced value back to the result type and set as the
5403 result. */
5404 gimple_seq stmts = NULL;
5405 val = gimple_convert (&stmts, scalar_type, val);
5406 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5407 scalar_results.safe_push (val);
5410 /* 2.3 Create the reduction code, using one of the three schemes described
5411 above. In SLP we simply need to extract all the elements from the
5412 vector (without reducing them), so we use scalar shifts. */
5413 else if (reduc_fn != IFN_LAST && !slp_reduc)
5415 tree tmp;
5416 tree vec_elem_type;
5418 /* Case 1: Create:
5419 v_out2 = reduc_expr <v_out1> */
5421 if (dump_enabled_p ())
5422 dump_printf_loc (MSG_NOTE, vect_location,
5423 "Reduce using direct vector reduction.\n");
5425 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5426 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5428 tree tmp_dest
5429 = vect_create_destination_var (scalar_dest, vec_elem_type);
5430 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5431 new_phi_result);
5432 gimple_set_lhs (epilog_stmt, tmp_dest);
5433 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5434 gimple_set_lhs (epilog_stmt, new_temp);
5435 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5437 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5438 new_temp);
5440 else
5442 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5443 new_phi_result);
5444 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5447 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5448 gimple_set_lhs (epilog_stmt, new_temp);
5449 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5452 == INTEGER_INDUC_COND_REDUCTION)
5453 && !operand_equal_p (initial_def, induc_val, 0))
5455 /* Earlier we set the initial value to be a vector if induc_val
5456 values. Check the result and if it is induc_val then replace
5457 with the original initial value, unless induc_val is
5458 the same as initial_def already. */
5459 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5460 induc_val);
5462 tmp = make_ssa_name (new_scalar_dest);
5463 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5464 initial_def, new_temp);
5465 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5466 new_temp = tmp;
5469 scalar_results.safe_push (new_temp);
5471 else if (direct_slp_reduc)
5473 /* Here we create one vector for each of the GROUP_SIZE results,
5474 with the elements for other SLP statements replaced with the
5475 neutral value. We can then do a normal reduction on each vector. */
5477 /* Enforced by vectorizable_reduction. */
5478 gcc_assert (new_phis.length () == 1);
5479 gcc_assert (pow2p_hwi (group_size));
5481 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5482 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5483 gimple_seq seq = NULL;
5485 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5486 and the same element size as VECTYPE. */
5487 tree index = build_index_vector (vectype, 0, 1);
5488 tree index_type = TREE_TYPE (index);
5489 tree index_elt_type = TREE_TYPE (index_type);
5490 tree mask_type = build_same_sized_truth_vector_type (index_type);
5492 /* Create a vector that, for each element, identifies which of
5493 the GROUP_SIZE results should use it. */
5494 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5495 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5496 build_vector_from_val (index_type, index_mask));
5498 /* Get a neutral vector value. This is simply a splat of the neutral
5499 scalar value if we have one, otherwise the initial scalar value
5500 is itself a neutral value. */
5501 tree vector_identity = NULL_TREE;
5502 if (neutral_op)
5503 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5504 neutral_op);
5505 for (unsigned int i = 0; i < group_size; ++i)
5507 /* If there's no univeral neutral value, we can use the
5508 initial scalar value from the original PHI. This is used
5509 for MIN and MAX reduction, for example. */
5510 if (!neutral_op)
5512 tree scalar_value
5513 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5514 loop_preheader_edge (loop));
5515 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5516 scalar_value);
5519 /* Calculate the equivalent of:
5521 sel[j] = (index[j] == i);
5523 which selects the elements of NEW_PHI_RESULT that should
5524 be included in the result. */
5525 tree compare_val = build_int_cst (index_elt_type, i);
5526 compare_val = build_vector_from_val (index_type, compare_val);
5527 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5528 index, compare_val);
5530 /* Calculate the equivalent of:
5532 vec = seq ? new_phi_result : vector_identity;
5534 VEC is now suitable for a full vector reduction. */
5535 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5536 sel, new_phi_result, vector_identity);
5538 /* Do the reduction and convert it to the appropriate type. */
5539 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5540 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5541 gimple_call_set_lhs (call, scalar);
5542 gimple_seq_add_stmt (&seq, call);
5543 scalar = gimple_convert (&seq, scalar_type, scalar);
5544 scalar_results.safe_push (scalar);
5546 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5548 else
5550 bool reduce_with_shift;
5551 tree vec_temp;
5553 /* COND reductions all do the final reduction with MAX_EXPR
5554 or MIN_EXPR. */
5555 if (code == COND_EXPR)
5557 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5558 == INTEGER_INDUC_COND_REDUCTION)
5559 code = induc_code;
5560 else
5561 code = MAX_EXPR;
5564 /* See if the target wants to do the final (shift) reduction
5565 in a vector mode of smaller size and first reduce upper/lower
5566 halves against each other. */
5567 enum machine_mode mode1 = mode;
5568 tree vectype1 = vectype;
5569 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5570 unsigned sz1 = sz;
5571 if (!slp_reduc
5572 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5573 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5575 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5576 reduce_with_shift = have_whole_vector_shift (mode1);
5577 if (!VECTOR_MODE_P (mode1))
5578 reduce_with_shift = false;
5579 else
5581 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5582 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5583 reduce_with_shift = false;
5586 /* First reduce the vector to the desired vector size we should
5587 do shift reduction on by combining upper and lower halves. */
5588 new_temp = new_phi_result;
5589 while (sz > sz1)
5591 gcc_assert (!slp_reduc);
5592 sz /= 2;
5593 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5595 /* The target has to make sure we support lowpart/highpart
5596 extraction, either via direct vector extract or through
5597 an integer mode punning. */
5598 tree dst1, dst2;
5599 if (convert_optab_handler (vec_extract_optab,
5600 TYPE_MODE (TREE_TYPE (new_temp)),
5601 TYPE_MODE (vectype1))
5602 != CODE_FOR_nothing)
5604 /* Extract sub-vectors directly once vec_extract becomes
5605 a conversion optab. */
5606 dst1 = make_ssa_name (vectype1);
5607 epilog_stmt
5608 = gimple_build_assign (dst1, BIT_FIELD_REF,
5609 build3 (BIT_FIELD_REF, vectype1,
5610 new_temp, TYPE_SIZE (vectype1),
5611 bitsize_int (0)));
5612 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5613 dst2 = make_ssa_name (vectype1);
5614 epilog_stmt
5615 = gimple_build_assign (dst2, BIT_FIELD_REF,
5616 build3 (BIT_FIELD_REF, vectype1,
5617 new_temp, TYPE_SIZE (vectype1),
5618 bitsize_int (sz * BITS_PER_UNIT)));
5619 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621 else
5623 /* Extract via punning to appropriately sized integer mode
5624 vector. */
5625 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5627 tree etype = build_vector_type (eltype, 2);
5628 gcc_assert (convert_optab_handler (vec_extract_optab,
5629 TYPE_MODE (etype),
5630 TYPE_MODE (eltype))
5631 != CODE_FOR_nothing);
5632 tree tem = make_ssa_name (etype);
5633 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5634 build1 (VIEW_CONVERT_EXPR,
5635 etype, new_temp));
5636 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5637 new_temp = tem;
5638 tem = make_ssa_name (eltype);
5639 epilog_stmt
5640 = gimple_build_assign (tem, BIT_FIELD_REF,
5641 build3 (BIT_FIELD_REF, eltype,
5642 new_temp, TYPE_SIZE (eltype),
5643 bitsize_int (0)));
5644 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645 dst1 = make_ssa_name (vectype1);
5646 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5647 build1 (VIEW_CONVERT_EXPR,
5648 vectype1, tem));
5649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5650 tem = make_ssa_name (eltype);
5651 epilog_stmt
5652 = gimple_build_assign (tem, BIT_FIELD_REF,
5653 build3 (BIT_FIELD_REF, eltype,
5654 new_temp, TYPE_SIZE (eltype),
5655 bitsize_int (sz * BITS_PER_UNIT)));
5656 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5657 dst2 = make_ssa_name (vectype1);
5658 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5659 build1 (VIEW_CONVERT_EXPR,
5660 vectype1, tem));
5661 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5664 new_temp = make_ssa_name (vectype1);
5665 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5666 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5669 if (reduce_with_shift && !slp_reduc)
5671 int element_bitsize = tree_to_uhwi (bitsize);
5672 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5673 for variable-length vectors and also requires direct target support
5674 for loop reductions. */
5675 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5676 int nelements = vec_size_in_bits / element_bitsize;
5677 vec_perm_builder sel;
5678 vec_perm_indices indices;
5680 int elt_offset;
5682 tree zero_vec = build_zero_cst (vectype1);
5683 /* Case 2: Create:
5684 for (offset = nelements/2; offset >= 1; offset/=2)
5686 Create: va' = vec_shift <va, offset>
5687 Create: va = vop <va, va'>
5688 } */
5690 tree rhs;
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_NOTE, vect_location,
5694 "Reduce using vector shifts\n");
5696 mode1 = TYPE_MODE (vectype1);
5697 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5698 for (elt_offset = nelements / 2;
5699 elt_offset >= 1;
5700 elt_offset /= 2)
5702 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5703 indices.new_vector (sel, 2, nelements);
5704 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5705 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5706 new_temp, zero_vec, mask);
5707 new_name = make_ssa_name (vec_dest, epilog_stmt);
5708 gimple_assign_set_lhs (epilog_stmt, new_name);
5709 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5711 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5712 new_temp);
5713 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5714 gimple_assign_set_lhs (epilog_stmt, new_temp);
5715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5718 /* 2.4 Extract the final scalar result. Create:
5719 s_out3 = extract_field <v_out2, bitpos> */
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_NOTE, vect_location,
5723 "extract scalar result\n");
5725 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5726 bitsize, bitsize_zero_node);
5727 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5728 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5729 gimple_assign_set_lhs (epilog_stmt, new_temp);
5730 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5731 scalar_results.safe_push (new_temp);
5733 else
5735 /* Case 3: Create:
5736 s = extract_field <v_out2, 0>
5737 for (offset = element_size;
5738 offset < vector_size;
5739 offset += element_size;)
5741 Create: s' = extract_field <v_out2, offset>
5742 Create: s = op <s, s'> // For non SLP cases
5743 } */
5745 if (dump_enabled_p ())
5746 dump_printf_loc (MSG_NOTE, vect_location,
5747 "Reduce using scalar code.\n");
5749 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5750 int element_bitsize = tree_to_uhwi (bitsize);
5751 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5753 int bit_offset;
5754 if (gimple_code (new_phi) == GIMPLE_PHI)
5755 vec_temp = PHI_RESULT (new_phi);
5756 else
5757 vec_temp = gimple_assign_lhs (new_phi);
5758 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5759 bitsize_zero_node);
5760 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5761 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5762 gimple_assign_set_lhs (epilog_stmt, new_temp);
5763 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5765 /* In SLP we don't need to apply reduction operation, so we just
5766 collect s' values in SCALAR_RESULTS. */
5767 if (slp_reduc)
5768 scalar_results.safe_push (new_temp);
5770 for (bit_offset = element_bitsize;
5771 bit_offset < vec_size_in_bits;
5772 bit_offset += element_bitsize)
5774 tree bitpos = bitsize_int (bit_offset);
5775 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5776 bitsize, bitpos);
5778 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5779 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5780 gimple_assign_set_lhs (epilog_stmt, new_name);
5781 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5783 if (slp_reduc)
5785 /* In SLP we don't need to apply reduction operation, so
5786 we just collect s' values in SCALAR_RESULTS. */
5787 new_temp = new_name;
5788 scalar_results.safe_push (new_name);
5790 else
5792 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5793 new_name, new_temp);
5794 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5795 gimple_assign_set_lhs (epilog_stmt, new_temp);
5796 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5801 /* The only case where we need to reduce scalar results in SLP, is
5802 unrolling. If the size of SCALAR_RESULTS is greater than
5803 GROUP_SIZE, we reduce them combining elements modulo
5804 GROUP_SIZE. */
5805 if (slp_reduc)
5807 tree res, first_res, new_res;
5808 gimple *new_stmt;
5810 /* Reduce multiple scalar results in case of SLP unrolling. */
5811 for (j = group_size; scalar_results.iterate (j, &res);
5812 j++)
5814 first_res = scalar_results[j % group_size];
5815 new_stmt = gimple_build_assign (new_scalar_dest, code,
5816 first_res, res);
5817 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5818 gimple_assign_set_lhs (new_stmt, new_res);
5819 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5820 scalar_results[j % group_size] = new_res;
5823 else
5824 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5825 scalar_results.safe_push (new_temp);
5828 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5829 == INTEGER_INDUC_COND_REDUCTION)
5830 && !operand_equal_p (initial_def, induc_val, 0))
5832 /* Earlier we set the initial value to be a vector if induc_val
5833 values. Check the result and if it is induc_val then replace
5834 with the original initial value, unless induc_val is
5835 the same as initial_def already. */
5836 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5837 induc_val);
5839 tree tmp = make_ssa_name (new_scalar_dest);
5840 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5841 initial_def, new_temp);
5842 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5843 scalar_results[0] = tmp;
5847 vect_finalize_reduction:
5849 if (double_reduc)
5850 loop = loop->inner;
5852 /* 2.5 Adjust the final result by the initial value of the reduction
5853 variable. (When such adjustment is not needed, then
5854 'adjustment_def' is zero). For example, if code is PLUS we create:
5855 new_temp = loop_exit_def + adjustment_def */
5857 if (adjustment_def)
5859 gcc_assert (!slp_reduc);
5860 if (nested_in_vect_loop)
5862 new_phi = new_phis[0];
5863 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5864 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5865 new_dest = vect_create_destination_var (scalar_dest, vectype);
5867 else
5869 new_temp = scalar_results[0];
5870 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5871 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5872 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5875 epilog_stmt = gimple_build_assign (new_dest, expr);
5876 new_temp = make_ssa_name (new_dest, epilog_stmt);
5877 gimple_assign_set_lhs (epilog_stmt, new_temp);
5878 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5879 if (nested_in_vect_loop)
5881 set_vinfo_for_stmt (epilog_stmt,
5882 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5883 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5884 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5886 if (!double_reduc)
5887 scalar_results.quick_push (new_temp);
5888 else
5889 scalar_results[0] = new_temp;
5891 else
5892 scalar_results[0] = new_temp;
5894 new_phis[0] = epilog_stmt;
5897 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5898 phis with new adjusted scalar results, i.e., replace use <s_out0>
5899 with use <s_out4>.
5901 Transform:
5902 loop_exit:
5903 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5904 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5905 v_out2 = reduce <v_out1>
5906 s_out3 = extract_field <v_out2, 0>
5907 s_out4 = adjust_result <s_out3>
5908 use <s_out0>
5909 use <s_out0>
5911 into:
5913 loop_exit:
5914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5915 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5916 v_out2 = reduce <v_out1>
5917 s_out3 = extract_field <v_out2, 0>
5918 s_out4 = adjust_result <s_out3>
5919 use <s_out4>
5920 use <s_out4> */
5923 /* In SLP reduction chain we reduce vector results into one vector if
5924 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5925 the last stmt in the reduction chain, since we are looking for the loop
5926 exit phi node. */
5927 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5929 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5930 /* Handle reduction patterns. */
5931 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5932 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5934 scalar_dest = gimple_assign_lhs (dest_stmt);
5935 group_size = 1;
5938 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5939 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5940 need to match SCALAR_RESULTS with corresponding statements. The first
5941 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5942 the first vector stmt, etc.
5943 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5944 if (group_size > new_phis.length ())
5946 ratio = group_size / new_phis.length ();
5947 gcc_assert (!(group_size % new_phis.length ()));
5949 else
5950 ratio = 1;
5952 for (k = 0; k < group_size; k++)
5954 if (k % ratio == 0)
5956 epilog_stmt = new_phis[k / ratio];
5957 reduction_phi = reduction_phis[k / ratio];
5958 if (double_reduc)
5959 inner_phi = inner_phis[k / ratio];
5962 if (slp_reduc)
5964 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5966 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5967 /* SLP statements can't participate in patterns. */
5968 gcc_assert (!orig_stmt);
5969 scalar_dest = gimple_assign_lhs (current_stmt);
5972 phis.create (3);
5973 /* Find the loop-closed-use at the loop exit of the original scalar
5974 result. (The reduction result is expected to have two immediate uses -
5975 one at the latch block, and one at the loop exit). */
5976 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5977 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5978 && !is_gimple_debug (USE_STMT (use_p)))
5979 phis.safe_push (USE_STMT (use_p));
5981 /* While we expect to have found an exit_phi because of loop-closed-ssa
5982 form we can end up without one if the scalar cycle is dead. */
5984 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5986 if (outer_loop)
5988 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5989 gphi *vect_phi;
5991 /* FORNOW. Currently not supporting the case that an inner-loop
5992 reduction is not used in the outer-loop (but only outside the
5993 outer-loop), unless it is double reduction. */
5994 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5995 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5996 || double_reduc);
5998 if (double_reduc)
5999 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6000 else
6001 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6002 if (!double_reduc
6003 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6004 != vect_double_reduction_def)
6005 continue;
6007 /* Handle double reduction:
6009 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
6010 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6011 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
6012 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
6014 At that point the regular reduction (stmt2 and stmt3) is
6015 already vectorized, as well as the exit phi node, stmt4.
6016 Here we vectorize the phi node of double reduction, stmt1, and
6017 update all relevant statements. */
6019 /* Go through all the uses of s2 to find double reduction phi
6020 node, i.e., stmt1 above. */
6021 orig_name = PHI_RESULT (exit_phi);
6022 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6024 stmt_vec_info use_stmt_vinfo;
6025 stmt_vec_info new_phi_vinfo;
6026 tree vect_phi_init, preheader_arg, vect_phi_res;
6027 basic_block bb = gimple_bb (use_stmt);
6028 gimple *use;
6030 /* Check that USE_STMT is really double reduction phi
6031 node. */
6032 if (gimple_code (use_stmt) != GIMPLE_PHI
6033 || gimple_phi_num_args (use_stmt) != 2
6034 || bb->loop_father != outer_loop)
6035 continue;
6036 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6037 if (!use_stmt_vinfo
6038 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6039 != vect_double_reduction_def)
6040 continue;
6042 /* Create vector phi node for double reduction:
6043 vs1 = phi <vs0, vs2>
6044 vs1 was created previously in this function by a call to
6045 vect_get_vec_def_for_operand and is stored in
6046 vec_initial_def;
6047 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6048 vs0 is created here. */
6050 /* Create vector phi node. */
6051 vect_phi = create_phi_node (vec_initial_def, bb);
6052 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6053 loop_vec_info_for_loop (outer_loop));
6054 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6056 /* Create vs0 - initial def of the double reduction phi. */
6057 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6058 loop_preheader_edge (outer_loop));
6059 vect_phi_init = get_initial_def_for_reduction
6060 (stmt, preheader_arg, NULL);
6062 /* Update phi node arguments with vs0 and vs2. */
6063 add_phi_arg (vect_phi, vect_phi_init,
6064 loop_preheader_edge (outer_loop),
6065 UNKNOWN_LOCATION);
6066 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6067 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6068 if (dump_enabled_p ())
6070 dump_printf_loc (MSG_NOTE, vect_location,
6071 "created double reduction phi node: ");
6072 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6075 vect_phi_res = PHI_RESULT (vect_phi);
6077 /* Replace the use, i.e., set the correct vs1 in the regular
6078 reduction phi node. FORNOW, NCOPIES is always 1, so the
6079 loop is redundant. */
6080 use = reduction_phi;
6081 for (j = 0; j < ncopies; j++)
6083 edge pr_edge = loop_preheader_edge (loop);
6084 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6085 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6091 phis.release ();
6092 if (nested_in_vect_loop)
6094 if (double_reduc)
6095 loop = outer_loop;
6096 else
6097 continue;
6100 phis.create (3);
6101 /* Find the loop-closed-use at the loop exit of the original scalar
6102 result. (The reduction result is expected to have two immediate uses,
6103 one at the latch block, and one at the loop exit). For double
6104 reductions we are looking for exit phis of the outer loop. */
6105 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6107 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6109 if (!is_gimple_debug (USE_STMT (use_p)))
6110 phis.safe_push (USE_STMT (use_p));
6112 else
6114 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6116 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6118 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6120 if (!flow_bb_inside_loop_p (loop,
6121 gimple_bb (USE_STMT (phi_use_p)))
6122 && !is_gimple_debug (USE_STMT (phi_use_p)))
6123 phis.safe_push (USE_STMT (phi_use_p));
6129 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6131 /* Replace the uses: */
6132 orig_name = PHI_RESULT (exit_phi);
6133 scalar_result = scalar_results[k];
6134 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6135 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6136 SET_USE (use_p, scalar_result);
6139 phis.release ();
6143 /* Return a vector of type VECTYPE that is equal to the vector select
6144 operation "MASK ? VEC : IDENTITY". Insert the select statements
6145 before GSI. */
6147 static tree
6148 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6149 tree vec, tree identity)
6151 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6152 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6153 mask, vec, identity);
6154 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6155 return cond;
6158 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6159 order, starting with LHS. Insert the extraction statements before GSI and
6160 associate the new scalar SSA names with variable SCALAR_DEST.
6161 Return the SSA name for the result. */
6163 static tree
6164 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6165 tree_code code, tree lhs, tree vector_rhs)
6167 tree vectype = TREE_TYPE (vector_rhs);
6168 tree scalar_type = TREE_TYPE (vectype);
6169 tree bitsize = TYPE_SIZE (scalar_type);
6170 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6171 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6173 for (unsigned HOST_WIDE_INT bit_offset = 0;
6174 bit_offset < vec_size_in_bits;
6175 bit_offset += element_bitsize)
6177 tree bitpos = bitsize_int (bit_offset);
6178 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6179 bitsize, bitpos);
6181 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6182 rhs = make_ssa_name (scalar_dest, stmt);
6183 gimple_assign_set_lhs (stmt, rhs);
6184 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6186 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6187 tree new_name = make_ssa_name (scalar_dest, stmt);
6188 gimple_assign_set_lhs (stmt, new_name);
6189 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6190 lhs = new_name;
6192 return lhs;
6195 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6196 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6197 statement. CODE is the operation performed by STMT and OPS are
6198 its scalar operands. REDUC_INDEX is the index of the operand in
6199 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6200 implements in-order reduction, or IFN_LAST if we should open-code it.
6201 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6202 that should be used to control the operation in a fully-masked loop. */
6204 static bool
6205 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6206 gimple **vec_stmt, slp_tree slp_node,
6207 gimple *reduc_def_stmt,
6208 tree_code code, internal_fn reduc_fn,
6209 tree ops[3], tree vectype_in,
6210 int reduc_index, vec_loop_masks *masks)
6212 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6213 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6214 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6215 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6216 gimple *new_stmt = NULL;
6218 int ncopies;
6219 if (slp_node)
6220 ncopies = 1;
6221 else
6222 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6224 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6225 gcc_assert (ncopies == 1);
6226 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6227 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6228 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6229 == FOLD_LEFT_REDUCTION);
6231 if (slp_node)
6232 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6233 TYPE_VECTOR_SUBPARTS (vectype_in)));
6235 tree op0 = ops[1 - reduc_index];
6237 int group_size = 1;
6238 gimple *scalar_dest_def;
6239 auto_vec<tree> vec_oprnds0;
6240 if (slp_node)
6242 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6243 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6244 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6246 else
6248 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6249 vec_oprnds0.create (1);
6250 vec_oprnds0.quick_push (loop_vec_def0);
6251 scalar_dest_def = stmt;
6254 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6255 tree scalar_type = TREE_TYPE (scalar_dest);
6256 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6258 int vec_num = vec_oprnds0.length ();
6259 gcc_assert (vec_num == 1 || slp_node);
6260 tree vec_elem_type = TREE_TYPE (vectype_out);
6261 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6263 tree vector_identity = NULL_TREE;
6264 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6265 vector_identity = build_zero_cst (vectype_out);
6267 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6268 int i;
6269 tree def0;
6270 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6272 tree mask = NULL_TREE;
6273 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6274 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6276 /* Handle MINUS by adding the negative. */
6277 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6279 tree negated = make_ssa_name (vectype_out);
6280 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6281 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6282 def0 = negated;
6285 if (mask)
6286 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6287 vector_identity);
6289 /* On the first iteration the input is simply the scalar phi
6290 result, and for subsequent iterations it is the output of
6291 the preceding operation. */
6292 if (reduc_fn != IFN_LAST)
6294 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6295 /* For chained SLP reductions the output of the previous reduction
6296 operation serves as the input of the next. For the final statement
6297 the output cannot be a temporary - we reuse the original
6298 scalar destination of the last statement. */
6299 if (i != vec_num - 1)
6301 gimple_set_lhs (new_stmt, scalar_dest_var);
6302 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6303 gimple_set_lhs (new_stmt, reduc_var);
6306 else
6308 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6309 reduc_var, def0);
6310 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6311 /* Remove the statement, so that we can use the same code paths
6312 as for statements that we've just created. */
6313 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6314 gsi_remove (&tmp_gsi, false);
6317 if (i == vec_num - 1)
6319 gimple_set_lhs (new_stmt, scalar_dest);
6320 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6322 else
6323 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6325 if (slp_node)
6326 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6329 if (!slp_node)
6330 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6332 return true;
6335 /* Function is_nonwrapping_integer_induction.
6337 Check if STMT (which is part of loop LOOP) both increments and
6338 does not cause overflow. */
6340 static bool
6341 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6343 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6344 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6345 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6346 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6347 widest_int ni, max_loop_value, lhs_max;
6348 bool overflow = false;
6350 /* Make sure the loop is integer based. */
6351 if (TREE_CODE (base) != INTEGER_CST
6352 || TREE_CODE (step) != INTEGER_CST)
6353 return false;
6355 /* Check that the max size of the loop will not wrap. */
6357 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6358 return true;
6360 if (! max_stmt_executions (loop, &ni))
6361 return false;
6363 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6364 &overflow);
6365 if (overflow)
6366 return false;
6368 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6369 TYPE_SIGN (lhs_type), &overflow);
6370 if (overflow)
6371 return false;
6373 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6374 <= TYPE_PRECISION (lhs_type));
6377 /* Function vectorizable_reduction.
6379 Check if STMT performs a reduction operation that can be vectorized.
6380 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6381 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6382 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6384 This function also handles reduction idioms (patterns) that have been
6385 recognized in advance during vect_pattern_recog. In this case, STMT may be
6386 of this form:
6387 X = pattern_expr (arg0, arg1, ..., X)
6388 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6389 sequence that had been detected and replaced by the pattern-stmt (STMT).
6391 This function also handles reduction of condition expressions, for example:
6392 for (int i = 0; i < N; i++)
6393 if (a[i] < value)
6394 last = a[i];
6395 This is handled by vectorising the loop and creating an additional vector
6396 containing the loop indexes for which "a[i] < value" was true. In the
6397 function epilogue this is reduced to a single max value and then used to
6398 index into the vector of results.
6400 In some cases of reduction patterns, the type of the reduction variable X is
6401 different than the type of the other arguments of STMT.
6402 In such cases, the vectype that is used when transforming STMT into a vector
6403 stmt is different than the vectype that is used to determine the
6404 vectorization factor, because it consists of a different number of elements
6405 than the actual number of elements that are being operated upon in parallel.
6407 For example, consider an accumulation of shorts into an int accumulator.
6408 On some targets it's possible to vectorize this pattern operating on 8
6409 shorts at a time (hence, the vectype for purposes of determining the
6410 vectorization factor should be V8HI); on the other hand, the vectype that
6411 is used to create the vector form is actually V4SI (the type of the result).
6413 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6414 indicates what is the actual level of parallelism (V8HI in the example), so
6415 that the right vectorization factor would be derived. This vectype
6416 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6417 be used to create the vectorized stmt. The right vectype for the vectorized
6418 stmt is obtained from the type of the result X:
6419 get_vectype_for_scalar_type (TREE_TYPE (X))
6421 This means that, contrary to "regular" reductions (or "regular" stmts in
6422 general), the following equation:
6423 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6424 does *NOT* necessarily hold for reduction patterns. */
6426 bool
6427 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6428 gimple **vec_stmt, slp_tree slp_node,
6429 slp_instance slp_node_instance)
6431 tree vec_dest;
6432 tree scalar_dest;
6433 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6434 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6435 tree vectype_in = NULL_TREE;
6436 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6437 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6438 enum tree_code code, orig_code;
6439 internal_fn reduc_fn;
6440 machine_mode vec_mode;
6441 int op_type;
6442 optab optab;
6443 tree new_temp = NULL_TREE;
6444 gimple *def_stmt;
6445 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6446 gimple *cond_reduc_def_stmt = NULL;
6447 enum tree_code cond_reduc_op_code = ERROR_MARK;
6448 tree scalar_type;
6449 bool is_simple_use;
6450 gimple *orig_stmt;
6451 stmt_vec_info orig_stmt_info = NULL;
6452 int i;
6453 int ncopies;
6454 int epilog_copies;
6455 stmt_vec_info prev_stmt_info, prev_phi_info;
6456 bool single_defuse_cycle = false;
6457 gimple *new_stmt = NULL;
6458 int j;
6459 tree ops[3];
6460 enum vect_def_type dts[3];
6461 bool nested_cycle = false, found_nested_cycle_def = false;
6462 bool double_reduc = false;
6463 basic_block def_bb;
6464 struct loop * def_stmt_loop, *outer_loop = NULL;
6465 tree def_arg;
6466 gimple *def_arg_stmt;
6467 auto_vec<tree> vec_oprnds0;
6468 auto_vec<tree> vec_oprnds1;
6469 auto_vec<tree> vec_oprnds2;
6470 auto_vec<tree> vect_defs;
6471 auto_vec<gimple *> phis;
6472 int vec_num;
6473 tree def0, tem;
6474 bool first_p = true;
6475 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6476 tree cond_reduc_val = NULL_TREE;
6478 /* Make sure it was already recognized as a reduction computation. */
6479 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6480 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6481 return false;
6483 if (nested_in_vect_loop_p (loop, stmt))
6485 outer_loop = loop;
6486 loop = loop->inner;
6487 nested_cycle = true;
6490 /* In case of reduction chain we switch to the first stmt in the chain, but
6491 we don't update STMT_INFO, since only the last stmt is marked as reduction
6492 and has reduction properties. */
6493 if (GROUP_FIRST_ELEMENT (stmt_info)
6494 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6496 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6497 first_p = false;
6500 if (gimple_code (stmt) == GIMPLE_PHI)
6502 /* Analysis is fully done on the reduction stmt invocation. */
6503 if (! vec_stmt)
6505 if (slp_node)
6506 slp_node_instance->reduc_phis = slp_node;
6508 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6509 return true;
6512 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6513 /* Leave the scalar phi in place. Note that checking
6514 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6515 for reductions involving a single statement. */
6516 return true;
6518 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6519 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6520 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6522 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6523 == EXTRACT_LAST_REDUCTION)
6524 /* Leave the scalar phi in place. */
6525 return true;
6527 gcc_assert (is_gimple_assign (reduc_stmt));
6528 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6530 tree op = gimple_op (reduc_stmt, k);
6531 if (op == gimple_phi_result (stmt))
6532 continue;
6533 if (k == 1
6534 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6535 continue;
6536 if (!vectype_in
6537 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6538 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6539 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6540 break;
6542 gcc_assert (vectype_in);
6544 if (slp_node)
6545 ncopies = 1;
6546 else
6547 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6549 use_operand_p use_p;
6550 gimple *use_stmt;
6551 if (ncopies > 1
6552 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6553 <= vect_used_only_live)
6554 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6555 && (use_stmt == reduc_stmt
6556 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6557 == reduc_stmt)))
6558 single_defuse_cycle = true;
6560 /* Create the destination vector */
6561 scalar_dest = gimple_assign_lhs (reduc_stmt);
6562 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6564 if (slp_node)
6565 /* The size vect_schedule_slp_instance computes is off for us. */
6566 vec_num = vect_get_num_vectors
6567 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6568 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6569 vectype_in);
6570 else
6571 vec_num = 1;
6573 /* Generate the reduction PHIs upfront. */
6574 prev_phi_info = NULL;
6575 for (j = 0; j < ncopies; j++)
6577 if (j == 0 || !single_defuse_cycle)
6579 for (i = 0; i < vec_num; i++)
6581 /* Create the reduction-phi that defines the reduction
6582 operand. */
6583 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6584 set_vinfo_for_stmt (new_phi,
6585 new_stmt_vec_info (new_phi, loop_vinfo));
6587 if (slp_node)
6588 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6589 else
6591 if (j == 0)
6592 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6593 else
6594 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6595 prev_phi_info = vinfo_for_stmt (new_phi);
6601 return true;
6604 /* 1. Is vectorizable reduction? */
6605 /* Not supportable if the reduction variable is used in the loop, unless
6606 it's a reduction chain. */
6607 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6608 && !GROUP_FIRST_ELEMENT (stmt_info))
6609 return false;
6611 /* Reductions that are not used even in an enclosing outer-loop,
6612 are expected to be "live" (used out of the loop). */
6613 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6614 && !STMT_VINFO_LIVE_P (stmt_info))
6615 return false;
6617 /* 2. Has this been recognized as a reduction pattern?
6619 Check if STMT represents a pattern that has been recognized
6620 in earlier analysis stages. For stmts that represent a pattern,
6621 the STMT_VINFO_RELATED_STMT field records the last stmt in
6622 the original sequence that constitutes the pattern. */
6624 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6625 if (orig_stmt)
6627 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6628 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6629 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6632 /* 3. Check the operands of the operation. The first operands are defined
6633 inside the loop body. The last operand is the reduction variable,
6634 which is defined by the loop-header-phi. */
6636 gcc_assert (is_gimple_assign (stmt));
6638 /* Flatten RHS. */
6639 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6641 case GIMPLE_BINARY_RHS:
6642 code = gimple_assign_rhs_code (stmt);
6643 op_type = TREE_CODE_LENGTH (code);
6644 gcc_assert (op_type == binary_op);
6645 ops[0] = gimple_assign_rhs1 (stmt);
6646 ops[1] = gimple_assign_rhs2 (stmt);
6647 break;
6649 case GIMPLE_TERNARY_RHS:
6650 code = gimple_assign_rhs_code (stmt);
6651 op_type = TREE_CODE_LENGTH (code);
6652 gcc_assert (op_type == ternary_op);
6653 ops[0] = gimple_assign_rhs1 (stmt);
6654 ops[1] = gimple_assign_rhs2 (stmt);
6655 ops[2] = gimple_assign_rhs3 (stmt);
6656 break;
6658 case GIMPLE_UNARY_RHS:
6659 return false;
6661 default:
6662 gcc_unreachable ();
6665 if (code == COND_EXPR && slp_node)
6666 return false;
6668 scalar_dest = gimple_assign_lhs (stmt);
6669 scalar_type = TREE_TYPE (scalar_dest);
6670 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6671 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6672 return false;
6674 /* Do not try to vectorize bit-precision reductions. */
6675 if (!type_has_mode_precision_p (scalar_type))
6676 return false;
6678 /* All uses but the last are expected to be defined in the loop.
6679 The last use is the reduction variable. In case of nested cycle this
6680 assumption is not true: we use reduc_index to record the index of the
6681 reduction variable. */
6682 gimple *reduc_def_stmt = NULL;
6683 int reduc_index = -1;
6684 for (i = 0; i < op_type; i++)
6686 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6687 if (i == 0 && code == COND_EXPR)
6688 continue;
6690 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6691 &def_stmt, &dts[i], &tem);
6692 dt = dts[i];
6693 gcc_assert (is_simple_use);
6694 if (dt == vect_reduction_def)
6696 reduc_def_stmt = def_stmt;
6697 reduc_index = i;
6698 continue;
6700 else if (tem)
6702 /* To properly compute ncopies we are interested in the widest
6703 input type in case we're looking at a widening accumulation. */
6704 if (!vectype_in
6705 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6706 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6707 vectype_in = tem;
6710 if (dt != vect_internal_def
6711 && dt != vect_external_def
6712 && dt != vect_constant_def
6713 && dt != vect_induction_def
6714 && !(dt == vect_nested_cycle && nested_cycle))
6715 return false;
6717 if (dt == vect_nested_cycle)
6719 found_nested_cycle_def = true;
6720 reduc_def_stmt = def_stmt;
6721 reduc_index = i;
6724 if (i == 1 && code == COND_EXPR)
6726 /* Record how value of COND_EXPR is defined. */
6727 if (dt == vect_constant_def)
6729 cond_reduc_dt = dt;
6730 cond_reduc_val = ops[i];
6732 if (dt == vect_induction_def
6733 && def_stmt != NULL
6734 && is_nonwrapping_integer_induction (def_stmt, loop))
6736 cond_reduc_dt = dt;
6737 cond_reduc_def_stmt = def_stmt;
6742 if (!vectype_in)
6743 vectype_in = vectype_out;
6745 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6746 directy used in stmt. */
6747 if (reduc_index == -1)
6749 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6751 if (dump_enabled_p ())
6752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 "in-order reduction chain without SLP.\n");
6754 return false;
6757 if (orig_stmt)
6758 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6759 else
6760 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6763 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6764 return false;
6766 if (!(reduc_index == -1
6767 || dts[reduc_index] == vect_reduction_def
6768 || dts[reduc_index] == vect_nested_cycle
6769 || ((dts[reduc_index] == vect_internal_def
6770 || dts[reduc_index] == vect_external_def
6771 || dts[reduc_index] == vect_constant_def
6772 || dts[reduc_index] == vect_induction_def)
6773 && nested_cycle && found_nested_cycle_def)))
6775 /* For pattern recognized stmts, orig_stmt might be a reduction,
6776 but some helper statements for the pattern might not, or
6777 might be COND_EXPRs with reduction uses in the condition. */
6778 gcc_assert (orig_stmt);
6779 return false;
6782 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6783 enum vect_reduction_type v_reduc_type
6784 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6785 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6787 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6788 /* If we have a condition reduction, see if we can simplify it further. */
6789 if (v_reduc_type == COND_REDUCTION)
6791 /* TODO: We can't yet handle reduction chains, since we need to treat
6792 each COND_EXPR in the chain specially, not just the last one.
6793 E.g. for:
6795 x_1 = PHI <x_3, ...>
6796 x_2 = a_2 ? ... : x_1;
6797 x_3 = a_3 ? ... : x_2;
6799 we're interested in the last element in x_3 for which a_2 || a_3
6800 is true, whereas the current reduction chain handling would
6801 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6802 as a reduction operation. */
6803 if (reduc_index == -1)
6805 if (dump_enabled_p ())
6806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6807 "conditional reduction chains not supported\n");
6808 return false;
6811 /* vect_is_simple_reduction ensured that operand 2 is the
6812 loop-carried operand. */
6813 gcc_assert (reduc_index == 2);
6815 /* Loop peeling modifies initial value of reduction PHI, which
6816 makes the reduction stmt to be transformed different to the
6817 original stmt analyzed. We need to record reduction code for
6818 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6819 it can be used directly at transform stage. */
6820 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6821 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6823 /* Also set the reduction type to CONST_COND_REDUCTION. */
6824 gcc_assert (cond_reduc_dt == vect_constant_def);
6825 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6827 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6828 vectype_in, OPTIMIZE_FOR_SPEED))
6830 if (dump_enabled_p ())
6831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6832 "optimizing condition reduction with"
6833 " FOLD_EXTRACT_LAST.\n");
6834 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6836 else if (cond_reduc_dt == vect_induction_def)
6838 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6839 tree base
6840 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6841 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6843 gcc_assert (TREE_CODE (base) == INTEGER_CST
6844 && TREE_CODE (step) == INTEGER_CST);
6845 cond_reduc_val = NULL_TREE;
6846 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6847 above base; punt if base is the minimum value of the type for
6848 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6849 if (tree_int_cst_sgn (step) == -1)
6851 cond_reduc_op_code = MIN_EXPR;
6852 if (tree_int_cst_sgn (base) == -1)
6853 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6854 else if (tree_int_cst_lt (base,
6855 TYPE_MAX_VALUE (TREE_TYPE (base))))
6856 cond_reduc_val
6857 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6859 else
6861 cond_reduc_op_code = MAX_EXPR;
6862 if (tree_int_cst_sgn (base) == 1)
6863 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6864 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6865 base))
6866 cond_reduc_val
6867 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6869 if (cond_reduc_val)
6871 if (dump_enabled_p ())
6872 dump_printf_loc (MSG_NOTE, vect_location,
6873 "condition expression based on "
6874 "integer induction.\n");
6875 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6876 = INTEGER_INDUC_COND_REDUCTION;
6879 else if (cond_reduc_dt == vect_constant_def)
6881 enum vect_def_type cond_initial_dt;
6882 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6883 tree cond_initial_val
6884 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6886 gcc_assert (cond_reduc_val != NULL_TREE);
6887 vect_is_simple_use (cond_initial_val, loop_vinfo,
6888 &def_stmt, &cond_initial_dt);
6889 if (cond_initial_dt == vect_constant_def
6890 && types_compatible_p (TREE_TYPE (cond_initial_val),
6891 TREE_TYPE (cond_reduc_val)))
6893 tree e = fold_binary (LE_EXPR, boolean_type_node,
6894 cond_initial_val, cond_reduc_val);
6895 if (e && (integer_onep (e) || integer_zerop (e)))
6897 if (dump_enabled_p ())
6898 dump_printf_loc (MSG_NOTE, vect_location,
6899 "condition expression based on "
6900 "compile time constant.\n");
6901 /* Record reduction code at analysis stage. */
6902 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6903 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6904 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6905 = CONST_COND_REDUCTION;
6911 if (orig_stmt)
6912 gcc_assert (tmp == orig_stmt
6913 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6914 else
6915 /* We changed STMT to be the first stmt in reduction chain, hence we
6916 check that in this case the first element in the chain is STMT. */
6917 gcc_assert (stmt == tmp
6918 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6920 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6921 return false;
6923 if (slp_node)
6924 ncopies = 1;
6925 else
6926 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6928 gcc_assert (ncopies >= 1);
6930 vec_mode = TYPE_MODE (vectype_in);
6931 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6933 if (code == COND_EXPR)
6935 /* Only call during the analysis stage, otherwise we'll lose
6936 STMT_VINFO_TYPE. */
6937 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6938 ops[reduc_index], 0, NULL))
6940 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942 "unsupported condition in reduction\n");
6943 return false;
6946 else
6948 /* 4. Supportable by target? */
6950 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6951 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6953 /* Shifts and rotates are only supported by vectorizable_shifts,
6954 not vectorizable_reduction. */
6955 if (dump_enabled_p ())
6956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6957 "unsupported shift or rotation.\n");
6958 return false;
6961 /* 4.1. check support for the operation in the loop */
6962 optab = optab_for_tree_code (code, vectype_in, optab_default);
6963 if (!optab)
6965 if (dump_enabled_p ())
6966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967 "no optab.\n");
6969 return false;
6972 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6974 if (dump_enabled_p ())
6975 dump_printf (MSG_NOTE, "op not supported by target.\n");
6977 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6978 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6979 return false;
6981 if (dump_enabled_p ())
6982 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6985 /* Worthwhile without SIMD support? */
6986 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6987 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6989 if (dump_enabled_p ())
6990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991 "not worthwhile without SIMD support.\n");
6993 return false;
6997 /* 4.2. Check support for the epilog operation.
6999 If STMT represents a reduction pattern, then the type of the
7000 reduction variable may be different than the type of the rest
7001 of the arguments. For example, consider the case of accumulation
7002 of shorts into an int accumulator; The original code:
7003 S1: int_a = (int) short_a;
7004 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7006 was replaced with:
7007 STMT: int_acc = widen_sum <short_a, int_acc>
7009 This means that:
7010 1. The tree-code that is used to create the vector operation in the
7011 epilog code (that reduces the partial results) is not the
7012 tree-code of STMT, but is rather the tree-code of the original
7013 stmt from the pattern that STMT is replacing. I.e, in the example
7014 above we want to use 'widen_sum' in the loop, but 'plus' in the
7015 epilog.
7016 2. The type (mode) we use to check available target support
7017 for the vector operation to be created in the *epilog*, is
7018 determined by the type of the reduction variable (in the example
7019 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7020 However the type (mode) we use to check available target support
7021 for the vector operation to be created *inside the loop*, is
7022 determined by the type of the other arguments to STMT (in the
7023 example we'd check this: optab_handler (widen_sum_optab,
7024 vect_short_mode)).
7026 This is contrary to "regular" reductions, in which the types of all
7027 the arguments are the same as the type of the reduction variable.
7028 For "regular" reductions we can therefore use the same vector type
7029 (and also the same tree-code) when generating the epilog code and
7030 when generating the code inside the loop. */
7032 vect_reduction_type reduction_type
7033 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7034 if (orig_stmt
7035 && (reduction_type == TREE_CODE_REDUCTION
7036 || reduction_type == FOLD_LEFT_REDUCTION))
7038 /* This is a reduction pattern: get the vectype from the type of the
7039 reduction variable, and get the tree-code from orig_stmt. */
7040 orig_code = gimple_assign_rhs_code (orig_stmt);
7041 gcc_assert (vectype_out);
7042 vec_mode = TYPE_MODE (vectype_out);
7044 else
7046 /* Regular reduction: use the same vectype and tree-code as used for
7047 the vector code inside the loop can be used for the epilog code. */
7048 orig_code = code;
7050 if (code == MINUS_EXPR)
7051 orig_code = PLUS_EXPR;
7053 /* For simple condition reductions, replace with the actual expression
7054 we want to base our reduction around. */
7055 if (reduction_type == CONST_COND_REDUCTION)
7057 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7058 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7060 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7061 orig_code = cond_reduc_op_code;
7064 if (nested_cycle)
7066 def_bb = gimple_bb (reduc_def_stmt);
7067 def_stmt_loop = def_bb->loop_father;
7068 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7069 loop_preheader_edge (def_stmt_loop));
7070 if (TREE_CODE (def_arg) == SSA_NAME
7071 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7072 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7073 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7074 && vinfo_for_stmt (def_arg_stmt)
7075 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7076 == vect_double_reduction_def)
7077 double_reduc = true;
7080 reduc_fn = IFN_LAST;
7082 if (reduction_type == TREE_CODE_REDUCTION
7083 || reduction_type == FOLD_LEFT_REDUCTION
7084 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7085 || reduction_type == CONST_COND_REDUCTION)
7087 if (reduction_type == FOLD_LEFT_REDUCTION
7088 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7089 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7091 if (reduc_fn != IFN_LAST
7092 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7093 OPTIMIZE_FOR_SPEED))
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 "reduc op not supported by target.\n");
7099 reduc_fn = IFN_LAST;
7102 else
7104 if (!nested_cycle || double_reduc)
7106 if (dump_enabled_p ())
7107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7108 "no reduc code for scalar code.\n");
7110 return false;
7114 else if (reduction_type == COND_REDUCTION)
7116 int scalar_precision
7117 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7118 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7119 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7120 nunits_out);
7122 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7123 OPTIMIZE_FOR_SPEED))
7124 reduc_fn = IFN_REDUC_MAX;
7127 if (reduction_type != EXTRACT_LAST_REDUCTION
7128 && reduc_fn == IFN_LAST
7129 && !nunits_out.is_constant ())
7131 if (dump_enabled_p ())
7132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7133 "missing target support for reduction on"
7134 " variable-length vectors.\n");
7135 return false;
7138 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7139 && ncopies > 1)
7141 if (dump_enabled_p ())
7142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 "multiple types in double reduction or condition "
7144 "reduction.\n");
7145 return false;
7148 /* For SLP reductions, see if there is a neutral value we can use. */
7149 tree neutral_op = NULL_TREE;
7150 if (slp_node)
7151 neutral_op
7152 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7153 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7155 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7157 /* We can't support in-order reductions of code such as this:
7159 for (int i = 0; i < n1; ++i)
7160 for (int j = 0; j < n2; ++j)
7161 l += a[j];
7163 since GCC effectively transforms the loop when vectorizing:
7165 for (int i = 0; i < n1 / VF; ++i)
7166 for (int j = 0; j < n2; ++j)
7167 for (int k = 0; k < VF; ++k)
7168 l += a[j];
7170 which is a reassociation of the original operation. */
7171 if (dump_enabled_p ())
7172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7173 "in-order double reduction not supported.\n");
7175 return false;
7178 if (reduction_type == FOLD_LEFT_REDUCTION
7179 && slp_node
7180 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7182 /* We cannot use in-order reductions in this case because there is
7183 an implicit reassociation of the operations involved. */
7184 if (dump_enabled_p ())
7185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186 "in-order unchained SLP reductions not supported.\n");
7187 return false;
7190 /* For double reductions, and for SLP reductions with a neutral value,
7191 we construct a variable-length initial vector by loading a vector
7192 full of the neutral value and then shift-and-inserting the start
7193 values into the low-numbered elements. */
7194 if ((double_reduc || neutral_op)
7195 && !nunits_out.is_constant ()
7196 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7197 vectype_out, OPTIMIZE_FOR_SPEED))
7199 if (dump_enabled_p ())
7200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7201 "reduction on variable-length vectors requires"
7202 " target support for a vector-shift-and-insert"
7203 " operation.\n");
7204 return false;
7207 /* Check extra constraints for variable-length unchained SLP reductions. */
7208 if (STMT_SLP_TYPE (stmt_info)
7209 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7210 && !nunits_out.is_constant ())
7212 /* We checked above that we could build the initial vector when
7213 there's a neutral element value. Check here for the case in
7214 which each SLP statement has its own initial value and in which
7215 that value needs to be repeated for every instance of the
7216 statement within the initial vector. */
7217 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7218 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7219 if (!neutral_op
7220 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7222 if (dump_enabled_p ())
7223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7224 "unsupported form of SLP reduction for"
7225 " variable-length vectors: cannot build"
7226 " initial vector.\n");
7227 return false;
7229 /* The epilogue code relies on the number of elements being a multiple
7230 of the group size. The duplicate-and-interleave approach to setting
7231 up the the initial vector does too. */
7232 if (!multiple_p (nunits_out, group_size))
7234 if (dump_enabled_p ())
7235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7236 "unsupported form of SLP reduction for"
7237 " variable-length vectors: the vector size"
7238 " is not a multiple of the number of results.\n");
7239 return false;
7243 /* In case of widenning multiplication by a constant, we update the type
7244 of the constant to be the type of the other operand. We check that the
7245 constant fits the type in the pattern recognition pass. */
7246 if (code == DOT_PROD_EXPR
7247 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7249 if (TREE_CODE (ops[0]) == INTEGER_CST)
7250 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7251 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7252 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7253 else
7255 if (dump_enabled_p ())
7256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257 "invalid types in dot-prod\n");
7259 return false;
7263 if (reduction_type == COND_REDUCTION)
7265 widest_int ni;
7267 if (! max_loop_iterations (loop, &ni))
7269 if (dump_enabled_p ())
7270 dump_printf_loc (MSG_NOTE, vect_location,
7271 "loop count not known, cannot create cond "
7272 "reduction.\n");
7273 return false;
7275 /* Convert backedges to iterations. */
7276 ni += 1;
7278 /* The additional index will be the same type as the condition. Check
7279 that the loop can fit into this less one (because we'll use up the
7280 zero slot for when there are no matches). */
7281 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7282 if (wi::geu_p (ni, wi::to_widest (max_index)))
7284 if (dump_enabled_p ())
7285 dump_printf_loc (MSG_NOTE, vect_location,
7286 "loop size is greater than data size.\n");
7287 return false;
7291 /* In case the vectorization factor (VF) is bigger than the number
7292 of elements that we can fit in a vectype (nunits), we have to generate
7293 more than one vector stmt - i.e - we need to "unroll" the
7294 vector stmt by a factor VF/nunits. For more details see documentation
7295 in vectorizable_operation. */
7297 /* If the reduction is used in an outer loop we need to generate
7298 VF intermediate results, like so (e.g. for ncopies=2):
7299 r0 = phi (init, r0)
7300 r1 = phi (init, r1)
7301 r0 = x0 + r0;
7302 r1 = x1 + r1;
7303 (i.e. we generate VF results in 2 registers).
7304 In this case we have a separate def-use cycle for each copy, and therefore
7305 for each copy we get the vector def for the reduction variable from the
7306 respective phi node created for this copy.
7308 Otherwise (the reduction is unused in the loop nest), we can combine
7309 together intermediate results, like so (e.g. for ncopies=2):
7310 r = phi (init, r)
7311 r = x0 + r;
7312 r = x1 + r;
7313 (i.e. we generate VF/2 results in a single register).
7314 In this case for each copy we get the vector def for the reduction variable
7315 from the vectorized reduction operation generated in the previous iteration.
7317 This only works when we see both the reduction PHI and its only consumer
7318 in vectorizable_reduction and there are no intermediate stmts
7319 participating. */
7320 use_operand_p use_p;
7321 gimple *use_stmt;
7322 if (ncopies > 1
7323 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7324 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7325 && (use_stmt == stmt
7326 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7328 single_defuse_cycle = true;
7329 epilog_copies = 1;
7331 else
7332 epilog_copies = ncopies;
7334 /* If the reduction stmt is one of the patterns that have lane
7335 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7336 if ((ncopies > 1
7337 && ! single_defuse_cycle)
7338 && (code == DOT_PROD_EXPR
7339 || code == WIDEN_SUM_EXPR
7340 || code == SAD_EXPR))
7342 if (dump_enabled_p ())
7343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7344 "multi def-use cycle not possible for lane-reducing "
7345 "reduction operation\n");
7346 return false;
7349 if (slp_node)
7350 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7351 else
7352 vec_num = 1;
7354 internal_fn cond_fn = get_conditional_internal_fn (code);
7355 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7357 if (!vec_stmt) /* transformation not required. */
7359 if (first_p)
7360 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7361 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7363 if (reduction_type != FOLD_LEFT_REDUCTION
7364 && (cond_fn == IFN_LAST
7365 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7366 OPTIMIZE_FOR_SPEED)))
7368 if (dump_enabled_p ())
7369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370 "can't use a fully-masked loop because no"
7371 " conditional operation is available.\n");
7372 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7374 else if (reduc_index == -1)
7376 if (dump_enabled_p ())
7377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7378 "can't use a fully-masked loop for chained"
7379 " reductions.\n");
7380 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7382 else
7383 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7384 vectype_in);
7386 if (dump_enabled_p ()
7387 && reduction_type == FOLD_LEFT_REDUCTION)
7388 dump_printf_loc (MSG_NOTE, vect_location,
7389 "using an in-order (fold-left) reduction.\n");
7390 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7391 return true;
7394 /* Transform. */
7396 if (dump_enabled_p ())
7397 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7399 /* FORNOW: Multiple types are not supported for condition. */
7400 if (code == COND_EXPR)
7401 gcc_assert (ncopies == 1);
7403 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7405 if (reduction_type == FOLD_LEFT_REDUCTION)
7406 return vectorize_fold_left_reduction
7407 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7408 reduc_fn, ops, vectype_in, reduc_index, masks);
7410 if (reduction_type == EXTRACT_LAST_REDUCTION)
7412 gcc_assert (!slp_node);
7413 return vectorizable_condition (stmt, gsi, vec_stmt,
7414 NULL, reduc_index, NULL);
7417 /* Create the destination vector */
7418 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7420 prev_stmt_info = NULL;
7421 prev_phi_info = NULL;
7422 if (!slp_node)
7424 vec_oprnds0.create (1);
7425 vec_oprnds1.create (1);
7426 if (op_type == ternary_op)
7427 vec_oprnds2.create (1);
7430 phis.create (vec_num);
7431 vect_defs.create (vec_num);
7432 if (!slp_node)
7433 vect_defs.quick_push (NULL_TREE);
7435 if (slp_node)
7436 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7437 else
7438 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7440 for (j = 0; j < ncopies; j++)
7442 if (code == COND_EXPR)
7444 gcc_assert (!slp_node);
7445 vectorizable_condition (stmt, gsi, vec_stmt,
7446 PHI_RESULT (phis[0]),
7447 reduc_index, NULL);
7448 /* Multiple types are not supported for condition. */
7449 break;
7452 /* Handle uses. */
7453 if (j == 0)
7455 if (slp_node)
7457 /* Get vec defs for all the operands except the reduction index,
7458 ensuring the ordering of the ops in the vector is kept. */
7459 auto_vec<tree, 3> slp_ops;
7460 auto_vec<vec<tree>, 3> vec_defs;
7462 slp_ops.quick_push (ops[0]);
7463 slp_ops.quick_push (ops[1]);
7464 if (op_type == ternary_op)
7465 slp_ops.quick_push (ops[2]);
7467 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7469 vec_oprnds0.safe_splice (vec_defs[0]);
7470 vec_defs[0].release ();
7471 vec_oprnds1.safe_splice (vec_defs[1]);
7472 vec_defs[1].release ();
7473 if (op_type == ternary_op)
7475 vec_oprnds2.safe_splice (vec_defs[2]);
7476 vec_defs[2].release ();
7479 else
7481 vec_oprnds0.quick_push
7482 (vect_get_vec_def_for_operand (ops[0], stmt));
7483 vec_oprnds1.quick_push
7484 (vect_get_vec_def_for_operand (ops[1], stmt));
7485 if (op_type == ternary_op)
7486 vec_oprnds2.quick_push
7487 (vect_get_vec_def_for_operand (ops[2], stmt));
7490 else
7492 if (!slp_node)
7494 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7496 if (single_defuse_cycle && reduc_index == 0)
7497 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7498 else
7499 vec_oprnds0[0]
7500 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7501 if (single_defuse_cycle && reduc_index == 1)
7502 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7503 else
7504 vec_oprnds1[0]
7505 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7506 if (op_type == ternary_op)
7508 if (single_defuse_cycle && reduc_index == 2)
7509 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7510 else
7511 vec_oprnds2[0]
7512 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7517 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7519 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7520 if (masked_loop_p)
7522 /* Make sure that the reduction accumulator is vop[0]. */
7523 if (reduc_index == 1)
7525 gcc_assert (commutative_tree_code (code));
7526 std::swap (vop[0], vop[1]);
7528 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7529 vectype_in, i * ncopies + j);
7530 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7531 vop[0], vop[1]);
7532 new_temp = make_ssa_name (vec_dest, call);
7533 gimple_call_set_lhs (call, new_temp);
7534 gimple_call_set_nothrow (call, true);
7535 new_stmt = call;
7537 else
7539 if (op_type == ternary_op)
7540 vop[2] = vec_oprnds2[i];
7542 new_temp = make_ssa_name (vec_dest, new_stmt);
7543 new_stmt = gimple_build_assign (new_temp, code,
7544 vop[0], vop[1], vop[2]);
7546 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7548 if (slp_node)
7550 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7551 vect_defs.quick_push (new_temp);
7553 else
7554 vect_defs[0] = new_temp;
7557 if (slp_node)
7558 continue;
7560 if (j == 0)
7561 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7562 else
7563 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7565 prev_stmt_info = vinfo_for_stmt (new_stmt);
7568 /* Finalize the reduction-phi (set its arguments) and create the
7569 epilog reduction code. */
7570 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7571 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7573 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7574 epilog_copies, reduc_fn, phis,
7575 double_reduc, slp_node, slp_node_instance,
7576 cond_reduc_val, cond_reduc_op_code,
7577 neutral_op);
7579 return true;
7582 /* Function vect_min_worthwhile_factor.
7584 For a loop where we could vectorize the operation indicated by CODE,
7585 return the minimum vectorization factor that makes it worthwhile
7586 to use generic vectors. */
7587 static unsigned int
7588 vect_min_worthwhile_factor (enum tree_code code)
7590 switch (code)
7592 case PLUS_EXPR:
7593 case MINUS_EXPR:
7594 case NEGATE_EXPR:
7595 return 4;
7597 case BIT_AND_EXPR:
7598 case BIT_IOR_EXPR:
7599 case BIT_XOR_EXPR:
7600 case BIT_NOT_EXPR:
7601 return 2;
7603 default:
7604 return INT_MAX;
7608 /* Return true if VINFO indicates we are doing loop vectorization and if
7609 it is worth decomposing CODE operations into scalar operations for
7610 that loop's vectorization factor. */
7612 bool
7613 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7615 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7616 unsigned HOST_WIDE_INT value;
7617 return (loop_vinfo
7618 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7619 && value >= vect_min_worthwhile_factor (code));
7622 /* Function vectorizable_induction
7624 Check if PHI performs an induction computation that can be vectorized.
7625 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7626 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7627 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7629 bool
7630 vectorizable_induction (gimple *phi,
7631 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7632 gimple **vec_stmt, slp_tree slp_node)
7634 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7637 unsigned ncopies;
7638 bool nested_in_vect_loop = false;
7639 struct loop *iv_loop;
7640 tree vec_def;
7641 edge pe = loop_preheader_edge (loop);
7642 basic_block new_bb;
7643 tree new_vec, vec_init, vec_step, t;
7644 tree new_name;
7645 gimple *new_stmt;
7646 gphi *induction_phi;
7647 tree induc_def, vec_dest;
7648 tree init_expr, step_expr;
7649 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7650 unsigned i;
7651 tree expr;
7652 gimple_seq stmts;
7653 imm_use_iterator imm_iter;
7654 use_operand_p use_p;
7655 gimple *exit_phi;
7656 edge latch_e;
7657 tree loop_arg;
7658 gimple_stmt_iterator si;
7659 basic_block bb = gimple_bb (phi);
7661 if (gimple_code (phi) != GIMPLE_PHI)
7662 return false;
7664 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7665 return false;
7667 /* Make sure it was recognized as induction computation. */
7668 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7669 return false;
7671 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7672 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7674 if (slp_node)
7675 ncopies = 1;
7676 else
7677 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7678 gcc_assert (ncopies >= 1);
7680 /* FORNOW. These restrictions should be relaxed. */
7681 if (nested_in_vect_loop_p (loop, phi))
7683 imm_use_iterator imm_iter;
7684 use_operand_p use_p;
7685 gimple *exit_phi;
7686 edge latch_e;
7687 tree loop_arg;
7689 if (ncopies > 1)
7691 if (dump_enabled_p ())
7692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7693 "multiple types in nested loop.\n");
7694 return false;
7697 /* FORNOW: outer loop induction with SLP not supported. */
7698 if (STMT_SLP_TYPE (stmt_info))
7699 return false;
7701 exit_phi = NULL;
7702 latch_e = loop_latch_edge (loop->inner);
7703 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7704 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7706 gimple *use_stmt = USE_STMT (use_p);
7707 if (is_gimple_debug (use_stmt))
7708 continue;
7710 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7712 exit_phi = use_stmt;
7713 break;
7716 if (exit_phi)
7718 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7719 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7720 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7722 if (dump_enabled_p ())
7723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724 "inner-loop induction only used outside "
7725 "of the outer vectorized loop.\n");
7726 return false;
7730 nested_in_vect_loop = true;
7731 iv_loop = loop->inner;
7733 else
7734 iv_loop = loop;
7735 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7737 if (slp_node && !nunits.is_constant ())
7739 /* The current SLP code creates the initial value element-by-element. */
7740 if (dump_enabled_p ())
7741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742 "SLP induction not supported for variable-length"
7743 " vectors.\n");
7744 return false;
7747 if (!vec_stmt) /* transformation not required. */
7749 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7750 if (dump_enabled_p ())
7751 dump_printf_loc (MSG_NOTE, vect_location,
7752 "=== vectorizable_induction ===\n");
7753 vect_model_induction_cost (stmt_info, ncopies);
7754 return true;
7757 /* Transform. */
7759 /* Compute a vector variable, initialized with the first VF values of
7760 the induction variable. E.g., for an iv with IV_PHI='X' and
7761 evolution S, for a vector of 4 units, we want to compute:
7762 [X, X + S, X + 2*S, X + 3*S]. */
7764 if (dump_enabled_p ())
7765 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7767 latch_e = loop_latch_edge (iv_loop);
7768 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7770 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7771 gcc_assert (step_expr != NULL_TREE);
7773 pe = loop_preheader_edge (iv_loop);
7774 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7775 loop_preheader_edge (iv_loop));
7777 stmts = NULL;
7778 if (!nested_in_vect_loop)
7780 /* Convert the initial value to the desired type. */
7781 tree new_type = TREE_TYPE (vectype);
7782 init_expr = gimple_convert (&stmts, new_type, init_expr);
7784 /* If we are using the loop mask to "peel" for alignment then we need
7785 to adjust the start value here. */
7786 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7787 if (skip_niters != NULL_TREE)
7789 if (FLOAT_TYPE_P (vectype))
7790 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7791 skip_niters);
7792 else
7793 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7794 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7795 skip_niters, step_expr);
7796 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7797 init_expr, skip_step);
7801 /* Convert the step to the desired type. */
7802 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7804 if (stmts)
7806 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7807 gcc_assert (!new_bb);
7810 /* Find the first insertion point in the BB. */
7811 si = gsi_after_labels (bb);
7813 /* For SLP induction we have to generate several IVs as for example
7814 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7815 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7816 [VF*S, VF*S, VF*S, VF*S] for all. */
7817 if (slp_node)
7819 /* Enforced above. */
7820 unsigned int const_nunits = nunits.to_constant ();
7822 /* Generate [VF*S, VF*S, ... ]. */
7823 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7825 expr = build_int_cst (integer_type_node, vf);
7826 expr = fold_convert (TREE_TYPE (step_expr), expr);
7828 else
7829 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7830 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7831 expr, step_expr);
7832 if (! CONSTANT_CLASS_P (new_name))
7833 new_name = vect_init_vector (phi, new_name,
7834 TREE_TYPE (step_expr), NULL);
7835 new_vec = build_vector_from_val (vectype, new_name);
7836 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7838 /* Now generate the IVs. */
7839 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7841 unsigned elts = const_nunits * nvects;
7842 unsigned nivs = least_common_multiple (group_size,
7843 const_nunits) / const_nunits;
7844 gcc_assert (elts % group_size == 0);
7845 tree elt = init_expr;
7846 unsigned ivn;
7847 for (ivn = 0; ivn < nivs; ++ivn)
7849 tree_vector_builder elts (vectype, const_nunits, 1);
7850 stmts = NULL;
7851 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7853 if (ivn*const_nunits + eltn >= group_size
7854 && (ivn * const_nunits + eltn) % group_size == 0)
7855 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7856 elt, step_expr);
7857 elts.quick_push (elt);
7859 vec_init = gimple_build_vector (&stmts, &elts);
7860 if (stmts)
7862 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7863 gcc_assert (!new_bb);
7866 /* Create the induction-phi that defines the induction-operand. */
7867 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7868 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7869 set_vinfo_for_stmt (induction_phi,
7870 new_stmt_vec_info (induction_phi, loop_vinfo));
7871 induc_def = PHI_RESULT (induction_phi);
7873 /* Create the iv update inside the loop */
7874 vec_def = make_ssa_name (vec_dest);
7875 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7876 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7877 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7879 /* Set the arguments of the phi node: */
7880 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7881 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7882 UNKNOWN_LOCATION);
7884 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7887 /* Re-use IVs when we can. */
7888 if (ivn < nvects)
7890 unsigned vfp
7891 = least_common_multiple (group_size, const_nunits) / group_size;
7892 /* Generate [VF'*S, VF'*S, ... ]. */
7893 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7895 expr = build_int_cst (integer_type_node, vfp);
7896 expr = fold_convert (TREE_TYPE (step_expr), expr);
7898 else
7899 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7900 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7901 expr, step_expr);
7902 if (! CONSTANT_CLASS_P (new_name))
7903 new_name = vect_init_vector (phi, new_name,
7904 TREE_TYPE (step_expr), NULL);
7905 new_vec = build_vector_from_val (vectype, new_name);
7906 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7907 for (; ivn < nvects; ++ivn)
7909 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7910 tree def;
7911 if (gimple_code (iv) == GIMPLE_PHI)
7912 def = gimple_phi_result (iv);
7913 else
7914 def = gimple_assign_lhs (iv);
7915 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7916 PLUS_EXPR,
7917 def, vec_step);
7918 if (gimple_code (iv) == GIMPLE_PHI)
7919 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7920 else
7922 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7923 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7925 set_vinfo_for_stmt (new_stmt,
7926 new_stmt_vec_info (new_stmt, loop_vinfo));
7927 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7931 return true;
7934 /* Create the vector that holds the initial_value of the induction. */
7935 if (nested_in_vect_loop)
7937 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7938 been created during vectorization of previous stmts. We obtain it
7939 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7940 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7941 /* If the initial value is not of proper type, convert it. */
7942 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7944 new_stmt
7945 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7946 vect_simple_var,
7947 "vec_iv_"),
7948 VIEW_CONVERT_EXPR,
7949 build1 (VIEW_CONVERT_EXPR, vectype,
7950 vec_init));
7951 vec_init = gimple_assign_lhs (new_stmt);
7952 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7953 new_stmt);
7954 gcc_assert (!new_bb);
7955 set_vinfo_for_stmt (new_stmt,
7956 new_stmt_vec_info (new_stmt, loop_vinfo));
7959 else
7961 /* iv_loop is the loop to be vectorized. Create:
7962 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7963 stmts = NULL;
7964 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7966 unsigned HOST_WIDE_INT const_nunits;
7967 if (nunits.is_constant (&const_nunits))
7969 tree_vector_builder elts (vectype, const_nunits, 1);
7970 elts.quick_push (new_name);
7971 for (i = 1; i < const_nunits; i++)
7973 /* Create: new_name_i = new_name + step_expr */
7974 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7975 new_name, step_expr);
7976 elts.quick_push (new_name);
7978 /* Create a vector from [new_name_0, new_name_1, ...,
7979 new_name_nunits-1] */
7980 vec_init = gimple_build_vector (&stmts, &elts);
7982 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7983 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7984 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7985 new_name, step_expr);
7986 else
7988 /* Build:
7989 [base, base, base, ...]
7990 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7991 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7992 gcc_assert (flag_associative_math);
7993 tree index = build_index_vector (vectype, 0, 1);
7994 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7995 new_name);
7996 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7997 step_expr);
7998 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7999 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8000 vec_init, step_vec);
8001 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8002 vec_init, base_vec);
8005 if (stmts)
8007 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8008 gcc_assert (!new_bb);
8013 /* Create the vector that holds the step of the induction. */
8014 if (nested_in_vect_loop)
8015 /* iv_loop is nested in the loop to be vectorized. Generate:
8016 vec_step = [S, S, S, S] */
8017 new_name = step_expr;
8018 else
8020 /* iv_loop is the loop to be vectorized. Generate:
8021 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8022 gimple_seq seq = NULL;
8023 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8025 expr = build_int_cst (integer_type_node, vf);
8026 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8028 else
8029 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8030 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8031 expr, step_expr);
8032 if (seq)
8034 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8035 gcc_assert (!new_bb);
8039 t = unshare_expr (new_name);
8040 gcc_assert (CONSTANT_CLASS_P (new_name)
8041 || TREE_CODE (new_name) == SSA_NAME);
8042 new_vec = build_vector_from_val (vectype, t);
8043 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8046 /* Create the following def-use cycle:
8047 loop prolog:
8048 vec_init = ...
8049 vec_step = ...
8050 loop:
8051 vec_iv = PHI <vec_init, vec_loop>
8053 STMT
8055 vec_loop = vec_iv + vec_step; */
8057 /* Create the induction-phi that defines the induction-operand. */
8058 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8059 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8060 set_vinfo_for_stmt (induction_phi,
8061 new_stmt_vec_info (induction_phi, loop_vinfo));
8062 induc_def = PHI_RESULT (induction_phi);
8064 /* Create the iv update inside the loop */
8065 vec_def = make_ssa_name (vec_dest);
8066 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8067 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8068 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8070 /* Set the arguments of the phi node: */
8071 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8072 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8073 UNKNOWN_LOCATION);
8075 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8077 /* In case that vectorization factor (VF) is bigger than the number
8078 of elements that we can fit in a vectype (nunits), we have to generate
8079 more than one vector stmt - i.e - we need to "unroll" the
8080 vector stmt by a factor VF/nunits. For more details see documentation
8081 in vectorizable_operation. */
8083 if (ncopies > 1)
8085 gimple_seq seq = NULL;
8086 stmt_vec_info prev_stmt_vinfo;
8087 /* FORNOW. This restriction should be relaxed. */
8088 gcc_assert (!nested_in_vect_loop);
8090 /* Create the vector that holds the step of the induction. */
8091 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8093 expr = build_int_cst (integer_type_node, nunits);
8094 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8096 else
8097 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8098 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8099 expr, step_expr);
8100 if (seq)
8102 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8103 gcc_assert (!new_bb);
8106 t = unshare_expr (new_name);
8107 gcc_assert (CONSTANT_CLASS_P (new_name)
8108 || TREE_CODE (new_name) == SSA_NAME);
8109 new_vec = build_vector_from_val (vectype, t);
8110 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8112 vec_def = induc_def;
8113 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8114 for (i = 1; i < ncopies; i++)
8116 /* vec_i = vec_prev + vec_step */
8117 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8118 vec_def, vec_step);
8119 vec_def = make_ssa_name (vec_dest, new_stmt);
8120 gimple_assign_set_lhs (new_stmt, vec_def);
8122 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8123 set_vinfo_for_stmt (new_stmt,
8124 new_stmt_vec_info (new_stmt, loop_vinfo));
8125 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8126 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8130 if (nested_in_vect_loop)
8132 /* Find the loop-closed exit-phi of the induction, and record
8133 the final vector of induction results: */
8134 exit_phi = NULL;
8135 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8137 gimple *use_stmt = USE_STMT (use_p);
8138 if (is_gimple_debug (use_stmt))
8139 continue;
8141 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8143 exit_phi = use_stmt;
8144 break;
8147 if (exit_phi)
8149 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8150 /* FORNOW. Currently not supporting the case that an inner-loop induction
8151 is not used in the outer-loop (i.e. only outside the outer-loop). */
8152 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8153 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8155 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8156 if (dump_enabled_p ())
8158 dump_printf_loc (MSG_NOTE, vect_location,
8159 "vector of inductions after inner-loop:");
8160 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8166 if (dump_enabled_p ())
8168 dump_printf_loc (MSG_NOTE, vect_location,
8169 "transform induction: created def-use cycle: ");
8170 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8171 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8172 SSA_NAME_DEF_STMT (vec_def), 0);
8175 return true;
8178 /* Function vectorizable_live_operation.
8180 STMT computes a value that is used outside the loop. Check if
8181 it can be supported. */
8183 bool
8184 vectorizable_live_operation (gimple *stmt,
8185 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8186 slp_tree slp_node, int slp_index,
8187 gimple **vec_stmt)
8189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8190 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8191 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8192 imm_use_iterator imm_iter;
8193 tree lhs, lhs_type, bitsize, vec_bitsize;
8194 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8195 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8196 int ncopies;
8197 gimple *use_stmt;
8198 auto_vec<tree> vec_oprnds;
8199 int vec_entry = 0;
8200 poly_uint64 vec_index = 0;
8202 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8204 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8205 return false;
8207 /* FORNOW. CHECKME. */
8208 if (nested_in_vect_loop_p (loop, stmt))
8209 return false;
8211 /* If STMT is not relevant and it is a simple assignment and its inputs are
8212 invariant then it can remain in place, unvectorized. The original last
8213 scalar value that it computes will be used. */
8214 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8216 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8217 if (dump_enabled_p ())
8218 dump_printf_loc (MSG_NOTE, vect_location,
8219 "statement is simple and uses invariant. Leaving in "
8220 "place.\n");
8221 return true;
8224 if (slp_node)
8225 ncopies = 1;
8226 else
8227 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8229 if (slp_node)
8231 gcc_assert (slp_index >= 0);
8233 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8234 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8236 /* Get the last occurrence of the scalar index from the concatenation of
8237 all the slp vectors. Calculate which slp vector it is and the index
8238 within. */
8239 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8241 /* Calculate which vector contains the result, and which lane of
8242 that vector we need. */
8243 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8245 if (dump_enabled_p ())
8246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8247 "Cannot determine which vector holds the"
8248 " final result.\n");
8249 return false;
8253 if (!vec_stmt)
8255 /* No transformation required. */
8256 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8258 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8259 OPTIMIZE_FOR_SPEED))
8261 if (dump_enabled_p ())
8262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8263 "can't use a fully-masked loop because "
8264 "the target doesn't support extract last "
8265 "reduction.\n");
8266 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8268 else if (slp_node)
8270 if (dump_enabled_p ())
8271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8272 "can't use a fully-masked loop because an "
8273 "SLP statement is live after the loop.\n");
8274 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8276 else if (ncopies > 1)
8278 if (dump_enabled_p ())
8279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8280 "can't use a fully-masked loop because"
8281 " ncopies is greater than 1.\n");
8282 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8284 else
8286 gcc_assert (ncopies == 1 && !slp_node);
8287 vect_record_loop_mask (loop_vinfo,
8288 &LOOP_VINFO_MASKS (loop_vinfo),
8289 1, vectype);
8292 return true;
8295 /* If stmt has a related stmt, then use that for getting the lhs. */
8296 if (is_pattern_stmt_p (stmt_info))
8297 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8299 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8300 : gimple_get_lhs (stmt);
8301 lhs_type = TREE_TYPE (lhs);
8303 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8304 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8305 : TYPE_SIZE (TREE_TYPE (vectype)));
8306 vec_bitsize = TYPE_SIZE (vectype);
8308 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8309 tree vec_lhs, bitstart;
8310 if (slp_node)
8312 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8314 /* Get the correct slp vectorized stmt. */
8315 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8316 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8317 vec_lhs = gimple_phi_result (phi);
8318 else
8319 vec_lhs = gimple_get_lhs (vec_stmt);
8321 /* Get entry to use. */
8322 bitstart = bitsize_int (vec_index);
8323 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8325 else
8327 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8328 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8329 gcc_checking_assert (ncopies == 1
8330 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8332 /* For multiple copies, get the last copy. */
8333 for (int i = 1; i < ncopies; ++i)
8334 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8335 vec_lhs);
8337 /* Get the last lane in the vector. */
8338 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8341 gimple_seq stmts = NULL;
8342 tree new_tree;
8343 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8345 /* Emit:
8347 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8349 where VEC_LHS is the vectorized live-out result and MASK is
8350 the loop mask for the final iteration. */
8351 gcc_assert (ncopies == 1 && !slp_node);
8352 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8353 tree scalar_res = make_ssa_name (scalar_type);
8354 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8355 1, vectype, 0);
8356 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8357 2, mask, vec_lhs);
8358 gimple_call_set_lhs (new_stmt, scalar_res);
8359 gimple_seq_add_stmt (&stmts, new_stmt);
8361 /* Convert the extracted vector element to the required scalar type. */
8362 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8364 else
8366 tree bftype = TREE_TYPE (vectype);
8367 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8368 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8369 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8370 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8371 &stmts, true, NULL_TREE);
8374 if (stmts)
8375 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8377 /* Replace use of lhs with newly computed result. If the use stmt is a
8378 single arg PHI, just replace all uses of PHI result. It's necessary
8379 because lcssa PHI defining lhs may be before newly inserted stmt. */
8380 use_operand_p use_p;
8381 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8382 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8383 && !is_gimple_debug (use_stmt))
8385 if (gimple_code (use_stmt) == GIMPLE_PHI
8386 && gimple_phi_num_args (use_stmt) == 1)
8388 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8390 else
8392 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8393 SET_USE (use_p, new_tree);
8395 update_stmt (use_stmt);
8398 return true;
8401 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8403 static void
8404 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8406 ssa_op_iter op_iter;
8407 imm_use_iterator imm_iter;
8408 def_operand_p def_p;
8409 gimple *ustmt;
8411 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8413 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8415 basic_block bb;
8417 if (!is_gimple_debug (ustmt))
8418 continue;
8420 bb = gimple_bb (ustmt);
8422 if (!flow_bb_inside_loop_p (loop, bb))
8424 if (gimple_debug_bind_p (ustmt))
8426 if (dump_enabled_p ())
8427 dump_printf_loc (MSG_NOTE, vect_location,
8428 "killing debug use\n");
8430 gimple_debug_bind_reset_value (ustmt);
8431 update_stmt (ustmt);
8433 else
8434 gcc_unreachable ();
8440 /* Given loop represented by LOOP_VINFO, return true if computation of
8441 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8442 otherwise. */
8444 static bool
8445 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8447 /* Constant case. */
8448 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8450 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8451 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8453 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8454 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8455 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8456 return true;
8459 widest_int max;
8460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8461 /* Check the upper bound of loop niters. */
8462 if (get_max_loop_iterations (loop, &max))
8464 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8465 signop sgn = TYPE_SIGN (type);
8466 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8467 if (max < type_max)
8468 return true;
8470 return false;
8473 /* Return a mask type with half the number of elements as TYPE. */
8475 tree
8476 vect_halve_mask_nunits (tree type)
8478 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8479 return build_truth_vector_type (nunits, current_vector_size);
8482 /* Return a mask type with twice as many elements as TYPE. */
8484 tree
8485 vect_double_mask_nunits (tree type)
8487 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8488 return build_truth_vector_type (nunits, current_vector_size);
8491 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8492 contain a sequence of NVECTORS masks that each control a vector of type
8493 VECTYPE. */
8495 void
8496 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8497 unsigned int nvectors, tree vectype)
8499 gcc_assert (nvectors != 0);
8500 if (masks->length () < nvectors)
8501 masks->safe_grow_cleared (nvectors);
8502 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8503 /* The number of scalars per iteration and the number of vectors are
8504 both compile-time constants. */
8505 unsigned int nscalars_per_iter
8506 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8507 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8508 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8510 rgm->max_nscalars_per_iter = nscalars_per_iter;
8511 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8515 /* Given a complete set of masks MASKS, extract mask number INDEX
8516 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8517 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8519 See the comment above vec_loop_masks for more details about the mask
8520 arrangement. */
8522 tree
8523 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8524 unsigned int nvectors, tree vectype, unsigned int index)
8526 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8527 tree mask_type = rgm->mask_type;
8529 /* Populate the rgroup's mask array, if this is the first time we've
8530 used it. */
8531 if (rgm->masks.is_empty ())
8533 rgm->masks.safe_grow_cleared (nvectors);
8534 for (unsigned int i = 0; i < nvectors; ++i)
8536 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8537 /* Provide a dummy definition until the real one is available. */
8538 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8539 rgm->masks[i] = mask;
8543 tree mask = rgm->masks[index];
8544 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8545 TYPE_VECTOR_SUBPARTS (vectype)))
8547 /* A loop mask for data type X can be reused for data type Y
8548 if X has N times more elements than Y and if Y's elements
8549 are N times bigger than X's. In this case each sequence
8550 of N elements in the loop mask will be all-zero or all-one.
8551 We can then view-convert the mask so that each sequence of
8552 N elements is replaced by a single element. */
8553 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8554 TYPE_VECTOR_SUBPARTS (vectype)));
8555 gimple_seq seq = NULL;
8556 mask_type = build_same_sized_truth_vector_type (vectype);
8557 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8558 if (seq)
8559 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8561 return mask;
8564 /* Scale profiling counters by estimation for LOOP which is vectorized
8565 by factor VF. */
8567 static void
8568 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8570 edge preheader = loop_preheader_edge (loop);
8571 /* Reduce loop iterations by the vectorization factor. */
8572 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8573 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8575 if (freq_h.nonzero_p ())
8577 profile_probability p;
8579 /* Avoid dropping loop body profile counter to 0 because of zero count
8580 in loop's preheader. */
8581 if (!(freq_e == profile_count::zero ()))
8582 freq_e = freq_e.force_nonzero ();
8583 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8584 scale_loop_frequencies (loop, p);
8587 edge exit_e = single_exit (loop);
8588 exit_e->probability = profile_probability::always ()
8589 .apply_scale (1, new_est_niter + 1);
8591 edge exit_l = single_pred_edge (loop->latch);
8592 profile_probability prob = exit_l->probability;
8593 exit_l->probability = exit_e->probability.invert ();
8594 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8595 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8598 /* Function vect_transform_loop.
8600 The analysis phase has determined that the loop is vectorizable.
8601 Vectorize the loop - created vectorized stmts to replace the scalar
8602 stmts in the loop, and update the loop exit condition.
8603 Returns scalar epilogue loop if any. */
8605 struct loop *
8606 vect_transform_loop (loop_vec_info loop_vinfo)
8608 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8609 struct loop *epilogue = NULL;
8610 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8611 int nbbs = loop->num_nodes;
8612 int i;
8613 tree niters_vector = NULL_TREE;
8614 tree step_vector = NULL_TREE;
8615 tree niters_vector_mult_vf = NULL_TREE;
8616 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8617 unsigned int lowest_vf = constant_lower_bound (vf);
8618 bool grouped_store;
8619 bool slp_scheduled = false;
8620 gimple *stmt, *pattern_stmt;
8621 gimple_seq pattern_def_seq = NULL;
8622 gimple_stmt_iterator pattern_def_si = gsi_none ();
8623 bool transform_pattern_stmt = false;
8624 bool check_profitability = false;
8625 unsigned int th;
8627 if (dump_enabled_p ())
8628 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8630 /* Use the more conservative vectorization threshold. If the number
8631 of iterations is constant assume the cost check has been performed
8632 by our caller. If the threshold makes all loops profitable that
8633 run at least the (estimated) vectorization factor number of times
8634 checking is pointless, too. */
8635 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8636 if (th >= vect_vf_for_cost (loop_vinfo)
8637 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8639 if (dump_enabled_p ())
8640 dump_printf_loc (MSG_NOTE, vect_location,
8641 "Profitability threshold is %d loop iterations.\n",
8642 th);
8643 check_profitability = true;
8646 /* Make sure there exists a single-predecessor exit bb. Do this before
8647 versioning. */
8648 edge e = single_exit (loop);
8649 if (! single_pred_p (e->dest))
8651 split_loop_exit_edge (e);
8652 if (dump_enabled_p ())
8653 dump_printf (MSG_NOTE, "split exit edge\n");
8656 /* Version the loop first, if required, so the profitability check
8657 comes first. */
8659 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8661 poly_uint64 versioning_threshold
8662 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8663 if (check_profitability
8664 && ordered_p (poly_uint64 (th), versioning_threshold))
8666 versioning_threshold = ordered_max (poly_uint64 (th),
8667 versioning_threshold);
8668 check_profitability = false;
8670 vect_loop_versioning (loop_vinfo, th, check_profitability,
8671 versioning_threshold);
8672 check_profitability = false;
8675 /* Make sure there exists a single-predecessor exit bb also on the
8676 scalar loop copy. Do this after versioning but before peeling
8677 so CFG structure is fine for both scalar and if-converted loop
8678 to make slpeel_duplicate_current_defs_from_edges face matched
8679 loop closed PHI nodes on the exit. */
8680 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8682 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8683 if (! single_pred_p (e->dest))
8685 split_loop_exit_edge (e);
8686 if (dump_enabled_p ())
8687 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8691 tree niters = vect_build_loop_niters (loop_vinfo);
8692 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8693 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8694 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8695 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8696 &step_vector, &niters_vector_mult_vf, th,
8697 check_profitability, niters_no_overflow);
8699 if (niters_vector == NULL_TREE)
8701 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8702 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8703 && known_eq (lowest_vf, vf))
8705 niters_vector
8706 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8707 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8708 step_vector = build_one_cst (TREE_TYPE (niters));
8710 else
8711 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8712 &step_vector, niters_no_overflow);
8715 /* 1) Make sure the loop header has exactly two entries
8716 2) Make sure we have a preheader basic block. */
8718 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8720 split_edge (loop_preheader_edge (loop));
8722 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8723 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8724 /* This will deal with any possible peeling. */
8725 vect_prepare_for_masked_peels (loop_vinfo);
8727 /* FORNOW: the vectorizer supports only loops which body consist
8728 of one basic block (header + empty latch). When the vectorizer will
8729 support more involved loop forms, the order by which the BBs are
8730 traversed need to be reconsidered. */
8732 for (i = 0; i < nbbs; i++)
8734 basic_block bb = bbs[i];
8735 stmt_vec_info stmt_info;
8737 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8738 gsi_next (&si))
8740 gphi *phi = si.phi ();
8741 if (dump_enabled_p ())
8743 dump_printf_loc (MSG_NOTE, vect_location,
8744 "------>vectorizing phi: ");
8745 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8747 stmt_info = vinfo_for_stmt (phi);
8748 if (!stmt_info)
8749 continue;
8751 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8752 vect_loop_kill_debug_uses (loop, phi);
8754 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8755 && !STMT_VINFO_LIVE_P (stmt_info))
8756 continue;
8758 if (STMT_VINFO_VECTYPE (stmt_info)
8759 && (maybe_ne
8760 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8761 && dump_enabled_p ())
8762 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8764 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8765 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8766 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8767 && ! PURE_SLP_STMT (stmt_info))
8769 if (dump_enabled_p ())
8770 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8771 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8775 pattern_stmt = NULL;
8776 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8777 !gsi_end_p (si) || transform_pattern_stmt;)
8779 bool is_store;
8781 if (transform_pattern_stmt)
8782 stmt = pattern_stmt;
8783 else
8785 stmt = gsi_stmt (si);
8786 /* During vectorization remove existing clobber stmts. */
8787 if (gimple_clobber_p (stmt))
8789 unlink_stmt_vdef (stmt);
8790 gsi_remove (&si, true);
8791 release_defs (stmt);
8792 continue;
8796 if (dump_enabled_p ())
8798 dump_printf_loc (MSG_NOTE, vect_location,
8799 "------>vectorizing statement: ");
8800 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8803 stmt_info = vinfo_for_stmt (stmt);
8805 /* vector stmts created in the outer-loop during vectorization of
8806 stmts in an inner-loop may not have a stmt_info, and do not
8807 need to be vectorized. */
8808 if (!stmt_info)
8810 gsi_next (&si);
8811 continue;
8814 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8815 vect_loop_kill_debug_uses (loop, stmt);
8817 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8818 && !STMT_VINFO_LIVE_P (stmt_info))
8820 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8821 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8822 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8823 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8825 stmt = pattern_stmt;
8826 stmt_info = vinfo_for_stmt (stmt);
8828 else
8830 gsi_next (&si);
8831 continue;
8834 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8835 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8836 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8837 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8838 transform_pattern_stmt = true;
8840 /* If pattern statement has def stmts, vectorize them too. */
8841 if (is_pattern_stmt_p (stmt_info))
8843 if (pattern_def_seq == NULL)
8845 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8846 pattern_def_si = gsi_start (pattern_def_seq);
8848 else if (!gsi_end_p (pattern_def_si))
8849 gsi_next (&pattern_def_si);
8850 if (pattern_def_seq != NULL)
8852 gimple *pattern_def_stmt = NULL;
8853 stmt_vec_info pattern_def_stmt_info = NULL;
8855 while (!gsi_end_p (pattern_def_si))
8857 pattern_def_stmt = gsi_stmt (pattern_def_si);
8858 pattern_def_stmt_info
8859 = vinfo_for_stmt (pattern_def_stmt);
8860 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8861 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8862 break;
8863 gsi_next (&pattern_def_si);
8866 if (!gsi_end_p (pattern_def_si))
8868 if (dump_enabled_p ())
8870 dump_printf_loc (MSG_NOTE, vect_location,
8871 "==> vectorizing pattern def "
8872 "stmt: ");
8873 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8874 pattern_def_stmt, 0);
8877 stmt = pattern_def_stmt;
8878 stmt_info = pattern_def_stmt_info;
8880 else
8882 pattern_def_si = gsi_none ();
8883 transform_pattern_stmt = false;
8886 else
8887 transform_pattern_stmt = false;
8890 if (STMT_VINFO_VECTYPE (stmt_info))
8892 poly_uint64 nunits
8893 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8894 if (!STMT_SLP_TYPE (stmt_info)
8895 && maybe_ne (nunits, vf)
8896 && dump_enabled_p ())
8897 /* For SLP VF is set according to unrolling factor, and not
8898 to vector size, hence for SLP this print is not valid. */
8899 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8902 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8903 reached. */
8904 if (STMT_SLP_TYPE (stmt_info))
8906 if (!slp_scheduled)
8908 slp_scheduled = true;
8910 if (dump_enabled_p ())
8911 dump_printf_loc (MSG_NOTE, vect_location,
8912 "=== scheduling SLP instances ===\n");
8914 vect_schedule_slp (loop_vinfo);
8917 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8918 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8920 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8922 pattern_def_seq = NULL;
8923 gsi_next (&si);
8925 continue;
8929 /* -------- vectorize statement ------------ */
8930 if (dump_enabled_p ())
8931 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8933 grouped_store = false;
8934 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8935 if (is_store)
8937 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8939 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8940 interleaving chain was completed - free all the stores in
8941 the chain. */
8942 gsi_next (&si);
8943 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8945 else
8947 /* Free the attached stmt_vec_info and remove the stmt. */
8948 gimple *store = gsi_stmt (si);
8949 free_stmt_vec_info (store);
8950 unlink_stmt_vdef (store);
8951 gsi_remove (&si, true);
8952 release_defs (store);
8955 /* Stores can only appear at the end of pattern statements. */
8956 gcc_assert (!transform_pattern_stmt);
8957 pattern_def_seq = NULL;
8959 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8961 pattern_def_seq = NULL;
8962 gsi_next (&si);
8964 } /* stmts in BB */
8966 /* Stub out scalar statements that must not survive vectorization.
8967 Doing this here helps with grouped statements, or statements that
8968 are involved in patterns. */
8969 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8970 !gsi_end_p (gsi); gsi_next (&gsi))
8972 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8973 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8975 tree lhs = gimple_get_lhs (call);
8976 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8978 tree zero = build_zero_cst (TREE_TYPE (lhs));
8979 gimple *new_stmt = gimple_build_assign (lhs, zero);
8980 gsi_replace (&gsi, new_stmt, true);
8984 } /* BBs in loop */
8986 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8987 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8988 if (integer_onep (step_vector))
8989 niters_no_overflow = true;
8990 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8991 niters_vector_mult_vf, !niters_no_overflow);
8993 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8994 scale_profile_for_vect_loop (loop, assumed_vf);
8996 /* True if the final iteration might not handle a full vector's
8997 worth of scalar iterations. */
8998 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8999 /* The minimum number of iterations performed by the epilogue. This
9000 is 1 when peeling for gaps because we always need a final scalar
9001 iteration. */
9002 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9003 /* +1 to convert latch counts to loop iteration counts,
9004 -min_epilogue_iters to remove iterations that cannot be performed
9005 by the vector code. */
9006 int bias_for_lowest = 1 - min_epilogue_iters;
9007 int bias_for_assumed = bias_for_lowest;
9008 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9009 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9011 /* When the amount of peeling is known at compile time, the first
9012 iteration will have exactly alignment_npeels active elements.
9013 In the worst case it will have at least one. */
9014 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9015 bias_for_lowest += lowest_vf - min_first_active;
9016 bias_for_assumed += assumed_vf - min_first_active;
9018 /* In these calculations the "- 1" converts loop iteration counts
9019 back to latch counts. */
9020 if (loop->any_upper_bound)
9021 loop->nb_iterations_upper_bound
9022 = (final_iter_may_be_partial
9023 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9024 lowest_vf) - 1
9025 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9026 lowest_vf) - 1);
9027 if (loop->any_likely_upper_bound)
9028 loop->nb_iterations_likely_upper_bound
9029 = (final_iter_may_be_partial
9030 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9031 + bias_for_lowest, lowest_vf) - 1
9032 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9033 + bias_for_lowest, lowest_vf) - 1);
9034 if (loop->any_estimate)
9035 loop->nb_iterations_estimate
9036 = (final_iter_may_be_partial
9037 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9038 assumed_vf) - 1
9039 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9040 assumed_vf) - 1);
9042 if (dump_enabled_p ())
9044 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9046 dump_printf_loc (MSG_NOTE, vect_location,
9047 "LOOP VECTORIZED\n");
9048 if (loop->inner)
9049 dump_printf_loc (MSG_NOTE, vect_location,
9050 "OUTER LOOP VECTORIZED\n");
9051 dump_printf (MSG_NOTE, "\n");
9053 else
9055 dump_printf_loc (MSG_NOTE, vect_location,
9056 "LOOP EPILOGUE VECTORIZED (VS=");
9057 dump_dec (MSG_NOTE, current_vector_size);
9058 dump_printf (MSG_NOTE, ")\n");
9062 /* Free SLP instances here because otherwise stmt reference counting
9063 won't work. */
9064 slp_instance instance;
9065 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9066 vect_free_slp_instance (instance);
9067 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9068 /* Clear-up safelen field since its value is invalid after vectorization
9069 since vectorized loop can have loop-carried dependencies. */
9070 loop->safelen = 0;
9072 /* Don't vectorize epilogue for epilogue. */
9073 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9074 epilogue = NULL;
9076 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9077 epilogue = NULL;
9079 if (epilogue)
9081 auto_vector_sizes vector_sizes;
9082 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9083 unsigned int next_size = 0;
9085 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9086 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9087 && known_eq (vf, lowest_vf))
9089 unsigned int eiters
9090 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9091 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9092 eiters = eiters % lowest_vf;
9093 epilogue->nb_iterations_upper_bound = eiters - 1;
9095 unsigned int ratio;
9096 while (next_size < vector_sizes.length ()
9097 && !(constant_multiple_p (current_vector_size,
9098 vector_sizes[next_size], &ratio)
9099 && eiters >= lowest_vf / ratio))
9100 next_size += 1;
9102 else
9103 while (next_size < vector_sizes.length ()
9104 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9105 next_size += 1;
9107 if (next_size == vector_sizes.length ())
9108 epilogue = NULL;
9111 if (epilogue)
9113 epilogue->force_vectorize = loop->force_vectorize;
9114 epilogue->safelen = loop->safelen;
9115 epilogue->dont_vectorize = false;
9117 /* We may need to if-convert epilogue to vectorize it. */
9118 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9119 tree_if_conversion (epilogue);
9122 return epilogue;
9125 /* The code below is trying to perform simple optimization - revert
9126 if-conversion for masked stores, i.e. if the mask of a store is zero
9127 do not perform it and all stored value producers also if possible.
9128 For example,
9129 for (i=0; i<n; i++)
9130 if (c[i])
9132 p1[i] += 1;
9133 p2[i] = p3[i] +2;
9135 this transformation will produce the following semi-hammock:
9137 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9139 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9140 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9141 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9142 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9143 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9144 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9148 void
9149 optimize_mask_stores (struct loop *loop)
9151 basic_block *bbs = get_loop_body (loop);
9152 unsigned nbbs = loop->num_nodes;
9153 unsigned i;
9154 basic_block bb;
9155 struct loop *bb_loop;
9156 gimple_stmt_iterator gsi;
9157 gimple *stmt;
9158 auto_vec<gimple *> worklist;
9160 vect_location = find_loop_location (loop);
9161 /* Pick up all masked stores in loop if any. */
9162 for (i = 0; i < nbbs; i++)
9164 bb = bbs[i];
9165 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9166 gsi_next (&gsi))
9168 stmt = gsi_stmt (gsi);
9169 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9170 worklist.safe_push (stmt);
9174 free (bbs);
9175 if (worklist.is_empty ())
9176 return;
9178 /* Loop has masked stores. */
9179 while (!worklist.is_empty ())
9181 gimple *last, *last_store;
9182 edge e, efalse;
9183 tree mask;
9184 basic_block store_bb, join_bb;
9185 gimple_stmt_iterator gsi_to;
9186 tree vdef, new_vdef;
9187 gphi *phi;
9188 tree vectype;
9189 tree zero;
9191 last = worklist.pop ();
9192 mask = gimple_call_arg (last, 2);
9193 bb = gimple_bb (last);
9194 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9195 the same loop as if_bb. It could be different to LOOP when two
9196 level loop-nest is vectorized and mask_store belongs to the inner
9197 one. */
9198 e = split_block (bb, last);
9199 bb_loop = bb->loop_father;
9200 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9201 join_bb = e->dest;
9202 store_bb = create_empty_bb (bb);
9203 add_bb_to_loop (store_bb, bb_loop);
9204 e->flags = EDGE_TRUE_VALUE;
9205 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9206 /* Put STORE_BB to likely part. */
9207 efalse->probability = profile_probability::unlikely ();
9208 store_bb->count = efalse->count ();
9209 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9210 if (dom_info_available_p (CDI_DOMINATORS))
9211 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9212 if (dump_enabled_p ())
9213 dump_printf_loc (MSG_NOTE, vect_location,
9214 "Create new block %d to sink mask stores.",
9215 store_bb->index);
9216 /* Create vector comparison with boolean result. */
9217 vectype = TREE_TYPE (mask);
9218 zero = build_zero_cst (vectype);
9219 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9220 gsi = gsi_last_bb (bb);
9221 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9222 /* Create new PHI node for vdef of the last masked store:
9223 .MEM_2 = VDEF <.MEM_1>
9224 will be converted to
9225 .MEM.3 = VDEF <.MEM_1>
9226 and new PHI node will be created in join bb
9227 .MEM_2 = PHI <.MEM_1, .MEM_3>
9229 vdef = gimple_vdef (last);
9230 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9231 gimple_set_vdef (last, new_vdef);
9232 phi = create_phi_node (vdef, join_bb);
9233 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9235 /* Put all masked stores with the same mask to STORE_BB if possible. */
9236 while (true)
9238 gimple_stmt_iterator gsi_from;
9239 gimple *stmt1 = NULL;
9241 /* Move masked store to STORE_BB. */
9242 last_store = last;
9243 gsi = gsi_for_stmt (last);
9244 gsi_from = gsi;
9245 /* Shift GSI to the previous stmt for further traversal. */
9246 gsi_prev (&gsi);
9247 gsi_to = gsi_start_bb (store_bb);
9248 gsi_move_before (&gsi_from, &gsi_to);
9249 /* Setup GSI_TO to the non-empty block start. */
9250 gsi_to = gsi_start_bb (store_bb);
9251 if (dump_enabled_p ())
9253 dump_printf_loc (MSG_NOTE, vect_location,
9254 "Move stmt to created bb\n");
9255 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9257 /* Move all stored value producers if possible. */
9258 while (!gsi_end_p (gsi))
9260 tree lhs;
9261 imm_use_iterator imm_iter;
9262 use_operand_p use_p;
9263 bool res;
9265 /* Skip debug statements. */
9266 if (is_gimple_debug (gsi_stmt (gsi)))
9268 gsi_prev (&gsi);
9269 continue;
9271 stmt1 = gsi_stmt (gsi);
9272 /* Do not consider statements writing to memory or having
9273 volatile operand. */
9274 if (gimple_vdef (stmt1)
9275 || gimple_has_volatile_ops (stmt1))
9276 break;
9277 gsi_from = gsi;
9278 gsi_prev (&gsi);
9279 lhs = gimple_get_lhs (stmt1);
9280 if (!lhs)
9281 break;
9283 /* LHS of vectorized stmt must be SSA_NAME. */
9284 if (TREE_CODE (lhs) != SSA_NAME)
9285 break;
9287 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9289 /* Remove dead scalar statement. */
9290 if (has_zero_uses (lhs))
9292 gsi_remove (&gsi_from, true);
9293 continue;
9297 /* Check that LHS does not have uses outside of STORE_BB. */
9298 res = true;
9299 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9301 gimple *use_stmt;
9302 use_stmt = USE_STMT (use_p);
9303 if (is_gimple_debug (use_stmt))
9304 continue;
9305 if (gimple_bb (use_stmt) != store_bb)
9307 res = false;
9308 break;
9311 if (!res)
9312 break;
9314 if (gimple_vuse (stmt1)
9315 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9316 break;
9318 /* Can move STMT1 to STORE_BB. */
9319 if (dump_enabled_p ())
9321 dump_printf_loc (MSG_NOTE, vect_location,
9322 "Move stmt to created bb\n");
9323 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9325 gsi_move_before (&gsi_from, &gsi_to);
9326 /* Shift GSI_TO for further insertion. */
9327 gsi_prev (&gsi_to);
9329 /* Put other masked stores with the same mask to STORE_BB. */
9330 if (worklist.is_empty ()
9331 || gimple_call_arg (worklist.last (), 2) != mask
9332 || worklist.last () != stmt1)
9333 break;
9334 last = worklist.pop ();
9336 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);