2018-05-07 Edward Smith-Rowland <3dw4rd@verizon.net>
[official-gcc.git] / gcc / tree-vect-loop.c
blob4ce721ed4786e61fd9d3239707e285a72aecc0c6
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Function vect_determine_vectorization_factor
160 Determine the vectorization factor (VF). VF is the number of data elements
161 that are operated upon in parallel in a single iteration of the vectorized
162 loop. For example, when vectorizing a loop that operates on 4byte elements,
163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164 elements can fit in a single vector register.
166 We currently support vectorization of loops in which all types operated upon
167 are of the same size. Therefore this function currently sets VF according to
168 the size of the types operated upon, and fails if there are multiple sizes
169 in the loop.
171 VF is also the factor by which the loop iterations are strip-mined, e.g.:
172 original loop:
173 for (i=0; i<N; i++){
174 a[i] = b[i] + c[i];
177 vectorized loop:
178 for (i=0; i<N; i+=VF){
179 a[i:VF] = b[i:VF] + c[i:VF];
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188 unsigned nbbs = loop->num_nodes;
189 poly_uint64 vectorization_factor = 1;
190 tree scalar_type = NULL_TREE;
191 gphi *phi;
192 tree vectype;
193 stmt_vec_info stmt_info;
194 unsigned i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
200 bool bool_result;
201 auto_vec<stmt_vec_info> mask_producers;
203 if (dump_enabled_p ())
204 dump_printf_loc (MSG_NOTE, vect_location,
205 "=== vect_determine_vectorization_factor ===\n");
207 for (i = 0; i < nbbs; i++)
209 basic_block bb = bbs[i];
211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 gsi_next (&si))
214 phi = si.phi ();
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 gcc_assert (stmt_info);
224 if (STMT_VINFO_RELEVANT_P (stmt_info)
225 || STMT_VINFO_LIVE_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 dump_printf (MSG_NOTE, "\n");
268 vect_update_max_nunits (&vectorization_factor, vectype);
272 for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 !gsi_end_p (si) || analyze_pattern_stmt;)
275 tree vf_vectype;
277 if (analyze_pattern_stmt)
278 stmt = pattern_stmt;
279 else
280 stmt = gsi_stmt (si);
282 stmt_info = vinfo_for_stmt (stmt);
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
362 stmt = pattern_def_stmt;
363 stmt_info = pattern_def_stmt_info;
365 else
367 pattern_def_si = gsi_none ();
368 analyze_pattern_stmt = false;
371 else
372 analyze_pattern_stmt = false;
375 if (gimple_get_lhs (stmt) == NULL_TREE
376 /* MASK_STORE has no lhs, but is ok. */
377 && (!is_gimple_call (stmt)
378 || !gimple_call_internal_p (stmt)
379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
381 if (is_gimple_call (stmt))
383 /* Ignore calls with no lhs. These must be calls to
384 #pragma omp simd functions, and what vectorization factor
385 it really needs can't be determined until
386 vectorizable_simd_clone_call. */
387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
389 pattern_def_seq = NULL;
390 gsi_next (&si);
392 continue;
394 if (dump_enabled_p ())
396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397 "not vectorized: irregular stmt.");
398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 return false;
404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "not vectorized: vector stmt in loop:");
410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
412 return false;
415 bool_result = false;
417 if (STMT_VINFO_VECTYPE (stmt_info))
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
428 else
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 else
434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436 /* Bool ops don't participate in vectorization factor
437 computation. For comparison use compared types to
438 compute a factor. */
439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 && is_gimple_assign (stmt)
441 && gimple_assign_rhs_code (stmt) != COND_EXPR)
443 if (STMT_VINFO_RELEVANT_P (stmt_info)
444 || STMT_VINFO_LIVE_P (stmt_info))
445 mask_producers.safe_push (stmt_info);
446 bool_result = true;
448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 == tcc_comparison
450 && !VECT_SCALAR_BOOLEAN_TYPE_P
451 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 else
455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
457 pattern_def_seq = NULL;
458 gsi_next (&si);
460 continue;
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "get vectype for scalar type: ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469 dump_printf (MSG_NOTE, "\n");
471 vectype = get_vectype_for_scalar_type (scalar_type);
472 if (!vectype)
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized: unsupported "
478 "data-type ");
479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480 scalar_type);
481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
483 return false;
486 if (!bool_result)
487 STMT_VINFO_VECTYPE (stmt_info) = vectype;
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493 dump_printf (MSG_NOTE, "\n");
497 /* Don't try to compute VF out scalar types if we stmt
498 produces boolean vector. Use result vectype instead. */
499 if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 vf_vectype = vectype;
501 else
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 if (!bool_result)
507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 &dummy);
509 if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "get vectype for scalar type: ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 dump_printf (MSG_NOTE, "\n");
516 vf_vectype = get_vectype_for_scalar_type (scalar_type);
518 if (!vf_vectype)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "not vectorized: unsupported data-type ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525 scalar_type);
526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 return false;
531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "not vectorized: different sized vector "
538 "types in statement, ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vf_vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553 dump_printf (MSG_NOTE, "\n");
556 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 dump_printf (MSG_NOTE, "\n");
563 vect_update_max_nunits (&vectorization_factor, vf_vectype);
565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 pattern_def_seq = NULL;
568 gsi_next (&si);
573 /* TODO: Analyze cost. Decide if worth while to vectorize. */
574 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577 dump_dec (MSG_NOTE, vectorization_factor);
578 dump_printf (MSG_NOTE, "\n");
581 if (known_le (vectorization_factor, 1U))
583 if (dump_enabled_p ())
584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585 "not vectorized: unsupported data-type\n");
586 return false;
588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
590 for (i = 0; i < mask_producers.length (); i++)
592 tree mask_type = NULL;
594 stmt = STMT_VINFO_STMT (mask_producers[i]);
596 if (is_gimple_assign (stmt)
597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 && !VECT_SCALAR_BOOLEAN_TYPE_P
599 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 mask_type = get_mask_type_for_scalar_type (scalar_type);
604 if (!mask_type)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "not vectorized: unsupported mask\n");
609 return false;
612 else
614 tree rhs;
615 ssa_op_iter iter;
616 gimple *def_stmt;
617 enum vect_def_type dt;
619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 &def_stmt, &dt, &vectype))
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "not vectorized: can't compute mask type "
628 "for statement, ");
629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 return false;
635 /* No vectype probably means external definition.
636 Allow it in case there is another operand which
637 allows to determine mask type. */
638 if (!vectype)
639 continue;
641 if (!mask_type)
642 mask_type = vectype;
643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 TYPE_VECTOR_SUBPARTS (vectype)))
646 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "not vectorized: different sized masks "
650 "types in statement, ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 mask_type);
653 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 vectype);
656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
658 return false;
660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 != VECTOR_BOOLEAN_TYPE_P (vectype))
663 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "not vectorized: mixed mask and "
667 "nonmask vector types in statement, ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 mask_type);
670 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 vectype);
673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
675 return false;
679 /* We may compare boolean value loaded as vector of integers.
680 Fix mask_type in such case. */
681 if (mask_type
682 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 && gimple_code (stmt) == GIMPLE_ASSIGN
684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 mask_type = build_same_sized_truth_vector_type (mask_type);
688 /* No mask_type should mean loop invariant predicate.
689 This is probably a subject for optimization in
690 if-conversion. */
691 if (!mask_type)
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "not vectorized: can't compute mask type "
697 "for statement, ");
698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 return false;
704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 return true;
711 /* Function vect_is_simple_iv_evolution.
713 FORNOW: A simple evolution of an induction variables in the loop is
714 considered a polynomial evolution. */
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718 tree * step)
720 tree init_expr;
721 tree step_expr;
722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723 basic_block bb;
725 /* When there is no evolution in this loop, the evolution function
726 is not "simple". */
727 if (evolution_part == NULL_TREE)
728 return false;
730 /* When the evolution is a polynomial of degree >= 2
731 the evolution function is not "simple". */
732 if (tree_is_chrec (evolution_part))
733 return false;
735 step_expr = evolution_part;
736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742 dump_printf (MSG_NOTE, ", init: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744 dump_printf (MSG_NOTE, "\n");
747 *init = init_expr;
748 *step = step_expr;
750 if (TREE_CODE (step_expr) != INTEGER_CST
751 && (TREE_CODE (step_expr) != SSA_NAME
752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 || !flag_associative_math)))
757 && (TREE_CODE (step_expr) != REAL_CST
758 || !flag_associative_math))
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "step unknown.\n");
763 return false;
766 return true;
769 /* Function vect_analyze_scalar_cycles_1.
771 Examine the cross iteration def-use cycles of scalar variables
772 in LOOP. LOOP_VINFO represents the loop that is now being
773 considered for vectorization (can be LOOP, or an outer-loop
774 enclosing LOOP). */
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
779 basic_block bb = loop->header;
780 tree init, step;
781 auto_vec<gimple *, 64> worklist;
782 gphi_iterator gsi;
783 bool double_reduc;
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "=== vect_analyze_scalar_cycles ===\n");
789 /* First - identify all inductions. Reduction detection assumes that all the
790 inductions have been identified, therefore, this order must not be
791 changed. */
792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
794 gphi *phi = gsi.phi ();
795 tree access_fn = NULL;
796 tree def = PHI_RESULT (phi);
797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 /* Skip virtual phi's. The data dependences that are associated with
806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
807 if (virtual_operand_p (def))
808 continue;
810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
812 /* Analyze the evolution function. */
813 access_fn = analyze_scalar_evolution (loop, def);
814 if (access_fn)
816 STRIP_NOPS (access_fn);
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "Access function of PHI: ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822 dump_printf (MSG_NOTE, "\n");
824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 = initial_condition_in_loop_num (access_fn, loop->num);
826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 = evolution_part_in_loop_num (access_fn, loop->num);
830 if (!access_fn
831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 && TREE_CODE (step) != INTEGER_CST))
835 worklist.safe_push (phi);
836 continue;
839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 != NULL_TREE);
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
849 /* Second - identify all reductions and nested cycles. */
850 while (worklist.length () > 0)
852 gimple *phi = worklist.pop ();
853 tree def = PHI_RESULT (phi);
854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855 gimple *reduc_stmt;
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 gcc_assert (!virtual_operand_p (def)
864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 &double_reduc, false);
868 if (reduc_stmt)
870 if (double_reduc)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "Detected double reduction.\n");
876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878 vect_double_reduction_def;
880 else
882 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "Detected vectorizable nested cycle.\n");
888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890 vect_nested_cycle;
892 else
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "Detected reduction.\n");
898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900 vect_reduction_def;
901 /* Store the reduction cycles for possible vectorization in
902 loop-aware SLP if it was not detected as reduction
903 chain. */
904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
917 /* Function vect_analyze_scalar_cycles.
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
926 Example1: reduction:
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
932 Example2: induction:
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
986 gimple *first;
987 unsigned i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1010 /* Function vect_get_loop_niters.
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1015 niter information holds in ASSUMPTIONS.
1017 Return the loop exit condition. */
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 tree *number_of_iterations, tree *number_of_iterationsm1)
1024 edge exit = single_exit (loop);
1025 struct tree_niter_desc niter_desc;
1026 tree niter_assumptions, niter, may_be_zero;
1027 gcond *cond = get_loop_exit_condition (loop);
1029 *assumptions = boolean_true_node;
1030 *number_of_iterationsm1 = chrec_dont_know;
1031 *number_of_iterations = chrec_dont_know;
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== get_loop_niters ===\n");
1036 if (!exit)
1037 return cond;
1039 niter = chrec_dont_know;
1040 may_be_zero = NULL_TREE;
1041 niter_assumptions = boolean_true_node;
1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043 || chrec_contains_undetermined (niter_desc.niter))
1044 return cond;
1046 niter_assumptions = niter_desc.assumptions;
1047 may_be_zero = niter_desc.may_be_zero;
1048 niter = niter_desc.niter;
1050 if (may_be_zero && integer_zerop (may_be_zero))
1051 may_be_zero = NULL_TREE;
1053 if (may_be_zero)
1055 if (COMPARISON_CLASS_P (may_be_zero))
1057 /* Try to combine may_be_zero with assumptions, this can simplify
1058 computation of niter expression. */
1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 niter_assumptions,
1062 fold_build1 (TRUTH_NOT_EXPR,
1063 boolean_type_node,
1064 may_be_zero));
1065 else
1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 build_int_cst (TREE_TYPE (niter), 0),
1068 rewrite_to_non_trapping_overflow (niter));
1070 may_be_zero = NULL_TREE;
1072 else if (integer_nonzerop (may_be_zero))
1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 return cond;
1078 else
1079 return cond;
1082 *assumptions = niter_assumptions;
1083 *number_of_iterationsm1 = niter;
1085 /* We want the number of loop header executions which is the number
1086 of latch executions plus one.
1087 ??? For UINT_MAX latch executions this number overflows to zero
1088 for loops like do { n++; } while (n != 0); */
1089 if (niter && !chrec_contains_undetermined (niter))
1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 build_int_cst (TREE_TYPE (niter), 1));
1092 *number_of_iterations = niter;
1094 return cond;
1097 /* Function bb_in_loop_p
1099 Used as predicate for dfs order traversal of the loop bbs. */
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1104 const struct loop *const loop = (const struct loop *)data;
1105 if (flow_bb_inside_loop_p (loop, bb))
1106 return true;
1107 return false;
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112 stmt_vec_info structs for all the stmts in LOOP_IN. */
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115 : vec_info (vec_info::loop, init_cost (loop_in)),
1116 loop (loop_in),
1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118 num_itersm1 (NULL_TREE),
1119 num_iters (NULL_TREE),
1120 num_iters_unchanged (NULL_TREE),
1121 num_iters_assumptions (NULL_TREE),
1122 th (0),
1123 versioning_threshold (0),
1124 vectorization_factor (0),
1125 max_vectorization_factor (0),
1126 mask_skip_niters (NULL_TREE),
1127 mask_compare_type (NULL_TREE),
1128 unaligned_dr (NULL),
1129 peeling_for_alignment (0),
1130 ptr_mask (0),
1131 ivexpr_map (NULL),
1132 slp_unrolling_factor (1),
1133 single_scalar_iteration_cost (0),
1134 vectorizable (false),
1135 can_fully_mask_p (true),
1136 fully_masked_p (false),
1137 peeling_for_gaps (false),
1138 peeling_for_niter (false),
1139 operands_swapped (false),
1140 no_data_dependencies (false),
1141 has_mask_store (false),
1142 scalar_loop (NULL),
1143 orig_loop_info (NULL)
1145 /* Create/Update stmt_info for all stmts in the loop. */
1146 basic_block *body = get_loop_body (loop);
1147 for (unsigned int i = 0; i < loop->num_nodes; i++)
1149 basic_block bb = body[i];
1150 gimple_stmt_iterator si;
1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1154 gimple *phi = gsi_stmt (si);
1155 gimple_set_uid (phi, 0);
1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1161 gimple *stmt = gsi_stmt (si);
1162 gimple_set_uid (stmt, 0);
1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 free (body);
1168 /* CHECKME: We want to visit all BBs before their successors (except for
1169 latch blocks, for which this assertion wouldn't hold). In the simple
1170 case of the loop forms we allow, a dfs order of the BBs would the same
1171 as reversed postorder traversal, so we are safe. */
1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 bbs, loop->num_nodes, loop);
1175 gcc_assert (nbbs == loop->num_nodes);
1178 /* Free all levels of MASKS. */
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1183 rgroup_masks *rgm;
1184 unsigned int i;
1185 FOR_EACH_VEC_ELT (*masks, i, rgm)
1186 rgm->masks.release ();
1187 masks->release ();
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191 stmt_vec_info structs of all the stmts in the loop. */
1193 _loop_vec_info::~_loop_vec_info ()
1195 int nbbs;
1196 gimple_stmt_iterator si;
1197 int j;
1199 nbbs = loop->num_nodes;
1200 for (j = 0; j < nbbs; j++)
1202 basic_block bb = bbs[j];
1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204 free_stmt_vec_info (gsi_stmt (si));
1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1208 gimple *stmt = gsi_stmt (si);
1210 /* We may have broken canonical form by moving a constant
1211 into RHS1 of a commutative op. Fix such occurrences. */
1212 if (operands_swapped && is_gimple_assign (stmt))
1214 enum tree_code code = gimple_assign_rhs_code (stmt);
1216 if ((code == PLUS_EXPR
1217 || code == POINTER_PLUS_EXPR
1218 || code == MULT_EXPR)
1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 swap_ssa_operands (stmt,
1221 gimple_assign_rhs1_ptr (stmt),
1222 gimple_assign_rhs2_ptr (stmt));
1223 else if (code == COND_EXPR
1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1226 tree cond_expr = gimple_assign_rhs1 (stmt);
1227 enum tree_code cond_code = TREE_CODE (cond_expr);
1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 0));
1233 cond_code = invert_tree_comparison (cond_code,
1234 honor_nans);
1235 if (cond_code != ERROR_MARK)
1237 TREE_SET_CODE (cond_expr, cond_code);
1238 swap_ssa_operands (stmt,
1239 gimple_assign_rhs2_ptr (stmt),
1240 gimple_assign_rhs3_ptr (stmt));
1246 /* Free stmt_vec_info. */
1247 free_stmt_vec_info (stmt);
1248 gsi_next (&si);
1252 free (bbs);
1254 release_vec_loop_masks (&masks);
1255 delete ivexpr_map;
1257 loop->aux = NULL;
1260 /* Return an invariant or register for EXPR and emit necessary
1261 computations in the LOOP_VINFO loop preheader. */
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1266 if (is_gimple_reg (expr)
1267 || is_gimple_min_invariant (expr))
1268 return expr;
1270 if (! loop_vinfo->ivexpr_map)
1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273 if (! cached)
1275 gimple_seq stmts = NULL;
1276 cached = force_gimple_operand (unshare_expr (expr),
1277 &stmts, true, NULL_TREE);
1278 if (stmts)
1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 gsi_insert_seq_on_edge_immediate (e, stmts);
1284 return cached;
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288 all masks required to mask LOOP_VINFO. */
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1293 rgroup_masks *rgm;
1294 unsigned int i;
1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296 if (rgm->mask_type != NULL_TREE
1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 cmp_type, rgm->mask_type,
1299 OPTIMIZE_FOR_SPEED))
1300 return false;
1301 return true;
1304 /* Calculate the maximum number of scalars per iteration for every
1305 rgroup in LOOP_VINFO. */
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1310 unsigned int res = 1;
1311 unsigned int i;
1312 rgroup_masks *rgm;
1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314 res = MAX (res, rgm->max_nscalars_per_iter);
1315 return res;
1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1319 whether we can actually generate the masks required. Return true if so,
1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326 unsigned int min_ni_width;
1328 /* Use a normal loop if there are no statements that need masking.
1329 This only happens in rare degenerate cases: it means that the loop
1330 has no loads, no stores, and no live-out values. */
1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332 return false;
1334 /* Get the maximum number of iterations that is representable
1335 in the counter type. */
1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1339 /* Get a more refined estimate for the number of iterations. */
1340 widest_int max_back_edges;
1341 if (max_loop_iterations (loop, &max_back_edges))
1342 max_ni = wi::smin (max_ni, max_back_edges + 1);
1344 /* Account for rgroup masks, in which each bit is replicated N times. */
1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1347 /* Work out how many bits we need to represent the limit. */
1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1350 /* Find a scalar mode for which WHILE_ULT is supported. */
1351 opt_scalar_int_mode cmp_mode_iter;
1352 tree cmp_type = NULL_TREE;
1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356 if (cmp_bits >= min_ni_width
1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 if (this_type
1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1363 /* Although we could stop as soon as we find a valid mode,
1364 it's often better to continue until we hit Pmode, since the
1365 operands to the WHILE are more likely to be reusable in
1366 address calculations. */
1367 cmp_type = this_type;
1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 break;
1374 if (!cmp_type)
1375 return false;
1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378 return true;
1381 /* Calculate the cost of one scalar iteration of the loop. */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387 int nbbs = loop->num_nodes, factor;
1388 int innerloop_iters, i;
1390 /* Gather costs for statements in the scalar loop. */
1392 /* FORNOW. */
1393 innerloop_iters = 1;
1394 if (loop->inner)
1395 innerloop_iters = 50; /* FIXME */
1397 for (i = 0; i < nbbs; i++)
1399 gimple_stmt_iterator si;
1400 basic_block bb = bbs[i];
1402 if (bb->loop_father == loop->inner)
1403 factor = innerloop_iters;
1404 else
1405 factor = 1;
1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1409 gimple *stmt = gsi_stmt (si);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413 continue;
1415 /* Skip stmts that are not vectorized inside the loop. */
1416 if (stmt_info
1417 && !STMT_VINFO_RELEVANT_P (stmt_info)
1418 && (!STMT_VINFO_LIVE_P (stmt_info)
1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421 continue;
1423 vect_cost_for_stmt kind;
1424 if (STMT_VINFO_DATA_REF (stmt_info))
1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427 kind = scalar_load;
1428 else
1429 kind = scalar_store;
1431 else
1432 kind = scalar_stmt;
1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 factor, kind, stmt_info, 0, vect_prologue);
1439 /* Now accumulate cost. */
1440 void *target_cost_data = init_cost (loop);
1441 stmt_info_for_cost *si;
1442 int j;
1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 j, si)
1446 struct _stmt_vec_info *stmt_info
1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448 (void) add_stmt_cost (target_cost_data, si->count,
1449 si->kind, stmt_info, si->misalign,
1450 vect_body);
1452 unsigned dummy, body_cost = 0;
1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454 destroy_cost_data (target_cost_data);
1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1459 /* Function vect_analyze_loop_form_1.
1461 Verify that certain CFG restrictions hold, including:
1462 - the loop has a pre-header
1463 - the loop has a single entry and exit
1464 - the loop exit condition is simple enough
1465 - the number of iterations can be analyzed, i.e, a countable loop. The
1466 niter could be analyzed under some assumptions. */
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 tree *assumptions, tree *number_of_iterationsm1,
1471 tree *number_of_iterations, gcond **inner_loop_cond)
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_NOTE, vect_location,
1475 "=== vect_analyze_loop_form ===\n");
1477 /* Different restrictions apply when we are considering an inner-most loop,
1478 vs. an outer (nested) loop.
1479 (FORNOW. May want to relax some of these restrictions in the future). */
1481 if (!loop->inner)
1483 /* Inner-most loop. We currently require that the number of BBs is
1484 exactly 2 (the header and latch). Vectorizable inner-most loops
1485 look like this:
1487 (pre-header)
1489 header <--------+
1490 | | |
1491 | +--> latch --+
1493 (exit-bb) */
1495 if (loop->num_nodes != 2)
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: control flow in loop.\n");
1500 return false;
1503 if (empty_block_p (loop->header))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: empty loop.\n");
1508 return false;
1511 else
1513 struct loop *innerloop = loop->inner;
1514 edge entryedge;
1516 /* Nested loop. We currently require that the loop is doubly-nested,
1517 contains a single inner loop, and the number of BBs is exactly 5.
1518 Vectorizable outer-loops look like this:
1520 (pre-header)
1522 header <---+
1524 inner-loop |
1526 tail ------+
1528 (exit-bb)
1530 The inner-loop has the properties expected of inner-most loops
1531 as described above. */
1533 if ((loop->inner)->inner || (loop->inner)->next)
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "not vectorized: multiple nested loops.\n");
1538 return false;
1541 if (loop->num_nodes != 5)
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: control flow in loop.\n");
1546 return false;
1549 entryedge = loop_preheader_edge (innerloop);
1550 if (entryedge->src != loop->header
1551 || !single_exit (innerloop)
1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: unsupported outerloop form.\n");
1557 return false;
1560 /* Analyze the inner-loop. */
1561 tree inner_niterm1, inner_niter, inner_assumptions;
1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 &inner_assumptions, &inner_niterm1,
1564 &inner_niter, NULL)
1565 /* Don't support analyzing niter under assumptions for inner
1566 loop. */
1567 || !integer_onep (inner_assumptions))
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 "not vectorized: Bad inner loop.\n");
1572 return false;
1575 if (!expr_invariant_in_loop_p (loop, inner_niter))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "not vectorized: inner-loop count not"
1580 " invariant.\n");
1581 return false;
1584 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "Considering outer-loop vectorization.\n");
1589 if (!single_exit (loop)
1590 || EDGE_COUNT (loop->header->preds) != 2)
1592 if (dump_enabled_p ())
1594 if (!single_exit (loop))
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: multiple exits.\n");
1597 else if (EDGE_COUNT (loop->header->preds) != 2)
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: too many incoming edges.\n");
1601 return false;
1604 /* We assume that the loop exit condition is at the end of the loop. i.e,
1605 that the loop is represented as a do-while (with a proper if-guard
1606 before the loop if needed), where the loop header contains all the
1607 executable statements, and the latch is empty. */
1608 if (!empty_block_p (loop->latch)
1609 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1611 if (dump_enabled_p ())
1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 "not vectorized: latch block not empty.\n");
1614 return false;
1617 /* Make sure the exit is not abnormal. */
1618 edge e = single_exit (loop);
1619 if (e->flags & EDGE_ABNORMAL)
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "not vectorized: abnormal loop exit edge.\n");
1624 return false;
1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 number_of_iterationsm1);
1629 if (!*loop_cond)
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "not vectorized: complicated exit condition.\n");
1634 return false;
1637 if (integer_zerop (*assumptions)
1638 || !*number_of_iterations
1639 || chrec_contains_undetermined (*number_of_iterations))
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "not vectorized: number of iterations cannot be "
1644 "computed.\n");
1645 return false;
1648 if (integer_zerop (*number_of_iterations))
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: number of iterations = 0.\n");
1653 return false;
1656 return true;
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1664 tree assumptions, number_of_iterations, number_of_iterationsm1;
1665 gcond *loop_cond, *inner_loop_cond = NULL;
1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 &assumptions, &number_of_iterationsm1,
1669 &number_of_iterations, &inner_loop_cond))
1670 return NULL;
1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676 if (!integer_onep (assumptions))
1678 /* We consider to vectorize this loop by versioning it under
1679 some assumptions. In order to do this, we need to clear
1680 existing information computed by scev and niter analyzer. */
1681 scev_reset_htab ();
1682 free_numbers_of_iterations_estimates (loop);
1683 /* Also set flag for this loop so that following scev and niter
1684 analysis are done under the assumptions. */
1685 loop_constraint_set (loop, LOOP_C_FINITE);
1686 /* Also record the assumptions for versioning. */
1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1692 if (dump_enabled_p ())
1694 dump_printf_loc (MSG_NOTE, vect_location,
1695 "Symbolic number of iterations is ");
1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697 dump_printf (MSG_NOTE, "\n");
1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702 if (inner_loop_cond)
1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704 = loop_exit_ctrl_vec_info_type;
1706 gcc_assert (!loop->aux);
1707 loop->aux = loop_vinfo;
1708 return loop_vinfo;
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714 statements update the vectorization factor. */
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721 int nbbs = loop->num_nodes;
1722 poly_uint64 vectorization_factor;
1723 int i;
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "=== vect_update_vf_for_slp ===\n");
1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730 gcc_assert (known_ne (vectorization_factor, 0U));
1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733 vectorization factor of the loop is the unrolling factor required by
1734 the SLP instances. If that unrolling factor is 1, we say, that we
1735 perform pure SLP on loop - cross iteration parallelism is not
1736 exploited. */
1737 bool only_slp_in_loop = true;
1738 for (i = 0; i < nbbs; i++)
1740 basic_block bb = bbs[i];
1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 gsi_next (&si))
1744 gimple *stmt = gsi_stmt (si);
1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 && STMT_VINFO_RELATED_STMT (stmt_info))
1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 stmt_info = vinfo_for_stmt (stmt);
1752 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 && !PURE_SLP_STMT (stmt_info))
1755 /* STMT needs both SLP and loop-based vectorization. */
1756 only_slp_in_loop = false;
1760 if (only_slp_in_loop)
1762 dump_printf_loc (MSG_NOTE, vect_location,
1763 "Loop contains only SLP stmts\n");
1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1766 else
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "Loop contains SLP and non-SLP stmts\n");
1770 /* Both the vectorization factor and unroll factor have the form
1771 current_vector_size * X for some rational X, so they must have
1772 a common multiple. */
1773 vectorization_factor
1774 = force_common_multiple (vectorization_factor,
1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Updating vectorization factor to ");
1783 dump_dec (MSG_NOTE, vectorization_factor);
1784 dump_printf (MSG_NOTE, ".\n");
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789 the other phi in the reduction is also relevant for vectorization.
1790 This rejects cases such as:
1792 outer1:
1793 x_1 = PHI <x_3(outer2), ...>;
1796 inner:
1797 x_2 = ...;
1800 outer2:
1801 x_3 = PHI <x_2(inner)>;
1803 if nothing in x_2 or elsewhere makes x_1 relevant. */
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809 return false;
1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 /* Function vect_analyze_loop_operations.
1817 Scan the loop stmts and make sure they are all vectorizable. */
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824 int nbbs = loop->num_nodes;
1825 int i;
1826 stmt_vec_info stmt_info;
1827 bool need_to_vectorize = false;
1828 bool ok;
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_NOTE, vect_location,
1832 "=== vect_analyze_loop_operations ===\n");
1834 for (i = 0; i < nbbs; i++)
1836 basic_block bb = bbs[i];
1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 gsi_next (&si))
1841 gphi *phi = si.phi ();
1842 ok = true;
1844 stmt_info = vinfo_for_stmt (phi);
1845 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1850 if (virtual_operand_p (gimple_phi_result (phi)))
1851 continue;
1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854 (i.e., a phi in the tail of the outer-loop). */
1855 if (! is_loop_header_bb_p (bb))
1857 /* FORNOW: we currently don't support the case that these phis
1858 are not used in the outerloop (unless it is double reduction,
1859 i.e., this phi is vect_reduction_def), cause this case
1860 requires to actually do something here. */
1861 if (STMT_VINFO_LIVE_P (stmt_info)
1862 && !vect_active_double_reduction_p (stmt_info))
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "Unsupported loop-closed phi in "
1867 "outer-loop.\n");
1868 return false;
1871 /* If PHI is used in the outer loop, we check that its operand
1872 is defined in the inner loop. */
1873 if (STMT_VINFO_RELEVANT_P (stmt_info))
1875 tree phi_op;
1876 gimple *op_def_stmt;
1878 if (gimple_phi_num_args (phi) != 1)
1879 return false;
1881 phi_op = PHI_ARG_DEF (phi, 0);
1882 if (TREE_CODE (phi_op) != SSA_NAME)
1883 return false;
1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 if (gimple_nop_p (op_def_stmt)
1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 || !vinfo_for_stmt (op_def_stmt))
1889 return false;
1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892 != vect_used_in_outer
1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894 != vect_used_in_outer_by_reduction)
1895 return false;
1898 continue;
1901 gcc_assert (stmt_info);
1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904 || STMT_VINFO_LIVE_P (stmt_info))
1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1907 /* A scalar-dependence cycle that we don't support. */
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "not vectorized: scalar dependence cycle.\n");
1911 return false;
1914 if (STMT_VINFO_RELEVANT_P (stmt_info))
1916 need_to_vectorize = true;
1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 && ! PURE_SLP_STMT (stmt_info))
1919 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 && ! PURE_SLP_STMT (stmt_info))
1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1927 if (ok
1928 && STMT_VINFO_LIVE_P (stmt_info)
1929 && !PURE_SLP_STMT (stmt_info))
1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1932 if (!ok)
1934 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 "not vectorized: relevant phi not "
1938 "supported: ");
1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1941 return false;
1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 gsi_next (&si))
1948 gimple *stmt = gsi_stmt (si);
1949 if (!gimple_clobber_p (stmt)
1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 return false;
1953 } /* bbs */
1955 /* All operations in the loop are either irrelevant (deal with loop
1956 control, or dead), or only used outside the loop and can be moved
1957 out of the loop (e.g. invariants, inductions). The loop can be
1958 optimized away by scalar optimizations. We're better off not
1959 touching this loop. */
1960 if (!need_to_vectorize)
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "All the computation can be taken out of the loop.\n");
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 "not vectorized: redundant loop. no profit to "
1968 "vectorize.\n");
1969 return false;
1972 return true;
1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1977 definitely no, or -1 if it's worth retrying. */
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1985 /* Only fully-masked loops can have iteration counts less than the
1986 vectorization factor. */
1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1989 HOST_WIDE_INT max_niter;
1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993 else
1994 max_niter = max_stmt_executions_int (loop);
1996 if (max_niter != -1
1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "not vectorized: iteration count smaller than "
2002 "vectorization factor.\n");
2003 return 0;
2007 int min_profitable_iters, min_profitable_estimate;
2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 &min_profitable_estimate);
2011 if (min_profitable_iters < 0)
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "not vectorized: vectorization not profitable.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: vector version will never be "
2019 "profitable.\n");
2020 return -1;
2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 * assumed_vf);
2026 /* Use the cost model only if it is more conservative than user specified
2027 threshold. */
2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 min_profitable_iters);
2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 "not vectorized: vectorization not profitable.\n");
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "not vectorized: iteration count smaller than user "
2042 "specified loop bound parameter or minimum profitable "
2043 "iterations (whichever is more conservative).\n");
2044 return 0;
2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048 if (estimated_niter == -1)
2049 estimated_niter = likely_max_stmt_executions_int (loop);
2050 if (estimated_niter != -1
2051 && ((unsigned HOST_WIDE_INT) estimated_niter
2052 < MAX (th, (unsigned) min_profitable_estimate)))
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 "not vectorized: estimated iteration count too "
2057 "small.\n");
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not vectorized: estimated iteration count smaller "
2061 "than specified loop bound parameter or minimum "
2062 "profitable iterations (whichever is more "
2063 "conservative).\n");
2064 return -1;
2067 return 1;
2071 /* Function vect_analyze_loop_2.
2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074 for it. The different analyses will record information in the
2075 loop_vec_info struct. */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2079 bool ok;
2080 int res;
2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082 poly_uint64 min_vf = 2;
2083 unsigned int n_stmts = 0;
2085 /* The first group of checks is independent of the vector size. */
2086 fatal = true;
2088 /* Find all data references in the loop (which correspond to vdefs/vuses)
2089 and analyze their evolution in the loop. */
2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "not vectorized: loop nest containing two "
2099 "or more consecutive inner loops cannot be "
2100 "vectorized\n");
2101 return false;
2104 for (unsigned i = 0; i < loop->num_nodes; i++)
2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 !gsi_end_p (gsi); gsi_next (&gsi))
2108 gimple *stmt = gsi_stmt (gsi);
2109 if (is_gimple_debug (stmt))
2110 continue;
2111 ++n_stmts;
2112 if (!find_data_references_in_stmt (loop, stmt,
2113 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2115 if (is_gimple_call (stmt) && loop->safelen)
2117 tree fndecl = gimple_call_fndecl (stmt), op;
2118 if (fndecl != NULL_TREE)
2120 cgraph_node *node = cgraph_node::get (fndecl);
2121 if (node != NULL && node->simd_clones != NULL)
2123 unsigned int j, n = gimple_call_num_args (stmt);
2124 for (j = 0; j < n; j++)
2126 op = gimple_call_arg (stmt, j);
2127 if (DECL_P (op)
2128 || (REFERENCE_CLASS_P (op)
2129 && get_base_address (op)))
2130 break;
2132 op = gimple_call_lhs (stmt);
2133 /* Ignore #pragma omp declare simd functions
2134 if they don't have data references in the
2135 call stmt itself. */
2136 if (j == n
2137 && !(op
2138 && (DECL_P (op)
2139 || (REFERENCE_CLASS_P (op)
2140 && get_base_address (op)))))
2141 continue;
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: loop contains function "
2148 "calls or data references that cannot "
2149 "be analyzed\n");
2150 return false;
2154 /* Analyze the data references and also adjust the minimal
2155 vectorization factor according to the loads and stores. */
2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158 if (!ok)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "bad data references.\n");
2163 return false;
2166 /* Classify all cross-iteration scalar data-flow cycles.
2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2168 vect_analyze_scalar_cycles (loop_vinfo);
2170 vect_pattern_recog (loop_vinfo);
2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2174 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2177 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178 if (!ok)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "bad data access.\n");
2183 return false;
2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189 if (!ok)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "unexpected pattern.\n");
2194 return false;
2197 /* While the rest of the analysis below depends on it in some way. */
2198 fatal = false;
2200 /* Analyze data dependences between the data-refs in the loop
2201 and adjust the maximum vectorization factor according to
2202 the dependences.
2203 FORNOW: fail at the first data dependence that we encounter. */
2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206 if (!ok
2207 || (max_vf != MAX_VECTORIZATION_FACTOR
2208 && maybe_lt (max_vf, min_vf)))
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "bad data dependence.\n");
2213 return false;
2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2217 ok = vect_determine_vectorization_factor (loop_vinfo);
2218 if (!ok)
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 "can't determine vectorization factor.\n");
2223 return false;
2225 if (max_vf != MAX_VECTORIZATION_FACTOR
2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "bad data dependence.\n");
2231 return false;
2234 /* Compute the scalar iteration cost. */
2235 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238 unsigned th;
2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2241 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242 if (!ok)
2243 return false;
2245 /* If there are any SLP instances mark them as pure_slp. */
2246 bool slp = vect_make_slp_decision (loop_vinfo);
2247 if (slp)
2249 /* Find stmts that need to be both vectorized and SLPed. */
2250 vect_detect_hybrid_slp (loop_vinfo);
2252 /* Update the vectorization factor based on the SLP decision. */
2253 vect_update_vf_for_slp (loop_vinfo);
2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2258 /* We don't expect to have to roll back to anything other than an empty
2259 set of rgroups. */
2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2262 /* This is the point where we can re-start analysis with SLP forced off. */
2263 start_over:
2265 /* Now the vectorization factor is final. */
2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267 gcc_assert (known_ne (vectorization_factor, 0U));
2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "vectorization_factor = ");
2273 dump_dec (MSG_NOTE, vectorization_factor);
2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 LOOP_VINFO_INT_NITERS (loop_vinfo));
2278 HOST_WIDE_INT max_niter
2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2281 /* Analyze the alignment of the data-refs in the loop.
2282 Fail if a data reference is found that cannot be vectorized. */
2284 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285 if (!ok)
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data alignment.\n");
2290 return false;
2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294 It is important to call pruning after vect_analyze_data_ref_accesses,
2295 since we use grouping information gathered by interleaving analysis. */
2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297 if (!ok)
2298 return false;
2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301 vectorization. */
2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2304 /* This pass will decide on using loop versioning and/or loop peeling in
2305 order to enhance the alignment of data references in the loop. */
2306 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return false;
2316 if (slp)
2318 /* Analyze operations in the SLP instances. Note this may
2319 remove unsupported SLP instances which makes the above
2320 SLP kind detection invalid. */
2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322 vect_slp_analyze_operations (loop_vinfo);
2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 goto again;
2327 /* Scan all the remaining operations in the loop that are not subject
2328 to SLP and make sure they are vectorizable. */
2329 ok = vect_analyze_loop_operations (loop_vinfo);
2330 if (!ok)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad operation or unsupported loop bound.\n");
2335 return false;
2338 /* Decide whether to use a fully-masked loop for this vectorization
2339 factor. */
2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342 && vect_verify_full_masking (loop_vinfo));
2343 if (dump_enabled_p ())
2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 dump_printf_loc (MSG_NOTE, vect_location,
2347 "using a fully-masked loop.\n");
2348 else
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "not using a fully-masked loop.\n");
2353 /* If epilog loop is required because of data accesses with gaps,
2354 one additional iteration needs to be peeled. Check if there is
2355 enough iterations for vectorization. */
2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2363 if (known_lt (wi::to_widest (scalar_niters), vf))
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "loop has no enough iterations to support"
2368 " peeling for gaps.\n");
2369 return false;
2373 /* Check the costings of the loop make vectorizing worthwhile. */
2374 res = vect_analyze_loop_costing (loop_vinfo);
2375 if (res < 0)
2376 goto again;
2377 if (!res)
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "Loop costings not worthwhile.\n");
2382 return false;
2385 /* Decide whether we need to create an epilogue loop to handle
2386 remaining scalar iterations. */
2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2389 unsigned HOST_WIDE_INT const_vf;
2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391 /* The main loop handles all iterations. */
2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2396 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2401 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404 < (unsigned) exact_log2 (const_vf))
2405 /* In case of versioning, check if the maximum number of
2406 iterations is greater than th. If they are identical,
2407 the epilogue is unnecessary. */
2408 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 || ((unsigned HOST_WIDE_INT) max_niter
2410 > (th / const_vf) * const_vf))))
2411 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2413 /* If an epilogue loop is required make sure we can create one. */
2414 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419 if (!vect_can_advance_ivs_p (loop_vinfo)
2420 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421 single_exit (LOOP_VINFO_LOOP
2422 (loop_vinfo))))
2424 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426 "not vectorized: can't create required "
2427 "epilog loop\n");
2428 goto again;
2432 /* During peeling, we need to check if number of loop iterations is
2433 enough for both peeled prolog loop and vector loop. This check
2434 can be merged along with threshold check of loop versioning, so
2435 increase threshold for this case if necessary. */
2436 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2438 poly_uint64 niters_th = 0;
2440 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2442 /* Niters for peeled prolog loop. */
2443 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2445 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446 tree vectype
2447 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2450 else
2451 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2454 /* Niters for at least one iteration of vectorized loop. */
2455 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457 /* One additional iteration because of peeling for gap. */
2458 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459 niters_th += 1;
2460 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2463 gcc_assert (known_eq (vectorization_factor,
2464 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2466 /* Ok to vectorize! */
2467 return true;
2469 again:
2470 /* Try again with SLP forced off but if we didn't do any SLP there is
2471 no point in re-trying. */
2472 if (!slp)
2473 return false;
2475 /* If there are reduction chains re-trying will fail anyway. */
2476 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477 return false;
2479 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480 via interleaving or lane instructions. */
2481 slp_instance instance;
2482 slp_tree node;
2483 unsigned i, j;
2484 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2486 stmt_vec_info vinfo;
2487 vinfo = vinfo_for_stmt
2488 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490 continue;
2491 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494 if (! vect_store_lanes_supported (vectype, size, false)
2495 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2496 && ! vect_grouped_store_supported (vectype, size))
2497 return false;
2498 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2500 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2501 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2502 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2503 size = STMT_VINFO_GROUP_SIZE (vinfo);
2504 vectype = STMT_VINFO_VECTYPE (vinfo);
2505 if (! vect_load_lanes_supported (vectype, size, false)
2506 && ! vect_grouped_load_supported (vectype, single_element_p,
2507 size))
2508 return false;
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_NOTE, vect_location,
2514 "re-trying with SLP disabled\n");
2516 /* Roll back state appropriately. No SLP this time. */
2517 slp = false;
2518 /* Restore vectorization factor as it were without SLP. */
2519 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2520 /* Free the SLP instances. */
2521 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2522 vect_free_slp_instance (instance);
2523 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2524 /* Reset SLP type to loop_vect on all stmts. */
2525 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2527 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2528 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2529 !gsi_end_p (si); gsi_next (&si))
2531 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2532 STMT_SLP_TYPE (stmt_info) = loop_vect;
2534 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2535 !gsi_end_p (si); gsi_next (&si))
2537 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2538 STMT_SLP_TYPE (stmt_info) = loop_vect;
2539 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2541 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2542 STMT_SLP_TYPE (stmt_info) = loop_vect;
2543 for (gimple_stmt_iterator pi
2544 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2545 !gsi_end_p (pi); gsi_next (&pi))
2547 gimple *pstmt = gsi_stmt (pi);
2548 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2553 /* Free optimized alias test DDRS. */
2554 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2555 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2556 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2557 /* Reset target cost data. */
2558 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2559 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2560 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2561 /* Reset accumulated rgroup information. */
2562 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2563 /* Reset assorted flags. */
2564 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2565 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2566 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2567 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2568 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2570 goto start_over;
2573 /* Function vect_analyze_loop.
2575 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2576 for it. The different analyses will record information in the
2577 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2578 be vectorized. */
2579 loop_vec_info
2580 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2582 loop_vec_info loop_vinfo;
2583 auto_vector_sizes vector_sizes;
2585 /* Autodetect first vector size we try. */
2586 current_vector_size = 0;
2587 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2588 unsigned int next_size = 0;
2590 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_NOTE, vect_location,
2592 "===== analyze_loop_nest =====\n");
2594 if (loop_outer (loop)
2595 && loop_vec_info_for_loop (loop_outer (loop))
2596 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2598 if (dump_enabled_p ())
2599 dump_printf_loc (MSG_NOTE, vect_location,
2600 "outer-loop already vectorized.\n");
2601 return NULL;
2604 poly_uint64 autodetected_vector_size = 0;
2605 while (1)
2607 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2608 loop_vinfo = vect_analyze_loop_form (loop);
2609 if (!loop_vinfo)
2611 if (dump_enabled_p ())
2612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613 "bad loop form.\n");
2614 return NULL;
2617 bool fatal = false;
2619 if (orig_loop_vinfo)
2620 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2622 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2624 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2626 return loop_vinfo;
2629 delete loop_vinfo;
2631 if (next_size == 0)
2632 autodetected_vector_size = current_vector_size;
2634 if (next_size < vector_sizes.length ()
2635 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2636 next_size += 1;
2638 if (fatal
2639 || next_size == vector_sizes.length ()
2640 || known_eq (current_vector_size, 0U))
2641 return NULL;
2643 /* Try the next biggest vector size. */
2644 current_vector_size = vector_sizes[next_size++];
2645 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "***** Re-trying analysis with "
2649 "vector size ");
2650 dump_dec (MSG_NOTE, current_vector_size);
2651 dump_printf (MSG_NOTE, "\n");
2656 /* Return true if there is an in-order reduction function for CODE, storing
2657 it in *REDUC_FN if so. */
2659 static bool
2660 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2662 switch (code)
2664 case PLUS_EXPR:
2665 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2666 return true;
2668 default:
2669 return false;
2673 /* Function reduction_fn_for_scalar_code
2675 Input:
2676 CODE - tree_code of a reduction operations.
2678 Output:
2679 REDUC_FN - the corresponding internal function to be used to reduce the
2680 vector of partial results into a single scalar result, or IFN_LAST
2681 if the operation is a supported reduction operation, but does not have
2682 such an internal function.
2684 Return FALSE if CODE currently cannot be vectorized as reduction. */
2686 static bool
2687 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2689 switch (code)
2691 case MAX_EXPR:
2692 *reduc_fn = IFN_REDUC_MAX;
2693 return true;
2695 case MIN_EXPR:
2696 *reduc_fn = IFN_REDUC_MIN;
2697 return true;
2699 case PLUS_EXPR:
2700 *reduc_fn = IFN_REDUC_PLUS;
2701 return true;
2703 case BIT_AND_EXPR:
2704 *reduc_fn = IFN_REDUC_AND;
2705 return true;
2707 case BIT_IOR_EXPR:
2708 *reduc_fn = IFN_REDUC_IOR;
2709 return true;
2711 case BIT_XOR_EXPR:
2712 *reduc_fn = IFN_REDUC_XOR;
2713 return true;
2715 case MULT_EXPR:
2716 case MINUS_EXPR:
2717 *reduc_fn = IFN_LAST;
2718 return true;
2720 default:
2721 return false;
2725 /* If there is a neutral value X such that SLP reduction NODE would not
2726 be affected by the introduction of additional X elements, return that X,
2727 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2728 is true if the SLP statements perform a single reduction, false if each
2729 statement performs an independent reduction. */
2731 static tree
2732 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2733 bool reduc_chain)
2735 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2736 gimple *stmt = stmts[0];
2737 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2738 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2739 tree scalar_type = TREE_TYPE (vector_type);
2740 struct loop *loop = gimple_bb (stmt)->loop_father;
2741 gcc_assert (loop);
2743 switch (code)
2745 case WIDEN_SUM_EXPR:
2746 case DOT_PROD_EXPR:
2747 case SAD_EXPR:
2748 case PLUS_EXPR:
2749 case MINUS_EXPR:
2750 case BIT_IOR_EXPR:
2751 case BIT_XOR_EXPR:
2752 return build_zero_cst (scalar_type);
2754 case MULT_EXPR:
2755 return build_one_cst (scalar_type);
2757 case BIT_AND_EXPR:
2758 return build_all_ones_cst (scalar_type);
2760 case MAX_EXPR:
2761 case MIN_EXPR:
2762 /* For MIN/MAX the initial values are neutral. A reduction chain
2763 has only a single initial value, so that value is neutral for
2764 all statements. */
2765 if (reduc_chain)
2766 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2767 return NULL_TREE;
2769 default:
2770 return NULL_TREE;
2774 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2775 STMT is printed with a message MSG. */
2777 static void
2778 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2780 dump_printf_loc (msg_type, vect_location, "%s", msg);
2781 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2785 /* Detect SLP reduction of the form:
2787 #a1 = phi <a5, a0>
2788 a2 = operation (a1)
2789 a3 = operation (a2)
2790 a4 = operation (a3)
2791 a5 = operation (a4)
2793 #a = phi <a5>
2795 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2796 FIRST_STMT is the first reduction stmt in the chain
2797 (a2 = operation (a1)).
2799 Return TRUE if a reduction chain was detected. */
2801 static bool
2802 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2803 gimple *first_stmt)
2805 struct loop *loop = (gimple_bb (phi))->loop_father;
2806 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807 enum tree_code code;
2808 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2809 stmt_vec_info use_stmt_info, current_stmt_info;
2810 tree lhs;
2811 imm_use_iterator imm_iter;
2812 use_operand_p use_p;
2813 int nloop_uses, size = 0, n_out_of_loop_uses;
2814 bool found = false;
2816 if (loop != vect_loop)
2817 return false;
2819 lhs = PHI_RESULT (phi);
2820 code = gimple_assign_rhs_code (first_stmt);
2821 while (1)
2823 nloop_uses = 0;
2824 n_out_of_loop_uses = 0;
2825 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2827 gimple *use_stmt = USE_STMT (use_p);
2828 if (is_gimple_debug (use_stmt))
2829 continue;
2831 /* Check if we got back to the reduction phi. */
2832 if (use_stmt == phi)
2834 loop_use_stmt = use_stmt;
2835 found = true;
2836 break;
2839 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2841 loop_use_stmt = use_stmt;
2842 nloop_uses++;
2844 else
2845 n_out_of_loop_uses++;
2847 /* There are can be either a single use in the loop or two uses in
2848 phi nodes. */
2849 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2850 return false;
2853 if (found)
2854 break;
2856 /* We reached a statement with no loop uses. */
2857 if (nloop_uses == 0)
2858 return false;
2860 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2861 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2862 return false;
2864 if (!is_gimple_assign (loop_use_stmt)
2865 || code != gimple_assign_rhs_code (loop_use_stmt)
2866 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2867 return false;
2869 /* Insert USE_STMT into reduction chain. */
2870 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2871 if (current_stmt)
2873 current_stmt_info = vinfo_for_stmt (current_stmt);
2874 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2875 GROUP_FIRST_ELEMENT (use_stmt_info)
2876 = GROUP_FIRST_ELEMENT (current_stmt_info);
2878 else
2879 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2881 lhs = gimple_assign_lhs (loop_use_stmt);
2882 current_stmt = loop_use_stmt;
2883 size++;
2886 if (!found || loop_use_stmt != phi || size < 2)
2887 return false;
2889 /* Swap the operands, if needed, to make the reduction operand be the second
2890 operand. */
2891 lhs = PHI_RESULT (phi);
2892 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2893 while (next_stmt)
2895 if (gimple_assign_rhs2 (next_stmt) == lhs)
2897 tree op = gimple_assign_rhs1 (next_stmt);
2898 gimple *def_stmt = NULL;
2900 if (TREE_CODE (op) == SSA_NAME)
2901 def_stmt = SSA_NAME_DEF_STMT (op);
2903 /* Check that the other def is either defined in the loop
2904 ("vect_internal_def"), or it's an induction (defined by a
2905 loop-header phi-node). */
2906 if (def_stmt
2907 && gimple_bb (def_stmt)
2908 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909 && (is_gimple_assign (def_stmt)
2910 || is_gimple_call (def_stmt)
2911 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912 == vect_induction_def
2913 || (gimple_code (def_stmt) == GIMPLE_PHI
2914 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915 == vect_internal_def
2916 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2918 lhs = gimple_assign_lhs (next_stmt);
2919 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2920 continue;
2923 return false;
2925 else
2927 tree op = gimple_assign_rhs2 (next_stmt);
2928 gimple *def_stmt = NULL;
2930 if (TREE_CODE (op) == SSA_NAME)
2931 def_stmt = SSA_NAME_DEF_STMT (op);
2933 /* Check that the other def is either defined in the loop
2934 ("vect_internal_def"), or it's an induction (defined by a
2935 loop-header phi-node). */
2936 if (def_stmt
2937 && gimple_bb (def_stmt)
2938 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2939 && (is_gimple_assign (def_stmt)
2940 || is_gimple_call (def_stmt)
2941 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2942 == vect_induction_def
2943 || (gimple_code (def_stmt) == GIMPLE_PHI
2944 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2945 == vect_internal_def
2946 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2948 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2951 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2954 swap_ssa_operands (next_stmt,
2955 gimple_assign_rhs1_ptr (next_stmt),
2956 gimple_assign_rhs2_ptr (next_stmt));
2957 update_stmt (next_stmt);
2959 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2960 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2962 else
2963 return false;
2966 lhs = gimple_assign_lhs (next_stmt);
2967 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2970 /* Save the chain for further analysis in SLP detection. */
2971 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2972 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2973 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2975 return true;
2978 /* Return true if we need an in-order reduction for operation CODE
2979 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2980 overflow must wrap. */
2982 static bool
2983 needs_fold_left_reduction_p (tree type, tree_code code,
2984 bool need_wrapping_integral_overflow)
2986 /* CHECKME: check for !flag_finite_math_only too? */
2987 if (SCALAR_FLOAT_TYPE_P (type))
2988 switch (code)
2990 case MIN_EXPR:
2991 case MAX_EXPR:
2992 return false;
2994 default:
2995 return !flag_associative_math;
2998 if (INTEGRAL_TYPE_P (type))
3000 if (!operation_no_trapping_overflow (type, code))
3001 return true;
3002 if (need_wrapping_integral_overflow
3003 && !TYPE_OVERFLOW_WRAPS (type)
3004 && operation_can_overflow (code))
3005 return true;
3006 return false;
3009 if (SAT_FIXED_POINT_TYPE_P (type))
3010 return true;
3012 return false;
3015 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3016 reduction operation CODE has a handled computation expression. */
3018 bool
3019 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3020 enum tree_code code)
3022 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3023 auto_bitmap visited;
3024 tree lookfor = PHI_RESULT (phi);
3025 ssa_op_iter curri;
3026 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3027 while (USE_FROM_PTR (curr) != loop_arg)
3028 curr = op_iter_next_use (&curri);
3029 curri.i = curri.numops;
3032 path.safe_push (std::make_pair (curri, curr));
3033 tree use = USE_FROM_PTR (curr);
3034 if (use == lookfor)
3035 break;
3036 gimple *def = SSA_NAME_DEF_STMT (use);
3037 if (gimple_nop_p (def)
3038 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3040 pop:
3043 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3044 curri = x.first;
3045 curr = x.second;
3047 curr = op_iter_next_use (&curri);
3048 /* Skip already visited or non-SSA operands (from iterating
3049 over PHI args). */
3050 while (curr != NULL_USE_OPERAND_P
3051 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3052 || ! bitmap_set_bit (visited,
3053 SSA_NAME_VERSION
3054 (USE_FROM_PTR (curr)))));
3056 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3057 if (curr == NULL_USE_OPERAND_P)
3058 break;
3060 else
3062 if (gimple_code (def) == GIMPLE_PHI)
3063 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3064 else
3065 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3066 while (curr != NULL_USE_OPERAND_P
3067 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3068 || ! bitmap_set_bit (visited,
3069 SSA_NAME_VERSION
3070 (USE_FROM_PTR (curr)))))
3071 curr = op_iter_next_use (&curri);
3072 if (curr == NULL_USE_OPERAND_P)
3073 goto pop;
3076 while (1);
3077 if (dump_file && (dump_flags & TDF_DETAILS))
3079 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3080 unsigned i;
3081 std::pair<ssa_op_iter, use_operand_p> *x;
3082 FOR_EACH_VEC_ELT (path, i, x)
3084 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3085 dump_printf (MSG_NOTE, " ");
3087 dump_printf (MSG_NOTE, "\n");
3090 /* Check whether the reduction path detected is valid. */
3091 bool fail = path.length () == 0;
3092 bool neg = false;
3093 for (unsigned i = 1; i < path.length (); ++i)
3095 gimple *use_stmt = USE_STMT (path[i].second);
3096 tree op = USE_FROM_PTR (path[i].second);
3097 if (! has_single_use (op)
3098 || ! is_gimple_assign (use_stmt))
3100 fail = true;
3101 break;
3103 if (gimple_assign_rhs_code (use_stmt) != code)
3105 if (code == PLUS_EXPR
3106 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3108 /* Track whether we negate the reduction value each iteration. */
3109 if (gimple_assign_rhs2 (use_stmt) == op)
3110 neg = ! neg;
3112 else
3114 fail = true;
3115 break;
3119 return ! fail && ! neg;
3123 /* Function vect_is_simple_reduction
3125 (1) Detect a cross-iteration def-use cycle that represents a simple
3126 reduction computation. We look for the following pattern:
3128 loop_header:
3129 a1 = phi < a0, a2 >
3130 a3 = ...
3131 a2 = operation (a3, a1)
3135 a3 = ...
3136 loop_header:
3137 a1 = phi < a0, a2 >
3138 a2 = operation (a3, a1)
3140 such that:
3141 1. operation is commutative and associative and it is safe to
3142 change the order of the computation
3143 2. no uses for a2 in the loop (a2 is used out of the loop)
3144 3. no uses of a1 in the loop besides the reduction operation
3145 4. no uses of a1 outside the loop.
3147 Conditions 1,4 are tested here.
3148 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3150 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3151 nested cycles.
3153 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3154 reductions:
3156 a1 = phi < a0, a2 >
3157 inner loop (def of a3)
3158 a2 = phi < a3 >
3160 (4) Detect condition expressions, ie:
3161 for (int i = 0; i < N; i++)
3162 if (a[i] < val)
3163 ret_val = a[i];
3167 static gimple *
3168 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3169 bool *double_reduc,
3170 bool need_wrapping_integral_overflow,
3171 enum vect_reduction_type *v_reduc_type)
3173 struct loop *loop = (gimple_bb (phi))->loop_father;
3174 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3175 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3176 enum tree_code orig_code, code;
3177 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3178 tree type;
3179 int nloop_uses;
3180 tree name;
3181 imm_use_iterator imm_iter;
3182 use_operand_p use_p;
3183 bool phi_def;
3185 *double_reduc = false;
3186 *v_reduc_type = TREE_CODE_REDUCTION;
3188 tree phi_name = PHI_RESULT (phi);
3189 /* ??? If there are no uses of the PHI result the inner loop reduction
3190 won't be detected as possibly double-reduction by vectorizable_reduction
3191 because that tries to walk the PHI arg from the preheader edge which
3192 can be constant. See PR60382. */
3193 if (has_zero_uses (phi_name))
3194 return NULL;
3195 nloop_uses = 0;
3196 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3198 gimple *use_stmt = USE_STMT (use_p);
3199 if (is_gimple_debug (use_stmt))
3200 continue;
3202 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3204 if (dump_enabled_p ())
3205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206 "intermediate value used outside loop.\n");
3208 return NULL;
3211 nloop_uses++;
3212 if (nloop_uses > 1)
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216 "reduction value used in loop.\n");
3217 return NULL;
3220 phi_use_stmt = use_stmt;
3223 edge latch_e = loop_latch_edge (loop);
3224 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3225 if (TREE_CODE (loop_arg) != SSA_NAME)
3227 if (dump_enabled_p ())
3229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230 "reduction: not ssa_name: ");
3231 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3232 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3234 return NULL;
3237 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3238 if (is_gimple_assign (def_stmt))
3240 name = gimple_assign_lhs (def_stmt);
3241 phi_def = false;
3243 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3245 name = PHI_RESULT (def_stmt);
3246 phi_def = true;
3248 else
3250 if (dump_enabled_p ())
3252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3253 "reduction: unhandled reduction operation: ");
3254 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3256 return NULL;
3259 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3260 return NULL;
3262 nloop_uses = 0;
3263 auto_vec<gphi *, 3> lcphis;
3264 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3266 gimple *use_stmt = USE_STMT (use_p);
3267 if (is_gimple_debug (use_stmt))
3268 continue;
3269 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3270 nloop_uses++;
3271 else
3272 /* We can have more than one loop-closed PHI. */
3273 lcphis.safe_push (as_a <gphi *> (use_stmt));
3274 if (nloop_uses > 1)
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "reduction used in loop.\n");
3279 return NULL;
3283 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3284 defined in the inner loop. */
3285 if (phi_def)
3287 op1 = PHI_ARG_DEF (def_stmt, 0);
3289 if (gimple_phi_num_args (def_stmt) != 1
3290 || TREE_CODE (op1) != SSA_NAME)
3292 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3294 "unsupported phi node definition.\n");
3296 return NULL;
3299 def1 = SSA_NAME_DEF_STMT (op1);
3300 if (gimple_bb (def1)
3301 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3302 && loop->inner
3303 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3304 && is_gimple_assign (def1)
3305 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3307 if (dump_enabled_p ())
3308 report_vect_op (MSG_NOTE, def_stmt,
3309 "detected double reduction: ");
3311 *double_reduc = true;
3312 return def_stmt;
3315 return NULL;
3318 /* If we are vectorizing an inner reduction we are executing that
3319 in the original order only in case we are not dealing with a
3320 double reduction. */
3321 bool check_reduction = true;
3322 if (flow_loop_nested_p (vect_loop, loop))
3324 gphi *lcphi;
3325 unsigned i;
3326 check_reduction = false;
3327 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3328 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3330 gimple *use_stmt = USE_STMT (use_p);
3331 if (is_gimple_debug (use_stmt))
3332 continue;
3333 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3334 check_reduction = true;
3338 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3339 code = orig_code = gimple_assign_rhs_code (def_stmt);
3341 /* We can handle "res -= x[i]", which is non-associative by
3342 simply rewriting this into "res += -x[i]". Avoid changing
3343 gimple instruction for the first simple tests and only do this
3344 if we're allowed to change code at all. */
3345 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3346 code = PLUS_EXPR;
3348 if (code == COND_EXPR)
3350 if (! nested_in_vect_loop)
3351 *v_reduc_type = COND_REDUCTION;
3353 op3 = gimple_assign_rhs1 (def_stmt);
3354 if (COMPARISON_CLASS_P (op3))
3356 op4 = TREE_OPERAND (op3, 1);
3357 op3 = TREE_OPERAND (op3, 0);
3359 if (op3 == phi_name || op4 == phi_name)
3361 if (dump_enabled_p ())
3362 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3363 "reduction: condition depends on previous"
3364 " iteration: ");
3365 return NULL;
3368 op1 = gimple_assign_rhs2 (def_stmt);
3369 op2 = gimple_assign_rhs3 (def_stmt);
3371 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3373 if (dump_enabled_p ())
3374 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3375 "reduction: not commutative/associative: ");
3376 return NULL;
3378 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3380 op1 = gimple_assign_rhs1 (def_stmt);
3381 op2 = gimple_assign_rhs2 (def_stmt);
3383 else
3385 if (dump_enabled_p ())
3386 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3387 "reduction: not handled operation: ");
3388 return NULL;
3391 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3393 if (dump_enabled_p ())
3394 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3395 "reduction: both uses not ssa_names: ");
3397 return NULL;
3400 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3401 if ((TREE_CODE (op1) == SSA_NAME
3402 && !types_compatible_p (type,TREE_TYPE (op1)))
3403 || (TREE_CODE (op2) == SSA_NAME
3404 && !types_compatible_p (type, TREE_TYPE (op2)))
3405 || (op3 && TREE_CODE (op3) == SSA_NAME
3406 && !types_compatible_p (type, TREE_TYPE (op3)))
3407 || (op4 && TREE_CODE (op4) == SSA_NAME
3408 && !types_compatible_p (type, TREE_TYPE (op4))))
3410 if (dump_enabled_p ())
3412 dump_printf_loc (MSG_NOTE, vect_location,
3413 "reduction: multiple types: operation type: ");
3414 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3415 dump_printf (MSG_NOTE, ", operands types: ");
3416 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3417 TREE_TYPE (op1));
3418 dump_printf (MSG_NOTE, ",");
3419 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3420 TREE_TYPE (op2));
3421 if (op3)
3423 dump_printf (MSG_NOTE, ",");
3424 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3425 TREE_TYPE (op3));
3428 if (op4)
3430 dump_printf (MSG_NOTE, ",");
3431 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3432 TREE_TYPE (op4));
3434 dump_printf (MSG_NOTE, "\n");
3437 return NULL;
3440 /* Check whether it's ok to change the order of the computation.
3441 Generally, when vectorizing a reduction we change the order of the
3442 computation. This may change the behavior of the program in some
3443 cases, so we need to check that this is ok. One exception is when
3444 vectorizing an outer-loop: the inner-loop is executed sequentially,
3445 and therefore vectorizing reductions in the inner-loop during
3446 outer-loop vectorization is safe. */
3447 if (check_reduction
3448 && *v_reduc_type == TREE_CODE_REDUCTION
3449 && needs_fold_left_reduction_p (type, code,
3450 need_wrapping_integral_overflow))
3451 *v_reduc_type = FOLD_LEFT_REDUCTION;
3453 /* Reduction is safe. We're dealing with one of the following:
3454 1) integer arithmetic and no trapv
3455 2) floating point arithmetic, and special flags permit this optimization
3456 3) nested cycle (i.e., outer loop vectorization). */
3457 if (TREE_CODE (op1) == SSA_NAME)
3458 def1 = SSA_NAME_DEF_STMT (op1);
3460 if (TREE_CODE (op2) == SSA_NAME)
3461 def2 = SSA_NAME_DEF_STMT (op2);
3463 if (code != COND_EXPR
3464 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3466 if (dump_enabled_p ())
3467 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3468 return NULL;
3471 /* Check that one def is the reduction def, defined by PHI,
3472 the other def is either defined in the loop ("vect_internal_def"),
3473 or it's an induction (defined by a loop-header phi-node). */
3475 if (def2 && def2 == phi
3476 && (code == COND_EXPR
3477 || !def1 || gimple_nop_p (def1)
3478 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3480 && (is_gimple_assign (def1)
3481 || is_gimple_call (def1)
3482 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3483 == vect_induction_def
3484 || (gimple_code (def1) == GIMPLE_PHI
3485 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3486 == vect_internal_def
3487 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3489 if (dump_enabled_p ())
3490 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3491 return def_stmt;
3494 if (def1 && def1 == phi
3495 && (code == COND_EXPR
3496 || !def2 || gimple_nop_p (def2)
3497 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3499 && (is_gimple_assign (def2)
3500 || is_gimple_call (def2)
3501 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3502 == vect_induction_def
3503 || (gimple_code (def2) == GIMPLE_PHI
3504 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3505 == vect_internal_def
3506 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3508 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3510 /* Check if we can swap operands (just for simplicity - so that
3511 the rest of the code can assume that the reduction variable
3512 is always the last (second) argument). */
3513 if (code == COND_EXPR)
3515 /* Swap cond_expr by inverting the condition. */
3516 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3517 enum tree_code invert_code = ERROR_MARK;
3518 enum tree_code cond_code = TREE_CODE (cond_expr);
3520 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3522 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3523 invert_code = invert_tree_comparison (cond_code, honor_nans);
3525 if (invert_code != ERROR_MARK)
3527 TREE_SET_CODE (cond_expr, invert_code);
3528 swap_ssa_operands (def_stmt,
3529 gimple_assign_rhs2_ptr (def_stmt),
3530 gimple_assign_rhs3_ptr (def_stmt));
3532 else
3534 if (dump_enabled_p ())
3535 report_vect_op (MSG_NOTE, def_stmt,
3536 "detected reduction: cannot swap operands "
3537 "for cond_expr");
3538 return NULL;
3541 else
3542 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3543 gimple_assign_rhs2_ptr (def_stmt));
3545 if (dump_enabled_p ())
3546 report_vect_op (MSG_NOTE, def_stmt,
3547 "detected reduction: need to swap operands: ");
3549 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3550 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3552 else
3554 if (dump_enabled_p ())
3555 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3558 return def_stmt;
3561 /* Try to find SLP reduction chain. */
3562 if (! nested_in_vect_loop
3563 && code != COND_EXPR
3564 && orig_code != MINUS_EXPR
3565 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3567 if (dump_enabled_p ())
3568 report_vect_op (MSG_NOTE, def_stmt,
3569 "reduction: detected reduction chain: ");
3571 return def_stmt;
3574 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3575 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3576 while (first)
3578 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3579 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3581 first = next;
3584 /* Look for the expression computing loop_arg from loop PHI result. */
3585 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3586 code))
3587 return def_stmt;
3589 if (dump_enabled_p ())
3591 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3592 "reduction: unknown pattern: ");
3595 return NULL;
3598 /* Wrapper around vect_is_simple_reduction, which will modify code
3599 in-place if it enables detection of more reductions. Arguments
3600 as there. */
3602 gimple *
3603 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3604 bool *double_reduc,
3605 bool need_wrapping_integral_overflow)
3607 enum vect_reduction_type v_reduc_type;
3608 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3609 need_wrapping_integral_overflow,
3610 &v_reduc_type);
3611 if (def)
3613 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3614 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3615 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3616 reduc_def_info = vinfo_for_stmt (def);
3617 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3618 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3620 return def;
3623 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3625 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3626 int *peel_iters_epilogue,
3627 stmt_vector_for_cost *scalar_cost_vec,
3628 stmt_vector_for_cost *prologue_cost_vec,
3629 stmt_vector_for_cost *epilogue_cost_vec)
3631 int retval = 0;
3632 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3634 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3636 *peel_iters_epilogue = assumed_vf / 2;
3637 if (dump_enabled_p ())
3638 dump_printf_loc (MSG_NOTE, vect_location,
3639 "cost model: epilogue peel iters set to vf/2 "
3640 "because loop iterations are unknown .\n");
3642 /* If peeled iterations are known but number of scalar loop
3643 iterations are unknown, count a taken branch per peeled loop. */
3644 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3645 NULL, 0, vect_prologue);
3646 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3647 NULL, 0, vect_epilogue);
3649 else
3651 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3652 peel_iters_prologue = niters < peel_iters_prologue ?
3653 niters : peel_iters_prologue;
3654 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3655 /* If we need to peel for gaps, but no peeling is required, we have to
3656 peel VF iterations. */
3657 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3658 *peel_iters_epilogue = assumed_vf;
3661 stmt_info_for_cost *si;
3662 int j;
3663 if (peel_iters_prologue)
3664 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3666 stmt_vec_info stmt_info
3667 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3668 retval += record_stmt_cost (prologue_cost_vec,
3669 si->count * peel_iters_prologue,
3670 si->kind, stmt_info, si->misalign,
3671 vect_prologue);
3673 if (*peel_iters_epilogue)
3674 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3676 stmt_vec_info stmt_info
3677 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3678 retval += record_stmt_cost (epilogue_cost_vec,
3679 si->count * *peel_iters_epilogue,
3680 si->kind, stmt_info, si->misalign,
3681 vect_epilogue);
3684 return retval;
3687 /* Function vect_estimate_min_profitable_iters
3689 Return the number of iterations required for the vector version of the
3690 loop to be profitable relative to the cost of the scalar version of the
3691 loop.
3693 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3694 of iterations for vectorization. -1 value means loop vectorization
3695 is not profitable. This returned value may be used for dynamic
3696 profitability check.
3698 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3699 for static check against estimated number of iterations. */
3701 static void
3702 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3703 int *ret_min_profitable_niters,
3704 int *ret_min_profitable_estimate)
3706 int min_profitable_iters;
3707 int min_profitable_estimate;
3708 int peel_iters_prologue;
3709 int peel_iters_epilogue;
3710 unsigned vec_inside_cost = 0;
3711 int vec_outside_cost = 0;
3712 unsigned vec_prologue_cost = 0;
3713 unsigned vec_epilogue_cost = 0;
3714 int scalar_single_iter_cost = 0;
3715 int scalar_outside_cost = 0;
3716 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3717 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3718 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3720 /* Cost model disabled. */
3721 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3723 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3724 *ret_min_profitable_niters = 0;
3725 *ret_min_profitable_estimate = 0;
3726 return;
3729 /* Requires loop versioning tests to handle misalignment. */
3730 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3732 /* FIXME: Make cost depend on complexity of individual check. */
3733 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3734 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3735 vect_prologue);
3736 dump_printf (MSG_NOTE,
3737 "cost model: Adding cost of checks for loop "
3738 "versioning to treat misalignment.\n");
3741 /* Requires loop versioning with alias checks. */
3742 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3744 /* FIXME: Make cost depend on complexity of individual check. */
3745 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3746 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3747 vect_prologue);
3748 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3749 if (len)
3750 /* Count LEN - 1 ANDs and LEN comparisons. */
3751 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3752 NULL, 0, vect_prologue);
3753 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3754 if (len)
3756 /* Count LEN - 1 ANDs and LEN comparisons. */
3757 unsigned int nstmts = len * 2 - 1;
3758 /* +1 for each bias that needs adding. */
3759 for (unsigned int i = 0; i < len; ++i)
3760 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3761 nstmts += 1;
3762 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3763 NULL, 0, vect_prologue);
3765 dump_printf (MSG_NOTE,
3766 "cost model: Adding cost of checks for loop "
3767 "versioning aliasing.\n");
3770 /* Requires loop versioning with niter checks. */
3771 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3773 /* FIXME: Make cost depend on complexity of individual check. */
3774 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3775 vect_prologue);
3776 dump_printf (MSG_NOTE,
3777 "cost model: Adding cost of checks for loop "
3778 "versioning niters.\n");
3781 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3782 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3783 vect_prologue);
3785 /* Count statements in scalar loop. Using this as scalar cost for a single
3786 iteration for now.
3788 TODO: Add outer loop support.
3790 TODO: Consider assigning different costs to different scalar
3791 statements. */
3793 scalar_single_iter_cost
3794 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3796 /* Add additional cost for the peeled instructions in prologue and epilogue
3797 loop. (For fully-masked loops there will be no peeling.)
3799 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3800 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3802 TODO: Build an expression that represents peel_iters for prologue and
3803 epilogue to be used in a run-time test. */
3805 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3807 peel_iters_prologue = 0;
3808 peel_iters_epilogue = 0;
3810 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3812 /* We need to peel exactly one iteration. */
3813 peel_iters_epilogue += 1;
3814 stmt_info_for_cost *si;
3815 int j;
3816 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3817 j, si)
3819 struct _stmt_vec_info *stmt_info
3820 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3821 (void) add_stmt_cost (target_cost_data, si->count,
3822 si->kind, stmt_info, si->misalign,
3823 vect_epilogue);
3827 else if (npeel < 0)
3829 peel_iters_prologue = assumed_vf / 2;
3830 dump_printf (MSG_NOTE, "cost model: "
3831 "prologue peel iters set to vf/2.\n");
3833 /* If peeling for alignment is unknown, loop bound of main loop becomes
3834 unknown. */
3835 peel_iters_epilogue = assumed_vf / 2;
3836 dump_printf (MSG_NOTE, "cost model: "
3837 "epilogue peel iters set to vf/2 because "
3838 "peeling for alignment is unknown.\n");
3840 /* If peeled iterations are unknown, count a taken branch and a not taken
3841 branch per peeled loop. Even if scalar loop iterations are known,
3842 vector iterations are not known since peeled prologue iterations are
3843 not known. Hence guards remain the same. */
3844 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3845 NULL, 0, vect_prologue);
3846 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3847 NULL, 0, vect_prologue);
3848 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3849 NULL, 0, vect_epilogue);
3850 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3851 NULL, 0, vect_epilogue);
3852 stmt_info_for_cost *si;
3853 int j;
3854 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3856 struct _stmt_vec_info *stmt_info
3857 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3858 (void) add_stmt_cost (target_cost_data,
3859 si->count * peel_iters_prologue,
3860 si->kind, stmt_info, si->misalign,
3861 vect_prologue);
3862 (void) add_stmt_cost (target_cost_data,
3863 si->count * peel_iters_epilogue,
3864 si->kind, stmt_info, si->misalign,
3865 vect_epilogue);
3868 else
3870 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3871 stmt_info_for_cost *si;
3872 int j;
3873 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3875 prologue_cost_vec.create (2);
3876 epilogue_cost_vec.create (2);
3877 peel_iters_prologue = npeel;
3879 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3880 &peel_iters_epilogue,
3881 &LOOP_VINFO_SCALAR_ITERATION_COST
3882 (loop_vinfo),
3883 &prologue_cost_vec,
3884 &epilogue_cost_vec);
3886 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3888 struct _stmt_vec_info *stmt_info
3889 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3890 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3891 si->misalign, vect_prologue);
3894 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3896 struct _stmt_vec_info *stmt_info
3897 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3898 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3899 si->misalign, vect_epilogue);
3902 prologue_cost_vec.release ();
3903 epilogue_cost_vec.release ();
3906 /* FORNOW: The scalar outside cost is incremented in one of the
3907 following ways:
3909 1. The vectorizer checks for alignment and aliasing and generates
3910 a condition that allows dynamic vectorization. A cost model
3911 check is ANDED with the versioning condition. Hence scalar code
3912 path now has the added cost of the versioning check.
3914 if (cost > th & versioning_check)
3915 jmp to vector code
3917 Hence run-time scalar is incremented by not-taken branch cost.
3919 2. The vectorizer then checks if a prologue is required. If the
3920 cost model check was not done before during versioning, it has to
3921 be done before the prologue check.
3923 if (cost <= th)
3924 prologue = scalar_iters
3925 if (prologue == 0)
3926 jmp to vector code
3927 else
3928 execute prologue
3929 if (prologue == num_iters)
3930 go to exit
3932 Hence the run-time scalar cost is incremented by a taken branch,
3933 plus a not-taken branch, plus a taken branch cost.
3935 3. The vectorizer then checks if an epilogue is required. If the
3936 cost model check was not done before during prologue check, it
3937 has to be done with the epilogue check.
3939 if (prologue == 0)
3940 jmp to vector code
3941 else
3942 execute prologue
3943 if (prologue == num_iters)
3944 go to exit
3945 vector code:
3946 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3947 jmp to epilogue
3949 Hence the run-time scalar cost should be incremented by 2 taken
3950 branches.
3952 TODO: The back end may reorder the BBS's differently and reverse
3953 conditions/branch directions. Change the estimates below to
3954 something more reasonable. */
3956 /* If the number of iterations is known and we do not do versioning, we can
3957 decide whether to vectorize at compile time. Hence the scalar version
3958 do not carry cost model guard costs. */
3959 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3960 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3962 /* Cost model check occurs at versioning. */
3963 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3964 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3965 else
3967 /* Cost model check occurs at prologue generation. */
3968 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3969 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3970 + vect_get_stmt_cost (cond_branch_not_taken);
3971 /* Cost model check occurs at epilogue generation. */
3972 else
3973 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3977 /* Complete the target-specific cost calculations. */
3978 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3979 &vec_inside_cost, &vec_epilogue_cost);
3981 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3983 if (dump_enabled_p ())
3985 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3986 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3987 vec_inside_cost);
3988 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3989 vec_prologue_cost);
3990 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3991 vec_epilogue_cost);
3992 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3993 scalar_single_iter_cost);
3994 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3995 scalar_outside_cost);
3996 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3997 vec_outside_cost);
3998 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3999 peel_iters_prologue);
4000 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4001 peel_iters_epilogue);
4004 /* Calculate number of iterations required to make the vector version
4005 profitable, relative to the loop bodies only. The following condition
4006 must hold true:
4007 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4008 where
4009 SIC = scalar iteration cost, VIC = vector iteration cost,
4010 VOC = vector outside cost, VF = vectorization factor,
4011 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4012 SOC = scalar outside cost for run time cost model check. */
4014 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4016 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4017 * assumed_vf
4018 - vec_inside_cost * peel_iters_prologue
4019 - vec_inside_cost * peel_iters_epilogue);
4020 if (min_profitable_iters <= 0)
4021 min_profitable_iters = 0;
4022 else
4024 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4025 - vec_inside_cost);
4027 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4028 <= (((int) vec_inside_cost * min_profitable_iters)
4029 + (((int) vec_outside_cost - scalar_outside_cost)
4030 * assumed_vf)))
4031 min_profitable_iters++;
4034 /* vector version will never be profitable. */
4035 else
4037 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4038 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4039 "did not happen for a simd loop");
4041 if (dump_enabled_p ())
4042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4043 "cost model: the vector iteration cost = %d "
4044 "divided by the scalar iteration cost = %d "
4045 "is greater or equal to the vectorization factor = %d"
4046 ".\n",
4047 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4048 *ret_min_profitable_niters = -1;
4049 *ret_min_profitable_estimate = -1;
4050 return;
4053 dump_printf (MSG_NOTE,
4054 " Calculated minimum iters for profitability: %d\n",
4055 min_profitable_iters);
4057 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4058 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4059 /* We want the vectorized loop to execute at least once. */
4060 min_profitable_iters = assumed_vf + peel_iters_prologue;
4062 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_NOTE, vect_location,
4064 " Runtime profitability threshold = %d\n",
4065 min_profitable_iters);
4067 *ret_min_profitable_niters = min_profitable_iters;
4069 /* Calculate number of iterations required to make the vector version
4070 profitable, relative to the loop bodies only.
4072 Non-vectorized variant is SIC * niters and it must win over vector
4073 variant on the expected loop trip count. The following condition must hold true:
4074 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4076 if (vec_outside_cost <= 0)
4077 min_profitable_estimate = 0;
4078 else
4080 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4081 * assumed_vf
4082 - vec_inside_cost * peel_iters_prologue
4083 - vec_inside_cost * peel_iters_epilogue)
4084 / ((scalar_single_iter_cost * assumed_vf)
4085 - vec_inside_cost);
4087 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4088 if (dump_enabled_p ())
4089 dump_printf_loc (MSG_NOTE, vect_location,
4090 " Static estimate profitability threshold = %d\n",
4091 min_profitable_estimate);
4093 *ret_min_profitable_estimate = min_profitable_estimate;
4096 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4097 vector elements (not bits) for a vector with NELT elements. */
4098 static void
4099 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4100 vec_perm_builder *sel)
4102 /* The encoding is a single stepped pattern. Any wrap-around is handled
4103 by vec_perm_indices. */
4104 sel->new_vector (nelt, 1, 3);
4105 for (unsigned int i = 0; i < 3; i++)
4106 sel->quick_push (i + offset);
4109 /* Checks whether the target supports whole-vector shifts for vectors of mode
4110 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4111 it supports vec_perm_const with masks for all necessary shift amounts. */
4112 static bool
4113 have_whole_vector_shift (machine_mode mode)
4115 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4116 return true;
4118 /* Variable-length vectors should be handled via the optab. */
4119 unsigned int nelt;
4120 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4121 return false;
4123 vec_perm_builder sel;
4124 vec_perm_indices indices;
4125 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4127 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4128 indices.new_vector (sel, 2, nelt);
4129 if (!can_vec_perm_const_p (mode, indices, false))
4130 return false;
4132 return true;
4135 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4136 functions. Design better to avoid maintenance issues. */
4138 /* Function vect_model_reduction_cost.
4140 Models cost for a reduction operation, including the vector ops
4141 generated within the strip-mine loop, the initial definition before
4142 the loop, and the epilogue code that must be generated. */
4144 static void
4145 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4146 int ncopies)
4148 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4149 enum tree_code code;
4150 optab optab;
4151 tree vectype;
4152 gimple *orig_stmt;
4153 machine_mode mode;
4154 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4155 struct loop *loop = NULL;
4156 void *target_cost_data;
4158 if (loop_vinfo)
4160 loop = LOOP_VINFO_LOOP (loop_vinfo);
4161 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4163 else
4164 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4166 /* Condition reductions generate two reductions in the loop. */
4167 vect_reduction_type reduction_type
4168 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4169 if (reduction_type == COND_REDUCTION)
4170 ncopies *= 2;
4172 vectype = STMT_VINFO_VECTYPE (stmt_info);
4173 mode = TYPE_MODE (vectype);
4174 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4176 if (!orig_stmt)
4177 orig_stmt = STMT_VINFO_STMT (stmt_info);
4179 code = gimple_assign_rhs_code (orig_stmt);
4181 if (reduction_type == EXTRACT_LAST_REDUCTION
4182 || reduction_type == FOLD_LEFT_REDUCTION)
4184 /* No extra instructions needed in the prologue. */
4185 prologue_cost = 0;
4187 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4188 /* Count one reduction-like operation per vector. */
4189 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4190 stmt_info, 0, vect_body);
4191 else
4193 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4194 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4195 inside_cost = add_stmt_cost (target_cost_data, nelements,
4196 vec_to_scalar, stmt_info, 0,
4197 vect_body);
4198 inside_cost += add_stmt_cost (target_cost_data, nelements,
4199 scalar_stmt, stmt_info, 0,
4200 vect_body);
4203 else
4205 /* Add in cost for initial definition.
4206 For cond reduction we have four vectors: initial index, step,
4207 initial result of the data reduction, initial value of the index
4208 reduction. */
4209 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4210 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4211 scalar_to_vec, stmt_info, 0,
4212 vect_prologue);
4214 /* Cost of reduction op inside loop. */
4215 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4216 stmt_info, 0, vect_body);
4219 /* Determine cost of epilogue code.
4221 We have a reduction operator that will reduce the vector in one statement.
4222 Also requires scalar extract. */
4224 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4226 if (reduc_fn != IFN_LAST)
4228 if (reduction_type == COND_REDUCTION)
4230 /* An EQ stmt and an COND_EXPR stmt. */
4231 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4232 vector_stmt, stmt_info, 0,
4233 vect_epilogue);
4234 /* Reduction of the max index and a reduction of the found
4235 values. */
4236 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4237 vec_to_scalar, stmt_info, 0,
4238 vect_epilogue);
4239 /* A broadcast of the max value. */
4240 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4241 scalar_to_vec, stmt_info, 0,
4242 vect_epilogue);
4244 else
4246 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4247 stmt_info, 0, vect_epilogue);
4248 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4249 vec_to_scalar, stmt_info, 0,
4250 vect_epilogue);
4253 else if (reduction_type == COND_REDUCTION)
4255 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4256 /* Extraction of scalar elements. */
4257 epilogue_cost += add_stmt_cost (target_cost_data,
4258 2 * estimated_nunits,
4259 vec_to_scalar, stmt_info, 0,
4260 vect_epilogue);
4261 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4262 epilogue_cost += add_stmt_cost (target_cost_data,
4263 2 * estimated_nunits - 3,
4264 scalar_stmt, stmt_info, 0,
4265 vect_epilogue);
4267 else if (reduction_type == EXTRACT_LAST_REDUCTION
4268 || reduction_type == FOLD_LEFT_REDUCTION)
4269 /* No extra instructions need in the epilogue. */
4271 else
4273 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4274 tree bitsize =
4275 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4276 int element_bitsize = tree_to_uhwi (bitsize);
4277 int nelements = vec_size_in_bits / element_bitsize;
4279 if (code == COND_EXPR)
4280 code = MAX_EXPR;
4282 optab = optab_for_tree_code (code, vectype, optab_default);
4284 /* We have a whole vector shift available. */
4285 if (optab != unknown_optab
4286 && VECTOR_MODE_P (mode)
4287 && optab_handler (optab, mode) != CODE_FOR_nothing
4288 && have_whole_vector_shift (mode))
4290 /* Final reduction via vector shifts and the reduction operator.
4291 Also requires scalar extract. */
4292 epilogue_cost += add_stmt_cost (target_cost_data,
4293 exact_log2 (nelements) * 2,
4294 vector_stmt, stmt_info, 0,
4295 vect_epilogue);
4296 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4297 vec_to_scalar, stmt_info, 0,
4298 vect_epilogue);
4300 else
4301 /* Use extracts and reduction op for final reduction. For N
4302 elements, we have N extracts and N-1 reduction ops. */
4303 epilogue_cost += add_stmt_cost (target_cost_data,
4304 nelements + nelements - 1,
4305 vector_stmt, stmt_info, 0,
4306 vect_epilogue);
4310 if (dump_enabled_p ())
4311 dump_printf (MSG_NOTE,
4312 "vect_model_reduction_cost: inside_cost = %d, "
4313 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4314 prologue_cost, epilogue_cost);
4318 /* Function vect_model_induction_cost.
4320 Models cost for induction operations. */
4322 static void
4323 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4325 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4326 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4327 unsigned inside_cost, prologue_cost;
4329 if (PURE_SLP_STMT (stmt_info))
4330 return;
4332 /* loop cost for vec_loop. */
4333 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4334 stmt_info, 0, vect_body);
4336 /* prologue cost for vec_init and vec_step. */
4337 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4338 stmt_info, 0, vect_prologue);
4340 if (dump_enabled_p ())
4341 dump_printf_loc (MSG_NOTE, vect_location,
4342 "vect_model_induction_cost: inside_cost = %d, "
4343 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4348 /* Function get_initial_def_for_reduction
4350 Input:
4351 STMT - a stmt that performs a reduction operation in the loop.
4352 INIT_VAL - the initial value of the reduction variable
4354 Output:
4355 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4356 of the reduction (used for adjusting the epilog - see below).
4357 Return a vector variable, initialized according to the operation that STMT
4358 performs. This vector will be used as the initial value of the
4359 vector of partial results.
4361 Option1 (adjust in epilog): Initialize the vector as follows:
4362 add/bit or/xor: [0,0,...,0,0]
4363 mult/bit and: [1,1,...,1,1]
4364 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4365 and when necessary (e.g. add/mult case) let the caller know
4366 that it needs to adjust the result by init_val.
4368 Option2: Initialize the vector as follows:
4369 add/bit or/xor: [init_val,0,0,...,0]
4370 mult/bit and: [init_val,1,1,...,1]
4371 min/max/cond_expr: [init_val,init_val,...,init_val]
4372 and no adjustments are needed.
4374 For example, for the following code:
4376 s = init_val;
4377 for (i=0;i<n;i++)
4378 s = s + a[i];
4380 STMT is 's = s + a[i]', and the reduction variable is 's'.
4381 For a vector of 4 units, we want to return either [0,0,0,init_val],
4382 or [0,0,0,0] and let the caller know that it needs to adjust
4383 the result at the end by 'init_val'.
4385 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4386 initialization vector is simpler (same element in all entries), if
4387 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4389 A cost model should help decide between these two schemes. */
4391 tree
4392 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4393 tree *adjustment_def)
4395 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4398 tree scalar_type = TREE_TYPE (init_val);
4399 tree vectype = get_vectype_for_scalar_type (scalar_type);
4400 enum tree_code code = gimple_assign_rhs_code (stmt);
4401 tree def_for_init;
4402 tree init_def;
4403 bool nested_in_vect_loop = false;
4404 REAL_VALUE_TYPE real_init_val = dconst0;
4405 int int_init_val = 0;
4406 gimple *def_stmt = NULL;
4407 gimple_seq stmts = NULL;
4409 gcc_assert (vectype);
4411 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4412 || SCALAR_FLOAT_TYPE_P (scalar_type));
4414 if (nested_in_vect_loop_p (loop, stmt))
4415 nested_in_vect_loop = true;
4416 else
4417 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4419 /* In case of double reduction we only create a vector variable to be put
4420 in the reduction phi node. The actual statement creation is done in
4421 vect_create_epilog_for_reduction. */
4422 if (adjustment_def && nested_in_vect_loop
4423 && TREE_CODE (init_val) == SSA_NAME
4424 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4425 && gimple_code (def_stmt) == GIMPLE_PHI
4426 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4427 && vinfo_for_stmt (def_stmt)
4428 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4429 == vect_double_reduction_def)
4431 *adjustment_def = NULL;
4432 return vect_create_destination_var (init_val, vectype);
4435 vect_reduction_type reduction_type
4436 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4438 /* In case of a nested reduction do not use an adjustment def as
4439 that case is not supported by the epilogue generation correctly
4440 if ncopies is not one. */
4441 if (adjustment_def && nested_in_vect_loop)
4443 *adjustment_def = NULL;
4444 return vect_get_vec_def_for_operand (init_val, stmt);
4447 switch (code)
4449 case WIDEN_SUM_EXPR:
4450 case DOT_PROD_EXPR:
4451 case SAD_EXPR:
4452 case PLUS_EXPR:
4453 case MINUS_EXPR:
4454 case BIT_IOR_EXPR:
4455 case BIT_XOR_EXPR:
4456 case MULT_EXPR:
4457 case BIT_AND_EXPR:
4459 /* ADJUSTMENT_DEF is NULL when called from
4460 vect_create_epilog_for_reduction to vectorize double reduction. */
4461 if (adjustment_def)
4462 *adjustment_def = init_val;
4464 if (code == MULT_EXPR)
4466 real_init_val = dconst1;
4467 int_init_val = 1;
4470 if (code == BIT_AND_EXPR)
4471 int_init_val = -1;
4473 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4474 def_for_init = build_real (scalar_type, real_init_val);
4475 else
4476 def_for_init = build_int_cst (scalar_type, int_init_val);
4478 if (adjustment_def)
4479 /* Option1: the first element is '0' or '1' as well. */
4480 init_def = gimple_build_vector_from_val (&stmts, vectype,
4481 def_for_init);
4482 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4484 /* Option2 (variable length): the first element is INIT_VAL. */
4485 init_def = build_vector_from_val (vectype, def_for_init);
4486 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4487 2, init_def, init_val);
4488 init_def = make_ssa_name (vectype);
4489 gimple_call_set_lhs (call, init_def);
4490 gimple_seq_add_stmt (&stmts, call);
4492 else
4494 /* Option2: the first element is INIT_VAL. */
4495 tree_vector_builder elts (vectype, 1, 2);
4496 elts.quick_push (init_val);
4497 elts.quick_push (def_for_init);
4498 init_def = gimple_build_vector (&stmts, &elts);
4501 break;
4503 case MIN_EXPR:
4504 case MAX_EXPR:
4505 case COND_EXPR:
4507 if (adjustment_def)
4509 *adjustment_def = NULL_TREE;
4510 if (reduction_type != COND_REDUCTION
4511 && reduction_type != EXTRACT_LAST_REDUCTION)
4513 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4514 break;
4517 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4518 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4520 break;
4522 default:
4523 gcc_unreachable ();
4526 if (stmts)
4527 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4528 return init_def;
4531 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4532 NUMBER_OF_VECTORS is the number of vector defs to create.
4533 If NEUTRAL_OP is nonnull, introducing extra elements of that
4534 value will not change the result. */
4536 static void
4537 get_initial_defs_for_reduction (slp_tree slp_node,
4538 vec<tree> *vec_oprnds,
4539 unsigned int number_of_vectors,
4540 bool reduc_chain, tree neutral_op)
4542 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4543 gimple *stmt = stmts[0];
4544 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4545 unsigned HOST_WIDE_INT nunits;
4546 unsigned j, number_of_places_left_in_vector;
4547 tree vector_type;
4548 tree vop;
4549 int group_size = stmts.length ();
4550 unsigned int vec_num, i;
4551 unsigned number_of_copies = 1;
4552 vec<tree> voprnds;
4553 voprnds.create (number_of_vectors);
4554 struct loop *loop;
4555 auto_vec<tree, 16> permute_results;
4557 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4559 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4561 loop = (gimple_bb (stmt))->loop_father;
4562 gcc_assert (loop);
4563 edge pe = loop_preheader_edge (loop);
4565 gcc_assert (!reduc_chain || neutral_op);
4567 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4568 created vectors. It is greater than 1 if unrolling is performed.
4570 For example, we have two scalar operands, s1 and s2 (e.g., group of
4571 strided accesses of size two), while NUNITS is four (i.e., four scalars
4572 of this type can be packed in a vector). The output vector will contain
4573 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4574 will be 2).
4576 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4577 containing the operands.
4579 For example, NUNITS is four as before, and the group size is 8
4580 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4581 {s5, s6, s7, s8}. */
4583 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4584 nunits = group_size;
4586 number_of_copies = nunits * number_of_vectors / group_size;
4588 number_of_places_left_in_vector = nunits;
4589 bool constant_p = true;
4590 tree_vector_builder elts (vector_type, nunits, 1);
4591 elts.quick_grow (nunits);
4592 for (j = 0; j < number_of_copies; j++)
4594 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4596 tree op;
4597 /* Get the def before the loop. In reduction chain we have only
4598 one initial value. */
4599 if ((j != (number_of_copies - 1)
4600 || (reduc_chain && i != 0))
4601 && neutral_op)
4602 op = neutral_op;
4603 else
4604 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4606 /* Create 'vect_ = {op0,op1,...,opn}'. */
4607 number_of_places_left_in_vector--;
4608 elts[number_of_places_left_in_vector] = op;
4609 if (!CONSTANT_CLASS_P (op))
4610 constant_p = false;
4612 if (number_of_places_left_in_vector == 0)
4614 gimple_seq ctor_seq = NULL;
4615 tree init;
4616 if (constant_p && !neutral_op
4617 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4618 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4619 /* Build the vector directly from ELTS. */
4620 init = gimple_build_vector (&ctor_seq, &elts);
4621 else if (neutral_op)
4623 /* Build a vector of the neutral value and shift the
4624 other elements into place. */
4625 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4626 neutral_op);
4627 int k = nunits;
4628 while (k > 0 && elts[k - 1] == neutral_op)
4629 k -= 1;
4630 while (k > 0)
4632 k -= 1;
4633 gcall *call = gimple_build_call_internal
4634 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4635 init = make_ssa_name (vector_type);
4636 gimple_call_set_lhs (call, init);
4637 gimple_seq_add_stmt (&ctor_seq, call);
4640 else
4642 /* First time round, duplicate ELTS to fill the
4643 required number of vectors, then cherry pick the
4644 appropriate result for each iteration. */
4645 if (vec_oprnds->is_empty ())
4646 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4647 number_of_vectors,
4648 permute_results);
4649 init = permute_results[number_of_vectors - j - 1];
4651 if (ctor_seq != NULL)
4652 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4653 voprnds.quick_push (init);
4655 number_of_places_left_in_vector = nunits;
4656 elts.new_vector (vector_type, nunits, 1);
4657 elts.quick_grow (nunits);
4658 constant_p = true;
4663 /* Since the vectors are created in the reverse order, we should invert
4664 them. */
4665 vec_num = voprnds.length ();
4666 for (j = vec_num; j != 0; j--)
4668 vop = voprnds[j - 1];
4669 vec_oprnds->quick_push (vop);
4672 voprnds.release ();
4674 /* In case that VF is greater than the unrolling factor needed for the SLP
4675 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4676 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4677 to replicate the vectors. */
4678 tree neutral_vec = NULL;
4679 while (number_of_vectors > vec_oprnds->length ())
4681 if (neutral_op)
4683 if (!neutral_vec)
4685 gimple_seq ctor_seq = NULL;
4686 neutral_vec = gimple_build_vector_from_val
4687 (&ctor_seq, vector_type, neutral_op);
4688 if (ctor_seq != NULL)
4689 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4691 vec_oprnds->quick_push (neutral_vec);
4693 else
4695 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4696 vec_oprnds->quick_push (vop);
4702 /* Function vect_create_epilog_for_reduction
4704 Create code at the loop-epilog to finalize the result of a reduction
4705 computation.
4707 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4708 reduction statements.
4709 STMT is the scalar reduction stmt that is being vectorized.
4710 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4711 number of elements that we can fit in a vectype (nunits). In this case
4712 we have to generate more than one vector stmt - i.e - we need to "unroll"
4713 the vector stmt by a factor VF/nunits. For more details see documentation
4714 in vectorizable_operation.
4715 REDUC_FN is the internal function for the epilog reduction.
4716 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4717 computation.
4718 REDUC_INDEX is the index of the operand in the right hand side of the
4719 statement that is defined by REDUCTION_PHI.
4720 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4721 SLP_NODE is an SLP node containing a group of reduction statements. The
4722 first one in this group is STMT.
4723 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4724 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4725 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4726 any value of the IV in the loop.
4727 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4728 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4729 null if this is not an SLP reduction
4731 This function:
4732 1. Creates the reduction def-use cycles: sets the arguments for
4733 REDUCTION_PHIS:
4734 The loop-entry argument is the vectorized initial-value of the reduction.
4735 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4736 sums.
4737 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4738 by calling the function specified by REDUC_FN if available, or by
4739 other means (whole-vector shifts or a scalar loop).
4740 The function also creates a new phi node at the loop exit to preserve
4741 loop-closed form, as illustrated below.
4743 The flow at the entry to this function:
4745 loop:
4746 vec_def = phi <null, null> # REDUCTION_PHI
4747 VECT_DEF = vector_stmt # vectorized form of STMT
4748 s_loop = scalar_stmt # (scalar) STMT
4749 loop_exit:
4750 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4751 use <s_out0>
4752 use <s_out0>
4754 The above is transformed by this function into:
4756 loop:
4757 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4758 VECT_DEF = vector_stmt # vectorized form of STMT
4759 s_loop = scalar_stmt # (scalar) STMT
4760 loop_exit:
4761 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4762 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4763 v_out2 = reduce <v_out1>
4764 s_out3 = extract_field <v_out2, 0>
4765 s_out4 = adjust_result <s_out3>
4766 use <s_out4>
4767 use <s_out4>
4770 static void
4771 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4772 gimple *reduc_def_stmt,
4773 int ncopies, internal_fn reduc_fn,
4774 vec<gimple *> reduction_phis,
4775 bool double_reduc,
4776 slp_tree slp_node,
4777 slp_instance slp_node_instance,
4778 tree induc_val, enum tree_code induc_code,
4779 tree neutral_op)
4781 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4782 stmt_vec_info prev_phi_info;
4783 tree vectype;
4784 machine_mode mode;
4785 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4786 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4787 basic_block exit_bb;
4788 tree scalar_dest;
4789 tree scalar_type;
4790 gimple *new_phi = NULL, *phi;
4791 gimple_stmt_iterator exit_gsi;
4792 tree vec_dest;
4793 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4794 gimple *epilog_stmt = NULL;
4795 enum tree_code code = gimple_assign_rhs_code (stmt);
4796 gimple *exit_phi;
4797 tree bitsize;
4798 tree adjustment_def = NULL;
4799 tree vec_initial_def = NULL;
4800 tree expr, def, initial_def = NULL;
4801 tree orig_name, scalar_result;
4802 imm_use_iterator imm_iter, phi_imm_iter;
4803 use_operand_p use_p, phi_use_p;
4804 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4805 bool nested_in_vect_loop = false;
4806 auto_vec<gimple *> new_phis;
4807 auto_vec<gimple *> inner_phis;
4808 enum vect_def_type dt = vect_unknown_def_type;
4809 int j, i;
4810 auto_vec<tree> scalar_results;
4811 unsigned int group_size = 1, k, ratio;
4812 auto_vec<tree> vec_initial_defs;
4813 auto_vec<gimple *> phis;
4814 bool slp_reduc = false;
4815 bool direct_slp_reduc;
4816 tree new_phi_result;
4817 gimple *inner_phi = NULL;
4818 tree induction_index = NULL_TREE;
4820 if (slp_node)
4821 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4823 if (nested_in_vect_loop_p (loop, stmt))
4825 outer_loop = loop;
4826 loop = loop->inner;
4827 nested_in_vect_loop = true;
4828 gcc_assert (!slp_node);
4831 vectype = STMT_VINFO_VECTYPE (stmt_info);
4832 gcc_assert (vectype);
4833 mode = TYPE_MODE (vectype);
4835 /* 1. Create the reduction def-use cycle:
4836 Set the arguments of REDUCTION_PHIS, i.e., transform
4838 loop:
4839 vec_def = phi <null, null> # REDUCTION_PHI
4840 VECT_DEF = vector_stmt # vectorized form of STMT
4843 into:
4845 loop:
4846 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4847 VECT_DEF = vector_stmt # vectorized form of STMT
4850 (in case of SLP, do it for all the phis). */
4852 /* Get the loop-entry arguments. */
4853 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4854 if (slp_node)
4856 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4857 vec_initial_defs.reserve (vec_num);
4858 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4859 &vec_initial_defs, vec_num,
4860 GROUP_FIRST_ELEMENT (stmt_info),
4861 neutral_op);
4863 else
4865 /* Get at the scalar def before the loop, that defines the initial value
4866 of the reduction variable. */
4867 gimple *def_stmt;
4868 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4869 loop_preheader_edge (loop));
4870 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4871 and we can't use zero for induc_val, use initial_def. Similarly
4872 for REDUC_MIN and initial_def larger than the base. */
4873 if (TREE_CODE (initial_def) == INTEGER_CST
4874 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4875 == INTEGER_INDUC_COND_REDUCTION)
4876 && !integer_zerop (induc_val)
4877 && ((induc_code == MAX_EXPR
4878 && tree_int_cst_lt (initial_def, induc_val))
4879 || (induc_code == MIN_EXPR
4880 && tree_int_cst_lt (induc_val, initial_def))))
4881 induc_val = initial_def;
4882 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4883 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4884 &adjustment_def);
4885 vec_initial_defs.create (1);
4886 vec_initial_defs.quick_push (vec_initial_def);
4889 /* Set phi nodes arguments. */
4890 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4892 tree vec_init_def = vec_initial_defs[i];
4893 tree def = vect_defs[i];
4894 for (j = 0; j < ncopies; j++)
4896 if (j != 0)
4898 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4899 if (nested_in_vect_loop)
4900 vec_init_def
4901 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4902 vec_init_def);
4905 /* Set the loop-entry arg of the reduction-phi. */
4907 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4908 == INTEGER_INDUC_COND_REDUCTION)
4910 /* Initialise the reduction phi to zero. This prevents initial
4911 values of non-zero interferring with the reduction op. */
4912 gcc_assert (ncopies == 1);
4913 gcc_assert (i == 0);
4915 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4916 tree induc_val_vec
4917 = build_vector_from_val (vec_init_def_type, induc_val);
4919 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4920 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4922 else
4923 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4924 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4926 /* Set the loop-latch arg for the reduction-phi. */
4927 if (j > 0)
4928 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4930 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4931 UNKNOWN_LOCATION);
4933 if (dump_enabled_p ())
4935 dump_printf_loc (MSG_NOTE, vect_location,
4936 "transform reduction: created def-use cycle: ");
4937 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4938 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4943 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4944 which is updated with the current index of the loop for every match of
4945 the original loop's cond_expr (VEC_STMT). This results in a vector
4946 containing the last time the condition passed for that vector lane.
4947 The first match will be a 1 to allow 0 to be used for non-matching
4948 indexes. If there are no matches at all then the vector will be all
4949 zeroes. */
4950 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4952 tree indx_before_incr, indx_after_incr;
4953 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4955 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4956 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4958 int scalar_precision
4959 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4960 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4961 tree cr_index_vector_type = build_vector_type
4962 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4964 /* First we create a simple vector induction variable which starts
4965 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4966 vector size (STEP). */
4968 /* Create a {1,2,3,...} vector. */
4969 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4971 /* Create a vector of the step value. */
4972 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4973 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4975 /* Create an induction variable. */
4976 gimple_stmt_iterator incr_gsi;
4977 bool insert_after;
4978 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4979 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4980 insert_after, &indx_before_incr, &indx_after_incr);
4982 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4983 filled with zeros (VEC_ZERO). */
4985 /* Create a vector of 0s. */
4986 tree zero = build_zero_cst (cr_index_scalar_type);
4987 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4989 /* Create a vector phi node. */
4990 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4991 new_phi = create_phi_node (new_phi_tree, loop->header);
4992 set_vinfo_for_stmt (new_phi,
4993 new_stmt_vec_info (new_phi, loop_vinfo));
4994 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4995 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4997 /* Now take the condition from the loops original cond_expr
4998 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4999 every match uses values from the induction variable
5000 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5001 (NEW_PHI_TREE).
5002 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5003 the new cond_expr (INDEX_COND_EXPR). */
5005 /* Duplicate the condition from vec_stmt. */
5006 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5008 /* Create a conditional, where the condition is taken from vec_stmt
5009 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5010 else is the phi (NEW_PHI_TREE). */
5011 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5012 ccompare, indx_before_incr,
5013 new_phi_tree);
5014 induction_index = make_ssa_name (cr_index_vector_type);
5015 gimple *index_condition = gimple_build_assign (induction_index,
5016 index_cond_expr);
5017 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5018 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5019 loop_vinfo);
5020 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5021 set_vinfo_for_stmt (index_condition, index_vec_info);
5023 /* Update the phi with the vec cond. */
5024 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5025 loop_latch_edge (loop), UNKNOWN_LOCATION);
5028 /* 2. Create epilog code.
5029 The reduction epilog code operates across the elements of the vector
5030 of partial results computed by the vectorized loop.
5031 The reduction epilog code consists of:
5033 step 1: compute the scalar result in a vector (v_out2)
5034 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5035 step 3: adjust the scalar result (s_out3) if needed.
5037 Step 1 can be accomplished using one the following three schemes:
5038 (scheme 1) using reduc_fn, if available.
5039 (scheme 2) using whole-vector shifts, if available.
5040 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5041 combined.
5043 The overall epilog code looks like this:
5045 s_out0 = phi <s_loop> # original EXIT_PHI
5046 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5047 v_out2 = reduce <v_out1> # step 1
5048 s_out3 = extract_field <v_out2, 0> # step 2
5049 s_out4 = adjust_result <s_out3> # step 3
5051 (step 3 is optional, and steps 1 and 2 may be combined).
5052 Lastly, the uses of s_out0 are replaced by s_out4. */
5055 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5056 v_out1 = phi <VECT_DEF>
5057 Store them in NEW_PHIS. */
5059 exit_bb = single_exit (loop)->dest;
5060 prev_phi_info = NULL;
5061 new_phis.create (vect_defs.length ());
5062 FOR_EACH_VEC_ELT (vect_defs, i, def)
5064 for (j = 0; j < ncopies; j++)
5066 tree new_def = copy_ssa_name (def);
5067 phi = create_phi_node (new_def, exit_bb);
5068 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5069 if (j == 0)
5070 new_phis.quick_push (phi);
5071 else
5073 def = vect_get_vec_def_for_stmt_copy (dt, def);
5074 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5077 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5078 prev_phi_info = vinfo_for_stmt (phi);
5082 /* The epilogue is created for the outer-loop, i.e., for the loop being
5083 vectorized. Create exit phis for the outer loop. */
5084 if (double_reduc)
5086 loop = outer_loop;
5087 exit_bb = single_exit (loop)->dest;
5088 inner_phis.create (vect_defs.length ());
5089 FOR_EACH_VEC_ELT (new_phis, i, phi)
5091 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5092 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5093 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5094 PHI_RESULT (phi));
5095 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5096 loop_vinfo));
5097 inner_phis.quick_push (phi);
5098 new_phis[i] = outer_phi;
5099 prev_phi_info = vinfo_for_stmt (outer_phi);
5100 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5102 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5103 new_result = copy_ssa_name (PHI_RESULT (phi));
5104 outer_phi = create_phi_node (new_result, exit_bb);
5105 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5106 PHI_RESULT (phi));
5107 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5108 loop_vinfo));
5109 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5110 prev_phi_info = vinfo_for_stmt (outer_phi);
5115 exit_gsi = gsi_after_labels (exit_bb);
5117 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5118 (i.e. when reduc_fn is not available) and in the final adjustment
5119 code (if needed). Also get the original scalar reduction variable as
5120 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5121 represents a reduction pattern), the tree-code and scalar-def are
5122 taken from the original stmt that the pattern-stmt (STMT) replaces.
5123 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5124 are taken from STMT. */
5126 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5127 if (!orig_stmt)
5129 /* Regular reduction */
5130 orig_stmt = stmt;
5132 else
5134 /* Reduction pattern */
5135 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5136 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5137 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5140 code = gimple_assign_rhs_code (orig_stmt);
5141 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5142 partial results are added and not subtracted. */
5143 if (code == MINUS_EXPR)
5144 code = PLUS_EXPR;
5146 scalar_dest = gimple_assign_lhs (orig_stmt);
5147 scalar_type = TREE_TYPE (scalar_dest);
5148 scalar_results.create (group_size);
5149 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5150 bitsize = TYPE_SIZE (scalar_type);
5152 /* In case this is a reduction in an inner-loop while vectorizing an outer
5153 loop - we don't need to extract a single scalar result at the end of the
5154 inner-loop (unless it is double reduction, i.e., the use of reduction is
5155 outside the outer-loop). The final vector of partial results will be used
5156 in the vectorized outer-loop, or reduced to a scalar result at the end of
5157 the outer-loop. */
5158 if (nested_in_vect_loop && !double_reduc)
5159 goto vect_finalize_reduction;
5161 /* SLP reduction without reduction chain, e.g.,
5162 # a1 = phi <a2, a0>
5163 # b1 = phi <b2, b0>
5164 a2 = operation (a1)
5165 b2 = operation (b1) */
5166 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5168 /* True if we should implement SLP_REDUC using native reduction operations
5169 instead of scalar operations. */
5170 direct_slp_reduc = (reduc_fn != IFN_LAST
5171 && slp_reduc
5172 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5174 /* In case of reduction chain, e.g.,
5175 # a1 = phi <a3, a0>
5176 a2 = operation (a1)
5177 a3 = operation (a2),
5179 we may end up with more than one vector result. Here we reduce them to
5180 one vector. */
5181 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5183 tree first_vect = PHI_RESULT (new_phis[0]);
5184 gassign *new_vec_stmt = NULL;
5185 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5186 for (k = 1; k < new_phis.length (); k++)
5188 gimple *next_phi = new_phis[k];
5189 tree second_vect = PHI_RESULT (next_phi);
5190 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5191 new_vec_stmt = gimple_build_assign (tem, code,
5192 first_vect, second_vect);
5193 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5194 first_vect = tem;
5197 new_phi_result = first_vect;
5198 if (new_vec_stmt)
5200 new_phis.truncate (0);
5201 new_phis.safe_push (new_vec_stmt);
5204 /* Likewise if we couldn't use a single defuse cycle. */
5205 else if (ncopies > 1)
5207 gcc_assert (new_phis.length () == 1);
5208 tree first_vect = PHI_RESULT (new_phis[0]);
5209 gassign *new_vec_stmt = NULL;
5210 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5211 gimple *next_phi = new_phis[0];
5212 for (int k = 1; k < ncopies; ++k)
5214 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5215 tree second_vect = PHI_RESULT (next_phi);
5216 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5217 new_vec_stmt = gimple_build_assign (tem, code,
5218 first_vect, second_vect);
5219 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5220 first_vect = tem;
5222 new_phi_result = first_vect;
5223 new_phis.truncate (0);
5224 new_phis.safe_push (new_vec_stmt);
5226 else
5227 new_phi_result = PHI_RESULT (new_phis[0]);
5229 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5230 && reduc_fn != IFN_LAST)
5232 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5233 various data values where the condition matched and another vector
5234 (INDUCTION_INDEX) containing all the indexes of those matches. We
5235 need to extract the last matching index (which will be the index with
5236 highest value) and use this to index into the data vector.
5237 For the case where there were no matches, the data vector will contain
5238 all default values and the index vector will be all zeros. */
5240 /* Get various versions of the type of the vector of indexes. */
5241 tree index_vec_type = TREE_TYPE (induction_index);
5242 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5243 tree index_scalar_type = TREE_TYPE (index_vec_type);
5244 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5245 (index_vec_type);
5247 /* Get an unsigned integer version of the type of the data vector. */
5248 int scalar_precision
5249 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5250 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5251 tree vectype_unsigned = build_vector_type
5252 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5254 /* First we need to create a vector (ZERO_VEC) of zeros and another
5255 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5256 can create using a MAX reduction and then expanding.
5257 In the case where the loop never made any matches, the max index will
5258 be zero. */
5260 /* Vector of {0, 0, 0,...}. */
5261 tree zero_vec = make_ssa_name (vectype);
5262 tree zero_vec_rhs = build_zero_cst (vectype);
5263 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5264 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5266 /* Find maximum value from the vector of found indexes. */
5267 tree max_index = make_ssa_name (index_scalar_type);
5268 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5269 1, induction_index);
5270 gimple_call_set_lhs (max_index_stmt, max_index);
5271 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5273 /* Vector of {max_index, max_index, max_index,...}. */
5274 tree max_index_vec = make_ssa_name (index_vec_type);
5275 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5276 max_index);
5277 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5278 max_index_vec_rhs);
5279 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5281 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5282 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5283 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5284 otherwise. Only one value should match, resulting in a vector
5285 (VEC_COND) with one data value and the rest zeros.
5286 In the case where the loop never made any matches, every index will
5287 match, resulting in a vector with all data values (which will all be
5288 the default value). */
5290 /* Compare the max index vector to the vector of found indexes to find
5291 the position of the max value. */
5292 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5293 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5294 induction_index,
5295 max_index_vec);
5296 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5298 /* Use the compare to choose either values from the data vector or
5299 zero. */
5300 tree vec_cond = make_ssa_name (vectype);
5301 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5302 vec_compare, new_phi_result,
5303 zero_vec);
5304 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5306 /* Finally we need to extract the data value from the vector (VEC_COND)
5307 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5308 reduction, but because this doesn't exist, we can use a MAX reduction
5309 instead. The data value might be signed or a float so we need to cast
5310 it first.
5311 In the case where the loop never made any matches, the data values are
5312 all identical, and so will reduce down correctly. */
5314 /* Make the matched data values unsigned. */
5315 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5316 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5317 vec_cond);
5318 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5319 VIEW_CONVERT_EXPR,
5320 vec_cond_cast_rhs);
5321 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5323 /* Reduce down to a scalar value. */
5324 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5325 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5326 1, vec_cond_cast);
5327 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5328 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5330 /* Convert the reduced value back to the result type and set as the
5331 result. */
5332 gimple_seq stmts = NULL;
5333 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5334 data_reduc);
5335 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5336 scalar_results.safe_push (new_temp);
5338 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5339 && reduc_fn == IFN_LAST)
5341 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5342 idx = 0;
5343 idx_val = induction_index[0];
5344 val = data_reduc[0];
5345 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5346 if (induction_index[i] > idx_val)
5347 val = data_reduc[i], idx_val = induction_index[i];
5348 return val; */
5350 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5351 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5352 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5353 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5354 /* Enforced by vectorizable_reduction, which ensures we have target
5355 support before allowing a conditional reduction on variable-length
5356 vectors. */
5357 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5358 tree idx_val = NULL_TREE, val = NULL_TREE;
5359 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5361 tree old_idx_val = idx_val;
5362 tree old_val = val;
5363 idx_val = make_ssa_name (idx_eltype);
5364 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5365 build3 (BIT_FIELD_REF, idx_eltype,
5366 induction_index,
5367 bitsize_int (el_size),
5368 bitsize_int (off)));
5369 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370 val = make_ssa_name (data_eltype);
5371 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5372 build3 (BIT_FIELD_REF,
5373 data_eltype,
5374 new_phi_result,
5375 bitsize_int (el_size),
5376 bitsize_int (off)));
5377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378 if (off != 0)
5380 tree new_idx_val = idx_val;
5381 tree new_val = val;
5382 if (off != v_size - el_size)
5384 new_idx_val = make_ssa_name (idx_eltype);
5385 epilog_stmt = gimple_build_assign (new_idx_val,
5386 MAX_EXPR, idx_val,
5387 old_idx_val);
5388 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5390 new_val = make_ssa_name (data_eltype);
5391 epilog_stmt = gimple_build_assign (new_val,
5392 COND_EXPR,
5393 build2 (GT_EXPR,
5394 boolean_type_node,
5395 idx_val,
5396 old_idx_val),
5397 val, old_val);
5398 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5399 idx_val = new_idx_val;
5400 val = new_val;
5403 /* Convert the reduced value back to the result type and set as the
5404 result. */
5405 gimple_seq stmts = NULL;
5406 val = gimple_convert (&stmts, scalar_type, val);
5407 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5408 scalar_results.safe_push (val);
5411 /* 2.3 Create the reduction code, using one of the three schemes described
5412 above. In SLP we simply need to extract all the elements from the
5413 vector (without reducing them), so we use scalar shifts. */
5414 else if (reduc_fn != IFN_LAST && !slp_reduc)
5416 tree tmp;
5417 tree vec_elem_type;
5419 /* Case 1: Create:
5420 v_out2 = reduc_expr <v_out1> */
5422 if (dump_enabled_p ())
5423 dump_printf_loc (MSG_NOTE, vect_location,
5424 "Reduce using direct vector reduction.\n");
5426 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5427 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5429 tree tmp_dest
5430 = vect_create_destination_var (scalar_dest, vec_elem_type);
5431 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5432 new_phi_result);
5433 gimple_set_lhs (epilog_stmt, tmp_dest);
5434 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5435 gimple_set_lhs (epilog_stmt, new_temp);
5436 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5438 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5439 new_temp);
5441 else
5443 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5444 new_phi_result);
5445 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5448 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5449 gimple_set_lhs (epilog_stmt, new_temp);
5450 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5452 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5453 == INTEGER_INDUC_COND_REDUCTION)
5454 && !operand_equal_p (initial_def, induc_val, 0))
5456 /* Earlier we set the initial value to be a vector if induc_val
5457 values. Check the result and if it is induc_val then replace
5458 with the original initial value, unless induc_val is
5459 the same as initial_def already. */
5460 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5461 induc_val);
5463 tmp = make_ssa_name (new_scalar_dest);
5464 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5465 initial_def, new_temp);
5466 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467 new_temp = tmp;
5470 scalar_results.safe_push (new_temp);
5472 else if (direct_slp_reduc)
5474 /* Here we create one vector for each of the GROUP_SIZE results,
5475 with the elements for other SLP statements replaced with the
5476 neutral value. We can then do a normal reduction on each vector. */
5478 /* Enforced by vectorizable_reduction. */
5479 gcc_assert (new_phis.length () == 1);
5480 gcc_assert (pow2p_hwi (group_size));
5482 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5483 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5484 gimple_seq seq = NULL;
5486 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5487 and the same element size as VECTYPE. */
5488 tree index = build_index_vector (vectype, 0, 1);
5489 tree index_type = TREE_TYPE (index);
5490 tree index_elt_type = TREE_TYPE (index_type);
5491 tree mask_type = build_same_sized_truth_vector_type (index_type);
5493 /* Create a vector that, for each element, identifies which of
5494 the GROUP_SIZE results should use it. */
5495 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5496 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5497 build_vector_from_val (index_type, index_mask));
5499 /* Get a neutral vector value. This is simply a splat of the neutral
5500 scalar value if we have one, otherwise the initial scalar value
5501 is itself a neutral value. */
5502 tree vector_identity = NULL_TREE;
5503 if (neutral_op)
5504 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5505 neutral_op);
5506 for (unsigned int i = 0; i < group_size; ++i)
5508 /* If there's no univeral neutral value, we can use the
5509 initial scalar value from the original PHI. This is used
5510 for MIN and MAX reduction, for example. */
5511 if (!neutral_op)
5513 tree scalar_value
5514 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5515 loop_preheader_edge (loop));
5516 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5517 scalar_value);
5520 /* Calculate the equivalent of:
5522 sel[j] = (index[j] == i);
5524 which selects the elements of NEW_PHI_RESULT that should
5525 be included in the result. */
5526 tree compare_val = build_int_cst (index_elt_type, i);
5527 compare_val = build_vector_from_val (index_type, compare_val);
5528 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5529 index, compare_val);
5531 /* Calculate the equivalent of:
5533 vec = seq ? new_phi_result : vector_identity;
5535 VEC is now suitable for a full vector reduction. */
5536 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5537 sel, new_phi_result, vector_identity);
5539 /* Do the reduction and convert it to the appropriate type. */
5540 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5541 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5542 gimple_call_set_lhs (call, scalar);
5543 gimple_seq_add_stmt (&seq, call);
5544 scalar = gimple_convert (&seq, scalar_type, scalar);
5545 scalar_results.safe_push (scalar);
5547 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5549 else
5551 bool reduce_with_shift;
5552 tree vec_temp;
5554 /* COND reductions all do the final reduction with MAX_EXPR
5555 or MIN_EXPR. */
5556 if (code == COND_EXPR)
5558 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5559 == INTEGER_INDUC_COND_REDUCTION)
5560 code = induc_code;
5561 else
5562 code = MAX_EXPR;
5565 /* See if the target wants to do the final (shift) reduction
5566 in a vector mode of smaller size and first reduce upper/lower
5567 halves against each other. */
5568 enum machine_mode mode1 = mode;
5569 tree vectype1 = vectype;
5570 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5571 unsigned sz1 = sz;
5572 if (!slp_reduc
5573 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5574 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5576 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5577 reduce_with_shift = have_whole_vector_shift (mode1);
5578 if (!VECTOR_MODE_P (mode1))
5579 reduce_with_shift = false;
5580 else
5582 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5583 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5584 reduce_with_shift = false;
5587 /* First reduce the vector to the desired vector size we should
5588 do shift reduction on by combining upper and lower halves. */
5589 new_temp = new_phi_result;
5590 while (sz > sz1)
5592 gcc_assert (!slp_reduc);
5593 sz /= 2;
5594 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5596 /* The target has to make sure we support lowpart/highpart
5597 extraction, either via direct vector extract or through
5598 an integer mode punning. */
5599 tree dst1, dst2;
5600 if (convert_optab_handler (vec_extract_optab,
5601 TYPE_MODE (TREE_TYPE (new_temp)),
5602 TYPE_MODE (vectype1))
5603 != CODE_FOR_nothing)
5605 /* Extract sub-vectors directly once vec_extract becomes
5606 a conversion optab. */
5607 dst1 = make_ssa_name (vectype1);
5608 epilog_stmt
5609 = gimple_build_assign (dst1, BIT_FIELD_REF,
5610 build3 (BIT_FIELD_REF, vectype1,
5611 new_temp, TYPE_SIZE (vectype1),
5612 bitsize_int (0)));
5613 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5614 dst2 = make_ssa_name (vectype1);
5615 epilog_stmt
5616 = gimple_build_assign (dst2, BIT_FIELD_REF,
5617 build3 (BIT_FIELD_REF, vectype1,
5618 new_temp, TYPE_SIZE (vectype1),
5619 bitsize_int (sz * BITS_PER_UNIT)));
5620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5622 else
5624 /* Extract via punning to appropriately sized integer mode
5625 vector. */
5626 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5628 tree etype = build_vector_type (eltype, 2);
5629 gcc_assert (convert_optab_handler (vec_extract_optab,
5630 TYPE_MODE (etype),
5631 TYPE_MODE (eltype))
5632 != CODE_FOR_nothing);
5633 tree tem = make_ssa_name (etype);
5634 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5635 build1 (VIEW_CONVERT_EXPR,
5636 etype, new_temp));
5637 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638 new_temp = tem;
5639 tem = make_ssa_name (eltype);
5640 epilog_stmt
5641 = gimple_build_assign (tem, BIT_FIELD_REF,
5642 build3 (BIT_FIELD_REF, eltype,
5643 new_temp, TYPE_SIZE (eltype),
5644 bitsize_int (0)));
5645 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5646 dst1 = make_ssa_name (vectype1);
5647 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5648 build1 (VIEW_CONVERT_EXPR,
5649 vectype1, tem));
5650 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5651 tem = make_ssa_name (eltype);
5652 epilog_stmt
5653 = gimple_build_assign (tem, BIT_FIELD_REF,
5654 build3 (BIT_FIELD_REF, eltype,
5655 new_temp, TYPE_SIZE (eltype),
5656 bitsize_int (sz * BITS_PER_UNIT)));
5657 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658 dst2 = make_ssa_name (vectype1);
5659 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5660 build1 (VIEW_CONVERT_EXPR,
5661 vectype1, tem));
5662 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5665 new_temp = make_ssa_name (vectype1);
5666 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5667 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5670 if (reduce_with_shift && !slp_reduc)
5672 int element_bitsize = tree_to_uhwi (bitsize);
5673 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5674 for variable-length vectors and also requires direct target support
5675 for loop reductions. */
5676 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5677 int nelements = vec_size_in_bits / element_bitsize;
5678 vec_perm_builder sel;
5679 vec_perm_indices indices;
5681 int elt_offset;
5683 tree zero_vec = build_zero_cst (vectype1);
5684 /* Case 2: Create:
5685 for (offset = nelements/2; offset >= 1; offset/=2)
5687 Create: va' = vec_shift <va, offset>
5688 Create: va = vop <va, va'>
5689 } */
5691 tree rhs;
5693 if (dump_enabled_p ())
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 "Reduce using vector shifts\n");
5697 mode1 = TYPE_MODE (vectype1);
5698 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5699 for (elt_offset = nelements / 2;
5700 elt_offset >= 1;
5701 elt_offset /= 2)
5703 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5704 indices.new_vector (sel, 2, nelements);
5705 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5706 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5707 new_temp, zero_vec, mask);
5708 new_name = make_ssa_name (vec_dest, epilog_stmt);
5709 gimple_assign_set_lhs (epilog_stmt, new_name);
5710 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5712 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5713 new_temp);
5714 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5715 gimple_assign_set_lhs (epilog_stmt, new_temp);
5716 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5719 /* 2.4 Extract the final scalar result. Create:
5720 s_out3 = extract_field <v_out2, bitpos> */
5722 if (dump_enabled_p ())
5723 dump_printf_loc (MSG_NOTE, vect_location,
5724 "extract scalar result\n");
5726 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5727 bitsize, bitsize_zero_node);
5728 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5729 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5730 gimple_assign_set_lhs (epilog_stmt, new_temp);
5731 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5732 scalar_results.safe_push (new_temp);
5734 else
5736 /* Case 3: Create:
5737 s = extract_field <v_out2, 0>
5738 for (offset = element_size;
5739 offset < vector_size;
5740 offset += element_size;)
5742 Create: s' = extract_field <v_out2, offset>
5743 Create: s = op <s, s'> // For non SLP cases
5744 } */
5746 if (dump_enabled_p ())
5747 dump_printf_loc (MSG_NOTE, vect_location,
5748 "Reduce using scalar code.\n");
5750 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5751 int element_bitsize = tree_to_uhwi (bitsize);
5752 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5754 int bit_offset;
5755 if (gimple_code (new_phi) == GIMPLE_PHI)
5756 vec_temp = PHI_RESULT (new_phi);
5757 else
5758 vec_temp = gimple_assign_lhs (new_phi);
5759 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5760 bitsize_zero_node);
5761 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5762 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5763 gimple_assign_set_lhs (epilog_stmt, new_temp);
5764 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5766 /* In SLP we don't need to apply reduction operation, so we just
5767 collect s' values in SCALAR_RESULTS. */
5768 if (slp_reduc)
5769 scalar_results.safe_push (new_temp);
5771 for (bit_offset = element_bitsize;
5772 bit_offset < vec_size_in_bits;
5773 bit_offset += element_bitsize)
5775 tree bitpos = bitsize_int (bit_offset);
5776 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5777 bitsize, bitpos);
5779 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5780 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5781 gimple_assign_set_lhs (epilog_stmt, new_name);
5782 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5784 if (slp_reduc)
5786 /* In SLP we don't need to apply reduction operation, so
5787 we just collect s' values in SCALAR_RESULTS. */
5788 new_temp = new_name;
5789 scalar_results.safe_push (new_name);
5791 else
5793 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5794 new_name, new_temp);
5795 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5796 gimple_assign_set_lhs (epilog_stmt, new_temp);
5797 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5802 /* The only case where we need to reduce scalar results in SLP, is
5803 unrolling. If the size of SCALAR_RESULTS is greater than
5804 GROUP_SIZE, we reduce them combining elements modulo
5805 GROUP_SIZE. */
5806 if (slp_reduc)
5808 tree res, first_res, new_res;
5809 gimple *new_stmt;
5811 /* Reduce multiple scalar results in case of SLP unrolling. */
5812 for (j = group_size; scalar_results.iterate (j, &res);
5813 j++)
5815 first_res = scalar_results[j % group_size];
5816 new_stmt = gimple_build_assign (new_scalar_dest, code,
5817 first_res, res);
5818 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5819 gimple_assign_set_lhs (new_stmt, new_res);
5820 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5821 scalar_results[j % group_size] = new_res;
5824 else
5825 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5826 scalar_results.safe_push (new_temp);
5829 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5830 == INTEGER_INDUC_COND_REDUCTION)
5831 && !operand_equal_p (initial_def, induc_val, 0))
5833 /* Earlier we set the initial value to be a vector if induc_val
5834 values. Check the result and if it is induc_val then replace
5835 with the original initial value, unless induc_val is
5836 the same as initial_def already. */
5837 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5838 induc_val);
5840 tree tmp = make_ssa_name (new_scalar_dest);
5841 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5842 initial_def, new_temp);
5843 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5844 scalar_results[0] = tmp;
5848 vect_finalize_reduction:
5850 if (double_reduc)
5851 loop = loop->inner;
5853 /* 2.5 Adjust the final result by the initial value of the reduction
5854 variable. (When such adjustment is not needed, then
5855 'adjustment_def' is zero). For example, if code is PLUS we create:
5856 new_temp = loop_exit_def + adjustment_def */
5858 if (adjustment_def)
5860 gcc_assert (!slp_reduc);
5861 if (nested_in_vect_loop)
5863 new_phi = new_phis[0];
5864 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5865 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5866 new_dest = vect_create_destination_var (scalar_dest, vectype);
5868 else
5870 new_temp = scalar_results[0];
5871 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5872 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5873 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5876 epilog_stmt = gimple_build_assign (new_dest, expr);
5877 new_temp = make_ssa_name (new_dest, epilog_stmt);
5878 gimple_assign_set_lhs (epilog_stmt, new_temp);
5879 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5880 if (nested_in_vect_loop)
5882 set_vinfo_for_stmt (epilog_stmt,
5883 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5884 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5885 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5887 if (!double_reduc)
5888 scalar_results.quick_push (new_temp);
5889 else
5890 scalar_results[0] = new_temp;
5892 else
5893 scalar_results[0] = new_temp;
5895 new_phis[0] = epilog_stmt;
5898 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5899 phis with new adjusted scalar results, i.e., replace use <s_out0>
5900 with use <s_out4>.
5902 Transform:
5903 loop_exit:
5904 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5905 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5906 v_out2 = reduce <v_out1>
5907 s_out3 = extract_field <v_out2, 0>
5908 s_out4 = adjust_result <s_out3>
5909 use <s_out0>
5910 use <s_out0>
5912 into:
5914 loop_exit:
5915 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5916 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5917 v_out2 = reduce <v_out1>
5918 s_out3 = extract_field <v_out2, 0>
5919 s_out4 = adjust_result <s_out3>
5920 use <s_out4>
5921 use <s_out4> */
5924 /* In SLP reduction chain we reduce vector results into one vector if
5925 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5926 the last stmt in the reduction chain, since we are looking for the loop
5927 exit phi node. */
5928 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5930 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5931 /* Handle reduction patterns. */
5932 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5933 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5935 scalar_dest = gimple_assign_lhs (dest_stmt);
5936 group_size = 1;
5939 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5940 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5941 need to match SCALAR_RESULTS with corresponding statements. The first
5942 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5943 the first vector stmt, etc.
5944 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5945 if (group_size > new_phis.length ())
5947 ratio = group_size / new_phis.length ();
5948 gcc_assert (!(group_size % new_phis.length ()));
5950 else
5951 ratio = 1;
5953 for (k = 0; k < group_size; k++)
5955 if (k % ratio == 0)
5957 epilog_stmt = new_phis[k / ratio];
5958 reduction_phi = reduction_phis[k / ratio];
5959 if (double_reduc)
5960 inner_phi = inner_phis[k / ratio];
5963 if (slp_reduc)
5965 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5967 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5968 /* SLP statements can't participate in patterns. */
5969 gcc_assert (!orig_stmt);
5970 scalar_dest = gimple_assign_lhs (current_stmt);
5973 phis.create (3);
5974 /* Find the loop-closed-use at the loop exit of the original scalar
5975 result. (The reduction result is expected to have two immediate uses -
5976 one at the latch block, and one at the loop exit). */
5977 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5978 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5979 && !is_gimple_debug (USE_STMT (use_p)))
5980 phis.safe_push (USE_STMT (use_p));
5982 /* While we expect to have found an exit_phi because of loop-closed-ssa
5983 form we can end up without one if the scalar cycle is dead. */
5985 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5987 if (outer_loop)
5989 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5990 gphi *vect_phi;
5992 /* FORNOW. Currently not supporting the case that an inner-loop
5993 reduction is not used in the outer-loop (but only outside the
5994 outer-loop), unless it is double reduction. */
5995 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5996 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5997 || double_reduc);
5999 if (double_reduc)
6000 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6001 else
6002 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6003 if (!double_reduc
6004 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6005 != vect_double_reduction_def)
6006 continue;
6008 /* Handle double reduction:
6010 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
6011 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6012 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
6013 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
6015 At that point the regular reduction (stmt2 and stmt3) is
6016 already vectorized, as well as the exit phi node, stmt4.
6017 Here we vectorize the phi node of double reduction, stmt1, and
6018 update all relevant statements. */
6020 /* Go through all the uses of s2 to find double reduction phi
6021 node, i.e., stmt1 above. */
6022 orig_name = PHI_RESULT (exit_phi);
6023 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6025 stmt_vec_info use_stmt_vinfo;
6026 stmt_vec_info new_phi_vinfo;
6027 tree vect_phi_init, preheader_arg, vect_phi_res;
6028 basic_block bb = gimple_bb (use_stmt);
6029 gimple *use;
6031 /* Check that USE_STMT is really double reduction phi
6032 node. */
6033 if (gimple_code (use_stmt) != GIMPLE_PHI
6034 || gimple_phi_num_args (use_stmt) != 2
6035 || bb->loop_father != outer_loop)
6036 continue;
6037 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6038 if (!use_stmt_vinfo
6039 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6040 != vect_double_reduction_def)
6041 continue;
6043 /* Create vector phi node for double reduction:
6044 vs1 = phi <vs0, vs2>
6045 vs1 was created previously in this function by a call to
6046 vect_get_vec_def_for_operand and is stored in
6047 vec_initial_def;
6048 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6049 vs0 is created here. */
6051 /* Create vector phi node. */
6052 vect_phi = create_phi_node (vec_initial_def, bb);
6053 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6054 loop_vec_info_for_loop (outer_loop));
6055 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6057 /* Create vs0 - initial def of the double reduction phi. */
6058 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6059 loop_preheader_edge (outer_loop));
6060 vect_phi_init = get_initial_def_for_reduction
6061 (stmt, preheader_arg, NULL);
6063 /* Update phi node arguments with vs0 and vs2. */
6064 add_phi_arg (vect_phi, vect_phi_init,
6065 loop_preheader_edge (outer_loop),
6066 UNKNOWN_LOCATION);
6067 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6068 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6069 if (dump_enabled_p ())
6071 dump_printf_loc (MSG_NOTE, vect_location,
6072 "created double reduction phi node: ");
6073 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6076 vect_phi_res = PHI_RESULT (vect_phi);
6078 /* Replace the use, i.e., set the correct vs1 in the regular
6079 reduction phi node. FORNOW, NCOPIES is always 1, so the
6080 loop is redundant. */
6081 use = reduction_phi;
6082 for (j = 0; j < ncopies; j++)
6084 edge pr_edge = loop_preheader_edge (loop);
6085 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6086 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6092 phis.release ();
6093 if (nested_in_vect_loop)
6095 if (double_reduc)
6096 loop = outer_loop;
6097 else
6098 continue;
6101 phis.create (3);
6102 /* Find the loop-closed-use at the loop exit of the original scalar
6103 result. (The reduction result is expected to have two immediate uses,
6104 one at the latch block, and one at the loop exit). For double
6105 reductions we are looking for exit phis of the outer loop. */
6106 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6108 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6110 if (!is_gimple_debug (USE_STMT (use_p)))
6111 phis.safe_push (USE_STMT (use_p));
6113 else
6115 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6117 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6119 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6121 if (!flow_bb_inside_loop_p (loop,
6122 gimple_bb (USE_STMT (phi_use_p)))
6123 && !is_gimple_debug (USE_STMT (phi_use_p)))
6124 phis.safe_push (USE_STMT (phi_use_p));
6130 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6132 /* Replace the uses: */
6133 orig_name = PHI_RESULT (exit_phi);
6134 scalar_result = scalar_results[k];
6135 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6136 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6137 SET_USE (use_p, scalar_result);
6140 phis.release ();
6144 /* Return a vector of type VECTYPE that is equal to the vector select
6145 operation "MASK ? VEC : IDENTITY". Insert the select statements
6146 before GSI. */
6148 static tree
6149 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6150 tree vec, tree identity)
6152 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6153 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6154 mask, vec, identity);
6155 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6156 return cond;
6159 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6160 order, starting with LHS. Insert the extraction statements before GSI and
6161 associate the new scalar SSA names with variable SCALAR_DEST.
6162 Return the SSA name for the result. */
6164 static tree
6165 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6166 tree_code code, tree lhs, tree vector_rhs)
6168 tree vectype = TREE_TYPE (vector_rhs);
6169 tree scalar_type = TREE_TYPE (vectype);
6170 tree bitsize = TYPE_SIZE (scalar_type);
6171 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6172 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6174 for (unsigned HOST_WIDE_INT bit_offset = 0;
6175 bit_offset < vec_size_in_bits;
6176 bit_offset += element_bitsize)
6178 tree bitpos = bitsize_int (bit_offset);
6179 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6180 bitsize, bitpos);
6182 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6183 rhs = make_ssa_name (scalar_dest, stmt);
6184 gimple_assign_set_lhs (stmt, rhs);
6185 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6187 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6188 tree new_name = make_ssa_name (scalar_dest, stmt);
6189 gimple_assign_set_lhs (stmt, new_name);
6190 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6191 lhs = new_name;
6193 return lhs;
6196 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6197 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6198 statement. CODE is the operation performed by STMT and OPS are
6199 its scalar operands. REDUC_INDEX is the index of the operand in
6200 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6201 implements in-order reduction, or IFN_LAST if we should open-code it.
6202 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6203 that should be used to control the operation in a fully-masked loop. */
6205 static bool
6206 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6207 gimple **vec_stmt, slp_tree slp_node,
6208 gimple *reduc_def_stmt,
6209 tree_code code, internal_fn reduc_fn,
6210 tree ops[3], tree vectype_in,
6211 int reduc_index, vec_loop_masks *masks)
6213 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6214 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6215 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6216 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6217 gimple *new_stmt = NULL;
6219 int ncopies;
6220 if (slp_node)
6221 ncopies = 1;
6222 else
6223 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6225 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6226 gcc_assert (ncopies == 1);
6227 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6228 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6229 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6230 == FOLD_LEFT_REDUCTION);
6232 if (slp_node)
6233 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6234 TYPE_VECTOR_SUBPARTS (vectype_in)));
6236 tree op0 = ops[1 - reduc_index];
6238 int group_size = 1;
6239 gimple *scalar_dest_def;
6240 auto_vec<tree> vec_oprnds0;
6241 if (slp_node)
6243 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6244 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6245 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6247 else
6249 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6250 vec_oprnds0.create (1);
6251 vec_oprnds0.quick_push (loop_vec_def0);
6252 scalar_dest_def = stmt;
6255 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6256 tree scalar_type = TREE_TYPE (scalar_dest);
6257 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6259 int vec_num = vec_oprnds0.length ();
6260 gcc_assert (vec_num == 1 || slp_node);
6261 tree vec_elem_type = TREE_TYPE (vectype_out);
6262 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6264 tree vector_identity = NULL_TREE;
6265 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6266 vector_identity = build_zero_cst (vectype_out);
6268 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6269 int i;
6270 tree def0;
6271 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6273 tree mask = NULL_TREE;
6274 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6275 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6277 /* Handle MINUS by adding the negative. */
6278 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6280 tree negated = make_ssa_name (vectype_out);
6281 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6282 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6283 def0 = negated;
6286 if (mask)
6287 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6288 vector_identity);
6290 /* On the first iteration the input is simply the scalar phi
6291 result, and for subsequent iterations it is the output of
6292 the preceding operation. */
6293 if (reduc_fn != IFN_LAST)
6295 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6296 /* For chained SLP reductions the output of the previous reduction
6297 operation serves as the input of the next. For the final statement
6298 the output cannot be a temporary - we reuse the original
6299 scalar destination of the last statement. */
6300 if (i != vec_num - 1)
6302 gimple_set_lhs (new_stmt, scalar_dest_var);
6303 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6304 gimple_set_lhs (new_stmt, reduc_var);
6307 else
6309 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6310 reduc_var, def0);
6311 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6312 /* Remove the statement, so that we can use the same code paths
6313 as for statements that we've just created. */
6314 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6315 gsi_remove (&tmp_gsi, false);
6318 if (i == vec_num - 1)
6320 gimple_set_lhs (new_stmt, scalar_dest);
6321 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6323 else
6324 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6326 if (slp_node)
6327 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6330 if (!slp_node)
6331 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6333 return true;
6336 /* Function is_nonwrapping_integer_induction.
6338 Check if STMT (which is part of loop LOOP) both increments and
6339 does not cause overflow. */
6341 static bool
6342 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6344 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6345 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6346 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6347 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6348 widest_int ni, max_loop_value, lhs_max;
6349 bool overflow = false;
6351 /* Make sure the loop is integer based. */
6352 if (TREE_CODE (base) != INTEGER_CST
6353 || TREE_CODE (step) != INTEGER_CST)
6354 return false;
6356 /* Check that the max size of the loop will not wrap. */
6358 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6359 return true;
6361 if (! max_stmt_executions (loop, &ni))
6362 return false;
6364 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6365 &overflow);
6366 if (overflow)
6367 return false;
6369 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6370 TYPE_SIGN (lhs_type), &overflow);
6371 if (overflow)
6372 return false;
6374 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6375 <= TYPE_PRECISION (lhs_type));
6378 /* Function vectorizable_reduction.
6380 Check if STMT performs a reduction operation that can be vectorized.
6381 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6382 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6383 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6385 This function also handles reduction idioms (patterns) that have been
6386 recognized in advance during vect_pattern_recog. In this case, STMT may be
6387 of this form:
6388 X = pattern_expr (arg0, arg1, ..., X)
6389 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6390 sequence that had been detected and replaced by the pattern-stmt (STMT).
6392 This function also handles reduction of condition expressions, for example:
6393 for (int i = 0; i < N; i++)
6394 if (a[i] < value)
6395 last = a[i];
6396 This is handled by vectorising the loop and creating an additional vector
6397 containing the loop indexes for which "a[i] < value" was true. In the
6398 function epilogue this is reduced to a single max value and then used to
6399 index into the vector of results.
6401 In some cases of reduction patterns, the type of the reduction variable X is
6402 different than the type of the other arguments of STMT.
6403 In such cases, the vectype that is used when transforming STMT into a vector
6404 stmt is different than the vectype that is used to determine the
6405 vectorization factor, because it consists of a different number of elements
6406 than the actual number of elements that are being operated upon in parallel.
6408 For example, consider an accumulation of shorts into an int accumulator.
6409 On some targets it's possible to vectorize this pattern operating on 8
6410 shorts at a time (hence, the vectype for purposes of determining the
6411 vectorization factor should be V8HI); on the other hand, the vectype that
6412 is used to create the vector form is actually V4SI (the type of the result).
6414 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6415 indicates what is the actual level of parallelism (V8HI in the example), so
6416 that the right vectorization factor would be derived. This vectype
6417 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6418 be used to create the vectorized stmt. The right vectype for the vectorized
6419 stmt is obtained from the type of the result X:
6420 get_vectype_for_scalar_type (TREE_TYPE (X))
6422 This means that, contrary to "regular" reductions (or "regular" stmts in
6423 general), the following equation:
6424 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6425 does *NOT* necessarily hold for reduction patterns. */
6427 bool
6428 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6429 gimple **vec_stmt, slp_tree slp_node,
6430 slp_instance slp_node_instance)
6432 tree vec_dest;
6433 tree scalar_dest;
6434 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6435 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6436 tree vectype_in = NULL_TREE;
6437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6439 enum tree_code code, orig_code;
6440 internal_fn reduc_fn;
6441 machine_mode vec_mode;
6442 int op_type;
6443 optab optab;
6444 tree new_temp = NULL_TREE;
6445 gimple *def_stmt;
6446 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6447 gimple *cond_reduc_def_stmt = NULL;
6448 enum tree_code cond_reduc_op_code = ERROR_MARK;
6449 tree scalar_type;
6450 bool is_simple_use;
6451 gimple *orig_stmt;
6452 stmt_vec_info orig_stmt_info = NULL;
6453 int i;
6454 int ncopies;
6455 int epilog_copies;
6456 stmt_vec_info prev_stmt_info, prev_phi_info;
6457 bool single_defuse_cycle = false;
6458 gimple *new_stmt = NULL;
6459 int j;
6460 tree ops[3];
6461 enum vect_def_type dts[3];
6462 bool nested_cycle = false, found_nested_cycle_def = false;
6463 bool double_reduc = false;
6464 basic_block def_bb;
6465 struct loop * def_stmt_loop, *outer_loop = NULL;
6466 tree def_arg;
6467 gimple *def_arg_stmt;
6468 auto_vec<tree> vec_oprnds0;
6469 auto_vec<tree> vec_oprnds1;
6470 auto_vec<tree> vec_oprnds2;
6471 auto_vec<tree> vect_defs;
6472 auto_vec<gimple *> phis;
6473 int vec_num;
6474 tree def0, tem;
6475 bool first_p = true;
6476 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6477 tree cond_reduc_val = NULL_TREE;
6479 /* Make sure it was already recognized as a reduction computation. */
6480 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6481 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6482 return false;
6484 if (nested_in_vect_loop_p (loop, stmt))
6486 outer_loop = loop;
6487 loop = loop->inner;
6488 nested_cycle = true;
6491 /* In case of reduction chain we switch to the first stmt in the chain, but
6492 we don't update STMT_INFO, since only the last stmt is marked as reduction
6493 and has reduction properties. */
6494 if (GROUP_FIRST_ELEMENT (stmt_info)
6495 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6497 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6498 first_p = false;
6501 if (gimple_code (stmt) == GIMPLE_PHI)
6503 /* Analysis is fully done on the reduction stmt invocation. */
6504 if (! vec_stmt)
6506 if (slp_node)
6507 slp_node_instance->reduc_phis = slp_node;
6509 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6510 return true;
6513 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6514 /* Leave the scalar phi in place. Note that checking
6515 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6516 for reductions involving a single statement. */
6517 return true;
6519 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6520 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6521 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6523 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6524 == EXTRACT_LAST_REDUCTION)
6525 /* Leave the scalar phi in place. */
6526 return true;
6528 gcc_assert (is_gimple_assign (reduc_stmt));
6529 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6531 tree op = gimple_op (reduc_stmt, k);
6532 if (op == gimple_phi_result (stmt))
6533 continue;
6534 if (k == 1
6535 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6536 continue;
6537 if (!vectype_in
6538 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6539 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6540 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6541 break;
6543 gcc_assert (vectype_in);
6545 if (slp_node)
6546 ncopies = 1;
6547 else
6548 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6550 use_operand_p use_p;
6551 gimple *use_stmt;
6552 if (ncopies > 1
6553 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6554 <= vect_used_only_live)
6555 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6556 && (use_stmt == reduc_stmt
6557 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6558 == reduc_stmt)))
6559 single_defuse_cycle = true;
6561 /* Create the destination vector */
6562 scalar_dest = gimple_assign_lhs (reduc_stmt);
6563 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6565 if (slp_node)
6566 /* The size vect_schedule_slp_instance computes is off for us. */
6567 vec_num = vect_get_num_vectors
6568 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6570 vectype_in);
6571 else
6572 vec_num = 1;
6574 /* Generate the reduction PHIs upfront. */
6575 prev_phi_info = NULL;
6576 for (j = 0; j < ncopies; j++)
6578 if (j == 0 || !single_defuse_cycle)
6580 for (i = 0; i < vec_num; i++)
6582 /* Create the reduction-phi that defines the reduction
6583 operand. */
6584 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6585 set_vinfo_for_stmt (new_phi,
6586 new_stmt_vec_info (new_phi, loop_vinfo));
6588 if (slp_node)
6589 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6590 else
6592 if (j == 0)
6593 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6594 else
6595 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6596 prev_phi_info = vinfo_for_stmt (new_phi);
6602 return true;
6605 /* 1. Is vectorizable reduction? */
6606 /* Not supportable if the reduction variable is used in the loop, unless
6607 it's a reduction chain. */
6608 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6609 && !GROUP_FIRST_ELEMENT (stmt_info))
6610 return false;
6612 /* Reductions that are not used even in an enclosing outer-loop,
6613 are expected to be "live" (used out of the loop). */
6614 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6615 && !STMT_VINFO_LIVE_P (stmt_info))
6616 return false;
6618 /* 2. Has this been recognized as a reduction pattern?
6620 Check if STMT represents a pattern that has been recognized
6621 in earlier analysis stages. For stmts that represent a pattern,
6622 the STMT_VINFO_RELATED_STMT field records the last stmt in
6623 the original sequence that constitutes the pattern. */
6625 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6626 if (orig_stmt)
6628 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6629 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6630 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6633 /* 3. Check the operands of the operation. The first operands are defined
6634 inside the loop body. The last operand is the reduction variable,
6635 which is defined by the loop-header-phi. */
6637 gcc_assert (is_gimple_assign (stmt));
6639 /* Flatten RHS. */
6640 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6642 case GIMPLE_BINARY_RHS:
6643 code = gimple_assign_rhs_code (stmt);
6644 op_type = TREE_CODE_LENGTH (code);
6645 gcc_assert (op_type == binary_op);
6646 ops[0] = gimple_assign_rhs1 (stmt);
6647 ops[1] = gimple_assign_rhs2 (stmt);
6648 break;
6650 case GIMPLE_TERNARY_RHS:
6651 code = gimple_assign_rhs_code (stmt);
6652 op_type = TREE_CODE_LENGTH (code);
6653 gcc_assert (op_type == ternary_op);
6654 ops[0] = gimple_assign_rhs1 (stmt);
6655 ops[1] = gimple_assign_rhs2 (stmt);
6656 ops[2] = gimple_assign_rhs3 (stmt);
6657 break;
6659 case GIMPLE_UNARY_RHS:
6660 return false;
6662 default:
6663 gcc_unreachable ();
6666 if (code == COND_EXPR && slp_node)
6667 return false;
6669 scalar_dest = gimple_assign_lhs (stmt);
6670 scalar_type = TREE_TYPE (scalar_dest);
6671 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6672 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6673 return false;
6675 /* Do not try to vectorize bit-precision reductions. */
6676 if (!type_has_mode_precision_p (scalar_type))
6677 return false;
6679 /* All uses but the last are expected to be defined in the loop.
6680 The last use is the reduction variable. In case of nested cycle this
6681 assumption is not true: we use reduc_index to record the index of the
6682 reduction variable. */
6683 gimple *reduc_def_stmt = NULL;
6684 int reduc_index = -1;
6685 for (i = 0; i < op_type; i++)
6687 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6688 if (i == 0 && code == COND_EXPR)
6689 continue;
6691 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6692 &def_stmt, &dts[i], &tem);
6693 dt = dts[i];
6694 gcc_assert (is_simple_use);
6695 if (dt == vect_reduction_def)
6697 reduc_def_stmt = def_stmt;
6698 reduc_index = i;
6699 continue;
6701 else if (tem)
6703 /* To properly compute ncopies we are interested in the widest
6704 input type in case we're looking at a widening accumulation. */
6705 if (!vectype_in
6706 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6707 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6708 vectype_in = tem;
6711 if (dt != vect_internal_def
6712 && dt != vect_external_def
6713 && dt != vect_constant_def
6714 && dt != vect_induction_def
6715 && !(dt == vect_nested_cycle && nested_cycle))
6716 return false;
6718 if (dt == vect_nested_cycle)
6720 found_nested_cycle_def = true;
6721 reduc_def_stmt = def_stmt;
6722 reduc_index = i;
6725 if (i == 1 && code == COND_EXPR)
6727 /* Record how value of COND_EXPR is defined. */
6728 if (dt == vect_constant_def)
6730 cond_reduc_dt = dt;
6731 cond_reduc_val = ops[i];
6733 if (dt == vect_induction_def
6734 && def_stmt != NULL
6735 && is_nonwrapping_integer_induction (def_stmt, loop))
6737 cond_reduc_dt = dt;
6738 cond_reduc_def_stmt = def_stmt;
6743 if (!vectype_in)
6744 vectype_in = vectype_out;
6746 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6747 directy used in stmt. */
6748 if (reduc_index == -1)
6750 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6752 if (dump_enabled_p ())
6753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754 "in-order reduction chain without SLP.\n");
6755 return false;
6758 if (orig_stmt)
6759 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6760 else
6761 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6764 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6765 return false;
6767 if (!(reduc_index == -1
6768 || dts[reduc_index] == vect_reduction_def
6769 || dts[reduc_index] == vect_nested_cycle
6770 || ((dts[reduc_index] == vect_internal_def
6771 || dts[reduc_index] == vect_external_def
6772 || dts[reduc_index] == vect_constant_def
6773 || dts[reduc_index] == vect_induction_def)
6774 && nested_cycle && found_nested_cycle_def)))
6776 /* For pattern recognized stmts, orig_stmt might be a reduction,
6777 but some helper statements for the pattern might not, or
6778 might be COND_EXPRs with reduction uses in the condition. */
6779 gcc_assert (orig_stmt);
6780 return false;
6783 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6784 enum vect_reduction_type v_reduc_type
6785 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6786 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6788 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6789 /* If we have a condition reduction, see if we can simplify it further. */
6790 if (v_reduc_type == COND_REDUCTION)
6792 /* TODO: We can't yet handle reduction chains, since we need to treat
6793 each COND_EXPR in the chain specially, not just the last one.
6794 E.g. for:
6796 x_1 = PHI <x_3, ...>
6797 x_2 = a_2 ? ... : x_1;
6798 x_3 = a_3 ? ... : x_2;
6800 we're interested in the last element in x_3 for which a_2 || a_3
6801 is true, whereas the current reduction chain handling would
6802 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6803 as a reduction operation. */
6804 if (reduc_index == -1)
6806 if (dump_enabled_p ())
6807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808 "conditional reduction chains not supported\n");
6809 return false;
6812 /* vect_is_simple_reduction ensured that operand 2 is the
6813 loop-carried operand. */
6814 gcc_assert (reduc_index == 2);
6816 /* Loop peeling modifies initial value of reduction PHI, which
6817 makes the reduction stmt to be transformed different to the
6818 original stmt analyzed. We need to record reduction code for
6819 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6820 it can be used directly at transform stage. */
6821 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6822 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6824 /* Also set the reduction type to CONST_COND_REDUCTION. */
6825 gcc_assert (cond_reduc_dt == vect_constant_def);
6826 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6828 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6829 vectype_in, OPTIMIZE_FOR_SPEED))
6831 if (dump_enabled_p ())
6832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833 "optimizing condition reduction with"
6834 " FOLD_EXTRACT_LAST.\n");
6835 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6837 else if (cond_reduc_dt == vect_induction_def)
6839 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6840 tree base
6841 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6842 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6844 gcc_assert (TREE_CODE (base) == INTEGER_CST
6845 && TREE_CODE (step) == INTEGER_CST);
6846 cond_reduc_val = NULL_TREE;
6847 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6848 above base; punt if base is the minimum value of the type for
6849 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6850 if (tree_int_cst_sgn (step) == -1)
6852 cond_reduc_op_code = MIN_EXPR;
6853 if (tree_int_cst_sgn (base) == -1)
6854 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6855 else if (tree_int_cst_lt (base,
6856 TYPE_MAX_VALUE (TREE_TYPE (base))))
6857 cond_reduc_val
6858 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6860 else
6862 cond_reduc_op_code = MAX_EXPR;
6863 if (tree_int_cst_sgn (base) == 1)
6864 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6865 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6866 base))
6867 cond_reduc_val
6868 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6870 if (cond_reduc_val)
6872 if (dump_enabled_p ())
6873 dump_printf_loc (MSG_NOTE, vect_location,
6874 "condition expression based on "
6875 "integer induction.\n");
6876 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6877 = INTEGER_INDUC_COND_REDUCTION;
6880 else if (cond_reduc_dt == vect_constant_def)
6882 enum vect_def_type cond_initial_dt;
6883 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6884 tree cond_initial_val
6885 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6887 gcc_assert (cond_reduc_val != NULL_TREE);
6888 vect_is_simple_use (cond_initial_val, loop_vinfo,
6889 &def_stmt, &cond_initial_dt);
6890 if (cond_initial_dt == vect_constant_def
6891 && types_compatible_p (TREE_TYPE (cond_initial_val),
6892 TREE_TYPE (cond_reduc_val)))
6894 tree e = fold_binary (LE_EXPR, boolean_type_node,
6895 cond_initial_val, cond_reduc_val);
6896 if (e && (integer_onep (e) || integer_zerop (e)))
6898 if (dump_enabled_p ())
6899 dump_printf_loc (MSG_NOTE, vect_location,
6900 "condition expression based on "
6901 "compile time constant.\n");
6902 /* Record reduction code at analysis stage. */
6903 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6904 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6905 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6906 = CONST_COND_REDUCTION;
6912 if (orig_stmt)
6913 gcc_assert (tmp == orig_stmt
6914 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6915 else
6916 /* We changed STMT to be the first stmt in reduction chain, hence we
6917 check that in this case the first element in the chain is STMT. */
6918 gcc_assert (stmt == tmp
6919 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6921 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6922 return false;
6924 if (slp_node)
6925 ncopies = 1;
6926 else
6927 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6929 gcc_assert (ncopies >= 1);
6931 vec_mode = TYPE_MODE (vectype_in);
6932 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6934 if (code == COND_EXPR)
6936 /* Only call during the analysis stage, otherwise we'll lose
6937 STMT_VINFO_TYPE. */
6938 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6939 ops[reduc_index], 0, NULL))
6941 if (dump_enabled_p ())
6942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943 "unsupported condition in reduction\n");
6944 return false;
6947 else
6949 /* 4. Supportable by target? */
6951 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6952 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6954 /* Shifts and rotates are only supported by vectorizable_shifts,
6955 not vectorizable_reduction. */
6956 if (dump_enabled_p ())
6957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958 "unsupported shift or rotation.\n");
6959 return false;
6962 /* 4.1. check support for the operation in the loop */
6963 optab = optab_for_tree_code (code, vectype_in, optab_default);
6964 if (!optab)
6966 if (dump_enabled_p ())
6967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6968 "no optab.\n");
6970 return false;
6973 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6975 if (dump_enabled_p ())
6976 dump_printf (MSG_NOTE, "op not supported by target.\n");
6978 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6979 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6980 return false;
6982 if (dump_enabled_p ())
6983 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6986 /* Worthwhile without SIMD support? */
6987 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6988 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6990 if (dump_enabled_p ())
6991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6992 "not worthwhile without SIMD support.\n");
6994 return false;
6998 /* 4.2. Check support for the epilog operation.
7000 If STMT represents a reduction pattern, then the type of the
7001 reduction variable may be different than the type of the rest
7002 of the arguments. For example, consider the case of accumulation
7003 of shorts into an int accumulator; The original code:
7004 S1: int_a = (int) short_a;
7005 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7007 was replaced with:
7008 STMT: int_acc = widen_sum <short_a, int_acc>
7010 This means that:
7011 1. The tree-code that is used to create the vector operation in the
7012 epilog code (that reduces the partial results) is not the
7013 tree-code of STMT, but is rather the tree-code of the original
7014 stmt from the pattern that STMT is replacing. I.e, in the example
7015 above we want to use 'widen_sum' in the loop, but 'plus' in the
7016 epilog.
7017 2. The type (mode) we use to check available target support
7018 for the vector operation to be created in the *epilog*, is
7019 determined by the type of the reduction variable (in the example
7020 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7021 However the type (mode) we use to check available target support
7022 for the vector operation to be created *inside the loop*, is
7023 determined by the type of the other arguments to STMT (in the
7024 example we'd check this: optab_handler (widen_sum_optab,
7025 vect_short_mode)).
7027 This is contrary to "regular" reductions, in which the types of all
7028 the arguments are the same as the type of the reduction variable.
7029 For "regular" reductions we can therefore use the same vector type
7030 (and also the same tree-code) when generating the epilog code and
7031 when generating the code inside the loop. */
7033 vect_reduction_type reduction_type
7034 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7035 if (orig_stmt
7036 && (reduction_type == TREE_CODE_REDUCTION
7037 || reduction_type == FOLD_LEFT_REDUCTION))
7039 /* This is a reduction pattern: get the vectype from the type of the
7040 reduction variable, and get the tree-code from orig_stmt. */
7041 orig_code = gimple_assign_rhs_code (orig_stmt);
7042 gcc_assert (vectype_out);
7043 vec_mode = TYPE_MODE (vectype_out);
7045 else
7047 /* Regular reduction: use the same vectype and tree-code as used for
7048 the vector code inside the loop can be used for the epilog code. */
7049 orig_code = code;
7051 if (code == MINUS_EXPR)
7052 orig_code = PLUS_EXPR;
7054 /* For simple condition reductions, replace with the actual expression
7055 we want to base our reduction around. */
7056 if (reduction_type == CONST_COND_REDUCTION)
7058 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7059 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7061 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7062 orig_code = cond_reduc_op_code;
7065 if (nested_cycle)
7067 def_bb = gimple_bb (reduc_def_stmt);
7068 def_stmt_loop = def_bb->loop_father;
7069 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7070 loop_preheader_edge (def_stmt_loop));
7071 if (TREE_CODE (def_arg) == SSA_NAME
7072 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7073 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7074 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7075 && vinfo_for_stmt (def_arg_stmt)
7076 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7077 == vect_double_reduction_def)
7078 double_reduc = true;
7081 reduc_fn = IFN_LAST;
7083 if (reduction_type == TREE_CODE_REDUCTION
7084 || reduction_type == FOLD_LEFT_REDUCTION
7085 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7086 || reduction_type == CONST_COND_REDUCTION)
7088 if (reduction_type == FOLD_LEFT_REDUCTION
7089 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7090 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7092 if (reduc_fn != IFN_LAST
7093 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7094 OPTIMIZE_FOR_SPEED))
7096 if (dump_enabled_p ())
7097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7098 "reduc op not supported by target.\n");
7100 reduc_fn = IFN_LAST;
7103 else
7105 if (!nested_cycle || double_reduc)
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 "no reduc code for scalar code.\n");
7111 return false;
7115 else if (reduction_type == COND_REDUCTION)
7117 int scalar_precision
7118 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7119 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7120 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7121 nunits_out);
7123 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7124 OPTIMIZE_FOR_SPEED))
7125 reduc_fn = IFN_REDUC_MAX;
7128 if (reduction_type != EXTRACT_LAST_REDUCTION
7129 && reduc_fn == IFN_LAST
7130 && !nunits_out.is_constant ())
7132 if (dump_enabled_p ())
7133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7134 "missing target support for reduction on"
7135 " variable-length vectors.\n");
7136 return false;
7139 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7140 && ncopies > 1)
7142 if (dump_enabled_p ())
7143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7144 "multiple types in double reduction or condition "
7145 "reduction.\n");
7146 return false;
7149 /* For SLP reductions, see if there is a neutral value we can use. */
7150 tree neutral_op = NULL_TREE;
7151 if (slp_node)
7152 neutral_op
7153 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7154 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7156 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7158 /* We can't support in-order reductions of code such as this:
7160 for (int i = 0; i < n1; ++i)
7161 for (int j = 0; j < n2; ++j)
7162 l += a[j];
7164 since GCC effectively transforms the loop when vectorizing:
7166 for (int i = 0; i < n1 / VF; ++i)
7167 for (int j = 0; j < n2; ++j)
7168 for (int k = 0; k < VF; ++k)
7169 l += a[j];
7171 which is a reassociation of the original operation. */
7172 if (dump_enabled_p ())
7173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7174 "in-order double reduction not supported.\n");
7176 return false;
7179 if (reduction_type == FOLD_LEFT_REDUCTION
7180 && slp_node
7181 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7183 /* We cannot use in-order reductions in this case because there is
7184 an implicit reassociation of the operations involved. */
7185 if (dump_enabled_p ())
7186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7187 "in-order unchained SLP reductions not supported.\n");
7188 return false;
7191 /* For double reductions, and for SLP reductions with a neutral value,
7192 we construct a variable-length initial vector by loading a vector
7193 full of the neutral value and then shift-and-inserting the start
7194 values into the low-numbered elements. */
7195 if ((double_reduc || neutral_op)
7196 && !nunits_out.is_constant ()
7197 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7198 vectype_out, OPTIMIZE_FOR_SPEED))
7200 if (dump_enabled_p ())
7201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202 "reduction on variable-length vectors requires"
7203 " target support for a vector-shift-and-insert"
7204 " operation.\n");
7205 return false;
7208 /* Check extra constraints for variable-length unchained SLP reductions. */
7209 if (STMT_SLP_TYPE (stmt_info)
7210 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7211 && !nunits_out.is_constant ())
7213 /* We checked above that we could build the initial vector when
7214 there's a neutral element value. Check here for the case in
7215 which each SLP statement has its own initial value and in which
7216 that value needs to be repeated for every instance of the
7217 statement within the initial vector. */
7218 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7219 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7220 if (!neutral_op
7221 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7223 if (dump_enabled_p ())
7224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225 "unsupported form of SLP reduction for"
7226 " variable-length vectors: cannot build"
7227 " initial vector.\n");
7228 return false;
7230 /* The epilogue code relies on the number of elements being a multiple
7231 of the group size. The duplicate-and-interleave approach to setting
7232 up the the initial vector does too. */
7233 if (!multiple_p (nunits_out, group_size))
7235 if (dump_enabled_p ())
7236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7237 "unsupported form of SLP reduction for"
7238 " variable-length vectors: the vector size"
7239 " is not a multiple of the number of results.\n");
7240 return false;
7244 /* In case of widenning multiplication by a constant, we update the type
7245 of the constant to be the type of the other operand. We check that the
7246 constant fits the type in the pattern recognition pass. */
7247 if (code == DOT_PROD_EXPR
7248 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7250 if (TREE_CODE (ops[0]) == INTEGER_CST)
7251 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7252 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7253 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7254 else
7256 if (dump_enabled_p ())
7257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258 "invalid types in dot-prod\n");
7260 return false;
7264 if (reduction_type == COND_REDUCTION)
7266 widest_int ni;
7268 if (! max_loop_iterations (loop, &ni))
7270 if (dump_enabled_p ())
7271 dump_printf_loc (MSG_NOTE, vect_location,
7272 "loop count not known, cannot create cond "
7273 "reduction.\n");
7274 return false;
7276 /* Convert backedges to iterations. */
7277 ni += 1;
7279 /* The additional index will be the same type as the condition. Check
7280 that the loop can fit into this less one (because we'll use up the
7281 zero slot for when there are no matches). */
7282 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7283 if (wi::geu_p (ni, wi::to_widest (max_index)))
7285 if (dump_enabled_p ())
7286 dump_printf_loc (MSG_NOTE, vect_location,
7287 "loop size is greater than data size.\n");
7288 return false;
7292 /* In case the vectorization factor (VF) is bigger than the number
7293 of elements that we can fit in a vectype (nunits), we have to generate
7294 more than one vector stmt - i.e - we need to "unroll" the
7295 vector stmt by a factor VF/nunits. For more details see documentation
7296 in vectorizable_operation. */
7298 /* If the reduction is used in an outer loop we need to generate
7299 VF intermediate results, like so (e.g. for ncopies=2):
7300 r0 = phi (init, r0)
7301 r1 = phi (init, r1)
7302 r0 = x0 + r0;
7303 r1 = x1 + r1;
7304 (i.e. we generate VF results in 2 registers).
7305 In this case we have a separate def-use cycle for each copy, and therefore
7306 for each copy we get the vector def for the reduction variable from the
7307 respective phi node created for this copy.
7309 Otherwise (the reduction is unused in the loop nest), we can combine
7310 together intermediate results, like so (e.g. for ncopies=2):
7311 r = phi (init, r)
7312 r = x0 + r;
7313 r = x1 + r;
7314 (i.e. we generate VF/2 results in a single register).
7315 In this case for each copy we get the vector def for the reduction variable
7316 from the vectorized reduction operation generated in the previous iteration.
7318 This only works when we see both the reduction PHI and its only consumer
7319 in vectorizable_reduction and there are no intermediate stmts
7320 participating. */
7321 use_operand_p use_p;
7322 gimple *use_stmt;
7323 if (ncopies > 1
7324 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7325 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7326 && (use_stmt == stmt
7327 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7329 single_defuse_cycle = true;
7330 epilog_copies = 1;
7332 else
7333 epilog_copies = ncopies;
7335 /* If the reduction stmt is one of the patterns that have lane
7336 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7337 if ((ncopies > 1
7338 && ! single_defuse_cycle)
7339 && (code == DOT_PROD_EXPR
7340 || code == WIDEN_SUM_EXPR
7341 || code == SAD_EXPR))
7343 if (dump_enabled_p ())
7344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7345 "multi def-use cycle not possible for lane-reducing "
7346 "reduction operation\n");
7347 return false;
7350 if (slp_node)
7351 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7352 else
7353 vec_num = 1;
7355 internal_fn cond_fn = get_conditional_internal_fn (code);
7356 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7358 if (!vec_stmt) /* transformation not required. */
7360 if (first_p)
7361 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7362 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7364 if (reduction_type != FOLD_LEFT_REDUCTION
7365 && (cond_fn == IFN_LAST
7366 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7367 OPTIMIZE_FOR_SPEED)))
7369 if (dump_enabled_p ())
7370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371 "can't use a fully-masked loop because no"
7372 " conditional operation is available.\n");
7373 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7375 else if (reduc_index == -1)
7377 if (dump_enabled_p ())
7378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7379 "can't use a fully-masked loop for chained"
7380 " reductions.\n");
7381 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7383 else
7384 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7385 vectype_in);
7387 if (dump_enabled_p ()
7388 && reduction_type == FOLD_LEFT_REDUCTION)
7389 dump_printf_loc (MSG_NOTE, vect_location,
7390 "using an in-order (fold-left) reduction.\n");
7391 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7392 return true;
7395 /* Transform. */
7397 if (dump_enabled_p ())
7398 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7400 /* FORNOW: Multiple types are not supported for condition. */
7401 if (code == COND_EXPR)
7402 gcc_assert (ncopies == 1);
7404 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7406 if (reduction_type == FOLD_LEFT_REDUCTION)
7407 return vectorize_fold_left_reduction
7408 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7409 reduc_fn, ops, vectype_in, reduc_index, masks);
7411 if (reduction_type == EXTRACT_LAST_REDUCTION)
7413 gcc_assert (!slp_node);
7414 return vectorizable_condition (stmt, gsi, vec_stmt,
7415 NULL, reduc_index, NULL);
7418 /* Create the destination vector */
7419 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7421 prev_stmt_info = NULL;
7422 prev_phi_info = NULL;
7423 if (!slp_node)
7425 vec_oprnds0.create (1);
7426 vec_oprnds1.create (1);
7427 if (op_type == ternary_op)
7428 vec_oprnds2.create (1);
7431 phis.create (vec_num);
7432 vect_defs.create (vec_num);
7433 if (!slp_node)
7434 vect_defs.quick_push (NULL_TREE);
7436 if (slp_node)
7437 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7438 else
7439 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7441 for (j = 0; j < ncopies; j++)
7443 if (code == COND_EXPR)
7445 gcc_assert (!slp_node);
7446 vectorizable_condition (stmt, gsi, vec_stmt,
7447 PHI_RESULT (phis[0]),
7448 reduc_index, NULL);
7449 /* Multiple types are not supported for condition. */
7450 break;
7453 /* Handle uses. */
7454 if (j == 0)
7456 if (slp_node)
7458 /* Get vec defs for all the operands except the reduction index,
7459 ensuring the ordering of the ops in the vector is kept. */
7460 auto_vec<tree, 3> slp_ops;
7461 auto_vec<vec<tree>, 3> vec_defs;
7463 slp_ops.quick_push (ops[0]);
7464 slp_ops.quick_push (ops[1]);
7465 if (op_type == ternary_op)
7466 slp_ops.quick_push (ops[2]);
7468 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7470 vec_oprnds0.safe_splice (vec_defs[0]);
7471 vec_defs[0].release ();
7472 vec_oprnds1.safe_splice (vec_defs[1]);
7473 vec_defs[1].release ();
7474 if (op_type == ternary_op)
7476 vec_oprnds2.safe_splice (vec_defs[2]);
7477 vec_defs[2].release ();
7480 else
7482 vec_oprnds0.quick_push
7483 (vect_get_vec_def_for_operand (ops[0], stmt));
7484 vec_oprnds1.quick_push
7485 (vect_get_vec_def_for_operand (ops[1], stmt));
7486 if (op_type == ternary_op)
7487 vec_oprnds2.quick_push
7488 (vect_get_vec_def_for_operand (ops[2], stmt));
7491 else
7493 if (!slp_node)
7495 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7497 if (single_defuse_cycle && reduc_index == 0)
7498 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7499 else
7500 vec_oprnds0[0]
7501 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7502 if (single_defuse_cycle && reduc_index == 1)
7503 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7504 else
7505 vec_oprnds1[0]
7506 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7507 if (op_type == ternary_op)
7509 if (single_defuse_cycle && reduc_index == 2)
7510 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7511 else
7512 vec_oprnds2[0]
7513 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7518 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7520 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7521 if (masked_loop_p)
7523 /* Make sure that the reduction accumulator is vop[0]. */
7524 if (reduc_index == 1)
7526 gcc_assert (commutative_tree_code (code));
7527 std::swap (vop[0], vop[1]);
7529 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7530 vectype_in, i * ncopies + j);
7531 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7532 vop[0], vop[1]);
7533 new_temp = make_ssa_name (vec_dest, call);
7534 gimple_call_set_lhs (call, new_temp);
7535 gimple_call_set_nothrow (call, true);
7536 new_stmt = call;
7538 else
7540 if (op_type == ternary_op)
7541 vop[2] = vec_oprnds2[i];
7543 new_temp = make_ssa_name (vec_dest, new_stmt);
7544 new_stmt = gimple_build_assign (new_temp, code,
7545 vop[0], vop[1], vop[2]);
7547 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7549 if (slp_node)
7551 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7552 vect_defs.quick_push (new_temp);
7554 else
7555 vect_defs[0] = new_temp;
7558 if (slp_node)
7559 continue;
7561 if (j == 0)
7562 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7563 else
7564 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7566 prev_stmt_info = vinfo_for_stmt (new_stmt);
7569 /* Finalize the reduction-phi (set its arguments) and create the
7570 epilog reduction code. */
7571 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7572 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7574 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7575 epilog_copies, reduc_fn, phis,
7576 double_reduc, slp_node, slp_node_instance,
7577 cond_reduc_val, cond_reduc_op_code,
7578 neutral_op);
7580 return true;
7583 /* Function vect_min_worthwhile_factor.
7585 For a loop where we could vectorize the operation indicated by CODE,
7586 return the minimum vectorization factor that makes it worthwhile
7587 to use generic vectors. */
7588 static unsigned int
7589 vect_min_worthwhile_factor (enum tree_code code)
7591 switch (code)
7593 case PLUS_EXPR:
7594 case MINUS_EXPR:
7595 case NEGATE_EXPR:
7596 return 4;
7598 case BIT_AND_EXPR:
7599 case BIT_IOR_EXPR:
7600 case BIT_XOR_EXPR:
7601 case BIT_NOT_EXPR:
7602 return 2;
7604 default:
7605 return INT_MAX;
7609 /* Return true if VINFO indicates we are doing loop vectorization and if
7610 it is worth decomposing CODE operations into scalar operations for
7611 that loop's vectorization factor. */
7613 bool
7614 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7616 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7617 unsigned HOST_WIDE_INT value;
7618 return (loop_vinfo
7619 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7620 && value >= vect_min_worthwhile_factor (code));
7623 /* Function vectorizable_induction
7625 Check if PHI performs an induction computation that can be vectorized.
7626 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7627 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7628 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7630 bool
7631 vectorizable_induction (gimple *phi,
7632 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7633 gimple **vec_stmt, slp_tree slp_node)
7635 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7636 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7637 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7638 unsigned ncopies;
7639 bool nested_in_vect_loop = false;
7640 struct loop *iv_loop;
7641 tree vec_def;
7642 edge pe = loop_preheader_edge (loop);
7643 basic_block new_bb;
7644 tree new_vec, vec_init, vec_step, t;
7645 tree new_name;
7646 gimple *new_stmt;
7647 gphi *induction_phi;
7648 tree induc_def, vec_dest;
7649 tree init_expr, step_expr;
7650 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7651 unsigned i;
7652 tree expr;
7653 gimple_seq stmts;
7654 imm_use_iterator imm_iter;
7655 use_operand_p use_p;
7656 gimple *exit_phi;
7657 edge latch_e;
7658 tree loop_arg;
7659 gimple_stmt_iterator si;
7660 basic_block bb = gimple_bb (phi);
7662 if (gimple_code (phi) != GIMPLE_PHI)
7663 return false;
7665 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7666 return false;
7668 /* Make sure it was recognized as induction computation. */
7669 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7670 return false;
7672 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7673 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7675 if (slp_node)
7676 ncopies = 1;
7677 else
7678 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7679 gcc_assert (ncopies >= 1);
7681 /* FORNOW. These restrictions should be relaxed. */
7682 if (nested_in_vect_loop_p (loop, phi))
7684 imm_use_iterator imm_iter;
7685 use_operand_p use_p;
7686 gimple *exit_phi;
7687 edge latch_e;
7688 tree loop_arg;
7690 if (ncopies > 1)
7692 if (dump_enabled_p ())
7693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694 "multiple types in nested loop.\n");
7695 return false;
7698 /* FORNOW: outer loop induction with SLP not supported. */
7699 if (STMT_SLP_TYPE (stmt_info))
7700 return false;
7702 exit_phi = NULL;
7703 latch_e = loop_latch_edge (loop->inner);
7704 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7707 gimple *use_stmt = USE_STMT (use_p);
7708 if (is_gimple_debug (use_stmt))
7709 continue;
7711 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7713 exit_phi = use_stmt;
7714 break;
7717 if (exit_phi)
7719 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7720 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7721 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7723 if (dump_enabled_p ())
7724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7725 "inner-loop induction only used outside "
7726 "of the outer vectorized loop.\n");
7727 return false;
7731 nested_in_vect_loop = true;
7732 iv_loop = loop->inner;
7734 else
7735 iv_loop = loop;
7736 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7738 if (slp_node && !nunits.is_constant ())
7740 /* The current SLP code creates the initial value element-by-element. */
7741 if (dump_enabled_p ())
7742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7743 "SLP induction not supported for variable-length"
7744 " vectors.\n");
7745 return false;
7748 if (!vec_stmt) /* transformation not required. */
7750 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7751 if (dump_enabled_p ())
7752 dump_printf_loc (MSG_NOTE, vect_location,
7753 "=== vectorizable_induction ===\n");
7754 vect_model_induction_cost (stmt_info, ncopies);
7755 return true;
7758 /* Transform. */
7760 /* Compute a vector variable, initialized with the first VF values of
7761 the induction variable. E.g., for an iv with IV_PHI='X' and
7762 evolution S, for a vector of 4 units, we want to compute:
7763 [X, X + S, X + 2*S, X + 3*S]. */
7765 if (dump_enabled_p ())
7766 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7768 latch_e = loop_latch_edge (iv_loop);
7769 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7771 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7772 gcc_assert (step_expr != NULL_TREE);
7774 pe = loop_preheader_edge (iv_loop);
7775 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7776 loop_preheader_edge (iv_loop));
7778 stmts = NULL;
7779 if (!nested_in_vect_loop)
7781 /* Convert the initial value to the desired type. */
7782 tree new_type = TREE_TYPE (vectype);
7783 init_expr = gimple_convert (&stmts, new_type, init_expr);
7785 /* If we are using the loop mask to "peel" for alignment then we need
7786 to adjust the start value here. */
7787 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7788 if (skip_niters != NULL_TREE)
7790 if (FLOAT_TYPE_P (vectype))
7791 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7792 skip_niters);
7793 else
7794 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7795 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7796 skip_niters, step_expr);
7797 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7798 init_expr, skip_step);
7802 /* Convert the step to the desired type. */
7803 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7805 if (stmts)
7807 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7808 gcc_assert (!new_bb);
7811 /* Find the first insertion point in the BB. */
7812 si = gsi_after_labels (bb);
7814 /* For SLP induction we have to generate several IVs as for example
7815 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7816 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7817 [VF*S, VF*S, VF*S, VF*S] for all. */
7818 if (slp_node)
7820 /* Enforced above. */
7821 unsigned int const_nunits = nunits.to_constant ();
7823 /* Generate [VF*S, VF*S, ... ]. */
7824 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7826 expr = build_int_cst (integer_type_node, vf);
7827 expr = fold_convert (TREE_TYPE (step_expr), expr);
7829 else
7830 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7831 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7832 expr, step_expr);
7833 if (! CONSTANT_CLASS_P (new_name))
7834 new_name = vect_init_vector (phi, new_name,
7835 TREE_TYPE (step_expr), NULL);
7836 new_vec = build_vector_from_val (vectype, new_name);
7837 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7839 /* Now generate the IVs. */
7840 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7841 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7842 unsigned elts = const_nunits * nvects;
7843 unsigned nivs = least_common_multiple (group_size,
7844 const_nunits) / const_nunits;
7845 gcc_assert (elts % group_size == 0);
7846 tree elt = init_expr;
7847 unsigned ivn;
7848 for (ivn = 0; ivn < nivs; ++ivn)
7850 tree_vector_builder elts (vectype, const_nunits, 1);
7851 stmts = NULL;
7852 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7854 if (ivn*const_nunits + eltn >= group_size
7855 && (ivn * const_nunits + eltn) % group_size == 0)
7856 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7857 elt, step_expr);
7858 elts.quick_push (elt);
7860 vec_init = gimple_build_vector (&stmts, &elts);
7861 if (stmts)
7863 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7864 gcc_assert (!new_bb);
7867 /* Create the induction-phi that defines the induction-operand. */
7868 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7869 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7870 set_vinfo_for_stmt (induction_phi,
7871 new_stmt_vec_info (induction_phi, loop_vinfo));
7872 induc_def = PHI_RESULT (induction_phi);
7874 /* Create the iv update inside the loop */
7875 vec_def = make_ssa_name (vec_dest);
7876 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7877 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7878 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7880 /* Set the arguments of the phi node: */
7881 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7882 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7883 UNKNOWN_LOCATION);
7885 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7888 /* Re-use IVs when we can. */
7889 if (ivn < nvects)
7891 unsigned vfp
7892 = least_common_multiple (group_size, const_nunits) / group_size;
7893 /* Generate [VF'*S, VF'*S, ... ]. */
7894 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7896 expr = build_int_cst (integer_type_node, vfp);
7897 expr = fold_convert (TREE_TYPE (step_expr), expr);
7899 else
7900 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7901 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7902 expr, step_expr);
7903 if (! CONSTANT_CLASS_P (new_name))
7904 new_name = vect_init_vector (phi, new_name,
7905 TREE_TYPE (step_expr), NULL);
7906 new_vec = build_vector_from_val (vectype, new_name);
7907 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7908 for (; ivn < nvects; ++ivn)
7910 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7911 tree def;
7912 if (gimple_code (iv) == GIMPLE_PHI)
7913 def = gimple_phi_result (iv);
7914 else
7915 def = gimple_assign_lhs (iv);
7916 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7917 PLUS_EXPR,
7918 def, vec_step);
7919 if (gimple_code (iv) == GIMPLE_PHI)
7920 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7921 else
7923 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7924 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7926 set_vinfo_for_stmt (new_stmt,
7927 new_stmt_vec_info (new_stmt, loop_vinfo));
7928 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7932 return true;
7935 /* Create the vector that holds the initial_value of the induction. */
7936 if (nested_in_vect_loop)
7938 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7939 been created during vectorization of previous stmts. We obtain it
7940 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7941 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7942 /* If the initial value is not of proper type, convert it. */
7943 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7945 new_stmt
7946 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7947 vect_simple_var,
7948 "vec_iv_"),
7949 VIEW_CONVERT_EXPR,
7950 build1 (VIEW_CONVERT_EXPR, vectype,
7951 vec_init));
7952 vec_init = gimple_assign_lhs (new_stmt);
7953 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7954 new_stmt);
7955 gcc_assert (!new_bb);
7956 set_vinfo_for_stmt (new_stmt,
7957 new_stmt_vec_info (new_stmt, loop_vinfo));
7960 else
7962 /* iv_loop is the loop to be vectorized. Create:
7963 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7964 stmts = NULL;
7965 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7967 unsigned HOST_WIDE_INT const_nunits;
7968 if (nunits.is_constant (&const_nunits))
7970 tree_vector_builder elts (vectype, const_nunits, 1);
7971 elts.quick_push (new_name);
7972 for (i = 1; i < const_nunits; i++)
7974 /* Create: new_name_i = new_name + step_expr */
7975 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7976 new_name, step_expr);
7977 elts.quick_push (new_name);
7979 /* Create a vector from [new_name_0, new_name_1, ...,
7980 new_name_nunits-1] */
7981 vec_init = gimple_build_vector (&stmts, &elts);
7983 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7984 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7985 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7986 new_name, step_expr);
7987 else
7989 /* Build:
7990 [base, base, base, ...]
7991 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7992 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7993 gcc_assert (flag_associative_math);
7994 tree index = build_index_vector (vectype, 0, 1);
7995 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7996 new_name);
7997 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7998 step_expr);
7999 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
8000 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8001 vec_init, step_vec);
8002 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8003 vec_init, base_vec);
8006 if (stmts)
8008 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8009 gcc_assert (!new_bb);
8014 /* Create the vector that holds the step of the induction. */
8015 if (nested_in_vect_loop)
8016 /* iv_loop is nested in the loop to be vectorized. Generate:
8017 vec_step = [S, S, S, S] */
8018 new_name = step_expr;
8019 else
8021 /* iv_loop is the loop to be vectorized. Generate:
8022 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8023 gimple_seq seq = NULL;
8024 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8026 expr = build_int_cst (integer_type_node, vf);
8027 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8029 else
8030 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8031 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8032 expr, step_expr);
8033 if (seq)
8035 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8036 gcc_assert (!new_bb);
8040 t = unshare_expr (new_name);
8041 gcc_assert (CONSTANT_CLASS_P (new_name)
8042 || TREE_CODE (new_name) == SSA_NAME);
8043 new_vec = build_vector_from_val (vectype, t);
8044 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8047 /* Create the following def-use cycle:
8048 loop prolog:
8049 vec_init = ...
8050 vec_step = ...
8051 loop:
8052 vec_iv = PHI <vec_init, vec_loop>
8054 STMT
8056 vec_loop = vec_iv + vec_step; */
8058 /* Create the induction-phi that defines the induction-operand. */
8059 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8060 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8061 set_vinfo_for_stmt (induction_phi,
8062 new_stmt_vec_info (induction_phi, loop_vinfo));
8063 induc_def = PHI_RESULT (induction_phi);
8065 /* Create the iv update inside the loop */
8066 vec_def = make_ssa_name (vec_dest);
8067 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8068 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8069 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8071 /* Set the arguments of the phi node: */
8072 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8073 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8074 UNKNOWN_LOCATION);
8076 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8078 /* In case that vectorization factor (VF) is bigger than the number
8079 of elements that we can fit in a vectype (nunits), we have to generate
8080 more than one vector stmt - i.e - we need to "unroll" the
8081 vector stmt by a factor VF/nunits. For more details see documentation
8082 in vectorizable_operation. */
8084 if (ncopies > 1)
8086 gimple_seq seq = NULL;
8087 stmt_vec_info prev_stmt_vinfo;
8088 /* FORNOW. This restriction should be relaxed. */
8089 gcc_assert (!nested_in_vect_loop);
8091 /* Create the vector that holds the step of the induction. */
8092 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8094 expr = build_int_cst (integer_type_node, nunits);
8095 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8097 else
8098 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8099 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8100 expr, step_expr);
8101 if (seq)
8103 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8104 gcc_assert (!new_bb);
8107 t = unshare_expr (new_name);
8108 gcc_assert (CONSTANT_CLASS_P (new_name)
8109 || TREE_CODE (new_name) == SSA_NAME);
8110 new_vec = build_vector_from_val (vectype, t);
8111 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8113 vec_def = induc_def;
8114 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8115 for (i = 1; i < ncopies; i++)
8117 /* vec_i = vec_prev + vec_step */
8118 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8119 vec_def, vec_step);
8120 vec_def = make_ssa_name (vec_dest, new_stmt);
8121 gimple_assign_set_lhs (new_stmt, vec_def);
8123 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8124 set_vinfo_for_stmt (new_stmt,
8125 new_stmt_vec_info (new_stmt, loop_vinfo));
8126 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8127 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8131 if (nested_in_vect_loop)
8133 /* Find the loop-closed exit-phi of the induction, and record
8134 the final vector of induction results: */
8135 exit_phi = NULL;
8136 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8138 gimple *use_stmt = USE_STMT (use_p);
8139 if (is_gimple_debug (use_stmt))
8140 continue;
8142 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8144 exit_phi = use_stmt;
8145 break;
8148 if (exit_phi)
8150 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8151 /* FORNOW. Currently not supporting the case that an inner-loop induction
8152 is not used in the outer-loop (i.e. only outside the outer-loop). */
8153 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8154 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8156 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8157 if (dump_enabled_p ())
8159 dump_printf_loc (MSG_NOTE, vect_location,
8160 "vector of inductions after inner-loop:");
8161 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8167 if (dump_enabled_p ())
8169 dump_printf_loc (MSG_NOTE, vect_location,
8170 "transform induction: created def-use cycle: ");
8171 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8172 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8173 SSA_NAME_DEF_STMT (vec_def), 0);
8176 return true;
8179 /* Function vectorizable_live_operation.
8181 STMT computes a value that is used outside the loop. Check if
8182 it can be supported. */
8184 bool
8185 vectorizable_live_operation (gimple *stmt,
8186 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8187 slp_tree slp_node, int slp_index,
8188 gimple **vec_stmt)
8190 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8191 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8192 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8193 imm_use_iterator imm_iter;
8194 tree lhs, lhs_type, bitsize, vec_bitsize;
8195 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8196 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8197 int ncopies;
8198 gimple *use_stmt;
8199 auto_vec<tree> vec_oprnds;
8200 int vec_entry = 0;
8201 poly_uint64 vec_index = 0;
8203 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8205 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8206 return false;
8208 /* FORNOW. CHECKME. */
8209 if (nested_in_vect_loop_p (loop, stmt))
8210 return false;
8212 /* If STMT is not relevant and it is a simple assignment and its inputs are
8213 invariant then it can remain in place, unvectorized. The original last
8214 scalar value that it computes will be used. */
8215 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8217 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8218 if (dump_enabled_p ())
8219 dump_printf_loc (MSG_NOTE, vect_location,
8220 "statement is simple and uses invariant. Leaving in "
8221 "place.\n");
8222 return true;
8225 if (slp_node)
8226 ncopies = 1;
8227 else
8228 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8230 if (slp_node)
8232 gcc_assert (slp_index >= 0);
8234 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8235 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8237 /* Get the last occurrence of the scalar index from the concatenation of
8238 all the slp vectors. Calculate which slp vector it is and the index
8239 within. */
8240 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8242 /* Calculate which vector contains the result, and which lane of
8243 that vector we need. */
8244 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8246 if (dump_enabled_p ())
8247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 "Cannot determine which vector holds the"
8249 " final result.\n");
8250 return false;
8254 if (!vec_stmt)
8256 /* No transformation required. */
8257 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8259 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8260 OPTIMIZE_FOR_SPEED))
8262 if (dump_enabled_p ())
8263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264 "can't use a fully-masked loop because "
8265 "the target doesn't support extract last "
8266 "reduction.\n");
8267 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8269 else if (slp_node)
8271 if (dump_enabled_p ())
8272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273 "can't use a fully-masked loop because an "
8274 "SLP statement is live after the loop.\n");
8275 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8277 else if (ncopies > 1)
8279 if (dump_enabled_p ())
8280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8281 "can't use a fully-masked loop because"
8282 " ncopies is greater than 1.\n");
8283 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8285 else
8287 gcc_assert (ncopies == 1 && !slp_node);
8288 vect_record_loop_mask (loop_vinfo,
8289 &LOOP_VINFO_MASKS (loop_vinfo),
8290 1, vectype);
8293 return true;
8296 /* If stmt has a related stmt, then use that for getting the lhs. */
8297 if (is_pattern_stmt_p (stmt_info))
8298 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8300 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8301 : gimple_get_lhs (stmt);
8302 lhs_type = TREE_TYPE (lhs);
8304 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8305 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8306 : TYPE_SIZE (TREE_TYPE (vectype)));
8307 vec_bitsize = TYPE_SIZE (vectype);
8309 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8310 tree vec_lhs, bitstart;
8311 if (slp_node)
8313 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8315 /* Get the correct slp vectorized stmt. */
8316 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8317 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8318 vec_lhs = gimple_phi_result (phi);
8319 else
8320 vec_lhs = gimple_get_lhs (vec_stmt);
8322 /* Get entry to use. */
8323 bitstart = bitsize_int (vec_index);
8324 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8326 else
8328 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8329 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8330 gcc_checking_assert (ncopies == 1
8331 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8333 /* For multiple copies, get the last copy. */
8334 for (int i = 1; i < ncopies; ++i)
8335 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8336 vec_lhs);
8338 /* Get the last lane in the vector. */
8339 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8342 gimple_seq stmts = NULL;
8343 tree new_tree;
8344 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8346 /* Emit:
8348 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8350 where VEC_LHS is the vectorized live-out result and MASK is
8351 the loop mask for the final iteration. */
8352 gcc_assert (ncopies == 1 && !slp_node);
8353 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8354 tree scalar_res = make_ssa_name (scalar_type);
8355 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8356 1, vectype, 0);
8357 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8358 2, mask, vec_lhs);
8359 gimple_call_set_lhs (new_stmt, scalar_res);
8360 gimple_seq_add_stmt (&stmts, new_stmt);
8362 /* Convert the extracted vector element to the required scalar type. */
8363 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8365 else
8367 tree bftype = TREE_TYPE (vectype);
8368 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8369 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8370 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8371 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8372 &stmts, true, NULL_TREE);
8375 if (stmts)
8376 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8378 /* Replace use of lhs with newly computed result. If the use stmt is a
8379 single arg PHI, just replace all uses of PHI result. It's necessary
8380 because lcssa PHI defining lhs may be before newly inserted stmt. */
8381 use_operand_p use_p;
8382 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8383 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8384 && !is_gimple_debug (use_stmt))
8386 if (gimple_code (use_stmt) == GIMPLE_PHI
8387 && gimple_phi_num_args (use_stmt) == 1)
8389 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8391 else
8393 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8394 SET_USE (use_p, new_tree);
8396 update_stmt (use_stmt);
8399 return true;
8402 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8404 static void
8405 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8407 ssa_op_iter op_iter;
8408 imm_use_iterator imm_iter;
8409 def_operand_p def_p;
8410 gimple *ustmt;
8412 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8414 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8416 basic_block bb;
8418 if (!is_gimple_debug (ustmt))
8419 continue;
8421 bb = gimple_bb (ustmt);
8423 if (!flow_bb_inside_loop_p (loop, bb))
8425 if (gimple_debug_bind_p (ustmt))
8427 if (dump_enabled_p ())
8428 dump_printf_loc (MSG_NOTE, vect_location,
8429 "killing debug use\n");
8431 gimple_debug_bind_reset_value (ustmt);
8432 update_stmt (ustmt);
8434 else
8435 gcc_unreachable ();
8441 /* Given loop represented by LOOP_VINFO, return true if computation of
8442 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8443 otherwise. */
8445 static bool
8446 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8448 /* Constant case. */
8449 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8451 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8452 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8454 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8455 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8456 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8457 return true;
8460 widest_int max;
8461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8462 /* Check the upper bound of loop niters. */
8463 if (get_max_loop_iterations (loop, &max))
8465 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8466 signop sgn = TYPE_SIGN (type);
8467 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8468 if (max < type_max)
8469 return true;
8471 return false;
8474 /* Return a mask type with half the number of elements as TYPE. */
8476 tree
8477 vect_halve_mask_nunits (tree type)
8479 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8480 return build_truth_vector_type (nunits, current_vector_size);
8483 /* Return a mask type with twice as many elements as TYPE. */
8485 tree
8486 vect_double_mask_nunits (tree type)
8488 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8489 return build_truth_vector_type (nunits, current_vector_size);
8492 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8493 contain a sequence of NVECTORS masks that each control a vector of type
8494 VECTYPE. */
8496 void
8497 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8498 unsigned int nvectors, tree vectype)
8500 gcc_assert (nvectors != 0);
8501 if (masks->length () < nvectors)
8502 masks->safe_grow_cleared (nvectors);
8503 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8504 /* The number of scalars per iteration and the number of vectors are
8505 both compile-time constants. */
8506 unsigned int nscalars_per_iter
8507 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8508 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8509 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8511 rgm->max_nscalars_per_iter = nscalars_per_iter;
8512 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8516 /* Given a complete set of masks MASKS, extract mask number INDEX
8517 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8518 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8520 See the comment above vec_loop_masks for more details about the mask
8521 arrangement. */
8523 tree
8524 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8525 unsigned int nvectors, tree vectype, unsigned int index)
8527 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8528 tree mask_type = rgm->mask_type;
8530 /* Populate the rgroup's mask array, if this is the first time we've
8531 used it. */
8532 if (rgm->masks.is_empty ())
8534 rgm->masks.safe_grow_cleared (nvectors);
8535 for (unsigned int i = 0; i < nvectors; ++i)
8537 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8538 /* Provide a dummy definition until the real one is available. */
8539 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8540 rgm->masks[i] = mask;
8544 tree mask = rgm->masks[index];
8545 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8546 TYPE_VECTOR_SUBPARTS (vectype)))
8548 /* A loop mask for data type X can be reused for data type Y
8549 if X has N times more elements than Y and if Y's elements
8550 are N times bigger than X's. In this case each sequence
8551 of N elements in the loop mask will be all-zero or all-one.
8552 We can then view-convert the mask so that each sequence of
8553 N elements is replaced by a single element. */
8554 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8555 TYPE_VECTOR_SUBPARTS (vectype)));
8556 gimple_seq seq = NULL;
8557 mask_type = build_same_sized_truth_vector_type (vectype);
8558 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8559 if (seq)
8560 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8562 return mask;
8565 /* Scale profiling counters by estimation for LOOP which is vectorized
8566 by factor VF. */
8568 static void
8569 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8571 edge preheader = loop_preheader_edge (loop);
8572 /* Reduce loop iterations by the vectorization factor. */
8573 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8574 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8576 if (freq_h.nonzero_p ())
8578 profile_probability p;
8580 /* Avoid dropping loop body profile counter to 0 because of zero count
8581 in loop's preheader. */
8582 if (!(freq_e == profile_count::zero ()))
8583 freq_e = freq_e.force_nonzero ();
8584 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8585 scale_loop_frequencies (loop, p);
8588 edge exit_e = single_exit (loop);
8589 exit_e->probability = profile_probability::always ()
8590 .apply_scale (1, new_est_niter + 1);
8592 edge exit_l = single_pred_edge (loop->latch);
8593 profile_probability prob = exit_l->probability;
8594 exit_l->probability = exit_e->probability.invert ();
8595 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8596 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8599 /* Function vect_transform_loop.
8601 The analysis phase has determined that the loop is vectorizable.
8602 Vectorize the loop - created vectorized stmts to replace the scalar
8603 stmts in the loop, and update the loop exit condition.
8604 Returns scalar epilogue loop if any. */
8606 struct loop *
8607 vect_transform_loop (loop_vec_info loop_vinfo)
8609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8610 struct loop *epilogue = NULL;
8611 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8612 int nbbs = loop->num_nodes;
8613 int i;
8614 tree niters_vector = NULL_TREE;
8615 tree step_vector = NULL_TREE;
8616 tree niters_vector_mult_vf = NULL_TREE;
8617 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8618 unsigned int lowest_vf = constant_lower_bound (vf);
8619 bool grouped_store;
8620 bool slp_scheduled = false;
8621 gimple *stmt, *pattern_stmt;
8622 gimple_seq pattern_def_seq = NULL;
8623 gimple_stmt_iterator pattern_def_si = gsi_none ();
8624 bool transform_pattern_stmt = false;
8625 bool check_profitability = false;
8626 unsigned int th;
8628 if (dump_enabled_p ())
8629 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8631 /* Use the more conservative vectorization threshold. If the number
8632 of iterations is constant assume the cost check has been performed
8633 by our caller. If the threshold makes all loops profitable that
8634 run at least the (estimated) vectorization factor number of times
8635 checking is pointless, too. */
8636 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8637 if (th >= vect_vf_for_cost (loop_vinfo)
8638 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8640 if (dump_enabled_p ())
8641 dump_printf_loc (MSG_NOTE, vect_location,
8642 "Profitability threshold is %d loop iterations.\n",
8643 th);
8644 check_profitability = true;
8647 /* Make sure there exists a single-predecessor exit bb. Do this before
8648 versioning. */
8649 edge e = single_exit (loop);
8650 if (! single_pred_p (e->dest))
8652 split_loop_exit_edge (e);
8653 if (dump_enabled_p ())
8654 dump_printf (MSG_NOTE, "split exit edge\n");
8657 /* Version the loop first, if required, so the profitability check
8658 comes first. */
8660 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8662 poly_uint64 versioning_threshold
8663 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8664 if (check_profitability
8665 && ordered_p (poly_uint64 (th), versioning_threshold))
8667 versioning_threshold = ordered_max (poly_uint64 (th),
8668 versioning_threshold);
8669 check_profitability = false;
8671 vect_loop_versioning (loop_vinfo, th, check_profitability,
8672 versioning_threshold);
8673 check_profitability = false;
8676 /* Make sure there exists a single-predecessor exit bb also on the
8677 scalar loop copy. Do this after versioning but before peeling
8678 so CFG structure is fine for both scalar and if-converted loop
8679 to make slpeel_duplicate_current_defs_from_edges face matched
8680 loop closed PHI nodes on the exit. */
8681 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8683 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8684 if (! single_pred_p (e->dest))
8686 split_loop_exit_edge (e);
8687 if (dump_enabled_p ())
8688 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8692 tree niters = vect_build_loop_niters (loop_vinfo);
8693 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8694 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8695 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8696 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8697 &step_vector, &niters_vector_mult_vf, th,
8698 check_profitability, niters_no_overflow);
8700 if (niters_vector == NULL_TREE)
8702 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8703 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8704 && known_eq (lowest_vf, vf))
8706 niters_vector
8707 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8708 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8709 step_vector = build_one_cst (TREE_TYPE (niters));
8711 else
8712 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8713 &step_vector, niters_no_overflow);
8716 /* 1) Make sure the loop header has exactly two entries
8717 2) Make sure we have a preheader basic block. */
8719 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8721 split_edge (loop_preheader_edge (loop));
8723 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8724 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8725 /* This will deal with any possible peeling. */
8726 vect_prepare_for_masked_peels (loop_vinfo);
8728 /* FORNOW: the vectorizer supports only loops which body consist
8729 of one basic block (header + empty latch). When the vectorizer will
8730 support more involved loop forms, the order by which the BBs are
8731 traversed need to be reconsidered. */
8733 for (i = 0; i < nbbs; i++)
8735 basic_block bb = bbs[i];
8736 stmt_vec_info stmt_info;
8738 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8739 gsi_next (&si))
8741 gphi *phi = si.phi ();
8742 if (dump_enabled_p ())
8744 dump_printf_loc (MSG_NOTE, vect_location,
8745 "------>vectorizing phi: ");
8746 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8748 stmt_info = vinfo_for_stmt (phi);
8749 if (!stmt_info)
8750 continue;
8752 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8753 vect_loop_kill_debug_uses (loop, phi);
8755 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8756 && !STMT_VINFO_LIVE_P (stmt_info))
8757 continue;
8759 if (STMT_VINFO_VECTYPE (stmt_info)
8760 && (maybe_ne
8761 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8762 && dump_enabled_p ())
8763 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8765 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8766 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8767 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8768 && ! PURE_SLP_STMT (stmt_info))
8770 if (dump_enabled_p ())
8771 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8772 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8776 pattern_stmt = NULL;
8777 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8778 !gsi_end_p (si) || transform_pattern_stmt;)
8780 bool is_store;
8782 if (transform_pattern_stmt)
8783 stmt = pattern_stmt;
8784 else
8786 stmt = gsi_stmt (si);
8787 /* During vectorization remove existing clobber stmts. */
8788 if (gimple_clobber_p (stmt))
8790 unlink_stmt_vdef (stmt);
8791 gsi_remove (&si, true);
8792 release_defs (stmt);
8793 continue;
8797 if (dump_enabled_p ())
8799 dump_printf_loc (MSG_NOTE, vect_location,
8800 "------>vectorizing statement: ");
8801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8804 stmt_info = vinfo_for_stmt (stmt);
8806 /* vector stmts created in the outer-loop during vectorization of
8807 stmts in an inner-loop may not have a stmt_info, and do not
8808 need to be vectorized. */
8809 if (!stmt_info)
8811 gsi_next (&si);
8812 continue;
8815 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8816 vect_loop_kill_debug_uses (loop, stmt);
8818 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8819 && !STMT_VINFO_LIVE_P (stmt_info))
8821 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8822 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8823 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8824 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8826 stmt = pattern_stmt;
8827 stmt_info = vinfo_for_stmt (stmt);
8829 else
8831 gsi_next (&si);
8832 continue;
8835 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8836 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8837 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8838 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8839 transform_pattern_stmt = true;
8841 /* If pattern statement has def stmts, vectorize them too. */
8842 if (is_pattern_stmt_p (stmt_info))
8844 if (pattern_def_seq == NULL)
8846 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8847 pattern_def_si = gsi_start (pattern_def_seq);
8849 else if (!gsi_end_p (pattern_def_si))
8850 gsi_next (&pattern_def_si);
8851 if (pattern_def_seq != NULL)
8853 gimple *pattern_def_stmt = NULL;
8854 stmt_vec_info pattern_def_stmt_info = NULL;
8856 while (!gsi_end_p (pattern_def_si))
8858 pattern_def_stmt = gsi_stmt (pattern_def_si);
8859 pattern_def_stmt_info
8860 = vinfo_for_stmt (pattern_def_stmt);
8861 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8862 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8863 break;
8864 gsi_next (&pattern_def_si);
8867 if (!gsi_end_p (pattern_def_si))
8869 if (dump_enabled_p ())
8871 dump_printf_loc (MSG_NOTE, vect_location,
8872 "==> vectorizing pattern def "
8873 "stmt: ");
8874 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8875 pattern_def_stmt, 0);
8878 stmt = pattern_def_stmt;
8879 stmt_info = pattern_def_stmt_info;
8881 else
8883 pattern_def_si = gsi_none ();
8884 transform_pattern_stmt = false;
8887 else
8888 transform_pattern_stmt = false;
8891 if (STMT_VINFO_VECTYPE (stmt_info))
8893 poly_uint64 nunits
8894 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8895 if (!STMT_SLP_TYPE (stmt_info)
8896 && maybe_ne (nunits, vf)
8897 && dump_enabled_p ())
8898 /* For SLP VF is set according to unrolling factor, and not
8899 to vector size, hence for SLP this print is not valid. */
8900 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8903 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8904 reached. */
8905 if (STMT_SLP_TYPE (stmt_info))
8907 if (!slp_scheduled)
8909 slp_scheduled = true;
8911 if (dump_enabled_p ())
8912 dump_printf_loc (MSG_NOTE, vect_location,
8913 "=== scheduling SLP instances ===\n");
8915 vect_schedule_slp (loop_vinfo);
8918 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8919 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8921 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8923 pattern_def_seq = NULL;
8924 gsi_next (&si);
8926 continue;
8930 /* -------- vectorize statement ------------ */
8931 if (dump_enabled_p ())
8932 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8934 grouped_store = false;
8935 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8936 if (is_store)
8938 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8940 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8941 interleaving chain was completed - free all the stores in
8942 the chain. */
8943 gsi_next (&si);
8944 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8946 else
8948 /* Free the attached stmt_vec_info and remove the stmt. */
8949 gimple *store = gsi_stmt (si);
8950 free_stmt_vec_info (store);
8951 unlink_stmt_vdef (store);
8952 gsi_remove (&si, true);
8953 release_defs (store);
8956 /* Stores can only appear at the end of pattern statements. */
8957 gcc_assert (!transform_pattern_stmt);
8958 pattern_def_seq = NULL;
8960 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8962 pattern_def_seq = NULL;
8963 gsi_next (&si);
8965 } /* stmts in BB */
8967 /* Stub out scalar statements that must not survive vectorization.
8968 Doing this here helps with grouped statements, or statements that
8969 are involved in patterns. */
8970 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8971 !gsi_end_p (gsi); gsi_next (&gsi))
8973 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8974 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8976 tree lhs = gimple_get_lhs (call);
8977 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8979 tree zero = build_zero_cst (TREE_TYPE (lhs));
8980 gimple *new_stmt = gimple_build_assign (lhs, zero);
8981 gsi_replace (&gsi, new_stmt, true);
8985 } /* BBs in loop */
8987 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8988 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8989 if (integer_onep (step_vector))
8990 niters_no_overflow = true;
8991 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8992 niters_vector_mult_vf, !niters_no_overflow);
8994 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8995 scale_profile_for_vect_loop (loop, assumed_vf);
8997 /* True if the final iteration might not handle a full vector's
8998 worth of scalar iterations. */
8999 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
9000 /* The minimum number of iterations performed by the epilogue. This
9001 is 1 when peeling for gaps because we always need a final scalar
9002 iteration. */
9003 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9004 /* +1 to convert latch counts to loop iteration counts,
9005 -min_epilogue_iters to remove iterations that cannot be performed
9006 by the vector code. */
9007 int bias_for_lowest = 1 - min_epilogue_iters;
9008 int bias_for_assumed = bias_for_lowest;
9009 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9010 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9012 /* When the amount of peeling is known at compile time, the first
9013 iteration will have exactly alignment_npeels active elements.
9014 In the worst case it will have at least one. */
9015 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9016 bias_for_lowest += lowest_vf - min_first_active;
9017 bias_for_assumed += assumed_vf - min_first_active;
9019 /* In these calculations the "- 1" converts loop iteration counts
9020 back to latch counts. */
9021 if (loop->any_upper_bound)
9022 loop->nb_iterations_upper_bound
9023 = (final_iter_may_be_partial
9024 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9025 lowest_vf) - 1
9026 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9027 lowest_vf) - 1);
9028 if (loop->any_likely_upper_bound)
9029 loop->nb_iterations_likely_upper_bound
9030 = (final_iter_may_be_partial
9031 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9032 + bias_for_lowest, lowest_vf) - 1
9033 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9034 + bias_for_lowest, lowest_vf) - 1);
9035 if (loop->any_estimate)
9036 loop->nb_iterations_estimate
9037 = (final_iter_may_be_partial
9038 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9039 assumed_vf) - 1
9040 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9041 assumed_vf) - 1);
9043 if (dump_enabled_p ())
9045 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9047 dump_printf_loc (MSG_NOTE, vect_location,
9048 "LOOP VECTORIZED\n");
9049 if (loop->inner)
9050 dump_printf_loc (MSG_NOTE, vect_location,
9051 "OUTER LOOP VECTORIZED\n");
9052 dump_printf (MSG_NOTE, "\n");
9054 else
9056 dump_printf_loc (MSG_NOTE, vect_location,
9057 "LOOP EPILOGUE VECTORIZED (VS=");
9058 dump_dec (MSG_NOTE, current_vector_size);
9059 dump_printf (MSG_NOTE, ")\n");
9063 /* Free SLP instances here because otherwise stmt reference counting
9064 won't work. */
9065 slp_instance instance;
9066 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9067 vect_free_slp_instance (instance);
9068 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9069 /* Clear-up safelen field since its value is invalid after vectorization
9070 since vectorized loop can have loop-carried dependencies. */
9071 loop->safelen = 0;
9073 /* Don't vectorize epilogue for epilogue. */
9074 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9075 epilogue = NULL;
9077 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9078 epilogue = NULL;
9080 if (epilogue)
9082 auto_vector_sizes vector_sizes;
9083 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9084 unsigned int next_size = 0;
9086 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9087 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9088 && known_eq (vf, lowest_vf))
9090 unsigned int eiters
9091 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9092 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9093 eiters = eiters % lowest_vf;
9094 epilogue->nb_iterations_upper_bound = eiters - 1;
9096 unsigned int ratio;
9097 while (next_size < vector_sizes.length ()
9098 && !(constant_multiple_p (current_vector_size,
9099 vector_sizes[next_size], &ratio)
9100 && eiters >= lowest_vf / ratio))
9101 next_size += 1;
9103 else
9104 while (next_size < vector_sizes.length ()
9105 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9106 next_size += 1;
9108 if (next_size == vector_sizes.length ())
9109 epilogue = NULL;
9112 if (epilogue)
9114 epilogue->force_vectorize = loop->force_vectorize;
9115 epilogue->safelen = loop->safelen;
9116 epilogue->dont_vectorize = false;
9118 /* We may need to if-convert epilogue to vectorize it. */
9119 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9120 tree_if_conversion (epilogue);
9123 return epilogue;
9126 /* The code below is trying to perform simple optimization - revert
9127 if-conversion for masked stores, i.e. if the mask of a store is zero
9128 do not perform it and all stored value producers also if possible.
9129 For example,
9130 for (i=0; i<n; i++)
9131 if (c[i])
9133 p1[i] += 1;
9134 p2[i] = p3[i] +2;
9136 this transformation will produce the following semi-hammock:
9138 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9140 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9141 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9142 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9143 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9144 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9145 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9149 void
9150 optimize_mask_stores (struct loop *loop)
9152 basic_block *bbs = get_loop_body (loop);
9153 unsigned nbbs = loop->num_nodes;
9154 unsigned i;
9155 basic_block bb;
9156 struct loop *bb_loop;
9157 gimple_stmt_iterator gsi;
9158 gimple *stmt;
9159 auto_vec<gimple *> worklist;
9161 vect_location = find_loop_location (loop);
9162 /* Pick up all masked stores in loop if any. */
9163 for (i = 0; i < nbbs; i++)
9165 bb = bbs[i];
9166 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9167 gsi_next (&gsi))
9169 stmt = gsi_stmt (gsi);
9170 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9171 worklist.safe_push (stmt);
9175 free (bbs);
9176 if (worklist.is_empty ())
9177 return;
9179 /* Loop has masked stores. */
9180 while (!worklist.is_empty ())
9182 gimple *last, *last_store;
9183 edge e, efalse;
9184 tree mask;
9185 basic_block store_bb, join_bb;
9186 gimple_stmt_iterator gsi_to;
9187 tree vdef, new_vdef;
9188 gphi *phi;
9189 tree vectype;
9190 tree zero;
9192 last = worklist.pop ();
9193 mask = gimple_call_arg (last, 2);
9194 bb = gimple_bb (last);
9195 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9196 the same loop as if_bb. It could be different to LOOP when two
9197 level loop-nest is vectorized and mask_store belongs to the inner
9198 one. */
9199 e = split_block (bb, last);
9200 bb_loop = bb->loop_father;
9201 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9202 join_bb = e->dest;
9203 store_bb = create_empty_bb (bb);
9204 add_bb_to_loop (store_bb, bb_loop);
9205 e->flags = EDGE_TRUE_VALUE;
9206 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9207 /* Put STORE_BB to likely part. */
9208 efalse->probability = profile_probability::unlikely ();
9209 store_bb->count = efalse->count ();
9210 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9211 if (dom_info_available_p (CDI_DOMINATORS))
9212 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9213 if (dump_enabled_p ())
9214 dump_printf_loc (MSG_NOTE, vect_location,
9215 "Create new block %d to sink mask stores.",
9216 store_bb->index);
9217 /* Create vector comparison with boolean result. */
9218 vectype = TREE_TYPE (mask);
9219 zero = build_zero_cst (vectype);
9220 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9221 gsi = gsi_last_bb (bb);
9222 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9223 /* Create new PHI node for vdef of the last masked store:
9224 .MEM_2 = VDEF <.MEM_1>
9225 will be converted to
9226 .MEM.3 = VDEF <.MEM_1>
9227 and new PHI node will be created in join bb
9228 .MEM_2 = PHI <.MEM_1, .MEM_3>
9230 vdef = gimple_vdef (last);
9231 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9232 gimple_set_vdef (last, new_vdef);
9233 phi = create_phi_node (vdef, join_bb);
9234 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9236 /* Put all masked stores with the same mask to STORE_BB if possible. */
9237 while (true)
9239 gimple_stmt_iterator gsi_from;
9240 gimple *stmt1 = NULL;
9242 /* Move masked store to STORE_BB. */
9243 last_store = last;
9244 gsi = gsi_for_stmt (last);
9245 gsi_from = gsi;
9246 /* Shift GSI to the previous stmt for further traversal. */
9247 gsi_prev (&gsi);
9248 gsi_to = gsi_start_bb (store_bb);
9249 gsi_move_before (&gsi_from, &gsi_to);
9250 /* Setup GSI_TO to the non-empty block start. */
9251 gsi_to = gsi_start_bb (store_bb);
9252 if (dump_enabled_p ())
9254 dump_printf_loc (MSG_NOTE, vect_location,
9255 "Move stmt to created bb\n");
9256 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9258 /* Move all stored value producers if possible. */
9259 while (!gsi_end_p (gsi))
9261 tree lhs;
9262 imm_use_iterator imm_iter;
9263 use_operand_p use_p;
9264 bool res;
9266 /* Skip debug statements. */
9267 if (is_gimple_debug (gsi_stmt (gsi)))
9269 gsi_prev (&gsi);
9270 continue;
9272 stmt1 = gsi_stmt (gsi);
9273 /* Do not consider statements writing to memory or having
9274 volatile operand. */
9275 if (gimple_vdef (stmt1)
9276 || gimple_has_volatile_ops (stmt1))
9277 break;
9278 gsi_from = gsi;
9279 gsi_prev (&gsi);
9280 lhs = gimple_get_lhs (stmt1);
9281 if (!lhs)
9282 break;
9284 /* LHS of vectorized stmt must be SSA_NAME. */
9285 if (TREE_CODE (lhs) != SSA_NAME)
9286 break;
9288 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9290 /* Remove dead scalar statement. */
9291 if (has_zero_uses (lhs))
9293 gsi_remove (&gsi_from, true);
9294 continue;
9298 /* Check that LHS does not have uses outside of STORE_BB. */
9299 res = true;
9300 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9302 gimple *use_stmt;
9303 use_stmt = USE_STMT (use_p);
9304 if (is_gimple_debug (use_stmt))
9305 continue;
9306 if (gimple_bb (use_stmt) != store_bb)
9308 res = false;
9309 break;
9312 if (!res)
9313 break;
9315 if (gimple_vuse (stmt1)
9316 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9317 break;
9319 /* Can move STMT1 to STORE_BB. */
9320 if (dump_enabled_p ())
9322 dump_printf_loc (MSG_NOTE, vect_location,
9323 "Move stmt to created bb\n");
9324 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9326 gsi_move_before (&gsi_from, &gsi_to);
9327 /* Shift GSI_TO for further insertion. */
9328 gsi_prev (&gsi_to);
9330 /* Put other masked stores with the same mask to STORE_BB. */
9331 if (worklist.is_empty ()
9332 || gimple_call_arg (worklist.last (), 2) != mask
9333 || worklist.last () != stmt1)
9334 break;
9335 last = worklist.pop ();
9337 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);