MAINTAINERS: Add myself as arc port maintainer
[official-gcc.git] / gcc / tree-vect-loop.c
blob72bbec4b45d225d73eddfaca69468cd23842a42a
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
739 if (!exit)
740 return cond;
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
754 if (may_be_zero)
756 if (COMPARISON_CLASS_P (may_be_zero))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
771 may_be_zero = NULL_TREE;
773 else if (integer_nonzerop (may_be_zero))
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
779 else
780 return cond;
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
795 return cond;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
860 for (unsigned int i = 0; i < nbbs; i++)
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
902 epilogue_vinfos.create (6);
905 /* Free all levels of rgroup CONTROLS. */
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
922 free (bbs);
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
930 loop->aux = NULL;
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
936 tree
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
939 if (is_gimple_reg (expr)
940 || is_gimple_min_invariant (expr))
941 return expr;
943 if (! loop_vinfo->ivexpr_map)
944 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
945 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
946 if (! cached)
948 gimple_seq stmts = NULL;
949 cached = force_gimple_operand (unshare_expr (expr),
950 &stmts, true, NULL_TREE);
951 if (stmts)
953 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
954 gsi_insert_seq_on_edge_immediate (e, stmts);
957 return cached;
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
963 static bool
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
966 rgroup_controls *rgm;
967 unsigned int i;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
969 if (rgm->type != NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
971 cmp_type, rgm->type,
972 OPTIMIZE_FOR_SPEED))
973 return false;
974 return true;
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
980 static unsigned int
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
983 unsigned int res = 1;
984 unsigned int i;
985 rgroup_controls *rgm;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
987 res = MAX (res, rgm->max_nscalars_per_iter);
988 return res;
991 /* Calculate the minimum precision necessary to represent:
993 MAX_NITERS * FACTOR
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
998 static unsigned
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges;
1010 if (max_loop_iterations (loop, &max_back_edges))
1011 max_ni = wi::smin (max_ni, max_back_edges + 1);
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni * factor, UNSIGNED);
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1022 unsigned HOST_WIDE_INT const_vf;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1026 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029 (loop_vinfo));
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038 peel_niter += 1;
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041 return true;
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050 < (unsigned) exact_log2 (const_vf))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055 || ((unsigned HOST_WIDE_INT) max_niter
1056 > (th / const_vf) * const_vf))))
1057 return true;
1059 return false;
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1069 unsigned int min_ni_width;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo);
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077 return false;
1079 /* Work out how many bits we need to represent the limit. */
1080 min_ni_width
1081 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter;
1085 tree cmp_type = NULL_TREE;
1086 tree iv_type = NULL_TREE;
1087 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088 unsigned int iv_precision = UINT_MAX;
1090 if (iv_limit != -1)
1091 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092 UNSIGNED);
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1096 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097 if (cmp_bits >= min_ni_width
1098 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1100 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101 if (this_type
1102 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1106 best choice:
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1110 Pmode.
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type = this_type;
1129 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130 cmp_type = this_type;
1131 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132 break;
1137 if (!cmp_type)
1138 return false;
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142 return true;
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1153 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154 return false;
1156 unsigned int max_nitems_per_iter = 1;
1157 unsigned int i;
1158 rgroup_controls *rgl;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1162 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182 min_ni_prec = MAX (min_ni_prec, ni_prec);
1183 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1185 tree iv_type = NULL_TREE;
1186 opt_scalar_int_mode tmode_iter;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1189 scalar_mode tmode = tmode_iter.require ();
1190 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1193 BITS_PER_WORD? */
1194 if (tbits > BITS_PER_WORD)
1195 break;
1197 /* Find the first available standard integral type. */
1198 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1200 iv_type = build_nonstandard_integer_type (tbits, true);
1201 break;
1205 if (!iv_type)
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1211 return false;
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1217 return true;
1220 /* Calculate the cost of one scalar iteration of the loop. */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226 int nbbs = loop->num_nodes, factor;
1227 int innerloop_iters, i;
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1231 /* Gather costs for statements in the scalar loop. */
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1238 for (i = 0; i < nbbs; i++)
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262 continue;
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (stmt_info))
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1272 else if (vect_nop_conversion_p (stmt_info))
1273 continue;
1274 else
1275 kind = scalar_stmt;
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 factor, kind, stmt_info, 0, vect_prologue);
1282 /* Now accumulate cost. */
1283 void *target_cost_data = init_cost (loop);
1284 stmt_info_for_cost *si;
1285 int j;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287 j, si)
1288 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289 si->kind, si->stmt_info, si->vectype,
1290 si->misalign, vect_body);
1291 unsigned dummy, body_cost = 0;
1292 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293 destroy_cost_data (target_cost_data);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1298 /* Function vect_analyze_loop_form_1.
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309 tree *assumptions, tree *number_of_iterationsm1,
1310 tree *number_of_iterations, gcond **inner_loop_cond)
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1318 if (!loop->inner)
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 look like this:
1324 (pre-header)
1326 header <--------+
1327 | | |
1328 | +--> latch --+
1330 (exit-bb) */
1332 if (loop->num_nodes != 2)
1333 return opt_result::failure_at (vect_location,
1334 "not vectorized:"
1335 " control flow in loop.\n");
1337 if (empty_block_p (loop->header))
1338 return opt_result::failure_at (vect_location,
1339 "not vectorized: empty loop.\n");
1341 else
1343 class loop *innerloop = loop->inner;
1344 edge entryedge;
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1350 (pre-header)
1352 header <---+
1354 inner-loop |
1356 tail ------+
1358 (exit-bb)
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1363 if ((loop->inner)->inner || (loop->inner)->next)
1364 return opt_result::failure_at (vect_location,
1365 "not vectorized:"
1366 " multiple nested loops.\n");
1368 if (loop->num_nodes != 5)
1369 return opt_result::failure_at (vect_location,
1370 "not vectorized:"
1371 " control flow in loop.\n");
1373 entryedge = loop_preheader_edge (innerloop);
1374 if (entryedge->src != loop->header
1375 || !single_exit (innerloop)
1376 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " unsupported outerloop form.\n");
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1, inner_niter, inner_assumptions;
1383 opt_result res
1384 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385 &inner_assumptions, &inner_niterm1,
1386 &inner_niter, NULL);
1387 if (!res)
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391 "not vectorized: Bad inner loop.\n");
1392 return res;
1395 /* Don't support analyzing niter under assumptions for inner
1396 loop. */
1397 if (!integer_onep (inner_assumptions))
1398 return opt_result::failure_at (vect_location,
1399 "not vectorized: Bad inner loop.\n");
1401 if (!expr_invariant_in_loop_p (loop, inner_niter))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: inner-loop count not"
1404 " invariant.\n");
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Considering outer-loop vectorization.\n");
1411 if (!single_exit (loop))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop->header->preds) != 2)
1415 return opt_result::failure_at (vect_location,
1416 "not vectorized:"
1417 " too many incoming edges.\n");
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop->latch)
1424 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized: latch block not empty.\n");
1428 /* Make sure the exit is not abnormal. */
1429 edge e = single_exit (loop);
1430 if (e->flags & EDGE_ABNORMAL)
1431 return opt_result::failure_at (vect_location,
1432 "not vectorized:"
1433 " abnormal loop exit edge.\n");
1435 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436 number_of_iterationsm1);
1437 if (!*loop_cond)
1438 return opt_result::failure_at
1439 (vect_location,
1440 "not vectorized: complicated exit condition.\n");
1442 if (integer_zerop (*assumptions)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations))
1445 return opt_result::failure_at
1446 (*loop_cond,
1447 "not vectorized: number of iterations cannot be computed.\n");
1449 if (integer_zerop (*number_of_iterations))
1450 return opt_result::failure_at
1451 (*loop_cond,
1452 "not vectorized: number of iterations = 0.\n");
1454 return opt_result::success ();
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1462 tree assumptions, number_of_iterations, number_of_iterationsm1;
1463 gcond *loop_cond, *inner_loop_cond = NULL;
1465 opt_result res
1466 = vect_analyze_loop_form_1 (loop, &loop_cond,
1467 &assumptions, &number_of_iterationsm1,
1468 &number_of_iterations, &inner_loop_cond);
1469 if (!res)
1470 return opt_loop_vec_info::propagate_failure (res);
1472 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476 if (!integer_onep (assumptions))
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1481 scev_reset_htab ();
1482 free_numbers_of_iterations_estimates (loop);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop, LOOP_C_FINITE);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1492 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1501 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo->lookup_stmt (inner_loop_cond);
1507 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1510 gcc_assert (!loop->aux);
1511 loop->aux = loop_vinfo;
1512 return opt_loop_vec_info::success (loop_vinfo);
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1570 if (only_slp_in_loop)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577 else
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1608 inner:
1609 x_2 = ...;
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1643 auto_vec<stmt_info_for_cost> cost_vec;
1645 for (i = 0; i < nbbs; i++)
1647 basic_block bb = bbs[i];
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1652 gphi *phi = si.phi ();
1653 ok = true;
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1679 tree phi_op;
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1702 continue;
1705 gcc_assert (stmt_info);
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1763 } /* bbs */
1765 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1782 return opt_result::success ();
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1803 return false;
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1830 /* If using the "very cheap" model. reject cases in which we'd keep
1831 a copy of the scalar code (even if we might be able to vectorize it). */
1832 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1833 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1834 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1835 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "some scalar iterations would need to be peeled\n");
1840 return 0;
1843 int min_profitable_iters, min_profitable_estimate;
1844 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1845 &min_profitable_estimate);
1847 if (min_profitable_iters < 0)
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851 "not vectorized: vectorization not profitable.\n");
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "not vectorized: vector version will never be "
1855 "profitable.\n");
1856 return -1;
1859 int min_scalar_loop_bound = (param_min_vect_loop_bound
1860 * assumed_vf);
1862 /* Use the cost model only if it is more conservative than user specified
1863 threshold. */
1864 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1865 min_profitable_iters);
1867 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1869 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1870 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "not vectorized: vectorization not profitable.\n");
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE, vect_location,
1877 "not vectorized: iteration count smaller than user "
1878 "specified loop bound parameter or minimum profitable "
1879 "iterations (whichever is more conservative).\n");
1880 return 0;
1883 /* The static profitablity threshold min_profitable_estimate includes
1884 the cost of having to check at runtime whether the scalar loop
1885 should be used instead. If it turns out that we don't need or want
1886 such a check, the threshold we should use for the static estimate
1887 is simply the point at which the vector loop becomes more profitable
1888 than the scalar loop. */
1889 if (min_profitable_estimate > min_profitable_iters
1890 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1892 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1893 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1897 " choice between the scalar and vector loops\n");
1898 min_profitable_estimate = min_profitable_iters;
1901 /* If the vector loop needs multiple iterations to be beneficial then
1902 things are probably too close to call, and the conservative thing
1903 would be to stick with the scalar code. */
1904 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1905 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909 "one iteration of the vector loop would be"
1910 " more expensive than the equivalent number of"
1911 " iterations of the scalar loop\n");
1912 return 0;
1915 HOST_WIDE_INT estimated_niter;
1917 /* If we are vectorizing an epilogue then we know the maximum number of
1918 scalar iterations it will cover is at least one lower than the
1919 vectorization factor of the main loop. */
1920 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1921 estimated_niter
1922 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1923 else
1925 estimated_niter = estimated_stmt_executions_int (loop);
1926 if (estimated_niter == -1)
1927 estimated_niter = likely_max_stmt_executions_int (loop);
1929 if (estimated_niter != -1
1930 && ((unsigned HOST_WIDE_INT) estimated_niter
1931 < MAX (th, (unsigned) min_profitable_estimate)))
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935 "not vectorized: estimated iteration count too "
1936 "small.\n");
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location,
1939 "not vectorized: estimated iteration count smaller "
1940 "than specified loop bound parameter or minimum "
1941 "profitable iterations (whichever is more "
1942 "conservative).\n");
1943 return -1;
1946 return 1;
1949 static opt_result
1950 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1951 vec<data_reference_p> *datarefs,
1952 unsigned int *n_stmts)
1954 *n_stmts = 0;
1955 for (unsigned i = 0; i < loop->num_nodes; i++)
1956 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1957 !gsi_end_p (gsi); gsi_next (&gsi))
1959 gimple *stmt = gsi_stmt (gsi);
1960 if (is_gimple_debug (stmt))
1961 continue;
1962 ++(*n_stmts);
1963 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1964 NULL, 0);
1965 if (!res)
1967 if (is_gimple_call (stmt) && loop->safelen)
1969 tree fndecl = gimple_call_fndecl (stmt), op;
1970 if (fndecl != NULL_TREE)
1972 cgraph_node *node = cgraph_node::get (fndecl);
1973 if (node != NULL && node->simd_clones != NULL)
1975 unsigned int j, n = gimple_call_num_args (stmt);
1976 for (j = 0; j < n; j++)
1978 op = gimple_call_arg (stmt, j);
1979 if (DECL_P (op)
1980 || (REFERENCE_CLASS_P (op)
1981 && get_base_address (op)))
1982 break;
1984 op = gimple_call_lhs (stmt);
1985 /* Ignore #pragma omp declare simd functions
1986 if they don't have data references in the
1987 call stmt itself. */
1988 if (j == n
1989 && !(op
1990 && (DECL_P (op)
1991 || (REFERENCE_CLASS_P (op)
1992 && get_base_address (op)))))
1993 continue;
1997 return res;
1999 /* If dependence analysis will give up due to the limit on the
2000 number of datarefs stop here and fail fatally. */
2001 if (datarefs->length ()
2002 > (unsigned)param_loop_max_datarefs_for_datadeps)
2003 return opt_result::failure_at (stmt, "exceeded param "
2004 "loop-max-datarefs-for-datadeps\n");
2006 return opt_result::success ();
2009 /* Look for SLP-only access groups and turn each individual access into its own
2010 group. */
2011 static void
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2014 unsigned int i;
2015 struct data_reference *dr;
2017 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2019 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2020 FOR_EACH_VEC_ELT (datarefs, i, dr)
2022 gcc_assert (DR_REF (dr));
2023 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2025 /* Check if the load is a part of an interleaving chain. */
2026 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2028 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2029 unsigned int group_size = DR_GROUP_SIZE (first_element);
2031 /* Check if SLP-only groups. */
2032 if (!STMT_SLP_TYPE (stmt_info)
2033 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2035 /* Dissolve the group. */
2036 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2038 stmt_vec_info vinfo = first_element;
2039 while (vinfo)
2041 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2042 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2043 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2044 DR_GROUP_SIZE (vinfo) = 1;
2045 if (STMT_VINFO_STRIDED_P (first_element))
2046 DR_GROUP_GAP (vinfo) = 0;
2047 else
2048 DR_GROUP_GAP (vinfo) = group_size - 1;
2049 vinfo = next;
2056 /* Determine if operating on full vectors for LOOP_VINFO might leave
2057 some scalar iterations still to do. If so, decide how we should
2058 handle those scalar iterations. The possibilities are:
2060 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2061 In this case:
2063 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2065 LOOP_VINFO_PEELING_FOR_NITER == false
2067 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2068 to handle the remaining scalar iterations. In this case:
2070 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2071 LOOP_VINFO_PEELING_FOR_NITER == true
2073 There are two choices:
2075 (2a) Consider vectorizing the epilogue loop at the same VF as the
2076 main loop, but using partial vectors instead of full vectors.
2077 In this case:
2079 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2081 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2082 In this case:
2084 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2086 When FOR_EPILOGUE_P is true, make this determination based on the
2087 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2088 based on the assumption that LOOP_VINFO is the main loop. The caller
2089 has made sure that the number of iterations is set appropriately for
2090 this value of FOR_EPILOGUE_P. */
2092 opt_result
2093 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2094 bool for_epilogue_p)
2096 /* Determine whether there would be any scalar iterations left over. */
2097 bool need_peeling_or_partial_vectors_p
2098 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2100 /* Decide whether to vectorize the loop with partial vectors. */
2101 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2102 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2103 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2104 && need_peeling_or_partial_vectors_p)
2106 /* For partial-vector-usage=1, try to push the handling of partial
2107 vectors to the epilogue, with the main loop continuing to operate
2108 on full vectors.
2110 ??? We could then end up failing to use partial vectors if we
2111 decide to peel iterations into a prologue, and if the main loop
2112 then ends up processing fewer than VF iterations. */
2113 if (param_vect_partial_vector_usage == 1
2114 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2115 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2116 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2117 else
2118 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2121 if (dump_enabled_p ())
2123 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2124 dump_printf_loc (MSG_NOTE, vect_location,
2125 "operating on partial vectors%s.\n",
2126 for_epilogue_p ? " for epilogue loop" : "");
2127 else
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "operating only on full vectors%s.\n",
2130 for_epilogue_p ? " for epilogue loop" : "");
2133 if (for_epilogue_p)
2135 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2136 gcc_assert (orig_loop_vinfo);
2137 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2138 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2139 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2143 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2145 /* Check that the loop processes at least one full vector. */
2146 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2148 if (known_lt (wi::to_widest (scalar_niters), vf))
2149 return opt_result::failure_at (vect_location,
2150 "loop does not have enough iterations"
2151 " to support vectorization.\n");
2153 /* If we need to peel an extra epilogue iteration to handle data
2154 accesses with gaps, check that there are enough scalar iterations
2155 available.
2157 The check above is redundant with this one when peeling for gaps,
2158 but the distinction is useful for diagnostics. */
2159 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2160 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2161 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2162 return opt_result::failure_at (vect_location,
2163 "loop does not have enough iterations"
2164 " to support peeling for gaps.\n");
2167 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2168 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2169 && need_peeling_or_partial_vectors_p);
2171 return opt_result::success ();
2174 /* Function vect_analyze_loop_2.
2176 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2177 for it. The different analyses will record information in the
2178 loop_vec_info struct. */
2179 static opt_result
2180 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2182 opt_result ok = opt_result::success ();
2183 int res;
2184 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2185 poly_uint64 min_vf = 2;
2186 loop_vec_info orig_loop_vinfo = NULL;
2188 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2189 loop_vec_info of the first vectorized loop. */
2190 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2191 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2192 else
2193 orig_loop_vinfo = loop_vinfo;
2194 gcc_assert (orig_loop_vinfo);
2196 /* The first group of checks is independent of the vector size. */
2197 fatal = true;
2199 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2200 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2201 return opt_result::failure_at (vect_location,
2202 "not vectorized: simd if(0)\n");
2204 /* Find all data references in the loop (which correspond to vdefs/vuses)
2205 and analyze their evolution in the loop. */
2207 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2209 /* Gather the data references and count stmts in the loop. */
2210 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2212 opt_result res
2213 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2214 &LOOP_VINFO_DATAREFS (loop_vinfo),
2215 n_stmts);
2216 if (!res)
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "not vectorized: loop contains function "
2221 "calls or data references that cannot "
2222 "be analyzed\n");
2223 return res;
2225 loop_vinfo->shared->save_datarefs ();
2227 else
2228 loop_vinfo->shared->check_datarefs ();
2230 /* Analyze the data references and also adjust the minimal
2231 vectorization factor according to the loads and stores. */
2233 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2234 if (!ok)
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2238 "bad data references.\n");
2239 return ok;
2242 /* Classify all cross-iteration scalar data-flow cycles.
2243 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2244 vect_analyze_scalar_cycles (loop_vinfo);
2246 vect_pattern_recog (loop_vinfo);
2248 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2250 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2251 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2253 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2254 if (!ok)
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258 "bad data access.\n");
2259 return ok;
2262 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2264 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2265 if (!ok)
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "unexpected pattern.\n");
2270 return ok;
2273 /* While the rest of the analysis below depends on it in some way. */
2274 fatal = false;
2276 /* Analyze data dependences between the data-refs in the loop
2277 and adjust the maximum vectorization factor according to
2278 the dependences.
2279 FORNOW: fail at the first data dependence that we encounter. */
2281 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2282 if (!ok)
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2286 "bad data dependence.\n");
2287 return ok;
2289 if (max_vf != MAX_VECTORIZATION_FACTOR
2290 && maybe_lt (max_vf, min_vf))
2291 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2292 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2294 ok = vect_determine_vectorization_factor (loop_vinfo);
2295 if (!ok)
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "can't determine vectorization factor.\n");
2300 return ok;
2302 if (max_vf != MAX_VECTORIZATION_FACTOR
2303 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2304 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2306 /* Compute the scalar iteration cost. */
2307 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2309 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2311 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2312 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2313 if (!ok)
2314 return ok;
2316 /* If there are any SLP instances mark them as pure_slp. */
2317 bool slp = vect_make_slp_decision (loop_vinfo);
2318 if (slp)
2320 /* Find stmts that need to be both vectorized and SLPed. */
2321 vect_detect_hybrid_slp (loop_vinfo);
2323 /* Update the vectorization factor based on the SLP decision. */
2324 vect_update_vf_for_slp (loop_vinfo);
2326 /* Optimize the SLP graph with the vectorization factor fixed. */
2327 vect_optimize_slp (loop_vinfo);
2329 /* Gather the loads reachable from the SLP graph entries. */
2330 vect_gather_slp_loads (loop_vinfo);
2333 bool saved_can_use_partial_vectors_p
2334 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2336 /* We don't expect to have to roll back to anything other than an empty
2337 set of rgroups. */
2338 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2340 /* This is the point where we can re-start analysis with SLP forced off. */
2341 start_over:
2343 /* Now the vectorization factor is final. */
2344 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2345 gcc_assert (known_ne (vectorization_factor, 0U));
2347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "vectorization_factor = ");
2351 dump_dec (MSG_NOTE, vectorization_factor);
2352 dump_printf (MSG_NOTE, ", niters = %wd\n",
2353 LOOP_VINFO_INT_NITERS (loop_vinfo));
2356 /* Analyze the alignment of the data-refs in the loop.
2357 Fail if a data reference is found that cannot be vectorized. */
2359 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2360 if (!ok)
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "bad data alignment.\n");
2365 return ok;
2368 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2369 It is important to call pruning after vect_analyze_data_ref_accesses,
2370 since we use grouping information gathered by interleaving analysis. */
2371 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2372 if (!ok)
2373 return ok;
2375 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2376 vectorization, since we do not want to add extra peeling or
2377 add versioning for alignment. */
2378 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2379 /* This pass will decide on using loop versioning and/or loop peeling in
2380 order to enhance the alignment of data references in the loop. */
2381 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2382 if (!ok)
2383 return ok;
2385 if (slp)
2387 /* Analyze operations in the SLP instances. Note this may
2388 remove unsupported SLP instances which makes the above
2389 SLP kind detection invalid. */
2390 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2391 vect_slp_analyze_operations (loop_vinfo);
2392 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2394 ok = opt_result::failure_at (vect_location,
2395 "unsupported SLP instances\n");
2396 goto again;
2399 /* Check whether any load in ALL SLP instances is possibly permuted. */
2400 slp_tree load_node, slp_root;
2401 unsigned i, x;
2402 slp_instance instance;
2403 bool can_use_lanes = true;
2404 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2406 slp_root = SLP_INSTANCE_TREE (instance);
2407 int group_size = SLP_TREE_LANES (slp_root);
2408 tree vectype = SLP_TREE_VECTYPE (slp_root);
2409 bool loads_permuted = false;
2410 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2412 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2413 continue;
2414 unsigned j;
2415 stmt_vec_info load_info;
2416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2417 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2419 loads_permuted = true;
2420 break;
2424 /* If the loads and stores can be handled with load/store-lane
2425 instructions record it and move on to the next instance. */
2426 if (loads_permuted
2427 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2428 && vect_store_lanes_supported (vectype, group_size, false))
2430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2432 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2433 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2434 /* Use SLP for strided accesses (or if we can't
2435 load-lanes). */
2436 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2437 || ! vect_load_lanes_supported
2438 (STMT_VINFO_VECTYPE (stmt_vinfo),
2439 DR_GROUP_SIZE (stmt_vinfo), false))
2440 break;
2443 can_use_lanes
2444 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2446 if (can_use_lanes && dump_enabled_p ())
2447 dump_printf_loc (MSG_NOTE, vect_location,
2448 "SLP instance %p can use load/store-lanes\n",
2449 instance);
2451 else
2453 can_use_lanes = false;
2454 break;
2458 /* If all SLP instances can use load/store-lanes abort SLP and try again
2459 with SLP disabled. */
2460 if (can_use_lanes)
2462 ok = opt_result::failure_at (vect_location,
2463 "Built SLP cancelled: can use "
2464 "load/store-lanes\n");
2465 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467 "Built SLP cancelled: all SLP instances support "
2468 "load/store-lanes\n");
2469 goto again;
2473 /* Dissolve SLP-only groups. */
2474 vect_dissolve_slp_only_groups (loop_vinfo);
2476 /* Scan all the remaining operations in the loop that are not subject
2477 to SLP and make sure they are vectorizable. */
2478 ok = vect_analyze_loop_operations (loop_vinfo);
2479 if (!ok)
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2483 "bad operation or unsupported loop bound.\n");
2484 return ok;
2487 /* For now, we don't expect to mix both masking and length approaches for one
2488 loop, disable it if both are recorded. */
2489 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2490 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2491 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495 "can't vectorize a loop with partial vectors"
2496 " because we don't expect to mix different"
2497 " approaches with partial vectors for the"
2498 " same loop.\n");
2499 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2502 /* If we still have the option of using partial vectors,
2503 check whether we can generate the necessary loop controls. */
2504 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2505 && !vect_verify_full_masking (loop_vinfo)
2506 && !vect_verify_loop_lens (loop_vinfo))
2507 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2509 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2510 to be able to handle fewer than VF scalars, or needs to have a lower VF
2511 than the main loop. */
2512 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2513 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2514 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2515 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2516 return opt_result::failure_at (vect_location,
2517 "Vectorization factor too high for"
2518 " epilogue loop.\n");
2520 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2521 assuming that the loop will be used as a main loop. We will redo
2522 this analysis later if we instead decide to use the loop as an
2523 epilogue loop. */
2524 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2525 if (!ok)
2526 return ok;
2528 /* Check the costings of the loop make vectorizing worthwhile. */
2529 res = vect_analyze_loop_costing (loop_vinfo);
2530 if (res < 0)
2532 ok = opt_result::failure_at (vect_location,
2533 "Loop costings may not be worthwhile.\n");
2534 goto again;
2536 if (!res)
2537 return opt_result::failure_at (vect_location,
2538 "Loop costings not worthwhile.\n");
2540 /* If an epilogue loop is required make sure we can create one. */
2541 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2542 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2546 if (!vect_can_advance_ivs_p (loop_vinfo)
2547 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2548 single_exit (LOOP_VINFO_LOOP
2549 (loop_vinfo))))
2551 ok = opt_result::failure_at (vect_location,
2552 "not vectorized: can't create required "
2553 "epilog loop\n");
2554 goto again;
2558 /* During peeling, we need to check if number of loop iterations is
2559 enough for both peeled prolog loop and vector loop. This check
2560 can be merged along with threshold check of loop versioning, so
2561 increase threshold for this case if necessary.
2563 If we are analyzing an epilogue we still want to check what its
2564 versioning threshold would be. If we decide to vectorize the epilogues we
2565 will want to use the lowest versioning threshold of all epilogues and main
2566 loop. This will enable us to enter a vectorized epilogue even when
2567 versioning the loop. We can't simply check whether the epilogue requires
2568 versioning though since we may have skipped some versioning checks when
2569 analyzing the epilogue. For instance, checks for alias versioning will be
2570 skipped when dealing with epilogues as we assume we already checked them
2571 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2572 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2574 poly_uint64 niters_th = 0;
2575 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2577 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2579 /* Niters for peeled prolog loop. */
2580 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2582 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2583 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2584 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2586 else
2587 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2590 /* Niters for at least one iteration of vectorized loop. */
2591 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2592 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2593 /* One additional iteration because of peeling for gap. */
2594 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2595 niters_th += 1;
2597 /* Use the same condition as vect_transform_loop to decide when to use
2598 the cost to determine a versioning threshold. */
2599 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2600 && ordered_p (th, niters_th))
2601 niters_th = ordered_max (poly_uint64 (th), niters_th);
2603 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2606 gcc_assert (known_eq (vectorization_factor,
2607 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2609 /* Ok to vectorize! */
2610 return opt_result::success ();
2612 again:
2613 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2614 gcc_assert (!ok);
2616 /* Try again with SLP forced off but if we didn't do any SLP there is
2617 no point in re-trying. */
2618 if (!slp)
2619 return ok;
2621 /* If there are reduction chains re-trying will fail anyway. */
2622 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2623 return ok;
2625 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2626 via interleaving or lane instructions. */
2627 slp_instance instance;
2628 slp_tree node;
2629 unsigned i, j;
2630 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2632 stmt_vec_info vinfo;
2633 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2634 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2635 continue;
2636 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2637 unsigned int size = DR_GROUP_SIZE (vinfo);
2638 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2639 if (! vect_store_lanes_supported (vectype, size, false)
2640 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2641 && ! vect_grouped_store_supported (vectype, size))
2642 return opt_result::failure_at (vinfo->stmt,
2643 "unsupported grouped store\n");
2644 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2646 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2647 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2649 size = DR_GROUP_SIZE (vinfo);
2650 vectype = STMT_VINFO_VECTYPE (vinfo);
2651 if (! vect_load_lanes_supported (vectype, size, false)
2652 && ! vect_grouped_load_supported (vectype, single_element_p,
2653 size))
2654 return opt_result::failure_at (vinfo->stmt,
2655 "unsupported grouped load\n");
2659 if (dump_enabled_p ())
2660 dump_printf_loc (MSG_NOTE, vect_location,
2661 "re-trying with SLP disabled\n");
2663 /* Roll back state appropriately. No SLP this time. */
2664 slp = false;
2665 /* Restore vectorization factor as it were without SLP. */
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2667 /* Free the SLP instances. */
2668 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2669 vect_free_slp_instance (instance);
2670 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2671 /* Reset SLP type to loop_vect on all stmts. */
2672 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2674 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2675 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2676 !gsi_end_p (si); gsi_next (&si))
2678 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2679 STMT_SLP_TYPE (stmt_info) = loop_vect;
2680 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2681 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2683 /* vectorizable_reduction adjusts reduction stmt def-types,
2684 restore them to that of the PHI. */
2685 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2686 = STMT_VINFO_DEF_TYPE (stmt_info);
2687 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2688 (STMT_VINFO_REDUC_DEF (stmt_info)))
2689 = STMT_VINFO_DEF_TYPE (stmt_info);
2692 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2693 !gsi_end_p (si); gsi_next (&si))
2695 if (is_gimple_debug (gsi_stmt (si)))
2696 continue;
2697 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2698 STMT_SLP_TYPE (stmt_info) = loop_vect;
2699 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2701 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2702 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2703 STMT_SLP_TYPE (stmt_info) = loop_vect;
2704 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2705 !gsi_end_p (pi); gsi_next (&pi))
2706 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2707 = loop_vect;
2711 /* Free optimized alias test DDRS. */
2712 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2713 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2714 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2715 /* Reset target cost data. */
2716 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2717 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2718 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2719 /* Reset accumulated rgroup information. */
2720 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2721 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2722 /* Reset assorted flags. */
2723 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2724 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2725 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2726 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2727 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2728 = saved_can_use_partial_vectors_p;
2730 goto start_over;
2733 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2734 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2735 OLD_LOOP_VINFO is better unless something specifically indicates
2736 otherwise.
2738 Note that this deliberately isn't a partial order. */
2740 static bool
2741 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2742 loop_vec_info old_loop_vinfo)
2744 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2745 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2747 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2748 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2750 /* Always prefer a VF of loop->simdlen over any other VF. */
2751 if (loop->simdlen)
2753 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2754 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2755 if (new_simdlen_p != old_simdlen_p)
2756 return new_simdlen_p;
2759 /* Limit the VFs to what is likely to be the maximum number of iterations,
2760 to handle cases in which at least one loop_vinfo is fully-masked. */
2761 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2762 if (estimated_max_niter != -1)
2764 if (known_le (estimated_max_niter, new_vf))
2765 new_vf = estimated_max_niter;
2766 if (known_le (estimated_max_niter, old_vf))
2767 old_vf = estimated_max_niter;
2770 /* Check whether the (fractional) cost per scalar iteration is lower
2771 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2772 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2773 * poly_widest_int (old_vf));
2774 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2775 * poly_widest_int (new_vf));
2776 if (maybe_lt (rel_old, rel_new))
2778 /* When old_loop_vinfo uses a variable vectorization factor,
2779 we know that it has a lower cost for at least one runtime VF.
2780 However, we don't know how likely that VF is.
2782 One option would be to compare the costs for the estimated VFs.
2783 The problem is that that can put too much pressure on the cost
2784 model. E.g. if the estimated VF is also the lowest possible VF,
2785 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2786 for the estimated VF, we'd then choose new_loop_vinfo even
2787 though (a) new_loop_vinfo might not actually be better than
2788 old_loop_vinfo for that VF and (b) it would be significantly
2789 worse at larger VFs.
2791 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2792 no more expensive than old_loop_vinfo even after doubling the
2793 estimated old_loop_vinfo VF. For all but trivial loops, this
2794 ensures that we only pick new_loop_vinfo if it is significantly
2795 better than old_loop_vinfo at the estimated VF. */
2796 if (rel_new.is_constant ())
2797 return false;
2799 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2800 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2801 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2802 * widest_int (old_estimated_vf));
2803 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2804 * widest_int (new_estimated_vf));
2805 return estimated_rel_new * 2 <= estimated_rel_old;
2807 if (known_lt (rel_new, rel_old))
2808 return true;
2810 /* If there's nothing to choose between the loop bodies, see whether
2811 there's a difference in the prologue and epilogue costs. */
2812 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2813 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2815 return false;
2818 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2819 true if we should. */
2821 static bool
2822 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2823 loop_vec_info old_loop_vinfo)
2825 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2826 return false;
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_NOTE, vect_location,
2830 "***** Preferring vector mode %s to vector mode %s\n",
2831 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2832 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2833 return true;
2836 /* Function vect_analyze_loop.
2838 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2839 for it. The different analyses will record information in the
2840 loop_vec_info struct. */
2841 opt_loop_vec_info
2842 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2844 auto_vector_modes vector_modes;
2846 /* Autodetect first vector size we try. */
2847 unsigned int autovec_flags
2848 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2849 loop->simdlen != 0);
2850 unsigned int mode_i = 0;
2852 DUMP_VECT_SCOPE ("analyze_loop_nest");
2854 if (loop_outer (loop)
2855 && loop_vec_info_for_loop (loop_outer (loop))
2856 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2857 return opt_loop_vec_info::failure_at (vect_location,
2858 "outer-loop already vectorized.\n");
2860 if (!find_loop_nest (loop, &shared->loop_nest))
2861 return opt_loop_vec_info::failure_at
2862 (vect_location,
2863 "not vectorized: loop nest containing two or more consecutive inner"
2864 " loops cannot be vectorized\n");
2866 unsigned n_stmts = 0;
2867 machine_mode autodetected_vector_mode = VOIDmode;
2868 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2869 machine_mode next_vector_mode = VOIDmode;
2870 poly_uint64 lowest_th = 0;
2871 unsigned vectorized_loops = 0;
2872 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2873 && !unlimited_cost_model (loop));
2875 bool vect_epilogues = false;
2876 opt_result res = opt_result::success ();
2877 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2878 while (1)
2880 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2881 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2882 if (!loop_vinfo)
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886 "bad loop form.\n");
2887 gcc_checking_assert (first_loop_vinfo == NULL);
2888 return loop_vinfo;
2890 loop_vinfo->vector_mode = next_vector_mode;
2892 bool fatal = false;
2894 /* When pick_lowest_cost_p is true, we should in principle iterate
2895 over all the loop_vec_infos that LOOP_VINFO could replace and
2896 try to vectorize LOOP_VINFO under the same conditions.
2897 E.g. when trying to replace an epilogue loop, we should vectorize
2898 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2899 to replace the main loop, we should vectorize LOOP_VINFO as a main
2900 loop too.
2902 However, autovectorize_vector_modes is usually sorted as follows:
2904 - Modes that naturally produce lower VFs usually follow modes that
2905 naturally produce higher VFs.
2907 - When modes naturally produce the same VF, maskable modes
2908 usually follow unmaskable ones, so that the maskable mode
2909 can be used to vectorize the epilogue of the unmaskable mode.
2911 This order is preferred because it leads to the maximum
2912 epilogue vectorization opportunities. Targets should only use
2913 a different order if they want to make wide modes available while
2914 disparaging them relative to earlier, smaller modes. The assumption
2915 in that case is that the wider modes are more expensive in some
2916 way that isn't reflected directly in the costs.
2918 There should therefore be few interesting cases in which
2919 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2920 treated as a standalone loop, and ends up being genuinely cheaper
2921 than FIRST_LOOP_VINFO. */
2922 if (vect_epilogues)
2923 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2925 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2926 if (mode_i == 0)
2927 autodetected_vector_mode = loop_vinfo->vector_mode;
2928 if (dump_enabled_p ())
2930 if (res)
2931 dump_printf_loc (MSG_NOTE, vect_location,
2932 "***** Analysis succeeded with vector mode %s\n",
2933 GET_MODE_NAME (loop_vinfo->vector_mode));
2934 else
2935 dump_printf_loc (MSG_NOTE, vect_location,
2936 "***** Analysis failed with vector mode %s\n",
2937 GET_MODE_NAME (loop_vinfo->vector_mode));
2940 loop->aux = NULL;
2942 if (!fatal)
2943 while (mode_i < vector_modes.length ()
2944 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2946 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_NOTE, vect_location,
2948 "***** The result for vector mode %s would"
2949 " be the same\n",
2950 GET_MODE_NAME (vector_modes[mode_i]));
2951 mode_i += 1;
2954 if (res)
2956 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2957 vectorized_loops++;
2959 /* Once we hit the desired simdlen for the first time,
2960 discard any previous attempts. */
2961 if (simdlen
2962 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2964 delete first_loop_vinfo;
2965 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2966 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2967 simdlen = 0;
2969 else if (pick_lowest_cost_p && first_loop_vinfo)
2971 /* Keep trying to roll back vectorization attempts while the
2972 loop_vec_infos they produced were worse than this one. */
2973 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2974 while (!vinfos.is_empty ()
2975 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2977 gcc_assert (vect_epilogues);
2978 delete vinfos.pop ();
2980 if (vinfos.is_empty ()
2981 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2983 delete first_loop_vinfo;
2984 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2985 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2989 if (first_loop_vinfo == NULL)
2991 first_loop_vinfo = loop_vinfo;
2992 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2994 else if (vect_epilogues
2995 /* For now only allow one epilogue loop. */
2996 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2998 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2999 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3000 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3001 || maybe_ne (lowest_th, 0U));
3002 /* Keep track of the known smallest versioning
3003 threshold. */
3004 if (ordered_p (lowest_th, th))
3005 lowest_th = ordered_min (lowest_th, th);
3007 else
3009 delete loop_vinfo;
3010 loop_vinfo = opt_loop_vec_info::success (NULL);
3013 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3014 enabled, SIMDUID is not set, it is the innermost loop and we have
3015 either already found the loop's SIMDLEN or there was no SIMDLEN to
3016 begin with.
3017 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3018 vect_epilogues = (!simdlen
3019 && loop->inner == NULL
3020 && param_vect_epilogues_nomask
3021 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3022 && !loop->simduid
3023 /* For now only allow one epilogue loop, but allow
3024 pick_lowest_cost_p to replace it. */
3025 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3026 || pick_lowest_cost_p));
3028 /* Commit to first_loop_vinfo if we have no reason to try
3029 alternatives. */
3030 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3031 break;
3033 else
3035 delete loop_vinfo;
3036 loop_vinfo = opt_loop_vec_info::success (NULL);
3037 if (fatal)
3039 gcc_checking_assert (first_loop_vinfo == NULL);
3040 break;
3044 /* Handle the case that the original loop can use partial
3045 vectorization, but want to only adopt it for the epilogue.
3046 The retry should be in the same mode as original. */
3047 if (vect_epilogues
3048 && loop_vinfo
3049 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3051 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3052 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE, vect_location,
3055 "***** Re-trying analysis with same vector mode"
3056 " %s for epilogue with partial vectors.\n",
3057 GET_MODE_NAME (loop_vinfo->vector_mode));
3058 continue;
3061 if (mode_i < vector_modes.length ()
3062 && VECTOR_MODE_P (autodetected_vector_mode)
3063 && (related_vector_mode (vector_modes[mode_i],
3064 GET_MODE_INNER (autodetected_vector_mode))
3065 == autodetected_vector_mode)
3066 && (related_vector_mode (autodetected_vector_mode,
3067 GET_MODE_INNER (vector_modes[mode_i]))
3068 == vector_modes[mode_i]))
3070 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_NOTE, vect_location,
3072 "***** Skipping vector mode %s, which would"
3073 " repeat the analysis for %s\n",
3074 GET_MODE_NAME (vector_modes[mode_i]),
3075 GET_MODE_NAME (autodetected_vector_mode));
3076 mode_i += 1;
3079 if (mode_i == vector_modes.length ()
3080 || autodetected_vector_mode == VOIDmode)
3081 break;
3083 /* Try the next biggest vector size. */
3084 next_vector_mode = vector_modes[mode_i++];
3085 if (dump_enabled_p ())
3086 dump_printf_loc (MSG_NOTE, vect_location,
3087 "***** Re-trying analysis with vector mode %s\n",
3088 GET_MODE_NAME (next_vector_mode));
3091 if (first_loop_vinfo)
3093 loop->aux = (loop_vec_info) first_loop_vinfo;
3094 if (dump_enabled_p ())
3095 dump_printf_loc (MSG_NOTE, vect_location,
3096 "***** Choosing vector mode %s\n",
3097 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3098 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3099 return first_loop_vinfo;
3102 return opt_loop_vec_info::propagate_failure (res);
3105 /* Return true if there is an in-order reduction function for CODE, storing
3106 it in *REDUC_FN if so. */
3108 static bool
3109 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3111 switch (code)
3113 case PLUS_EXPR:
3114 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3115 return true;
3117 default:
3118 return false;
3122 /* Function reduction_fn_for_scalar_code
3124 Input:
3125 CODE - tree_code of a reduction operations.
3127 Output:
3128 REDUC_FN - the corresponding internal function to be used to reduce the
3129 vector of partial results into a single scalar result, or IFN_LAST
3130 if the operation is a supported reduction operation, but does not have
3131 such an internal function.
3133 Return FALSE if CODE currently cannot be vectorized as reduction. */
3135 static bool
3136 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3138 switch (code)
3140 case MAX_EXPR:
3141 *reduc_fn = IFN_REDUC_MAX;
3142 return true;
3144 case MIN_EXPR:
3145 *reduc_fn = IFN_REDUC_MIN;
3146 return true;
3148 case PLUS_EXPR:
3149 *reduc_fn = IFN_REDUC_PLUS;
3150 return true;
3152 case BIT_AND_EXPR:
3153 *reduc_fn = IFN_REDUC_AND;
3154 return true;
3156 case BIT_IOR_EXPR:
3157 *reduc_fn = IFN_REDUC_IOR;
3158 return true;
3160 case BIT_XOR_EXPR:
3161 *reduc_fn = IFN_REDUC_XOR;
3162 return true;
3164 case MULT_EXPR:
3165 case MINUS_EXPR:
3166 *reduc_fn = IFN_LAST;
3167 return true;
3169 default:
3170 return false;
3174 /* If there is a neutral value X such that SLP reduction NODE would not
3175 be affected by the introduction of additional X elements, return that X,
3176 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3177 is the vector type that would hold element X. REDUC_CHAIN is true if
3178 the SLP statements perform a single reduction, false if each statement
3179 performs an independent reduction. */
3181 static tree
3182 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3183 tree_code code, bool reduc_chain)
3185 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3186 stmt_vec_info stmt_vinfo = stmts[0];
3187 tree scalar_type = TREE_TYPE (vector_type);
3188 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3189 gcc_assert (loop);
3191 switch (code)
3193 case WIDEN_SUM_EXPR:
3194 case DOT_PROD_EXPR:
3195 case SAD_EXPR:
3196 case PLUS_EXPR:
3197 case MINUS_EXPR:
3198 case BIT_IOR_EXPR:
3199 case BIT_XOR_EXPR:
3200 return build_zero_cst (scalar_type);
3202 case MULT_EXPR:
3203 return build_one_cst (scalar_type);
3205 case BIT_AND_EXPR:
3206 return build_all_ones_cst (scalar_type);
3208 case MAX_EXPR:
3209 case MIN_EXPR:
3210 /* For MIN/MAX the initial values are neutral. A reduction chain
3211 has only a single initial value, so that value is neutral for
3212 all statements. */
3213 if (reduc_chain)
3214 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3215 loop_preheader_edge (loop));
3216 return NULL_TREE;
3218 default:
3219 return NULL_TREE;
3223 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3224 STMT is printed with a message MSG. */
3226 static void
3227 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3229 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3232 /* Return true if we need an in-order reduction for operation CODE
3233 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3234 overflow must wrap. */
3236 bool
3237 needs_fold_left_reduction_p (tree type, tree_code code)
3239 /* CHECKME: check for !flag_finite_math_only too? */
3240 if (SCALAR_FLOAT_TYPE_P (type))
3241 switch (code)
3243 case MIN_EXPR:
3244 case MAX_EXPR:
3245 return false;
3247 default:
3248 return !flag_associative_math;
3251 if (INTEGRAL_TYPE_P (type))
3253 if (!operation_no_trapping_overflow (type, code))
3254 return true;
3255 return false;
3258 if (SAT_FIXED_POINT_TYPE_P (type))
3259 return true;
3261 return false;
3264 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3265 has a handled computation expression. Store the main reduction
3266 operation in *CODE. */
3268 static bool
3269 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3270 tree loop_arg, enum tree_code *code,
3271 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3273 auto_bitmap visited;
3274 tree lookfor = PHI_RESULT (phi);
3275 ssa_op_iter curri;
3276 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3277 while (USE_FROM_PTR (curr) != loop_arg)
3278 curr = op_iter_next_use (&curri);
3279 curri.i = curri.numops;
3282 path.safe_push (std::make_pair (curri, curr));
3283 tree use = USE_FROM_PTR (curr);
3284 if (use == lookfor)
3285 break;
3286 gimple *def = SSA_NAME_DEF_STMT (use);
3287 if (gimple_nop_p (def)
3288 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3290 pop:
3293 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3294 curri = x.first;
3295 curr = x.second;
3297 curr = op_iter_next_use (&curri);
3298 /* Skip already visited or non-SSA operands (from iterating
3299 over PHI args). */
3300 while (curr != NULL_USE_OPERAND_P
3301 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3302 || ! bitmap_set_bit (visited,
3303 SSA_NAME_VERSION
3304 (USE_FROM_PTR (curr)))));
3306 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3307 if (curr == NULL_USE_OPERAND_P)
3308 break;
3310 else
3312 if (gimple_code (def) == GIMPLE_PHI)
3313 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3314 else
3315 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3316 while (curr != NULL_USE_OPERAND_P
3317 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3318 || ! bitmap_set_bit (visited,
3319 SSA_NAME_VERSION
3320 (USE_FROM_PTR (curr)))))
3321 curr = op_iter_next_use (&curri);
3322 if (curr == NULL_USE_OPERAND_P)
3323 goto pop;
3326 while (1);
3327 if (dump_file && (dump_flags & TDF_DETAILS))
3329 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3330 unsigned i;
3331 std::pair<ssa_op_iter, use_operand_p> *x;
3332 FOR_EACH_VEC_ELT (path, i, x)
3333 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3334 dump_printf (MSG_NOTE, "\n");
3337 /* Check whether the reduction path detected is valid. */
3338 bool fail = path.length () == 0;
3339 bool neg = false;
3340 int sign = -1;
3341 *code = ERROR_MARK;
3342 for (unsigned i = 1; i < path.length (); ++i)
3344 gimple *use_stmt = USE_STMT (path[i].second);
3345 tree op = USE_FROM_PTR (path[i].second);
3346 if (! is_gimple_assign (use_stmt)
3347 /* The following make sure we can compute the operand index
3348 easily plus it mostly disallows chaining via COND_EXPR condition
3349 operands. */
3350 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3351 && (gimple_num_ops (use_stmt) <= 2
3352 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3353 && (gimple_num_ops (use_stmt) <= 3
3354 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3356 fail = true;
3357 break;
3359 /* Check there's only a single stmt the op is used on. For the
3360 not value-changing tail and the last stmt allow out-of-loop uses.
3361 ??? We could relax this and handle arbitrary live stmts by
3362 forcing a scalar epilogue for example. */
3363 imm_use_iterator imm_iter;
3364 gimple *op_use_stmt;
3365 unsigned cnt = 0;
3366 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3367 if (!is_gimple_debug (op_use_stmt)
3368 && (*code != ERROR_MARK
3369 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3371 /* We want to allow x + x but not x < 1 ? x : 2. */
3372 if (is_gimple_assign (op_use_stmt)
3373 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3375 use_operand_p use_p;
3376 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3377 cnt++;
3379 else
3380 cnt++;
3382 if (cnt != 1)
3384 fail = true;
3385 break;
3387 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3388 if (use_code == MINUS_EXPR)
3390 use_code = PLUS_EXPR;
3391 /* Track whether we negate the reduction value each iteration. */
3392 if (gimple_assign_rhs2 (use_stmt) == op)
3393 neg = ! neg;
3395 if (CONVERT_EXPR_CODE_P (use_code)
3396 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3397 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3399 else if (*code == ERROR_MARK)
3401 *code = use_code;
3402 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3404 else if (use_code != *code)
3406 fail = true;
3407 break;
3409 else if ((use_code == MIN_EXPR
3410 || use_code == MAX_EXPR)
3411 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3413 fail = true;
3414 break;
3417 return ! fail && ! neg && *code != ERROR_MARK;
3420 bool
3421 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3422 tree loop_arg, enum tree_code code)
3424 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3425 enum tree_code code_;
3426 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3427 && code_ == code);
3432 /* Function vect_is_simple_reduction
3434 (1) Detect a cross-iteration def-use cycle that represents a simple
3435 reduction computation. We look for the following pattern:
3437 loop_header:
3438 a1 = phi < a0, a2 >
3439 a3 = ...
3440 a2 = operation (a3, a1)
3444 a3 = ...
3445 loop_header:
3446 a1 = phi < a0, a2 >
3447 a2 = operation (a3, a1)
3449 such that:
3450 1. operation is commutative and associative and it is safe to
3451 change the order of the computation
3452 2. no uses for a2 in the loop (a2 is used out of the loop)
3453 3. no uses of a1 in the loop besides the reduction operation
3454 4. no uses of a1 outside the loop.
3456 Conditions 1,4 are tested here.
3457 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3459 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3460 nested cycles.
3462 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3463 reductions:
3465 a1 = phi < a0, a2 >
3466 inner loop (def of a3)
3467 a2 = phi < a3 >
3469 (4) Detect condition expressions, ie:
3470 for (int i = 0; i < N; i++)
3471 if (a[i] < val)
3472 ret_val = a[i];
3476 static stmt_vec_info
3477 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3478 bool *double_reduc, bool *reduc_chain_p)
3480 gphi *phi = as_a <gphi *> (phi_info->stmt);
3481 gimple *phi_use_stmt = NULL;
3482 imm_use_iterator imm_iter;
3483 use_operand_p use_p;
3485 *double_reduc = false;
3486 *reduc_chain_p = false;
3487 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3489 tree phi_name = PHI_RESULT (phi);
3490 /* ??? If there are no uses of the PHI result the inner loop reduction
3491 won't be detected as possibly double-reduction by vectorizable_reduction
3492 because that tries to walk the PHI arg from the preheader edge which
3493 can be constant. See PR60382. */
3494 if (has_zero_uses (phi_name))
3495 return NULL;
3496 class loop *loop = (gimple_bb (phi))->loop_father;
3497 unsigned nphi_def_loop_uses = 0;
3498 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3500 gimple *use_stmt = USE_STMT (use_p);
3501 if (is_gimple_debug (use_stmt))
3502 continue;
3504 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3506 if (dump_enabled_p ())
3507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3508 "intermediate value used outside loop.\n");
3510 return NULL;
3513 nphi_def_loop_uses++;
3514 phi_use_stmt = use_stmt;
3517 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3518 if (TREE_CODE (latch_def) != SSA_NAME)
3520 if (dump_enabled_p ())
3521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3522 "reduction: not ssa_name: %T\n", latch_def);
3523 return NULL;
3526 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3527 if (!def_stmt_info
3528 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3529 return NULL;
3531 bool nested_in_vect_loop
3532 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3533 unsigned nlatch_def_loop_uses = 0;
3534 auto_vec<gphi *, 3> lcphis;
3535 bool inner_loop_of_double_reduc = false;
3536 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3538 gimple *use_stmt = USE_STMT (use_p);
3539 if (is_gimple_debug (use_stmt))
3540 continue;
3541 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3542 nlatch_def_loop_uses++;
3543 else
3545 /* We can have more than one loop-closed PHI. */
3546 lcphis.safe_push (as_a <gphi *> (use_stmt));
3547 if (nested_in_vect_loop
3548 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3549 == vect_double_reduction_def))
3550 inner_loop_of_double_reduc = true;
3554 /* If we are vectorizing an inner reduction we are executing that
3555 in the original order only in case we are not dealing with a
3556 double reduction. */
3557 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3559 if (dump_enabled_p ())
3560 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3561 "detected nested cycle: ");
3562 return def_stmt_info;
3565 /* If this isn't a nested cycle or if the nested cycle reduction value
3566 is used ouside of the inner loop we cannot handle uses of the reduction
3567 value. */
3568 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3570 if (dump_enabled_p ())
3571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3572 "reduction used in loop.\n");
3573 return NULL;
3576 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3577 defined in the inner loop. */
3578 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3580 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3581 if (gimple_phi_num_args (def_stmt) != 1
3582 || TREE_CODE (op1) != SSA_NAME)
3584 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586 "unsupported phi node definition.\n");
3588 return NULL;
3591 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3592 if (gimple_bb (def1)
3593 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3594 && loop->inner
3595 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3596 && is_gimple_assign (def1)
3597 && is_a <gphi *> (phi_use_stmt)
3598 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3600 if (dump_enabled_p ())
3601 report_vect_op (MSG_NOTE, def_stmt,
3602 "detected double reduction: ");
3604 *double_reduc = true;
3605 return def_stmt_info;
3608 return NULL;
3611 /* Look for the expression computing latch_def from then loop PHI result. */
3612 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3613 enum tree_code code;
3614 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3615 path))
3617 STMT_VINFO_REDUC_CODE (phi_info) = code;
3618 if (code == COND_EXPR && !nested_in_vect_loop)
3619 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3621 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3622 reduction chain for which the additional restriction is that
3623 all operations in the chain are the same. */
3624 auto_vec<stmt_vec_info, 8> reduc_chain;
3625 unsigned i;
3626 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3627 for (i = path.length () - 1; i >= 1; --i)
3629 gimple *stmt = USE_STMT (path[i].second);
3630 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3631 STMT_VINFO_REDUC_IDX (stmt_info)
3632 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3633 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3634 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3635 && (i == 1 || i == path.length () - 1));
3636 if ((stmt_code != code && !leading_conversion)
3637 /* We can only handle the final value in epilogue
3638 generation for reduction chains. */
3639 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3640 is_slp_reduc = false;
3641 /* For reduction chains we support a trailing/leading
3642 conversions. We do not store those in the actual chain. */
3643 if (leading_conversion)
3644 continue;
3645 reduc_chain.safe_push (stmt_info);
3647 if (is_slp_reduc && reduc_chain.length () > 1)
3649 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3651 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3652 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3654 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3655 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3657 /* Save the chain for further analysis in SLP detection. */
3658 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3659 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3661 *reduc_chain_p = true;
3662 if (dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE, vect_location,
3664 "reduction: detected reduction chain\n");
3666 else if (dump_enabled_p ())
3667 dump_printf_loc (MSG_NOTE, vect_location,
3668 "reduction: detected reduction\n");
3670 return def_stmt_info;
3673 if (dump_enabled_p ())
3674 dump_printf_loc (MSG_NOTE, vect_location,
3675 "reduction: unknown pattern\n");
3677 return NULL;
3680 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3681 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3682 or -1 if not known. */
3684 static int
3685 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3687 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3688 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3690 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location,
3692 "cost model: epilogue peel iters set to vf/2 "
3693 "because loop iterations are unknown .\n");
3694 return assumed_vf / 2;
3696 else
3698 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3699 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3700 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3701 /* If we need to peel for gaps, but no peeling is required, we have to
3702 peel VF iterations. */
3703 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3704 peel_iters_epilogue = assumed_vf;
3705 return peel_iters_epilogue;
3709 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3711 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3712 int *peel_iters_epilogue,
3713 stmt_vector_for_cost *scalar_cost_vec,
3714 stmt_vector_for_cost *prologue_cost_vec,
3715 stmt_vector_for_cost *epilogue_cost_vec)
3717 int retval = 0;
3719 *peel_iters_epilogue
3720 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3722 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3724 /* If peeled iterations are known but number of scalar loop
3725 iterations are unknown, count a taken branch per peeled loop. */
3726 if (peel_iters_prologue > 0)
3727 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3728 NULL, NULL_TREE, 0, vect_prologue);
3729 if (*peel_iters_epilogue > 0)
3730 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3731 NULL, NULL_TREE, 0, vect_epilogue);
3734 stmt_info_for_cost *si;
3735 int j;
3736 if (peel_iters_prologue)
3737 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3738 retval += record_stmt_cost (prologue_cost_vec,
3739 si->count * peel_iters_prologue,
3740 si->kind, si->stmt_info, si->misalign,
3741 vect_prologue);
3742 if (*peel_iters_epilogue)
3743 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3744 retval += record_stmt_cost (epilogue_cost_vec,
3745 si->count * *peel_iters_epilogue,
3746 si->kind, si->stmt_info, si->misalign,
3747 vect_epilogue);
3749 return retval;
3752 /* Function vect_estimate_min_profitable_iters
3754 Return the number of iterations required for the vector version of the
3755 loop to be profitable relative to the cost of the scalar version of the
3756 loop.
3758 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3759 of iterations for vectorization. -1 value means loop vectorization
3760 is not profitable. This returned value may be used for dynamic
3761 profitability check.
3763 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3764 for static check against estimated number of iterations. */
3766 static void
3767 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3768 int *ret_min_profitable_niters,
3769 int *ret_min_profitable_estimate)
3771 int min_profitable_iters;
3772 int min_profitable_estimate;
3773 int peel_iters_prologue;
3774 int peel_iters_epilogue;
3775 unsigned vec_inside_cost = 0;
3776 int vec_outside_cost = 0;
3777 unsigned vec_prologue_cost = 0;
3778 unsigned vec_epilogue_cost = 0;
3779 int scalar_single_iter_cost = 0;
3780 int scalar_outside_cost = 0;
3781 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3782 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3783 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3785 /* Cost model disabled. */
3786 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3788 if (dump_enabled_p ())
3789 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3790 *ret_min_profitable_niters = 0;
3791 *ret_min_profitable_estimate = 0;
3792 return;
3795 /* Requires loop versioning tests to handle misalignment. */
3796 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3798 /* FIXME: Make cost depend on complexity of individual check. */
3799 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3800 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3801 NULL, NULL_TREE, 0, vect_prologue);
3802 if (dump_enabled_p ())
3803 dump_printf (MSG_NOTE,
3804 "cost model: Adding cost of checks for loop "
3805 "versioning to treat misalignment.\n");
3808 /* Requires loop versioning with alias checks. */
3809 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3811 /* FIXME: Make cost depend on complexity of individual check. */
3812 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3813 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3814 NULL, NULL_TREE, 0, vect_prologue);
3815 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3816 if (len)
3817 /* Count LEN - 1 ANDs and LEN comparisons. */
3818 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3819 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3820 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3821 if (len)
3823 /* Count LEN - 1 ANDs and LEN comparisons. */
3824 unsigned int nstmts = len * 2 - 1;
3825 /* +1 for each bias that needs adding. */
3826 for (unsigned int i = 0; i < len; ++i)
3827 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3828 nstmts += 1;
3829 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3830 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3832 if (dump_enabled_p ())
3833 dump_printf (MSG_NOTE,
3834 "cost model: Adding cost of checks for loop "
3835 "versioning aliasing.\n");
3838 /* Requires loop versioning with niter checks. */
3839 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3841 /* FIXME: Make cost depend on complexity of individual check. */
3842 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3843 NULL, NULL_TREE, 0, vect_prologue);
3844 if (dump_enabled_p ())
3845 dump_printf (MSG_NOTE,
3846 "cost model: Adding cost of checks for loop "
3847 "versioning niters.\n");
3850 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3851 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3852 NULL, NULL_TREE, 0, vect_prologue);
3854 /* Count statements in scalar loop. Using this as scalar cost for a single
3855 iteration for now.
3857 TODO: Add outer loop support.
3859 TODO: Consider assigning different costs to different scalar
3860 statements. */
3862 scalar_single_iter_cost
3863 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3865 /* Add additional cost for the peeled instructions in prologue and epilogue
3866 loop. (For fully-masked loops there will be no peeling.)
3868 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3869 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3871 TODO: Build an expression that represents peel_iters for prologue and
3872 epilogue to be used in a run-time test. */
3874 bool prologue_need_br_taken_cost = false;
3875 bool prologue_need_br_not_taken_cost = false;
3877 /* Calculate peel_iters_prologue. */
3878 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3879 peel_iters_prologue = 0;
3880 else if (npeel < 0)
3882 peel_iters_prologue = assumed_vf / 2;
3883 if (dump_enabled_p ())
3884 dump_printf (MSG_NOTE, "cost model: "
3885 "prologue peel iters set to vf/2.\n");
3887 /* If peeled iterations are unknown, count a taken branch and a not taken
3888 branch per peeled loop. Even if scalar loop iterations are known,
3889 vector iterations are not known since peeled prologue iterations are
3890 not known. Hence guards remain the same. */
3891 prologue_need_br_taken_cost = true;
3892 prologue_need_br_not_taken_cost = true;
3894 else
3896 peel_iters_prologue = npeel;
3897 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3898 /* If peeled iterations are known but number of scalar loop
3899 iterations are unknown, count a taken branch per peeled loop. */
3900 prologue_need_br_taken_cost = true;
3903 bool epilogue_need_br_taken_cost = false;
3904 bool epilogue_need_br_not_taken_cost = false;
3906 /* Calculate peel_iters_epilogue. */
3907 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3908 /* We need to peel exactly one iteration for gaps. */
3909 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3910 else if (npeel < 0)
3912 /* If peeling for alignment is unknown, loop bound of main loop
3913 becomes unknown. */
3914 peel_iters_epilogue = assumed_vf / 2;
3915 if (dump_enabled_p ())
3916 dump_printf (MSG_NOTE, "cost model: "
3917 "epilogue peel iters set to vf/2 because "
3918 "peeling for alignment is unknown.\n");
3920 /* See the same reason above in peel_iters_prologue calculation. */
3921 epilogue_need_br_taken_cost = true;
3922 epilogue_need_br_not_taken_cost = true;
3924 else
3926 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3927 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3928 /* If peeled iterations are known but number of scalar loop
3929 iterations are unknown, count a taken branch per peeled loop. */
3930 epilogue_need_br_taken_cost = true;
3933 stmt_info_for_cost *si;
3934 int j;
3935 /* Add costs associated with peel_iters_prologue. */
3936 if (peel_iters_prologue)
3937 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3939 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3940 si->count * peel_iters_prologue, si->kind,
3941 si->stmt_info, si->vectype, si->misalign,
3942 vect_prologue);
3945 /* Add costs associated with peel_iters_epilogue. */
3946 if (peel_iters_epilogue)
3947 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3949 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3950 si->count * peel_iters_epilogue, si->kind,
3951 si->stmt_info, si->vectype, si->misalign,
3952 vect_epilogue);
3955 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3957 if (prologue_need_br_taken_cost)
3958 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3959 NULL, NULL_TREE, 0, vect_prologue);
3961 if (prologue_need_br_not_taken_cost)
3962 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3963 cond_branch_not_taken, NULL, NULL_TREE, 0,
3964 vect_prologue);
3966 if (epilogue_need_br_taken_cost)
3967 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3968 NULL, NULL_TREE, 0, vect_epilogue);
3970 if (epilogue_need_br_not_taken_cost)
3971 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3972 cond_branch_not_taken, NULL, NULL_TREE, 0,
3973 vect_epilogue);
3975 /* Take care of special costs for rgroup controls of partial vectors. */
3976 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3978 /* Calculate how many masks we need to generate. */
3979 unsigned int num_masks = 0;
3980 rgroup_controls *rgm;
3981 unsigned int num_vectors_m1;
3982 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3983 if (rgm->type)
3984 num_masks += num_vectors_m1 + 1;
3985 gcc_assert (num_masks > 0);
3987 /* In the worst case, we need to generate each mask in the prologue
3988 and in the loop body. One of the loop body mask instructions
3989 replaces the comparison in the scalar loop, and since we don't
3990 count the scalar comparison against the scalar body, we shouldn't
3991 count that vector instruction against the vector body either.
3993 Sometimes we can use unpacks instead of generating prologue
3994 masks and sometimes the prologue mask will fold to a constant,
3995 so the actual prologue cost might be smaller. However, it's
3996 simpler and safer to use the worst-case cost; if this ends up
3997 being the tie-breaker between vectorizing or not, then it's
3998 probably better not to vectorize. */
3999 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4000 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4001 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4002 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4004 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4006 /* Referring to the functions vect_set_loop_condition_partial_vectors
4007 and vect_set_loop_controls_directly, we need to generate each
4008 length in the prologue and in the loop body if required. Although
4009 there are some possible optimizations, we consider the worst case
4010 here. */
4012 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4013 bool need_iterate_p
4014 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4015 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4017 /* Calculate how many statements to be added. */
4018 unsigned int prologue_stmts = 0;
4019 unsigned int body_stmts = 0;
4021 rgroup_controls *rgc;
4022 unsigned int num_vectors_m1;
4023 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4024 if (rgc->type)
4026 /* May need one SHIFT for nitems_total computation. */
4027 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4028 if (nitems != 1 && !niters_known_p)
4029 prologue_stmts += 1;
4031 /* May need one MAX and one MINUS for wrap around. */
4032 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4033 prologue_stmts += 2;
4035 /* Need one MAX and one MINUS for each batch limit excepting for
4036 the 1st one. */
4037 prologue_stmts += num_vectors_m1 * 2;
4039 unsigned int num_vectors = num_vectors_m1 + 1;
4041 /* Need to set up lengths in prologue, only one MIN required
4042 for each since start index is zero. */
4043 prologue_stmts += num_vectors;
4045 /* Each may need two MINs and one MINUS to update lengths in body
4046 for next iteration. */
4047 if (need_iterate_p)
4048 body_stmts += 3 * num_vectors;
4051 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4052 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4053 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4054 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4057 /* FORNOW: The scalar outside cost is incremented in one of the
4058 following ways:
4060 1. The vectorizer checks for alignment and aliasing and generates
4061 a condition that allows dynamic vectorization. A cost model
4062 check is ANDED with the versioning condition. Hence scalar code
4063 path now has the added cost of the versioning check.
4065 if (cost > th & versioning_check)
4066 jmp to vector code
4068 Hence run-time scalar is incremented by not-taken branch cost.
4070 2. The vectorizer then checks if a prologue is required. If the
4071 cost model check was not done before during versioning, it has to
4072 be done before the prologue check.
4074 if (cost <= th)
4075 prologue = scalar_iters
4076 if (prologue == 0)
4077 jmp to vector code
4078 else
4079 execute prologue
4080 if (prologue == num_iters)
4081 go to exit
4083 Hence the run-time scalar cost is incremented by a taken branch,
4084 plus a not-taken branch, plus a taken branch cost.
4086 3. The vectorizer then checks if an epilogue is required. If the
4087 cost model check was not done before during prologue check, it
4088 has to be done with the epilogue check.
4090 if (prologue == 0)
4091 jmp to vector code
4092 else
4093 execute prologue
4094 if (prologue == num_iters)
4095 go to exit
4096 vector code:
4097 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4098 jmp to epilogue
4100 Hence the run-time scalar cost should be incremented by 2 taken
4101 branches.
4103 TODO: The back end may reorder the BBS's differently and reverse
4104 conditions/branch directions. Change the estimates below to
4105 something more reasonable. */
4107 /* If the number of iterations is known and we do not do versioning, we can
4108 decide whether to vectorize at compile time. Hence the scalar version
4109 do not carry cost model guard costs. */
4110 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4111 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4113 /* Cost model check occurs at versioning. */
4114 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4115 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4116 else
4118 /* Cost model check occurs at prologue generation. */
4119 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4120 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4121 + vect_get_stmt_cost (cond_branch_not_taken);
4122 /* Cost model check occurs at epilogue generation. */
4123 else
4124 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4128 /* Complete the target-specific cost calculations. */
4129 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4130 &vec_inside_cost, &vec_epilogue_cost);
4132 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4134 /* Stash the costs so that we can compare two loop_vec_infos. */
4135 loop_vinfo->vec_inside_cost = vec_inside_cost;
4136 loop_vinfo->vec_outside_cost = vec_outside_cost;
4138 if (dump_enabled_p ())
4140 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4141 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4142 vec_inside_cost);
4143 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4144 vec_prologue_cost);
4145 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4146 vec_epilogue_cost);
4147 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4148 scalar_single_iter_cost);
4149 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4150 scalar_outside_cost);
4151 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4152 vec_outside_cost);
4153 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4154 peel_iters_prologue);
4155 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4156 peel_iters_epilogue);
4159 /* Calculate number of iterations required to make the vector version
4160 profitable, relative to the loop bodies only. The following condition
4161 must hold true:
4162 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4163 where
4164 SIC = scalar iteration cost, VIC = vector iteration cost,
4165 VOC = vector outside cost, VF = vectorization factor,
4166 NPEEL = prologue iterations + epilogue iterations,
4167 SOC = scalar outside cost for run time cost model check. */
4169 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4170 - vec_inside_cost);
4171 if (saving_per_viter <= 0)
4173 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4174 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4175 "vectorization did not happen for a simd loop");
4177 if (dump_enabled_p ())
4178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4179 "cost model: the vector iteration cost = %d "
4180 "divided by the scalar iteration cost = %d "
4181 "is greater or equal to the vectorization factor = %d"
4182 ".\n",
4183 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4184 *ret_min_profitable_niters = -1;
4185 *ret_min_profitable_estimate = -1;
4186 return;
4189 /* ??? The "if" arm is written to handle all cases; see below for what
4190 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4191 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4193 /* Rewriting the condition above in terms of the number of
4194 vector iterations (vniters) rather than the number of
4195 scalar iterations (niters) gives:
4197 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4199 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4201 For integer N, X and Y when X > 0:
4203 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4204 int outside_overhead = (vec_outside_cost
4205 - scalar_single_iter_cost * peel_iters_prologue
4206 - scalar_single_iter_cost * peel_iters_epilogue
4207 - scalar_outside_cost);
4208 /* We're only interested in cases that require at least one
4209 vector iteration. */
4210 int min_vec_niters = 1;
4211 if (outside_overhead > 0)
4212 min_vec_niters = outside_overhead / saving_per_viter + 1;
4214 if (dump_enabled_p ())
4215 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4216 min_vec_niters);
4218 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4220 /* Now that we know the minimum number of vector iterations,
4221 find the minimum niters for which the scalar cost is larger:
4223 SIC * niters > VIC * vniters + VOC - SOC
4225 We know that the minimum niters is no more than
4226 vniters * VF + NPEEL, but it might be (and often is) less
4227 than that if a partial vector iteration is cheaper than the
4228 equivalent scalar code. */
4229 int threshold = (vec_inside_cost * min_vec_niters
4230 + vec_outside_cost
4231 - scalar_outside_cost);
4232 if (threshold <= 0)
4233 min_profitable_iters = 1;
4234 else
4235 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4237 else
4238 /* Convert the number of vector iterations into a number of
4239 scalar iterations. */
4240 min_profitable_iters = (min_vec_niters * assumed_vf
4241 + peel_iters_prologue
4242 + peel_iters_epilogue);
4244 else
4246 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4247 * assumed_vf
4248 - vec_inside_cost * peel_iters_prologue
4249 - vec_inside_cost * peel_iters_epilogue);
4250 if (min_profitable_iters <= 0)
4251 min_profitable_iters = 0;
4252 else
4254 min_profitable_iters /= saving_per_viter;
4256 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4257 <= (((int) vec_inside_cost * min_profitable_iters)
4258 + (((int) vec_outside_cost - scalar_outside_cost)
4259 * assumed_vf)))
4260 min_profitable_iters++;
4264 if (dump_enabled_p ())
4265 dump_printf (MSG_NOTE,
4266 " Calculated minimum iters for profitability: %d\n",
4267 min_profitable_iters);
4269 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4270 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4271 /* We want the vectorized loop to execute at least once. */
4272 min_profitable_iters = assumed_vf + peel_iters_prologue;
4273 else if (min_profitable_iters < peel_iters_prologue)
4274 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4275 vectorized loop executes at least once. */
4276 min_profitable_iters = peel_iters_prologue;
4278 if (dump_enabled_p ())
4279 dump_printf_loc (MSG_NOTE, vect_location,
4280 " Runtime profitability threshold = %d\n",
4281 min_profitable_iters);
4283 *ret_min_profitable_niters = min_profitable_iters;
4285 /* Calculate number of iterations required to make the vector version
4286 profitable, relative to the loop bodies only.
4288 Non-vectorized variant is SIC * niters and it must win over vector
4289 variant on the expected loop trip count. The following condition must hold true:
4290 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4292 if (vec_outside_cost <= 0)
4293 min_profitable_estimate = 0;
4294 /* ??? This "else if" arm is written to handle all cases; see below for
4295 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4296 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4298 /* This is a repeat of the code above, but with + SOC rather
4299 than - SOC. */
4300 int outside_overhead = (vec_outside_cost
4301 - scalar_single_iter_cost * peel_iters_prologue
4302 - scalar_single_iter_cost * peel_iters_epilogue
4303 + scalar_outside_cost);
4304 int min_vec_niters = 1;
4305 if (outside_overhead > 0)
4306 min_vec_niters = outside_overhead / saving_per_viter + 1;
4308 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4310 int threshold = (vec_inside_cost * min_vec_niters
4311 + vec_outside_cost
4312 + scalar_outside_cost);
4313 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4315 else
4316 min_profitable_estimate = (min_vec_niters * assumed_vf
4317 + peel_iters_prologue
4318 + peel_iters_epilogue);
4320 else
4322 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4323 * assumed_vf
4324 - vec_inside_cost * peel_iters_prologue
4325 - vec_inside_cost * peel_iters_epilogue)
4326 / ((scalar_single_iter_cost * assumed_vf)
4327 - vec_inside_cost);
4329 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_NOTE, vect_location,
4332 " Static estimate profitability threshold = %d\n",
4333 min_profitable_estimate);
4335 *ret_min_profitable_estimate = min_profitable_estimate;
4338 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4339 vector elements (not bits) for a vector with NELT elements. */
4340 static void
4341 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4342 vec_perm_builder *sel)
4344 /* The encoding is a single stepped pattern. Any wrap-around is handled
4345 by vec_perm_indices. */
4346 sel->new_vector (nelt, 1, 3);
4347 for (unsigned int i = 0; i < 3; i++)
4348 sel->quick_push (i + offset);
4351 /* Checks whether the target supports whole-vector shifts for vectors of mode
4352 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4353 it supports vec_perm_const with masks for all necessary shift amounts. */
4354 static bool
4355 have_whole_vector_shift (machine_mode mode)
4357 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4358 return true;
4360 /* Variable-length vectors should be handled via the optab. */
4361 unsigned int nelt;
4362 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4363 return false;
4365 vec_perm_builder sel;
4366 vec_perm_indices indices;
4367 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4369 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4370 indices.new_vector (sel, 2, nelt);
4371 if (!can_vec_perm_const_p (mode, indices, false))
4372 return false;
4374 return true;
4377 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4378 functions. Design better to avoid maintenance issues. */
4380 /* Function vect_model_reduction_cost.
4382 Models cost for a reduction operation, including the vector ops
4383 generated within the strip-mine loop, the initial definition before
4384 the loop, and the epilogue code that must be generated. */
4386 static void
4387 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4388 stmt_vec_info stmt_info, internal_fn reduc_fn,
4389 vect_reduction_type reduction_type,
4390 int ncopies, stmt_vector_for_cost *cost_vec)
4392 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4393 enum tree_code code;
4394 optab optab;
4395 tree vectype;
4396 machine_mode mode;
4397 class loop *loop = NULL;
4399 if (loop_vinfo)
4400 loop = LOOP_VINFO_LOOP (loop_vinfo);
4402 /* Condition reductions generate two reductions in the loop. */
4403 if (reduction_type == COND_REDUCTION)
4404 ncopies *= 2;
4406 vectype = STMT_VINFO_VECTYPE (stmt_info);
4407 mode = TYPE_MODE (vectype);
4408 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4410 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4412 if (reduction_type == EXTRACT_LAST_REDUCTION)
4413 /* No extra instructions are needed in the prologue. The loop body
4414 operations are costed in vectorizable_condition. */
4415 inside_cost = 0;
4416 else if (reduction_type == FOLD_LEFT_REDUCTION)
4418 /* No extra instructions needed in the prologue. */
4419 prologue_cost = 0;
4421 if (reduc_fn != IFN_LAST)
4422 /* Count one reduction-like operation per vector. */
4423 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4424 stmt_info, 0, vect_body);
4425 else
4427 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4428 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4429 inside_cost = record_stmt_cost (cost_vec, nelements,
4430 vec_to_scalar, stmt_info, 0,
4431 vect_body);
4432 inside_cost += record_stmt_cost (cost_vec, nelements,
4433 scalar_stmt, stmt_info, 0,
4434 vect_body);
4437 else
4439 /* Add in cost for initial definition.
4440 For cond reduction we have four vectors: initial index, step,
4441 initial result of the data reduction, initial value of the index
4442 reduction. */
4443 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4444 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4445 scalar_to_vec, stmt_info, 0,
4446 vect_prologue);
4448 /* Cost of reduction op inside loop. */
4449 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4450 stmt_info, 0, vect_body);
4453 /* Determine cost of epilogue code.
4455 We have a reduction operator that will reduce the vector in one statement.
4456 Also requires scalar extract. */
4458 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4460 if (reduc_fn != IFN_LAST)
4462 if (reduction_type == COND_REDUCTION)
4464 /* An EQ stmt and an COND_EXPR stmt. */
4465 epilogue_cost += record_stmt_cost (cost_vec, 2,
4466 vector_stmt, stmt_info, 0,
4467 vect_epilogue);
4468 /* Reduction of the max index and a reduction of the found
4469 values. */
4470 epilogue_cost += record_stmt_cost (cost_vec, 2,
4471 vec_to_scalar, stmt_info, 0,
4472 vect_epilogue);
4473 /* A broadcast of the max value. */
4474 epilogue_cost += record_stmt_cost (cost_vec, 1,
4475 scalar_to_vec, stmt_info, 0,
4476 vect_epilogue);
4478 else
4480 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4481 stmt_info, 0, vect_epilogue);
4482 epilogue_cost += record_stmt_cost (cost_vec, 1,
4483 vec_to_scalar, stmt_info, 0,
4484 vect_epilogue);
4487 else if (reduction_type == COND_REDUCTION)
4489 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4490 /* Extraction of scalar elements. */
4491 epilogue_cost += record_stmt_cost (cost_vec,
4492 2 * estimated_nunits,
4493 vec_to_scalar, stmt_info, 0,
4494 vect_epilogue);
4495 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4496 epilogue_cost += record_stmt_cost (cost_vec,
4497 2 * estimated_nunits - 3,
4498 scalar_stmt, stmt_info, 0,
4499 vect_epilogue);
4501 else if (reduction_type == EXTRACT_LAST_REDUCTION
4502 || reduction_type == FOLD_LEFT_REDUCTION)
4503 /* No extra instructions need in the epilogue. */
4505 else
4507 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4508 tree bitsize =
4509 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4510 int element_bitsize = tree_to_uhwi (bitsize);
4511 int nelements = vec_size_in_bits / element_bitsize;
4513 if (code == COND_EXPR)
4514 code = MAX_EXPR;
4516 optab = optab_for_tree_code (code, vectype, optab_default);
4518 /* We have a whole vector shift available. */
4519 if (optab != unknown_optab
4520 && VECTOR_MODE_P (mode)
4521 && optab_handler (optab, mode) != CODE_FOR_nothing
4522 && have_whole_vector_shift (mode))
4524 /* Final reduction via vector shifts and the reduction operator.
4525 Also requires scalar extract. */
4526 epilogue_cost += record_stmt_cost (cost_vec,
4527 exact_log2 (nelements) * 2,
4528 vector_stmt, stmt_info, 0,
4529 vect_epilogue);
4530 epilogue_cost += record_stmt_cost (cost_vec, 1,
4531 vec_to_scalar, stmt_info, 0,
4532 vect_epilogue);
4534 else
4535 /* Use extracts and reduction op for final reduction. For N
4536 elements, we have N extracts and N-1 reduction ops. */
4537 epilogue_cost += record_stmt_cost (cost_vec,
4538 nelements + nelements - 1,
4539 vector_stmt, stmt_info, 0,
4540 vect_epilogue);
4544 if (dump_enabled_p ())
4545 dump_printf (MSG_NOTE,
4546 "vect_model_reduction_cost: inside_cost = %d, "
4547 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4548 prologue_cost, epilogue_cost);
4553 /* Function get_initial_def_for_reduction
4555 Input:
4556 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4557 INIT_VAL - the initial value of the reduction variable
4559 Output:
4560 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4561 of the reduction (used for adjusting the epilog - see below).
4562 Return a vector variable, initialized according to the operation that
4563 STMT_VINFO performs. This vector will be used as the initial value
4564 of the vector of partial results.
4566 Option1 (adjust in epilog): Initialize the vector as follows:
4567 add/bit or/xor: [0,0,...,0,0]
4568 mult/bit and: [1,1,...,1,1]
4569 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4570 and when necessary (e.g. add/mult case) let the caller know
4571 that it needs to adjust the result by init_val.
4573 Option2: Initialize the vector as follows:
4574 add/bit or/xor: [init_val,0,0,...,0]
4575 mult/bit and: [init_val,1,1,...,1]
4576 min/max/cond_expr: [init_val,init_val,...,init_val]
4577 and no adjustments are needed.
4579 For example, for the following code:
4581 s = init_val;
4582 for (i=0;i<n;i++)
4583 s = s + a[i];
4585 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4586 For a vector of 4 units, we want to return either [0,0,0,init_val],
4587 or [0,0,0,0] and let the caller know that it needs to adjust
4588 the result at the end by 'init_val'.
4590 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4591 initialization vector is simpler (same element in all entries), if
4592 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4594 A cost model should help decide between these two schemes. */
4596 static tree
4597 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4598 stmt_vec_info stmt_vinfo,
4599 enum tree_code code, tree init_val,
4600 tree *adjustment_def)
4602 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4603 tree scalar_type = TREE_TYPE (init_val);
4604 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4605 tree def_for_init;
4606 tree init_def;
4607 REAL_VALUE_TYPE real_init_val = dconst0;
4608 int int_init_val = 0;
4609 gimple_seq stmts = NULL;
4611 gcc_assert (vectype);
4613 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4614 || SCALAR_FLOAT_TYPE_P (scalar_type));
4616 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4617 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4619 /* ADJUSTMENT_DEF is NULL when called from
4620 vect_create_epilog_for_reduction to vectorize double reduction. */
4621 if (adjustment_def)
4622 *adjustment_def = NULL;
4624 switch (code)
4626 case WIDEN_SUM_EXPR:
4627 case DOT_PROD_EXPR:
4628 case SAD_EXPR:
4629 case PLUS_EXPR:
4630 case MINUS_EXPR:
4631 case BIT_IOR_EXPR:
4632 case BIT_XOR_EXPR:
4633 case MULT_EXPR:
4634 case BIT_AND_EXPR:
4636 if (code == MULT_EXPR)
4638 real_init_val = dconst1;
4639 int_init_val = 1;
4642 if (code == BIT_AND_EXPR)
4643 int_init_val = -1;
4645 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4646 def_for_init = build_real (scalar_type, real_init_val);
4647 else
4648 def_for_init = build_int_cst (scalar_type, int_init_val);
4650 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4652 /* Option1: the first element is '0' or '1' as well. */
4653 if (!operand_equal_p (def_for_init, init_val, 0))
4654 *adjustment_def = init_val;
4655 init_def = gimple_build_vector_from_val (&stmts, vectype,
4656 def_for_init);
4658 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4660 /* Option2 (variable length): the first element is INIT_VAL. */
4661 init_def = gimple_build_vector_from_val (&stmts, vectype,
4662 def_for_init);
4663 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4664 vectype, init_def, init_val);
4666 else
4668 /* Option2: the first element is INIT_VAL. */
4669 tree_vector_builder elts (vectype, 1, 2);
4670 elts.quick_push (init_val);
4671 elts.quick_push (def_for_init);
4672 init_def = gimple_build_vector (&stmts, &elts);
4675 break;
4677 case MIN_EXPR:
4678 case MAX_EXPR:
4679 case COND_EXPR:
4681 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4682 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4684 break;
4686 default:
4687 gcc_unreachable ();
4690 if (stmts)
4691 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4692 return init_def;
4695 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4696 NUMBER_OF_VECTORS is the number of vector defs to create.
4697 If NEUTRAL_OP is nonnull, introducing extra elements of that
4698 value will not change the result. */
4700 static void
4701 get_initial_defs_for_reduction (vec_info *vinfo,
4702 slp_tree slp_node,
4703 vec<tree> *vec_oprnds,
4704 unsigned int number_of_vectors,
4705 bool reduc_chain, tree neutral_op)
4707 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4708 stmt_vec_info stmt_vinfo = stmts[0];
4709 unsigned HOST_WIDE_INT nunits;
4710 unsigned j, number_of_places_left_in_vector;
4711 tree vector_type;
4712 unsigned int group_size = stmts.length ();
4713 unsigned int i;
4714 class loop *loop;
4716 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4718 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4720 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4721 gcc_assert (loop);
4722 edge pe = loop_preheader_edge (loop);
4724 gcc_assert (!reduc_chain || neutral_op);
4726 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4727 created vectors. It is greater than 1 if unrolling is performed.
4729 For example, we have two scalar operands, s1 and s2 (e.g., group of
4730 strided accesses of size two), while NUNITS is four (i.e., four scalars
4731 of this type can be packed in a vector). The output vector will contain
4732 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4733 will be 2).
4735 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4736 vectors containing the operands.
4738 For example, NUNITS is four as before, and the group size is 8
4739 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4740 {s5, s6, s7, s8}. */
4742 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4743 nunits = group_size;
4745 number_of_places_left_in_vector = nunits;
4746 bool constant_p = true;
4747 tree_vector_builder elts (vector_type, nunits, 1);
4748 elts.quick_grow (nunits);
4749 gimple_seq ctor_seq = NULL;
4750 for (j = 0; j < nunits * number_of_vectors; ++j)
4752 tree op;
4753 i = j % group_size;
4754 stmt_vinfo = stmts[i];
4756 /* Get the def before the loop. In reduction chain we have only
4757 one initial value. Else we have as many as PHIs in the group. */
4758 if (reduc_chain)
4759 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4760 else if (((vec_oprnds->length () + 1) * nunits
4761 - number_of_places_left_in_vector >= group_size)
4762 && neutral_op)
4763 op = neutral_op;
4764 else
4765 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4767 /* Create 'vect_ = {op0,op1,...,opn}'. */
4768 number_of_places_left_in_vector--;
4769 elts[nunits - number_of_places_left_in_vector - 1] = op;
4770 if (!CONSTANT_CLASS_P (op))
4771 constant_p = false;
4773 if (number_of_places_left_in_vector == 0)
4775 tree init;
4776 if (constant_p && !neutral_op
4777 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4778 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4779 /* Build the vector directly from ELTS. */
4780 init = gimple_build_vector (&ctor_seq, &elts);
4781 else if (neutral_op)
4783 /* Build a vector of the neutral value and shift the
4784 other elements into place. */
4785 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4786 neutral_op);
4787 int k = nunits;
4788 while (k > 0 && elts[k - 1] == neutral_op)
4789 k -= 1;
4790 while (k > 0)
4792 k -= 1;
4793 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4794 vector_type, init, elts[k]);
4797 else
4799 /* First time round, duplicate ELTS to fill the
4800 required number of vectors. */
4801 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4802 number_of_vectors, *vec_oprnds);
4803 break;
4805 vec_oprnds->quick_push (init);
4807 number_of_places_left_in_vector = nunits;
4808 elts.new_vector (vector_type, nunits, 1);
4809 elts.quick_grow (nunits);
4810 constant_p = true;
4813 if (ctor_seq != NULL)
4814 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4817 /* For a statement STMT_INFO taking part in a reduction operation return
4818 the stmt_vec_info the meta information is stored on. */
4820 stmt_vec_info
4821 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4823 stmt_info = vect_orig_stmt (stmt_info);
4824 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4825 if (!is_a <gphi *> (stmt_info->stmt)
4826 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4827 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4828 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4829 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4831 if (gimple_phi_num_args (phi) == 1)
4832 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4834 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4836 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4837 stmt_vec_info info
4838 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4839 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4840 stmt_info = info;
4842 return stmt_info;
4845 /* Function vect_create_epilog_for_reduction
4847 Create code at the loop-epilog to finalize the result of a reduction
4848 computation.
4850 STMT_INFO is the scalar reduction stmt that is being vectorized.
4851 SLP_NODE is an SLP node containing a group of reduction statements. The
4852 first one in this group is STMT_INFO.
4853 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4854 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4855 (counting from 0)
4857 This function:
4858 1. Completes the reduction def-use cycles.
4859 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4860 by calling the function specified by REDUC_FN if available, or by
4861 other means (whole-vector shifts or a scalar loop).
4862 The function also creates a new phi node at the loop exit to preserve
4863 loop-closed form, as illustrated below.
4865 The flow at the entry to this function:
4867 loop:
4868 vec_def = phi <vec_init, null> # REDUCTION_PHI
4869 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4870 s_loop = scalar_stmt # (scalar) STMT_INFO
4871 loop_exit:
4872 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4873 use <s_out0>
4874 use <s_out0>
4876 The above is transformed by this function into:
4878 loop:
4879 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4880 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4881 s_loop = scalar_stmt # (scalar) STMT_INFO
4882 loop_exit:
4883 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4884 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4885 v_out2 = reduce <v_out1>
4886 s_out3 = extract_field <v_out2, 0>
4887 s_out4 = adjust_result <s_out3>
4888 use <s_out4>
4889 use <s_out4>
4892 static void
4893 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4894 stmt_vec_info stmt_info,
4895 slp_tree slp_node,
4896 slp_instance slp_node_instance)
4898 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4899 gcc_assert (reduc_info->is_reduc_info);
4900 /* For double reductions we need to get at the inner loop reduction
4901 stmt which has the meta info attached. Our stmt_info is that of the
4902 loop-closed PHI of the inner loop which we remember as
4903 def for the reduction PHI generation. */
4904 bool double_reduc = false;
4905 stmt_vec_info rdef_info = stmt_info;
4906 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4908 gcc_assert (!slp_node);
4909 double_reduc = true;
4910 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4911 (stmt_info->stmt, 0));
4912 stmt_info = vect_stmt_to_vectorize (stmt_info);
4914 gphi *reduc_def_stmt
4915 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4916 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4917 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4918 tree vectype;
4919 machine_mode mode;
4920 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4921 basic_block exit_bb;
4922 tree scalar_dest;
4923 tree scalar_type;
4924 gimple *new_phi = NULL, *phi;
4925 gimple_stmt_iterator exit_gsi;
4926 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4927 gimple *epilog_stmt = NULL;
4928 gimple *exit_phi;
4929 tree bitsize;
4930 tree def;
4931 tree orig_name, scalar_result;
4932 imm_use_iterator imm_iter, phi_imm_iter;
4933 use_operand_p use_p, phi_use_p;
4934 gimple *use_stmt;
4935 bool nested_in_vect_loop = false;
4936 auto_vec<gimple *> new_phis;
4937 int j, i;
4938 auto_vec<tree> scalar_results;
4939 unsigned int group_size = 1, k;
4940 auto_vec<gimple *> phis;
4941 bool slp_reduc = false;
4942 bool direct_slp_reduc;
4943 tree new_phi_result;
4944 tree induction_index = NULL_TREE;
4946 if (slp_node)
4947 group_size = SLP_TREE_LANES (slp_node);
4949 if (nested_in_vect_loop_p (loop, stmt_info))
4951 outer_loop = loop;
4952 loop = loop->inner;
4953 nested_in_vect_loop = true;
4954 gcc_assert (!slp_node);
4956 gcc_assert (!nested_in_vect_loop || double_reduc);
4958 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4959 gcc_assert (vectype);
4960 mode = TYPE_MODE (vectype);
4962 tree initial_def = NULL;
4963 tree induc_val = NULL_TREE;
4964 tree adjustment_def = NULL;
4965 if (slp_node)
4967 else
4969 /* Get at the scalar def before the loop, that defines the initial value
4970 of the reduction variable. */
4971 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4972 loop_preheader_edge (loop));
4973 /* Optimize: for induction condition reduction, if we can't use zero
4974 for induc_val, use initial_def. */
4975 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4976 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4977 else if (double_reduc)
4979 else if (nested_in_vect_loop)
4981 else
4982 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4985 unsigned vec_num;
4986 int ncopies;
4987 if (slp_node)
4989 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4990 ncopies = 1;
4992 else
4994 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4995 vec_num = 1;
4996 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4999 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5000 which is updated with the current index of the loop for every match of
5001 the original loop's cond_expr (VEC_STMT). This results in a vector
5002 containing the last time the condition passed for that vector lane.
5003 The first match will be a 1 to allow 0 to be used for non-matching
5004 indexes. If there are no matches at all then the vector will be all
5005 zeroes.
5007 PR92772: This algorithm is broken for architectures that support
5008 masked vectors, but do not provide fold_extract_last. */
5009 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5011 auto_vec<std::pair<tree, bool>, 2> ccompares;
5012 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5013 cond_info = vect_stmt_to_vectorize (cond_info);
5014 while (cond_info != reduc_info)
5016 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5018 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5019 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5020 ccompares.safe_push
5021 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5022 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5024 cond_info
5025 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5026 1 + STMT_VINFO_REDUC_IDX
5027 (cond_info)));
5028 cond_info = vect_stmt_to_vectorize (cond_info);
5030 gcc_assert (ccompares.length () != 0);
5032 tree indx_before_incr, indx_after_incr;
5033 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5034 int scalar_precision
5035 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5036 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5037 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5038 (TYPE_MODE (vectype), cr_index_scalar_type,
5039 TYPE_VECTOR_SUBPARTS (vectype));
5041 /* First we create a simple vector induction variable which starts
5042 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5043 vector size (STEP). */
5045 /* Create a {1,2,3,...} vector. */
5046 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5048 /* Create a vector of the step value. */
5049 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5050 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5052 /* Create an induction variable. */
5053 gimple_stmt_iterator incr_gsi;
5054 bool insert_after;
5055 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5056 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5057 insert_after, &indx_before_incr, &indx_after_incr);
5059 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5060 filled with zeros (VEC_ZERO). */
5062 /* Create a vector of 0s. */
5063 tree zero = build_zero_cst (cr_index_scalar_type);
5064 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5066 /* Create a vector phi node. */
5067 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5068 new_phi = create_phi_node (new_phi_tree, loop->header);
5069 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5070 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5072 /* Now take the condition from the loops original cond_exprs
5073 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5074 every match uses values from the induction variable
5075 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5076 (NEW_PHI_TREE).
5077 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5078 the new cond_expr (INDEX_COND_EXPR). */
5079 gimple_seq stmts = NULL;
5080 for (int i = ccompares.length () - 1; i != -1; --i)
5082 tree ccompare = ccompares[i].first;
5083 if (ccompares[i].second)
5084 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5085 cr_index_vector_type,
5086 ccompare,
5087 indx_before_incr, new_phi_tree);
5088 else
5089 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5090 cr_index_vector_type,
5091 ccompare,
5092 new_phi_tree, indx_before_incr);
5094 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5096 /* Update the phi with the vec cond. */
5097 induction_index = new_phi_tree;
5098 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5099 loop_latch_edge (loop), UNKNOWN_LOCATION);
5102 /* 2. Create epilog code.
5103 The reduction epilog code operates across the elements of the vector
5104 of partial results computed by the vectorized loop.
5105 The reduction epilog code consists of:
5107 step 1: compute the scalar result in a vector (v_out2)
5108 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5109 step 3: adjust the scalar result (s_out3) if needed.
5111 Step 1 can be accomplished using one the following three schemes:
5112 (scheme 1) using reduc_fn, if available.
5113 (scheme 2) using whole-vector shifts, if available.
5114 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5115 combined.
5117 The overall epilog code looks like this:
5119 s_out0 = phi <s_loop> # original EXIT_PHI
5120 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5121 v_out2 = reduce <v_out1> # step 1
5122 s_out3 = extract_field <v_out2, 0> # step 2
5123 s_out4 = adjust_result <s_out3> # step 3
5125 (step 3 is optional, and steps 1 and 2 may be combined).
5126 Lastly, the uses of s_out0 are replaced by s_out4. */
5129 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5130 v_out1 = phi <VECT_DEF>
5131 Store them in NEW_PHIS. */
5132 if (double_reduc)
5133 loop = outer_loop;
5134 exit_bb = single_exit (loop)->dest;
5135 new_phis.create (slp_node ? vec_num : ncopies);
5136 for (unsigned i = 0; i < vec_num; i++)
5138 if (slp_node)
5139 def = vect_get_slp_vect_def (slp_node, i);
5140 else
5141 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5142 for (j = 0; j < ncopies; j++)
5144 tree new_def = copy_ssa_name (def);
5145 phi = create_phi_node (new_def, exit_bb);
5146 if (j == 0)
5147 new_phis.quick_push (phi);
5148 else
5150 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5151 new_phis.quick_push (phi);
5154 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5158 exit_gsi = gsi_after_labels (exit_bb);
5160 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5161 (i.e. when reduc_fn is not available) and in the final adjustment
5162 code (if needed). Also get the original scalar reduction variable as
5163 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5164 represents a reduction pattern), the tree-code and scalar-def are
5165 taken from the original stmt that the pattern-stmt (STMT) replaces.
5166 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5167 are taken from STMT. */
5169 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5170 if (orig_stmt_info != stmt_info)
5172 /* Reduction pattern */
5173 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5174 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5177 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5178 scalar_type = TREE_TYPE (scalar_dest);
5179 scalar_results.create (group_size);
5180 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5181 bitsize = TYPE_SIZE (scalar_type);
5183 /* SLP reduction without reduction chain, e.g.,
5184 # a1 = phi <a2, a0>
5185 # b1 = phi <b2, b0>
5186 a2 = operation (a1)
5187 b2 = operation (b1) */
5188 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5190 /* True if we should implement SLP_REDUC using native reduction operations
5191 instead of scalar operations. */
5192 direct_slp_reduc = (reduc_fn != IFN_LAST
5193 && slp_reduc
5194 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5196 /* In case of reduction chain, e.g.,
5197 # a1 = phi <a3, a0>
5198 a2 = operation (a1)
5199 a3 = operation (a2),
5201 we may end up with more than one vector result. Here we reduce them to
5202 one vector. */
5203 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5205 gimple_seq stmts = NULL;
5206 tree first_vect = PHI_RESULT (new_phis[0]);
5207 first_vect = gimple_convert (&stmts, vectype, first_vect);
5208 for (k = 1; k < new_phis.length (); k++)
5210 gimple *next_phi = new_phis[k];
5211 tree second_vect = PHI_RESULT (next_phi);
5212 second_vect = gimple_convert (&stmts, vectype, second_vect);
5213 first_vect = gimple_build (&stmts, code, vectype,
5214 first_vect, second_vect);
5216 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5218 new_phi_result = first_vect;
5219 new_phis.truncate (0);
5220 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5222 /* Likewise if we couldn't use a single defuse cycle. */
5223 else if (ncopies > 1)
5225 gimple_seq stmts = NULL;
5226 tree first_vect = PHI_RESULT (new_phis[0]);
5227 first_vect = gimple_convert (&stmts, vectype, first_vect);
5228 for (int k = 1; k < ncopies; ++k)
5230 tree second_vect = PHI_RESULT (new_phis[k]);
5231 second_vect = gimple_convert (&stmts, vectype, second_vect);
5232 first_vect = gimple_build (&stmts, code, vectype,
5233 first_vect, second_vect);
5235 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5236 new_phi_result = first_vect;
5237 new_phis.truncate (0);
5238 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5240 else
5241 new_phi_result = PHI_RESULT (new_phis[0]);
5243 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5244 && reduc_fn != IFN_LAST)
5246 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5247 various data values where the condition matched and another vector
5248 (INDUCTION_INDEX) containing all the indexes of those matches. We
5249 need to extract the last matching index (which will be the index with
5250 highest value) and use this to index into the data vector.
5251 For the case where there were no matches, the data vector will contain
5252 all default values and the index vector will be all zeros. */
5254 /* Get various versions of the type of the vector of indexes. */
5255 tree index_vec_type = TREE_TYPE (induction_index);
5256 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5257 tree index_scalar_type = TREE_TYPE (index_vec_type);
5258 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5260 /* Get an unsigned integer version of the type of the data vector. */
5261 int scalar_precision
5262 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5263 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5264 tree vectype_unsigned = build_vector_type
5265 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5267 /* First we need to create a vector (ZERO_VEC) of zeros and another
5268 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5269 can create using a MAX reduction and then expanding.
5270 In the case where the loop never made any matches, the max index will
5271 be zero. */
5273 /* Vector of {0, 0, 0,...}. */
5274 tree zero_vec = build_zero_cst (vectype);
5276 gimple_seq stmts = NULL;
5277 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5278 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5280 /* Find maximum value from the vector of found indexes. */
5281 tree max_index = make_ssa_name (index_scalar_type);
5282 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5283 1, induction_index);
5284 gimple_call_set_lhs (max_index_stmt, max_index);
5285 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5287 /* Vector of {max_index, max_index, max_index,...}. */
5288 tree max_index_vec = make_ssa_name (index_vec_type);
5289 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5290 max_index);
5291 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5292 max_index_vec_rhs);
5293 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5295 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5296 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5297 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5298 otherwise. Only one value should match, resulting in a vector
5299 (VEC_COND) with one data value and the rest zeros.
5300 In the case where the loop never made any matches, every index will
5301 match, resulting in a vector with all data values (which will all be
5302 the default value). */
5304 /* Compare the max index vector to the vector of found indexes to find
5305 the position of the max value. */
5306 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5307 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5308 induction_index,
5309 max_index_vec);
5310 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5312 /* Use the compare to choose either values from the data vector or
5313 zero. */
5314 tree vec_cond = make_ssa_name (vectype);
5315 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5316 vec_compare, new_phi_result,
5317 zero_vec);
5318 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5320 /* Finally we need to extract the data value from the vector (VEC_COND)
5321 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5322 reduction, but because this doesn't exist, we can use a MAX reduction
5323 instead. The data value might be signed or a float so we need to cast
5324 it first.
5325 In the case where the loop never made any matches, the data values are
5326 all identical, and so will reduce down correctly. */
5328 /* Make the matched data values unsigned. */
5329 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5330 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5331 vec_cond);
5332 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5333 VIEW_CONVERT_EXPR,
5334 vec_cond_cast_rhs);
5335 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5337 /* Reduce down to a scalar value. */
5338 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5339 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5340 1, vec_cond_cast);
5341 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5342 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5344 /* Convert the reduced value back to the result type and set as the
5345 result. */
5346 stmts = NULL;
5347 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5348 data_reduc);
5349 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5350 scalar_results.safe_push (new_temp);
5352 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5353 && reduc_fn == IFN_LAST)
5355 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5356 idx = 0;
5357 idx_val = induction_index[0];
5358 val = data_reduc[0];
5359 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5360 if (induction_index[i] > idx_val)
5361 val = data_reduc[i], idx_val = induction_index[i];
5362 return val; */
5364 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5365 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5366 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5367 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5368 /* Enforced by vectorizable_reduction, which ensures we have target
5369 support before allowing a conditional reduction on variable-length
5370 vectors. */
5371 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5372 tree idx_val = NULL_TREE, val = NULL_TREE;
5373 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5375 tree old_idx_val = idx_val;
5376 tree old_val = val;
5377 idx_val = make_ssa_name (idx_eltype);
5378 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5379 build3 (BIT_FIELD_REF, idx_eltype,
5380 induction_index,
5381 bitsize_int (el_size),
5382 bitsize_int (off)));
5383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5384 val = make_ssa_name (data_eltype);
5385 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5386 build3 (BIT_FIELD_REF,
5387 data_eltype,
5388 new_phi_result,
5389 bitsize_int (el_size),
5390 bitsize_int (off)));
5391 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5392 if (off != 0)
5394 tree new_idx_val = idx_val;
5395 if (off != v_size - el_size)
5397 new_idx_val = make_ssa_name (idx_eltype);
5398 epilog_stmt = gimple_build_assign (new_idx_val,
5399 MAX_EXPR, idx_val,
5400 old_idx_val);
5401 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5403 tree new_val = make_ssa_name (data_eltype);
5404 epilog_stmt = gimple_build_assign (new_val,
5405 COND_EXPR,
5406 build2 (GT_EXPR,
5407 boolean_type_node,
5408 idx_val,
5409 old_idx_val),
5410 val, old_val);
5411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412 idx_val = new_idx_val;
5413 val = new_val;
5416 /* Convert the reduced value back to the result type and set as the
5417 result. */
5418 gimple_seq stmts = NULL;
5419 val = gimple_convert (&stmts, scalar_type, val);
5420 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5421 scalar_results.safe_push (val);
5424 /* 2.3 Create the reduction code, using one of the three schemes described
5425 above. In SLP we simply need to extract all the elements from the
5426 vector (without reducing them), so we use scalar shifts. */
5427 else if (reduc_fn != IFN_LAST && !slp_reduc)
5429 tree tmp;
5430 tree vec_elem_type;
5432 /* Case 1: Create:
5433 v_out2 = reduc_expr <v_out1> */
5435 if (dump_enabled_p ())
5436 dump_printf_loc (MSG_NOTE, vect_location,
5437 "Reduce using direct vector reduction.\n");
5439 gimple_seq stmts = NULL;
5440 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5441 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5442 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5443 vec_elem_type, new_phi_result);
5444 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5445 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5447 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5448 && induc_val)
5450 /* Earlier we set the initial value to be a vector if induc_val
5451 values. Check the result and if it is induc_val then replace
5452 with the original initial value, unless induc_val is
5453 the same as initial_def already. */
5454 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5455 induc_val);
5457 tmp = make_ssa_name (new_scalar_dest);
5458 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5459 initial_def, new_temp);
5460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5461 new_temp = tmp;
5464 scalar_results.safe_push (new_temp);
5466 else if (direct_slp_reduc)
5468 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5469 with the elements for other SLP statements replaced with the
5470 neutral value. We can then do a normal reduction on each vector. */
5472 /* Enforced by vectorizable_reduction. */
5473 gcc_assert (new_phis.length () == 1);
5474 gcc_assert (pow2p_hwi (group_size));
5476 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5477 vec<stmt_vec_info> orig_phis
5478 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5479 gimple_seq seq = NULL;
5481 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5482 and the same element size as VECTYPE. */
5483 tree index = build_index_vector (vectype, 0, 1);
5484 tree index_type = TREE_TYPE (index);
5485 tree index_elt_type = TREE_TYPE (index_type);
5486 tree mask_type = truth_type_for (index_type);
5488 /* Create a vector that, for each element, identifies which of
5489 the REDUC_GROUP_SIZE results should use it. */
5490 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5491 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5492 build_vector_from_val (index_type, index_mask));
5494 /* Get a neutral vector value. This is simply a splat of the neutral
5495 scalar value if we have one, otherwise the initial scalar value
5496 is itself a neutral value. */
5497 tree vector_identity = NULL_TREE;
5498 tree neutral_op = NULL_TREE;
5499 if (slp_node)
5501 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5502 neutral_op
5503 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5504 vectype, code, first != NULL);
5506 if (neutral_op)
5507 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5508 neutral_op);
5509 for (unsigned int i = 0; i < group_size; ++i)
5511 /* If there's no univeral neutral value, we can use the
5512 initial scalar value from the original PHI. This is used
5513 for MIN and MAX reduction, for example. */
5514 if (!neutral_op)
5516 tree scalar_value
5517 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5518 loop_preheader_edge (loop));
5519 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5520 scalar_value);
5521 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5522 scalar_value);
5525 /* Calculate the equivalent of:
5527 sel[j] = (index[j] == i);
5529 which selects the elements of NEW_PHI_RESULT that should
5530 be included in the result. */
5531 tree compare_val = build_int_cst (index_elt_type, i);
5532 compare_val = build_vector_from_val (index_type, compare_val);
5533 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5534 index, compare_val);
5536 /* Calculate the equivalent of:
5538 vec = seq ? new_phi_result : vector_identity;
5540 VEC is now suitable for a full vector reduction. */
5541 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5542 sel, new_phi_result, vector_identity);
5544 /* Do the reduction and convert it to the appropriate type. */
5545 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5546 TREE_TYPE (vectype), vec);
5547 scalar = gimple_convert (&seq, scalar_type, scalar);
5548 scalar_results.safe_push (scalar);
5550 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5552 else
5554 bool reduce_with_shift;
5555 tree vec_temp;
5557 gcc_assert (slp_reduc || new_phis.length () == 1);
5559 /* See if the target wants to do the final (shift) reduction
5560 in a vector mode of smaller size and first reduce upper/lower
5561 halves against each other. */
5562 enum machine_mode mode1 = mode;
5563 tree stype = TREE_TYPE (vectype);
5564 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5565 unsigned nunits1 = nunits;
5566 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5567 && new_phis.length () == 1)
5569 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5570 /* For SLP reductions we have to make sure lanes match up, but
5571 since we're doing individual element final reduction reducing
5572 vector width here is even more important.
5573 ??? We can also separate lanes with permutes, for the common
5574 case of power-of-two group-size odd/even extracts would work. */
5575 if (slp_reduc && nunits != nunits1)
5577 nunits1 = least_common_multiple (nunits1, group_size);
5578 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5581 if (!slp_reduc
5582 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5583 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5585 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5586 stype, nunits1);
5587 reduce_with_shift = have_whole_vector_shift (mode1);
5588 if (!VECTOR_MODE_P (mode1))
5589 reduce_with_shift = false;
5590 else
5592 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5593 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5594 reduce_with_shift = false;
5597 /* First reduce the vector to the desired vector size we should
5598 do shift reduction on by combining upper and lower halves. */
5599 new_temp = new_phi_result;
5600 while (nunits > nunits1)
5602 nunits /= 2;
5603 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5604 stype, nunits);
5605 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5607 /* The target has to make sure we support lowpart/highpart
5608 extraction, either via direct vector extract or through
5609 an integer mode punning. */
5610 tree dst1, dst2;
5611 if (convert_optab_handler (vec_extract_optab,
5612 TYPE_MODE (TREE_TYPE (new_temp)),
5613 TYPE_MODE (vectype1))
5614 != CODE_FOR_nothing)
5616 /* Extract sub-vectors directly once vec_extract becomes
5617 a conversion optab. */
5618 dst1 = make_ssa_name (vectype1);
5619 epilog_stmt
5620 = gimple_build_assign (dst1, BIT_FIELD_REF,
5621 build3 (BIT_FIELD_REF, vectype1,
5622 new_temp, TYPE_SIZE (vectype1),
5623 bitsize_int (0)));
5624 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5625 dst2 = make_ssa_name (vectype1);
5626 epilog_stmt
5627 = gimple_build_assign (dst2, BIT_FIELD_REF,
5628 build3 (BIT_FIELD_REF, vectype1,
5629 new_temp, TYPE_SIZE (vectype1),
5630 bitsize_int (bitsize)));
5631 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5633 else
5635 /* Extract via punning to appropriately sized integer mode
5636 vector. */
5637 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5638 tree etype = build_vector_type (eltype, 2);
5639 gcc_assert (convert_optab_handler (vec_extract_optab,
5640 TYPE_MODE (etype),
5641 TYPE_MODE (eltype))
5642 != CODE_FOR_nothing);
5643 tree tem = make_ssa_name (etype);
5644 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5645 build1 (VIEW_CONVERT_EXPR,
5646 etype, new_temp));
5647 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5648 new_temp = tem;
5649 tem = make_ssa_name (eltype);
5650 epilog_stmt
5651 = gimple_build_assign (tem, BIT_FIELD_REF,
5652 build3 (BIT_FIELD_REF, eltype,
5653 new_temp, TYPE_SIZE (eltype),
5654 bitsize_int (0)));
5655 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5656 dst1 = make_ssa_name (vectype1);
5657 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5658 build1 (VIEW_CONVERT_EXPR,
5659 vectype1, tem));
5660 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5661 tem = make_ssa_name (eltype);
5662 epilog_stmt
5663 = gimple_build_assign (tem, BIT_FIELD_REF,
5664 build3 (BIT_FIELD_REF, eltype,
5665 new_temp, TYPE_SIZE (eltype),
5666 bitsize_int (bitsize)));
5667 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5668 dst2 = make_ssa_name (vectype1);
5669 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5670 build1 (VIEW_CONVERT_EXPR,
5671 vectype1, tem));
5672 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5675 new_temp = make_ssa_name (vectype1);
5676 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5677 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5678 new_phis[0] = epilog_stmt;
5681 if (reduce_with_shift && !slp_reduc)
5683 int element_bitsize = tree_to_uhwi (bitsize);
5684 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5685 for variable-length vectors and also requires direct target support
5686 for loop reductions. */
5687 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5688 int nelements = vec_size_in_bits / element_bitsize;
5689 vec_perm_builder sel;
5690 vec_perm_indices indices;
5692 int elt_offset;
5694 tree zero_vec = build_zero_cst (vectype1);
5695 /* Case 2: Create:
5696 for (offset = nelements/2; offset >= 1; offset/=2)
5698 Create: va' = vec_shift <va, offset>
5699 Create: va = vop <va, va'>
5700 } */
5702 tree rhs;
5704 if (dump_enabled_p ())
5705 dump_printf_loc (MSG_NOTE, vect_location,
5706 "Reduce using vector shifts\n");
5708 gimple_seq stmts = NULL;
5709 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5710 for (elt_offset = nelements / 2;
5711 elt_offset >= 1;
5712 elt_offset /= 2)
5714 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5715 indices.new_vector (sel, 2, nelements);
5716 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5717 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5718 new_temp, zero_vec, mask);
5719 new_temp = gimple_build (&stmts, code,
5720 vectype1, new_name, new_temp);
5722 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5724 /* 2.4 Extract the final scalar result. Create:
5725 s_out3 = extract_field <v_out2, bitpos> */
5727 if (dump_enabled_p ())
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 "extract scalar result\n");
5731 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5732 bitsize, bitsize_zero_node);
5733 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5734 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5735 gimple_assign_set_lhs (epilog_stmt, new_temp);
5736 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5737 scalar_results.safe_push (new_temp);
5739 else
5741 /* Case 3: Create:
5742 s = extract_field <v_out2, 0>
5743 for (offset = element_size;
5744 offset < vector_size;
5745 offset += element_size;)
5747 Create: s' = extract_field <v_out2, offset>
5748 Create: s = op <s, s'> // For non SLP cases
5749 } */
5751 if (dump_enabled_p ())
5752 dump_printf_loc (MSG_NOTE, vect_location,
5753 "Reduce using scalar code.\n");
5755 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5756 int element_bitsize = tree_to_uhwi (bitsize);
5757 tree compute_type = TREE_TYPE (vectype);
5758 gimple_seq stmts = NULL;
5759 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5761 int bit_offset;
5762 if (gimple_code (new_phi) == GIMPLE_PHI)
5763 vec_temp = PHI_RESULT (new_phi);
5764 else
5765 vec_temp = gimple_assign_lhs (new_phi);
5766 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5767 vec_temp, bitsize, bitsize_zero_node);
5769 /* In SLP we don't need to apply reduction operation, so we just
5770 collect s' values in SCALAR_RESULTS. */
5771 if (slp_reduc)
5772 scalar_results.safe_push (new_temp);
5774 for (bit_offset = element_bitsize;
5775 bit_offset < vec_size_in_bits;
5776 bit_offset += element_bitsize)
5778 tree bitpos = bitsize_int (bit_offset);
5779 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5780 compute_type, vec_temp,
5781 bitsize, bitpos);
5782 if (slp_reduc)
5784 /* In SLP we don't need to apply reduction operation, so
5785 we just collect s' values in SCALAR_RESULTS. */
5786 new_temp = new_name;
5787 scalar_results.safe_push (new_name);
5789 else
5790 new_temp = gimple_build (&stmts, code, compute_type,
5791 new_name, new_temp);
5795 /* The only case where we need to reduce scalar results in SLP, is
5796 unrolling. If the size of SCALAR_RESULTS is greater than
5797 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5798 REDUC_GROUP_SIZE. */
5799 if (slp_reduc)
5801 tree res, first_res, new_res;
5803 /* Reduce multiple scalar results in case of SLP unrolling. */
5804 for (j = group_size; scalar_results.iterate (j, &res);
5805 j++)
5807 first_res = scalar_results[j % group_size];
5808 new_res = gimple_build (&stmts, code, compute_type,
5809 first_res, res);
5810 scalar_results[j % group_size] = new_res;
5812 for (k = 0; k < group_size; k++)
5813 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5814 scalar_results[k]);
5816 else
5818 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5819 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5820 scalar_results.safe_push (new_temp);
5823 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5826 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5827 && induc_val)
5829 /* Earlier we set the initial value to be a vector if induc_val
5830 values. Check the result and if it is induc_val then replace
5831 with the original initial value, unless induc_val is
5832 the same as initial_def already. */
5833 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5834 induc_val);
5836 tree tmp = make_ssa_name (new_scalar_dest);
5837 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5838 initial_def, new_temp);
5839 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5840 scalar_results[0] = tmp;
5844 /* 2.5 Adjust the final result by the initial value of the reduction
5845 variable. (When such adjustment is not needed, then
5846 'adjustment_def' is zero). For example, if code is PLUS we create:
5847 new_temp = loop_exit_def + adjustment_def */
5849 if (adjustment_def)
5851 gcc_assert (!slp_reduc);
5852 gimple_seq stmts = NULL;
5853 if (nested_in_vect_loop)
5855 new_phi = new_phis[0];
5856 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5857 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5858 new_temp = gimple_build (&stmts, code, vectype,
5859 PHI_RESULT (new_phi), adjustment_def);
5861 else
5863 new_temp = scalar_results[0];
5864 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5865 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5866 new_temp = gimple_build (&stmts, code, scalar_type,
5867 new_temp, adjustment_def);
5870 epilog_stmt = gimple_seq_last_stmt (stmts);
5871 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5872 if (nested_in_vect_loop)
5874 if (!double_reduc)
5875 scalar_results.quick_push (new_temp);
5876 else
5877 scalar_results[0] = new_temp;
5879 else
5880 scalar_results[0] = new_temp;
5882 new_phis[0] = epilog_stmt;
5885 if (double_reduc)
5886 loop = loop->inner;
5888 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5889 phis with new adjusted scalar results, i.e., replace use <s_out0>
5890 with use <s_out4>.
5892 Transform:
5893 loop_exit:
5894 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5895 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5896 v_out2 = reduce <v_out1>
5897 s_out3 = extract_field <v_out2, 0>
5898 s_out4 = adjust_result <s_out3>
5899 use <s_out0>
5900 use <s_out0>
5902 into:
5904 loop_exit:
5905 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5906 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5907 v_out2 = reduce <v_out1>
5908 s_out3 = extract_field <v_out2, 0>
5909 s_out4 = adjust_result <s_out3>
5910 use <s_out4>
5911 use <s_out4> */
5914 /* In SLP reduction chain we reduce vector results into one vector if
5915 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5916 LHS of the last stmt in the reduction chain, since we are looking for
5917 the loop exit phi node. */
5918 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5920 stmt_vec_info dest_stmt_info
5921 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5922 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5923 group_size = 1;
5926 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5927 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5928 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5929 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5930 correspond to the first vector stmt, etc.
5931 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5932 if (group_size > new_phis.length ())
5933 gcc_assert (!(group_size % new_phis.length ()));
5935 for (k = 0; k < group_size; k++)
5937 if (slp_reduc)
5939 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5941 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5942 /* SLP statements can't participate in patterns. */
5943 gcc_assert (!orig_stmt_info);
5944 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5947 if (nested_in_vect_loop)
5949 if (double_reduc)
5950 loop = outer_loop;
5951 else
5952 gcc_unreachable ();
5955 phis.create (3);
5956 /* Find the loop-closed-use at the loop exit of the original scalar
5957 result. (The reduction result is expected to have two immediate uses,
5958 one at the latch block, and one at the loop exit). For double
5959 reductions we are looking for exit phis of the outer loop. */
5960 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5962 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5964 if (!is_gimple_debug (USE_STMT (use_p)))
5965 phis.safe_push (USE_STMT (use_p));
5967 else
5969 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5971 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5973 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5975 if (!flow_bb_inside_loop_p (loop,
5976 gimple_bb (USE_STMT (phi_use_p)))
5977 && !is_gimple_debug (USE_STMT (phi_use_p)))
5978 phis.safe_push (USE_STMT (phi_use_p));
5984 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5986 /* Replace the uses: */
5987 orig_name = PHI_RESULT (exit_phi);
5988 scalar_result = scalar_results[k];
5989 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5991 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5992 SET_USE (use_p, scalar_result);
5993 update_stmt (use_stmt);
5997 phis.release ();
6001 /* Return a vector of type VECTYPE that is equal to the vector select
6002 operation "MASK ? VEC : IDENTITY". Insert the select statements
6003 before GSI. */
6005 static tree
6006 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6007 tree vec, tree identity)
6009 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6010 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6011 mask, vec, identity);
6012 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6013 return cond;
6016 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6017 order, starting with LHS. Insert the extraction statements before GSI and
6018 associate the new scalar SSA names with variable SCALAR_DEST.
6019 Return the SSA name for the result. */
6021 static tree
6022 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6023 tree_code code, tree lhs, tree vector_rhs)
6025 tree vectype = TREE_TYPE (vector_rhs);
6026 tree scalar_type = TREE_TYPE (vectype);
6027 tree bitsize = TYPE_SIZE (scalar_type);
6028 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6029 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6031 for (unsigned HOST_WIDE_INT bit_offset = 0;
6032 bit_offset < vec_size_in_bits;
6033 bit_offset += element_bitsize)
6035 tree bitpos = bitsize_int (bit_offset);
6036 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6037 bitsize, bitpos);
6039 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6040 rhs = make_ssa_name (scalar_dest, stmt);
6041 gimple_assign_set_lhs (stmt, rhs);
6042 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6044 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6045 tree new_name = make_ssa_name (scalar_dest, stmt);
6046 gimple_assign_set_lhs (stmt, new_name);
6047 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6048 lhs = new_name;
6050 return lhs;
6053 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6054 type of the vector input. */
6056 static internal_fn
6057 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6059 internal_fn mask_reduc_fn;
6061 switch (reduc_fn)
6063 case IFN_FOLD_LEFT_PLUS:
6064 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6065 break;
6067 default:
6068 return IFN_LAST;
6071 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6072 OPTIMIZE_FOR_SPEED))
6073 return mask_reduc_fn;
6074 return IFN_LAST;
6077 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6078 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6079 statement. CODE is the operation performed by STMT_INFO and OPS are
6080 its scalar operands. REDUC_INDEX is the index of the operand in
6081 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6082 implements in-order reduction, or IFN_LAST if we should open-code it.
6083 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6084 that should be used to control the operation in a fully-masked loop. */
6086 static bool
6087 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6088 stmt_vec_info stmt_info,
6089 gimple_stmt_iterator *gsi,
6090 gimple **vec_stmt, slp_tree slp_node,
6091 gimple *reduc_def_stmt,
6092 tree_code code, internal_fn reduc_fn,
6093 tree ops[3], tree vectype_in,
6094 int reduc_index, vec_loop_masks *masks)
6096 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6097 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6098 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6100 int ncopies;
6101 if (slp_node)
6102 ncopies = 1;
6103 else
6104 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6106 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6107 gcc_assert (ncopies == 1);
6108 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6110 if (slp_node)
6111 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6112 TYPE_VECTOR_SUBPARTS (vectype_in)));
6114 tree op0 = ops[1 - reduc_index];
6116 int group_size = 1;
6117 stmt_vec_info scalar_dest_def_info;
6118 auto_vec<tree> vec_oprnds0;
6119 if (slp_node)
6121 auto_vec<vec<tree> > vec_defs (2);
6122 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6123 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6124 vec_defs[0].release ();
6125 vec_defs[1].release ();
6126 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6127 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6129 else
6131 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6132 op0, &vec_oprnds0);
6133 scalar_dest_def_info = stmt_info;
6136 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6137 tree scalar_type = TREE_TYPE (scalar_dest);
6138 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6140 int vec_num = vec_oprnds0.length ();
6141 gcc_assert (vec_num == 1 || slp_node);
6142 tree vec_elem_type = TREE_TYPE (vectype_out);
6143 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6145 tree vector_identity = NULL_TREE;
6146 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6147 vector_identity = build_zero_cst (vectype_out);
6149 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6150 int i;
6151 tree def0;
6152 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6154 gimple *new_stmt;
6155 tree mask = NULL_TREE;
6156 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6157 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6159 /* Handle MINUS by adding the negative. */
6160 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6162 tree negated = make_ssa_name (vectype_out);
6163 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6164 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6165 def0 = negated;
6168 if (mask && mask_reduc_fn == IFN_LAST)
6169 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6170 vector_identity);
6172 /* On the first iteration the input is simply the scalar phi
6173 result, and for subsequent iterations it is the output of
6174 the preceding operation. */
6175 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6177 if (mask && mask_reduc_fn != IFN_LAST)
6178 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6179 def0, mask);
6180 else
6181 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6182 def0);
6183 /* For chained SLP reductions the output of the previous reduction
6184 operation serves as the input of the next. For the final statement
6185 the output cannot be a temporary - we reuse the original
6186 scalar destination of the last statement. */
6187 if (i != vec_num - 1)
6189 gimple_set_lhs (new_stmt, scalar_dest_var);
6190 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6191 gimple_set_lhs (new_stmt, reduc_var);
6194 else
6196 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6197 reduc_var, def0);
6198 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6199 /* Remove the statement, so that we can use the same code paths
6200 as for statements that we've just created. */
6201 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6202 gsi_remove (&tmp_gsi, true);
6205 if (i == vec_num - 1)
6207 gimple_set_lhs (new_stmt, scalar_dest);
6208 vect_finish_replace_stmt (loop_vinfo,
6209 scalar_dest_def_info,
6210 new_stmt);
6212 else
6213 vect_finish_stmt_generation (loop_vinfo,
6214 scalar_dest_def_info,
6215 new_stmt, gsi);
6217 if (slp_node)
6218 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6219 else
6221 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6222 *vec_stmt = new_stmt;
6226 return true;
6229 /* Function is_nonwrapping_integer_induction.
6231 Check if STMT_VINO (which is part of loop LOOP) both increments and
6232 does not cause overflow. */
6234 static bool
6235 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6237 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6238 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6239 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6240 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6241 widest_int ni, max_loop_value, lhs_max;
6242 wi::overflow_type overflow = wi::OVF_NONE;
6244 /* Make sure the loop is integer based. */
6245 if (TREE_CODE (base) != INTEGER_CST
6246 || TREE_CODE (step) != INTEGER_CST)
6247 return false;
6249 /* Check that the max size of the loop will not wrap. */
6251 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6252 return true;
6254 if (! max_stmt_executions (loop, &ni))
6255 return false;
6257 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6258 &overflow);
6259 if (overflow)
6260 return false;
6262 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6263 TYPE_SIGN (lhs_type), &overflow);
6264 if (overflow)
6265 return false;
6267 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6268 <= TYPE_PRECISION (lhs_type));
6271 /* Check if masking can be supported by inserting a conditional expression.
6272 CODE is the code for the operation. COND_FN is the conditional internal
6273 function, if it exists. VECTYPE_IN is the type of the vector input. */
6274 static bool
6275 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6276 tree vectype_in)
6278 if (cond_fn != IFN_LAST
6279 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6280 OPTIMIZE_FOR_SPEED))
6281 return false;
6283 switch (code)
6285 case DOT_PROD_EXPR:
6286 case SAD_EXPR:
6287 return true;
6289 default:
6290 return false;
6294 /* Insert a conditional expression to enable masked vectorization. CODE is the
6295 code for the operation. VOP is the array of operands. MASK is the loop
6296 mask. GSI is a statement iterator used to place the new conditional
6297 expression. */
6298 static void
6299 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6300 gimple_stmt_iterator *gsi)
6302 switch (code)
6304 case DOT_PROD_EXPR:
6306 tree vectype = TREE_TYPE (vop[1]);
6307 tree zero = build_zero_cst (vectype);
6308 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6309 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6310 mask, vop[1], zero);
6311 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6312 vop[1] = masked_op1;
6313 break;
6316 case SAD_EXPR:
6318 tree vectype = TREE_TYPE (vop[1]);
6319 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6320 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6321 mask, vop[1], vop[0]);
6322 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6323 vop[1] = masked_op1;
6324 break;
6327 default:
6328 gcc_unreachable ();
6332 /* Function vectorizable_reduction.
6334 Check if STMT_INFO performs a reduction operation that can be vectorized.
6335 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6336 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6337 Return true if STMT_INFO is vectorizable in this way.
6339 This function also handles reduction idioms (patterns) that have been
6340 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6341 may be of this form:
6342 X = pattern_expr (arg0, arg1, ..., X)
6343 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6344 sequence that had been detected and replaced by the pattern-stmt
6345 (STMT_INFO).
6347 This function also handles reduction of condition expressions, for example:
6348 for (int i = 0; i < N; i++)
6349 if (a[i] < value)
6350 last = a[i];
6351 This is handled by vectorising the loop and creating an additional vector
6352 containing the loop indexes for which "a[i] < value" was true. In the
6353 function epilogue this is reduced to a single max value and then used to
6354 index into the vector of results.
6356 In some cases of reduction patterns, the type of the reduction variable X is
6357 different than the type of the other arguments of STMT_INFO.
6358 In such cases, the vectype that is used when transforming STMT_INFO into
6359 a vector stmt is different than the vectype that is used to determine the
6360 vectorization factor, because it consists of a different number of elements
6361 than the actual number of elements that are being operated upon in parallel.
6363 For example, consider an accumulation of shorts into an int accumulator.
6364 On some targets it's possible to vectorize this pattern operating on 8
6365 shorts at a time (hence, the vectype for purposes of determining the
6366 vectorization factor should be V8HI); on the other hand, the vectype that
6367 is used to create the vector form is actually V4SI (the type of the result).
6369 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6370 indicates what is the actual level of parallelism (V8HI in the example), so
6371 that the right vectorization factor would be derived. This vectype
6372 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6373 be used to create the vectorized stmt. The right vectype for the vectorized
6374 stmt is obtained from the type of the result X:
6375 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6377 This means that, contrary to "regular" reductions (or "regular" stmts in
6378 general), the following equation:
6379 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6380 does *NOT* necessarily hold for reduction patterns. */
6382 bool
6383 vectorizable_reduction (loop_vec_info loop_vinfo,
6384 stmt_vec_info stmt_info, slp_tree slp_node,
6385 slp_instance slp_node_instance,
6386 stmt_vector_for_cost *cost_vec)
6388 tree scalar_dest;
6389 tree vectype_in = NULL_TREE;
6390 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6391 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6392 stmt_vec_info cond_stmt_vinfo = NULL;
6393 tree scalar_type;
6394 int i;
6395 int ncopies;
6396 bool single_defuse_cycle = false;
6397 bool nested_cycle = false;
6398 bool double_reduc = false;
6399 int vec_num;
6400 tree tem;
6401 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6402 tree cond_reduc_val = NULL_TREE;
6404 /* Make sure it was already recognized as a reduction computation. */
6405 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6406 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6407 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6408 return false;
6410 /* The stmt we store reduction analysis meta on. */
6411 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6412 reduc_info->is_reduc_info = true;
6414 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6416 if (is_a <gphi *> (stmt_info->stmt))
6418 if (slp_node)
6420 /* We eventually need to set a vector type on invariant
6421 arguments. */
6422 unsigned j;
6423 slp_tree child;
6424 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6425 if (!vect_maybe_update_slp_op_vectype
6426 (child, SLP_TREE_VECTYPE (slp_node)))
6428 if (dump_enabled_p ())
6429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6430 "incompatible vector types for "
6431 "invariants\n");
6432 return false;
6435 /* Analysis for double-reduction is done on the outer
6436 loop PHI, nested cycles have no further restrictions. */
6437 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6439 else
6440 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6441 return true;
6444 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6445 stmt_vec_info phi_info = stmt_info;
6446 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6447 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6449 if (!is_a <gphi *> (stmt_info->stmt))
6451 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6452 return true;
6454 if (slp_node)
6456 slp_node_instance->reduc_phis = slp_node;
6457 /* ??? We're leaving slp_node to point to the PHIs, we only
6458 need it to get at the number of vector stmts which wasn't
6459 yet initialized for the instance root. */
6461 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6462 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6463 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6465 use_operand_p use_p;
6466 gimple *use_stmt;
6467 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6468 &use_p, &use_stmt);
6469 gcc_assert (res);
6470 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6471 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6475 /* PHIs should not participate in patterns. */
6476 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6477 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6479 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6480 and compute the reduction chain length. Discover the real
6481 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6482 tree reduc_def
6483 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6484 loop_latch_edge
6485 (gimple_bb (reduc_def_phi)->loop_father));
6486 unsigned reduc_chain_length = 0;
6487 bool only_slp_reduc_chain = true;
6488 stmt_info = NULL;
6489 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6490 while (reduc_def != PHI_RESULT (reduc_def_phi))
6492 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6493 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6494 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6496 if (dump_enabled_p ())
6497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6498 "reduction chain broken by patterns.\n");
6499 return false;
6501 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6502 only_slp_reduc_chain = false;
6503 /* ??? For epilogue generation live members of the chain need
6504 to point back to the PHI via their original stmt for
6505 info_for_reduction to work. */
6506 if (STMT_VINFO_LIVE_P (vdef))
6507 STMT_VINFO_REDUC_DEF (def) = phi_info;
6508 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6509 if (!assign)
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 "reduction chain includes calls.\n");
6514 return false;
6516 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6518 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6519 TREE_TYPE (gimple_assign_rhs1 (assign))))
6521 if (dump_enabled_p ())
6522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6523 "conversion in the reduction chain.\n");
6524 return false;
6527 else if (!stmt_info)
6528 /* First non-conversion stmt. */
6529 stmt_info = vdef;
6530 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6531 reduc_chain_length++;
6532 if (!stmt_info && slp_node)
6533 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6535 /* PHIs should not participate in patterns. */
6536 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6538 if (nested_in_vect_loop_p (loop, stmt_info))
6540 loop = loop->inner;
6541 nested_cycle = true;
6544 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6545 element. */
6546 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6548 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6549 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6551 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6552 gcc_assert (slp_node
6553 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6555 /* 1. Is vectorizable reduction? */
6556 /* Not supportable if the reduction variable is used in the loop, unless
6557 it's a reduction chain. */
6558 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6559 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6560 return false;
6562 /* Reductions that are not used even in an enclosing outer-loop,
6563 are expected to be "live" (used out of the loop). */
6564 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6565 && !STMT_VINFO_LIVE_P (stmt_info))
6566 return false;
6568 /* 2. Has this been recognized as a reduction pattern?
6570 Check if STMT represents a pattern that has been recognized
6571 in earlier analysis stages. For stmts that represent a pattern,
6572 the STMT_VINFO_RELATED_STMT field records the last stmt in
6573 the original sequence that constitutes the pattern. */
6575 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6576 if (orig_stmt_info)
6578 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6579 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6582 /* 3. Check the operands of the operation. The first operands are defined
6583 inside the loop body. The last operand is the reduction variable,
6584 which is defined by the loop-header-phi. */
6586 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6587 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6588 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6589 enum tree_code code = gimple_assign_rhs_code (stmt);
6590 bool lane_reduc_code_p
6591 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6592 int op_type = TREE_CODE_LENGTH (code);
6594 scalar_dest = gimple_assign_lhs (stmt);
6595 scalar_type = TREE_TYPE (scalar_dest);
6596 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6597 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6598 return false;
6600 /* Do not try to vectorize bit-precision reductions. */
6601 if (!type_has_mode_precision_p (scalar_type))
6602 return false;
6604 /* For lane-reducing ops we're reducing the number of reduction PHIs
6605 which means the only use of that may be in the lane-reducing operation. */
6606 if (lane_reduc_code_p
6607 && reduc_chain_length != 1
6608 && !only_slp_reduc_chain)
6610 if (dump_enabled_p ())
6611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612 "lane-reducing reduction with extra stmts.\n");
6613 return false;
6616 /* All uses but the last are expected to be defined in the loop.
6617 The last use is the reduction variable. In case of nested cycle this
6618 assumption is not true: we use reduc_index to record the index of the
6619 reduction variable. */
6620 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6621 /* We need to skip an extra operand for COND_EXPRs with embedded
6622 comparison. */
6623 unsigned opno_adjust = 0;
6624 if (code == COND_EXPR
6625 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6626 opno_adjust = 1;
6627 for (i = 0; i < op_type; i++)
6629 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6630 if (i == 0 && code == COND_EXPR)
6631 continue;
6633 stmt_vec_info def_stmt_info;
6634 enum vect_def_type dt;
6635 tree op;
6636 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6637 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6638 &def_stmt_info))
6640 if (dump_enabled_p ())
6641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6642 "use not simple.\n");
6643 return false;
6645 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6646 continue;
6648 /* There should be only one cycle def in the stmt, the one
6649 leading to reduc_def. */
6650 if (VECTORIZABLE_CYCLE_DEF (dt))
6651 return false;
6653 /* To properly compute ncopies we are interested in the widest
6654 non-reduction input type in case we're looking at a widening
6655 accumulation that we later handle in vect_transform_reduction. */
6656 if (lane_reduc_code_p
6657 && tem
6658 && (!vectype_in
6659 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6660 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6661 vectype_in = tem;
6663 if (code == COND_EXPR)
6665 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6666 if (dt == vect_constant_def)
6668 cond_reduc_dt = dt;
6669 cond_reduc_val = op;
6671 if (dt == vect_induction_def
6672 && def_stmt_info
6673 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6675 cond_reduc_dt = dt;
6676 cond_stmt_vinfo = def_stmt_info;
6680 if (!vectype_in)
6681 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6682 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6684 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6685 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6686 /* If we have a condition reduction, see if we can simplify it further. */
6687 if (v_reduc_type == COND_REDUCTION)
6689 if (slp_node)
6690 return false;
6692 /* When the condition uses the reduction value in the condition, fail. */
6693 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697 "condition depends on previous iteration\n");
6698 return false;
6701 if (reduc_chain_length == 1
6702 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6703 vectype_in, OPTIMIZE_FOR_SPEED))
6705 if (dump_enabled_p ())
6706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6707 "optimizing condition reduction with"
6708 " FOLD_EXTRACT_LAST.\n");
6709 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6711 else if (cond_reduc_dt == vect_induction_def)
6713 tree base
6714 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6715 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6717 gcc_assert (TREE_CODE (base) == INTEGER_CST
6718 && TREE_CODE (step) == INTEGER_CST);
6719 cond_reduc_val = NULL_TREE;
6720 enum tree_code cond_reduc_op_code = ERROR_MARK;
6721 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6722 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6724 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6725 above base; punt if base is the minimum value of the type for
6726 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6727 else if (tree_int_cst_sgn (step) == -1)
6729 cond_reduc_op_code = MIN_EXPR;
6730 if (tree_int_cst_sgn (base) == -1)
6731 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6732 else if (tree_int_cst_lt (base,
6733 TYPE_MAX_VALUE (TREE_TYPE (base))))
6734 cond_reduc_val
6735 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6737 else
6739 cond_reduc_op_code = MAX_EXPR;
6740 if (tree_int_cst_sgn (base) == 1)
6741 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6742 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6743 base))
6744 cond_reduc_val
6745 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6747 if (cond_reduc_val)
6749 if (dump_enabled_p ())
6750 dump_printf_loc (MSG_NOTE, vect_location,
6751 "condition expression based on "
6752 "integer induction.\n");
6753 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6754 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6755 = cond_reduc_val;
6756 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6759 else if (cond_reduc_dt == vect_constant_def)
6761 enum vect_def_type cond_initial_dt;
6762 tree cond_initial_val
6763 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6765 gcc_assert (cond_reduc_val != NULL_TREE);
6766 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6767 if (cond_initial_dt == vect_constant_def
6768 && types_compatible_p (TREE_TYPE (cond_initial_val),
6769 TREE_TYPE (cond_reduc_val)))
6771 tree e = fold_binary (LE_EXPR, boolean_type_node,
6772 cond_initial_val, cond_reduc_val);
6773 if (e && (integer_onep (e) || integer_zerop (e)))
6775 if (dump_enabled_p ())
6776 dump_printf_loc (MSG_NOTE, vect_location,
6777 "condition expression based on "
6778 "compile time constant.\n");
6779 /* Record reduction code at analysis stage. */
6780 STMT_VINFO_REDUC_CODE (reduc_info)
6781 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6782 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6788 if (STMT_VINFO_LIVE_P (phi_info))
6789 return false;
6791 if (slp_node)
6792 ncopies = 1;
6793 else
6794 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6796 gcc_assert (ncopies >= 1);
6798 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6800 if (nested_cycle)
6802 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6803 == vect_double_reduction_def);
6804 double_reduc = true;
6807 /* 4.2. Check support for the epilog operation.
6809 If STMT represents a reduction pattern, then the type of the
6810 reduction variable may be different than the type of the rest
6811 of the arguments. For example, consider the case of accumulation
6812 of shorts into an int accumulator; The original code:
6813 S1: int_a = (int) short_a;
6814 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6816 was replaced with:
6817 STMT: int_acc = widen_sum <short_a, int_acc>
6819 This means that:
6820 1. The tree-code that is used to create the vector operation in the
6821 epilog code (that reduces the partial results) is not the
6822 tree-code of STMT, but is rather the tree-code of the original
6823 stmt from the pattern that STMT is replacing. I.e, in the example
6824 above we want to use 'widen_sum' in the loop, but 'plus' in the
6825 epilog.
6826 2. The type (mode) we use to check available target support
6827 for the vector operation to be created in the *epilog*, is
6828 determined by the type of the reduction variable (in the example
6829 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6830 However the type (mode) we use to check available target support
6831 for the vector operation to be created *inside the loop*, is
6832 determined by the type of the other arguments to STMT (in the
6833 example we'd check this: optab_handler (widen_sum_optab,
6834 vect_short_mode)).
6836 This is contrary to "regular" reductions, in which the types of all
6837 the arguments are the same as the type of the reduction variable.
6838 For "regular" reductions we can therefore use the same vector type
6839 (and also the same tree-code) when generating the epilog code and
6840 when generating the code inside the loop. */
6842 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6843 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6845 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6846 if (reduction_type == TREE_CODE_REDUCTION)
6848 /* Check whether it's ok to change the order of the computation.
6849 Generally, when vectorizing a reduction we change the order of the
6850 computation. This may change the behavior of the program in some
6851 cases, so we need to check that this is ok. One exception is when
6852 vectorizing an outer-loop: the inner-loop is executed sequentially,
6853 and therefore vectorizing reductions in the inner-loop during
6854 outer-loop vectorization is safe. */
6855 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6857 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6858 is not directy used in stmt. */
6859 if (!only_slp_reduc_chain
6860 && reduc_chain_length != 1)
6862 if (dump_enabled_p ())
6863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6864 "in-order reduction chain without SLP.\n");
6865 return false;
6867 STMT_VINFO_REDUC_TYPE (reduc_info)
6868 = reduction_type = FOLD_LEFT_REDUCTION;
6870 else if (!commutative_tree_code (orig_code)
6871 || !associative_tree_code (orig_code))
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875 "reduction: not commutative/associative");
6876 return false;
6880 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6881 && ncopies > 1)
6883 if (dump_enabled_p ())
6884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6885 "multiple types in double reduction or condition "
6886 "reduction or fold-left reduction.\n");
6887 return false;
6890 internal_fn reduc_fn = IFN_LAST;
6891 if (reduction_type == TREE_CODE_REDUCTION
6892 || reduction_type == FOLD_LEFT_REDUCTION
6893 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6894 || reduction_type == CONST_COND_REDUCTION)
6896 if (reduction_type == FOLD_LEFT_REDUCTION
6897 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6898 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6900 if (reduc_fn != IFN_LAST
6901 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6902 OPTIMIZE_FOR_SPEED))
6904 if (dump_enabled_p ())
6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 "reduc op not supported by target.\n");
6908 reduc_fn = IFN_LAST;
6911 else
6913 if (!nested_cycle || double_reduc)
6915 if (dump_enabled_p ())
6916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917 "no reduc code for scalar code.\n");
6919 return false;
6923 else if (reduction_type == COND_REDUCTION)
6925 int scalar_precision
6926 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6927 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6928 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6929 nunits_out);
6931 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6932 OPTIMIZE_FOR_SPEED))
6933 reduc_fn = IFN_REDUC_MAX;
6935 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6937 if (reduction_type != EXTRACT_LAST_REDUCTION
6938 && (!nested_cycle || double_reduc)
6939 && reduc_fn == IFN_LAST
6940 && !nunits_out.is_constant ())
6942 if (dump_enabled_p ())
6943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6944 "missing target support for reduction on"
6945 " variable-length vectors.\n");
6946 return false;
6949 /* For SLP reductions, see if there is a neutral value we can use. */
6950 tree neutral_op = NULL_TREE;
6951 if (slp_node)
6952 neutral_op = neutral_op_for_slp_reduction
6953 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6954 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6956 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6958 /* We can't support in-order reductions of code such as this:
6960 for (int i = 0; i < n1; ++i)
6961 for (int j = 0; j < n2; ++j)
6962 l += a[j];
6964 since GCC effectively transforms the loop when vectorizing:
6966 for (int i = 0; i < n1 / VF; ++i)
6967 for (int j = 0; j < n2; ++j)
6968 for (int k = 0; k < VF; ++k)
6969 l += a[j];
6971 which is a reassociation of the original operation. */
6972 if (dump_enabled_p ())
6973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974 "in-order double reduction not supported.\n");
6976 return false;
6979 if (reduction_type == FOLD_LEFT_REDUCTION
6980 && slp_node
6981 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6983 /* We cannot use in-order reductions in this case because there is
6984 an implicit reassociation of the operations involved. */
6985 if (dump_enabled_p ())
6986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6987 "in-order unchained SLP reductions not supported.\n");
6988 return false;
6991 /* For double reductions, and for SLP reductions with a neutral value,
6992 we construct a variable-length initial vector by loading a vector
6993 full of the neutral value and then shift-and-inserting the start
6994 values into the low-numbered elements. */
6995 if ((double_reduc || neutral_op)
6996 && !nunits_out.is_constant ()
6997 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6998 vectype_out, OPTIMIZE_FOR_SPEED))
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002 "reduction on variable-length vectors requires"
7003 " target support for a vector-shift-and-insert"
7004 " operation.\n");
7005 return false;
7008 /* Check extra constraints for variable-length unchained SLP reductions. */
7009 if (STMT_SLP_TYPE (stmt_info)
7010 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7011 && !nunits_out.is_constant ())
7013 /* We checked above that we could build the initial vector when
7014 there's a neutral element value. Check here for the case in
7015 which each SLP statement has its own initial value and in which
7016 that value needs to be repeated for every instance of the
7017 statement within the initial vector. */
7018 unsigned int group_size = SLP_TREE_LANES (slp_node);
7019 if (!neutral_op
7020 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7021 TREE_TYPE (vectype_out)))
7023 if (dump_enabled_p ())
7024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7025 "unsupported form of SLP reduction for"
7026 " variable-length vectors: cannot build"
7027 " initial vector.\n");
7028 return false;
7030 /* The epilogue code relies on the number of elements being a multiple
7031 of the group size. The duplicate-and-interleave approach to setting
7032 up the initial vector does too. */
7033 if (!multiple_p (nunits_out, group_size))
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037 "unsupported form of SLP reduction for"
7038 " variable-length vectors: the vector size"
7039 " is not a multiple of the number of results.\n");
7040 return false;
7044 if (reduction_type == COND_REDUCTION)
7046 widest_int ni;
7048 if (! max_loop_iterations (loop, &ni))
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_NOTE, vect_location,
7052 "loop count not known, cannot create cond "
7053 "reduction.\n");
7054 return false;
7056 /* Convert backedges to iterations. */
7057 ni += 1;
7059 /* The additional index will be the same type as the condition. Check
7060 that the loop can fit into this less one (because we'll use up the
7061 zero slot for when there are no matches). */
7062 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7063 if (wi::geu_p (ni, wi::to_widest (max_index)))
7065 if (dump_enabled_p ())
7066 dump_printf_loc (MSG_NOTE, vect_location,
7067 "loop size is greater than data size.\n");
7068 return false;
7072 /* In case the vectorization factor (VF) is bigger than the number
7073 of elements that we can fit in a vectype (nunits), we have to generate
7074 more than one vector stmt - i.e - we need to "unroll" the
7075 vector stmt by a factor VF/nunits. For more details see documentation
7076 in vectorizable_operation. */
7078 /* If the reduction is used in an outer loop we need to generate
7079 VF intermediate results, like so (e.g. for ncopies=2):
7080 r0 = phi (init, r0)
7081 r1 = phi (init, r1)
7082 r0 = x0 + r0;
7083 r1 = x1 + r1;
7084 (i.e. we generate VF results in 2 registers).
7085 In this case we have a separate def-use cycle for each copy, and therefore
7086 for each copy we get the vector def for the reduction variable from the
7087 respective phi node created for this copy.
7089 Otherwise (the reduction is unused in the loop nest), we can combine
7090 together intermediate results, like so (e.g. for ncopies=2):
7091 r = phi (init, r)
7092 r = x0 + r;
7093 r = x1 + r;
7094 (i.e. we generate VF/2 results in a single register).
7095 In this case for each copy we get the vector def for the reduction variable
7096 from the vectorized reduction operation generated in the previous iteration.
7098 This only works when we see both the reduction PHI and its only consumer
7099 in vectorizable_reduction and there are no intermediate stmts
7100 participating. */
7101 if (ncopies > 1
7102 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7103 && reduc_chain_length == 1)
7104 single_defuse_cycle = true;
7106 if (single_defuse_cycle || lane_reduc_code_p)
7108 gcc_assert (code != COND_EXPR);
7110 /* 4. Supportable by target? */
7111 bool ok = true;
7113 /* 4.1. check support for the operation in the loop */
7114 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7115 if (!optab)
7117 if (dump_enabled_p ())
7118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7119 "no optab.\n");
7120 ok = false;
7123 machine_mode vec_mode = TYPE_MODE (vectype_in);
7124 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7126 if (dump_enabled_p ())
7127 dump_printf (MSG_NOTE, "op not supported by target.\n");
7128 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7129 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7130 ok = false;
7131 else
7132 if (dump_enabled_p ())
7133 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7136 /* Worthwhile without SIMD support? */
7137 if (ok
7138 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7139 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7141 if (dump_enabled_p ())
7142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 "not worthwhile without SIMD support.\n");
7144 ok = false;
7147 /* lane-reducing operations have to go through vect_transform_reduction.
7148 For the other cases try without the single cycle optimization. */
7149 if (!ok)
7151 if (lane_reduc_code_p)
7152 return false;
7153 else
7154 single_defuse_cycle = false;
7157 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7159 /* If the reduction stmt is one of the patterns that have lane
7160 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7161 if ((ncopies > 1 && ! single_defuse_cycle)
7162 && lane_reduc_code_p)
7164 if (dump_enabled_p ())
7165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7166 "multi def-use cycle not possible for lane-reducing "
7167 "reduction operation\n");
7168 return false;
7171 if (slp_node
7172 && !(!single_defuse_cycle
7173 && code != DOT_PROD_EXPR
7174 && code != WIDEN_SUM_EXPR
7175 && code != SAD_EXPR
7176 && reduction_type != FOLD_LEFT_REDUCTION))
7177 for (i = 0; i < op_type; i++)
7178 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7180 if (dump_enabled_p ())
7181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7182 "incompatible vector types for invariants\n");
7183 return false;
7186 if (slp_node)
7187 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7188 else
7189 vec_num = 1;
7191 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7192 reduction_type, ncopies, cost_vec);
7193 if (dump_enabled_p ()
7194 && reduction_type == FOLD_LEFT_REDUCTION)
7195 dump_printf_loc (MSG_NOTE, vect_location,
7196 "using an in-order (fold-left) reduction.\n");
7197 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7198 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7199 reductions go through their own vectorizable_* routines. */
7200 if (!single_defuse_cycle
7201 && code != DOT_PROD_EXPR
7202 && code != WIDEN_SUM_EXPR
7203 && code != SAD_EXPR
7204 && reduction_type != FOLD_LEFT_REDUCTION)
7206 stmt_vec_info tem
7207 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7208 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7210 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7211 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7213 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7214 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7216 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7218 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7219 internal_fn cond_fn = get_conditional_internal_fn (code);
7221 if (reduction_type != FOLD_LEFT_REDUCTION
7222 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7223 && (cond_fn == IFN_LAST
7224 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7225 OPTIMIZE_FOR_SPEED)))
7227 if (dump_enabled_p ())
7228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7229 "can't operate on partial vectors because"
7230 " no conditional operation is available.\n");
7231 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7233 else if (reduction_type == FOLD_LEFT_REDUCTION
7234 && reduc_fn == IFN_LAST
7235 && !expand_vec_cond_expr_p (vectype_in,
7236 truth_type_for (vectype_in),
7237 SSA_NAME))
7239 if (dump_enabled_p ())
7240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7241 "can't operate on partial vectors because"
7242 " no conditional operation is available.\n");
7243 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7245 else
7246 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7247 vectype_in, NULL);
7249 return true;
7252 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7253 value. */
7255 bool
7256 vect_transform_reduction (loop_vec_info loop_vinfo,
7257 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7258 gimple **vec_stmt, slp_tree slp_node)
7260 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7261 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7262 int i;
7263 int ncopies;
7264 int vec_num;
7266 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7267 gcc_assert (reduc_info->is_reduc_info);
7269 if (nested_in_vect_loop_p (loop, stmt_info))
7271 loop = loop->inner;
7272 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7275 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7276 enum tree_code code = gimple_assign_rhs_code (stmt);
7277 int op_type = TREE_CODE_LENGTH (code);
7279 /* Flatten RHS. */
7280 tree ops[3];
7281 switch (get_gimple_rhs_class (code))
7283 case GIMPLE_TERNARY_RHS:
7284 ops[2] = gimple_assign_rhs3 (stmt);
7285 /* Fall thru. */
7286 case GIMPLE_BINARY_RHS:
7287 ops[0] = gimple_assign_rhs1 (stmt);
7288 ops[1] = gimple_assign_rhs2 (stmt);
7289 break;
7290 default:
7291 gcc_unreachable ();
7294 /* All uses but the last are expected to be defined in the loop.
7295 The last use is the reduction variable. In case of nested cycle this
7296 assumption is not true: we use reduc_index to record the index of the
7297 reduction variable. */
7298 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7299 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7300 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7301 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7303 if (slp_node)
7305 ncopies = 1;
7306 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7308 else
7310 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7311 vec_num = 1;
7314 internal_fn cond_fn = get_conditional_internal_fn (code);
7315 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7316 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7318 /* Transform. */
7319 tree new_temp = NULL_TREE;
7320 auto_vec<tree> vec_oprnds0;
7321 auto_vec<tree> vec_oprnds1;
7322 auto_vec<tree> vec_oprnds2;
7323 tree def0;
7325 if (dump_enabled_p ())
7326 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7328 /* FORNOW: Multiple types are not supported for condition. */
7329 if (code == COND_EXPR)
7330 gcc_assert (ncopies == 1);
7332 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7334 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7335 if (reduction_type == FOLD_LEFT_REDUCTION)
7337 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7338 return vectorize_fold_left_reduction
7339 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7340 reduc_fn, ops, vectype_in, reduc_index, masks);
7343 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7344 gcc_assert (single_defuse_cycle
7345 || code == DOT_PROD_EXPR
7346 || code == WIDEN_SUM_EXPR
7347 || code == SAD_EXPR);
7349 /* Create the destination vector */
7350 tree scalar_dest = gimple_assign_lhs (stmt);
7351 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7353 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7354 single_defuse_cycle && reduc_index == 0
7355 ? NULL_TREE : ops[0], &vec_oprnds0,
7356 single_defuse_cycle && reduc_index == 1
7357 ? NULL_TREE : ops[1], &vec_oprnds1,
7358 op_type == ternary_op
7359 && !(single_defuse_cycle && reduc_index == 2)
7360 ? ops[2] : NULL_TREE, &vec_oprnds2);
7361 if (single_defuse_cycle)
7363 gcc_assert (!slp_node);
7364 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7365 ops[reduc_index],
7366 reduc_index == 0 ? &vec_oprnds0
7367 : (reduc_index == 1 ? &vec_oprnds1
7368 : &vec_oprnds2));
7371 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7373 gimple *new_stmt;
7374 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7375 if (masked_loop_p && !mask_by_cond_expr)
7377 /* Make sure that the reduction accumulator is vop[0]. */
7378 if (reduc_index == 1)
7380 gcc_assert (commutative_tree_code (code));
7381 std::swap (vop[0], vop[1]);
7383 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7384 vectype_in, i);
7385 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7386 vop[0], vop[1], vop[0]);
7387 new_temp = make_ssa_name (vec_dest, call);
7388 gimple_call_set_lhs (call, new_temp);
7389 gimple_call_set_nothrow (call, true);
7390 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7391 new_stmt = call;
7393 else
7395 if (op_type == ternary_op)
7396 vop[2] = vec_oprnds2[i];
7398 if (masked_loop_p && mask_by_cond_expr)
7400 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7401 vectype_in, i);
7402 build_vect_cond_expr (code, vop, mask, gsi);
7405 new_stmt = gimple_build_assign (vec_dest, code,
7406 vop[0], vop[1], vop[2]);
7407 new_temp = make_ssa_name (vec_dest, new_stmt);
7408 gimple_assign_set_lhs (new_stmt, new_temp);
7409 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7412 if (slp_node)
7413 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7414 else if (single_defuse_cycle
7415 && i < ncopies - 1)
7417 if (reduc_index == 0)
7418 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7419 else if (reduc_index == 1)
7420 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7421 else if (reduc_index == 2)
7422 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7424 else
7425 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7428 if (!slp_node)
7429 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7431 return true;
7434 /* Transform phase of a cycle PHI. */
7436 bool
7437 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7438 stmt_vec_info stmt_info, gimple **vec_stmt,
7439 slp_tree slp_node, slp_instance slp_node_instance)
7441 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7442 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7443 int i;
7444 int ncopies;
7445 int j;
7446 bool nested_cycle = false;
7447 int vec_num;
7449 if (nested_in_vect_loop_p (loop, stmt_info))
7451 loop = loop->inner;
7452 nested_cycle = true;
7455 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7456 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7457 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7458 gcc_assert (reduc_info->is_reduc_info);
7460 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7461 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7462 /* Leave the scalar phi in place. */
7463 return true;
7465 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7466 /* For a nested cycle we do not fill the above. */
7467 if (!vectype_in)
7468 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7469 gcc_assert (vectype_in);
7471 if (slp_node)
7473 /* The size vect_schedule_slp_instance computes is off for us. */
7474 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7475 * SLP_TREE_LANES (slp_node), vectype_in);
7476 ncopies = 1;
7478 else
7480 vec_num = 1;
7481 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7484 /* Check whether we should use a single PHI node and accumulate
7485 vectors to one before the backedge. */
7486 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7487 ncopies = 1;
7489 /* Create the destination vector */
7490 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7491 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7492 vectype_out);
7494 /* Get the loop-entry arguments. */
7495 tree vec_initial_def;
7496 auto_vec<tree> vec_initial_defs;
7497 if (slp_node)
7499 vec_initial_defs.reserve (vec_num);
7500 if (nested_cycle)
7502 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7503 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7504 &vec_initial_defs);
7506 else
7508 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7509 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7510 tree neutral_op
7511 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7512 STMT_VINFO_REDUC_CODE (reduc_info),
7513 first != NULL);
7514 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7515 &vec_initial_defs, vec_num,
7516 first != NULL, neutral_op);
7519 else
7521 /* Get at the scalar def before the loop, that defines the initial
7522 value of the reduction variable. */
7523 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7524 loop_preheader_edge (loop));
7525 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7526 and we can't use zero for induc_val, use initial_def. Similarly
7527 for REDUC_MIN and initial_def larger than the base. */
7528 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7530 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7531 if (TREE_CODE (initial_def) == INTEGER_CST
7532 && !integer_zerop (induc_val)
7533 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7534 && tree_int_cst_lt (initial_def, induc_val))
7535 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7536 && tree_int_cst_lt (induc_val, initial_def))))
7538 induc_val = initial_def;
7539 /* Communicate we used the initial_def to epilouge
7540 generation. */
7541 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7543 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7544 vec_initial_defs.create (ncopies);
7545 for (i = 0; i < ncopies; ++i)
7546 vec_initial_defs.quick_push (vec_initial_def);
7548 else if (nested_cycle)
7550 /* Do not use an adjustment def as that case is not supported
7551 correctly if ncopies is not one. */
7552 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7553 ncopies, initial_def,
7554 &vec_initial_defs);
7556 else
7558 tree adjustment_def = NULL_TREE;
7559 tree *adjustment_defp = &adjustment_def;
7560 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7561 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7562 adjustment_defp = NULL;
7563 vec_initial_def
7564 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7565 initial_def, adjustment_defp);
7566 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7567 vec_initial_defs.create (ncopies);
7568 for (i = 0; i < ncopies; ++i)
7569 vec_initial_defs.quick_push (vec_initial_def);
7573 /* Generate the reduction PHIs upfront. */
7574 for (i = 0; i < vec_num; i++)
7576 tree vec_init_def = vec_initial_defs[i];
7577 for (j = 0; j < ncopies; j++)
7579 /* Create the reduction-phi that defines the reduction
7580 operand. */
7581 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7583 /* Set the loop-entry arg of the reduction-phi. */
7584 if (j != 0 && nested_cycle)
7585 vec_init_def = vec_initial_defs[j];
7586 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7587 UNKNOWN_LOCATION);
7589 /* The loop-latch arg is set in epilogue processing. */
7591 if (slp_node)
7592 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7593 else
7595 if (j == 0)
7596 *vec_stmt = new_phi;
7597 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7602 return true;
7605 /* Vectorizes LC PHIs. */
7607 bool
7608 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7609 stmt_vec_info stmt_info, gimple **vec_stmt,
7610 slp_tree slp_node)
7612 if (!loop_vinfo
7613 || !is_a <gphi *> (stmt_info->stmt)
7614 || gimple_phi_num_args (stmt_info->stmt) != 1)
7615 return false;
7617 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7618 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7619 return false;
7621 if (!vec_stmt) /* transformation not required. */
7623 /* Deal with copies from externs or constants that disguise as
7624 loop-closed PHI nodes (PR97886). */
7625 if (slp_node
7626 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7627 SLP_TREE_VECTYPE (slp_node)))
7629 if (dump_enabled_p ())
7630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7631 "incompatible vector types for invariants\n");
7632 return false;
7634 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7635 return true;
7638 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7639 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7640 basic_block bb = gimple_bb (stmt_info->stmt);
7641 edge e = single_pred_edge (bb);
7642 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7643 auto_vec<tree> vec_oprnds;
7644 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7645 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7646 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7647 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7649 /* Create the vectorized LC PHI node. */
7650 gphi *new_phi = create_phi_node (vec_dest, bb);
7651 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7652 if (slp_node)
7653 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7654 else
7655 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7657 if (!slp_node)
7658 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7660 return true;
7663 /* Vectorizes PHIs. */
7665 bool
7666 vectorizable_phi (vec_info *,
7667 stmt_vec_info stmt_info, gimple **vec_stmt,
7668 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7670 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7671 return false;
7673 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7674 return false;
7676 tree vectype = SLP_TREE_VECTYPE (slp_node);
7678 if (!vec_stmt) /* transformation not required. */
7680 slp_tree child;
7681 unsigned i;
7682 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7683 if (!child)
7685 if (dump_enabled_p ())
7686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7687 "PHI node with unvectorized backedge def\n");
7688 return false;
7690 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7692 if (dump_enabled_p ())
7693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694 "incompatible vector types for invariants\n");
7695 return false;
7697 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7698 vector_stmt, stmt_info, vectype, 0, vect_body);
7699 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7700 return true;
7703 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7704 basic_block bb = gimple_bb (stmt_info->stmt);
7705 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7706 auto_vec<gphi *> new_phis;
7707 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7709 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7711 /* Skip not yet vectorized defs. */
7712 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7713 && SLP_TREE_VEC_STMTS (child).is_empty ())
7714 continue;
7716 auto_vec<tree> vec_oprnds;
7717 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7718 if (!new_phis.exists ())
7720 new_phis.create (vec_oprnds.length ());
7721 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7723 /* Create the vectorized LC PHI node. */
7724 new_phis.quick_push (create_phi_node (vec_dest, bb));
7725 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7728 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7729 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7730 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7732 /* We should have at least one already vectorized child. */
7733 gcc_assert (new_phis.exists ());
7735 return true;
7739 /* Function vect_min_worthwhile_factor.
7741 For a loop where we could vectorize the operation indicated by CODE,
7742 return the minimum vectorization factor that makes it worthwhile
7743 to use generic vectors. */
7744 static unsigned int
7745 vect_min_worthwhile_factor (enum tree_code code)
7747 switch (code)
7749 case PLUS_EXPR:
7750 case MINUS_EXPR:
7751 case NEGATE_EXPR:
7752 return 4;
7754 case BIT_AND_EXPR:
7755 case BIT_IOR_EXPR:
7756 case BIT_XOR_EXPR:
7757 case BIT_NOT_EXPR:
7758 return 2;
7760 default:
7761 return INT_MAX;
7765 /* Return true if VINFO indicates we are doing loop vectorization and if
7766 it is worth decomposing CODE operations into scalar operations for
7767 that loop's vectorization factor. */
7769 bool
7770 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7772 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7773 unsigned HOST_WIDE_INT value;
7774 return (loop_vinfo
7775 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7776 && value >= vect_min_worthwhile_factor (code));
7779 /* Function vectorizable_induction
7781 Check if STMT_INFO performs an induction computation that can be vectorized.
7782 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7783 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7784 Return true if STMT_INFO is vectorizable in this way. */
7786 bool
7787 vectorizable_induction (loop_vec_info loop_vinfo,
7788 stmt_vec_info stmt_info,
7789 gimple **vec_stmt, slp_tree slp_node,
7790 stmt_vector_for_cost *cost_vec)
7792 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7793 unsigned ncopies;
7794 bool nested_in_vect_loop = false;
7795 class loop *iv_loop;
7796 tree vec_def;
7797 edge pe = loop_preheader_edge (loop);
7798 basic_block new_bb;
7799 tree new_vec, vec_init, vec_step, t;
7800 tree new_name;
7801 gimple *new_stmt;
7802 gphi *induction_phi;
7803 tree induc_def, vec_dest;
7804 tree init_expr, step_expr;
7805 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7806 unsigned i;
7807 tree expr;
7808 gimple_stmt_iterator si;
7810 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7811 if (!phi)
7812 return false;
7814 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7815 return false;
7817 /* Make sure it was recognized as induction computation. */
7818 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7819 return false;
7821 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7822 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7824 if (slp_node)
7825 ncopies = 1;
7826 else
7827 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7828 gcc_assert (ncopies >= 1);
7830 /* FORNOW. These restrictions should be relaxed. */
7831 if (nested_in_vect_loop_p (loop, stmt_info))
7833 imm_use_iterator imm_iter;
7834 use_operand_p use_p;
7835 gimple *exit_phi;
7836 edge latch_e;
7837 tree loop_arg;
7839 if (ncopies > 1)
7841 if (dump_enabled_p ())
7842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7843 "multiple types in nested loop.\n");
7844 return false;
7847 exit_phi = NULL;
7848 latch_e = loop_latch_edge (loop->inner);
7849 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7850 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7852 gimple *use_stmt = USE_STMT (use_p);
7853 if (is_gimple_debug (use_stmt))
7854 continue;
7856 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7858 exit_phi = use_stmt;
7859 break;
7862 if (exit_phi)
7864 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7865 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7866 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7868 if (dump_enabled_p ())
7869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7870 "inner-loop induction only used outside "
7871 "of the outer vectorized loop.\n");
7872 return false;
7876 nested_in_vect_loop = true;
7877 iv_loop = loop->inner;
7879 else
7880 iv_loop = loop;
7881 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7883 if (slp_node && !nunits.is_constant ())
7885 /* The current SLP code creates the step value element-by-element. */
7886 if (dump_enabled_p ())
7887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7888 "SLP induction not supported for variable-length"
7889 " vectors.\n");
7890 return false;
7893 if (!vec_stmt) /* transformation not required. */
7895 unsigned inside_cost = 0, prologue_cost = 0;
7896 if (slp_node)
7898 /* We eventually need to set a vector type on invariant
7899 arguments. */
7900 unsigned j;
7901 slp_tree child;
7902 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7903 if (!vect_maybe_update_slp_op_vectype
7904 (child, SLP_TREE_VECTYPE (slp_node)))
7906 if (dump_enabled_p ())
7907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7908 "incompatible vector types for "
7909 "invariants\n");
7910 return false;
7912 /* loop cost for vec_loop. */
7913 inside_cost
7914 = record_stmt_cost (cost_vec,
7915 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7916 vector_stmt, stmt_info, 0, vect_body);
7917 /* prologue cost for vec_init (if not nested) and step. */
7918 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7919 scalar_to_vec,
7920 stmt_info, 0, vect_prologue);
7922 else /* if (!slp_node) */
7924 /* loop cost for vec_loop. */
7925 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7926 stmt_info, 0, vect_body);
7927 /* prologue cost for vec_init and vec_step. */
7928 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7929 stmt_info, 0, vect_prologue);
7931 if (dump_enabled_p ())
7932 dump_printf_loc (MSG_NOTE, vect_location,
7933 "vect_model_induction_cost: inside_cost = %d, "
7934 "prologue_cost = %d .\n", inside_cost,
7935 prologue_cost);
7937 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7938 DUMP_VECT_SCOPE ("vectorizable_induction");
7939 return true;
7942 /* Transform. */
7944 /* Compute a vector variable, initialized with the first VF values of
7945 the induction variable. E.g., for an iv with IV_PHI='X' and
7946 evolution S, for a vector of 4 units, we want to compute:
7947 [X, X + S, X + 2*S, X + 3*S]. */
7949 if (dump_enabled_p ())
7950 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7952 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7953 gcc_assert (step_expr != NULL_TREE);
7954 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7956 pe = loop_preheader_edge (iv_loop);
7957 /* Find the first insertion point in the BB. */
7958 basic_block bb = gimple_bb (phi);
7959 si = gsi_after_labels (bb);
7961 /* For SLP induction we have to generate several IVs as for example
7962 with group size 3 we need
7963 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7964 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
7965 if (slp_node)
7967 /* Enforced above. */
7968 unsigned int const_nunits = nunits.to_constant ();
7970 /* The initial values are vectorized, but any lanes > group_size
7971 need adjustment. */
7972 slp_tree init_node
7973 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7975 /* Gather steps. Since we do not vectorize inductions as
7976 cycles we have to reconstruct the step from SCEV data. */
7977 unsigned group_size = SLP_TREE_LANES (slp_node);
7978 tree *steps = XALLOCAVEC (tree, group_size);
7979 tree *inits = XALLOCAVEC (tree, group_size);
7980 stmt_vec_info phi_info;
7981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7983 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7984 if (!init_node)
7985 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7986 pe->dest_idx);
7989 /* Now generate the IVs. */
7990 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7991 gcc_assert ((const_nunits * nvects) % group_size == 0);
7992 unsigned nivs;
7993 if (nested_in_vect_loop)
7994 nivs = nvects;
7995 else
7997 /* Compute the number of distinct IVs we need. First reduce
7998 group_size if it is a multiple of const_nunits so we get
7999 one IV for a group_size of 4 but const_nunits 2. */
8000 unsigned group_sizep = group_size;
8001 if (group_sizep % const_nunits == 0)
8002 group_sizep = group_sizep / const_nunits;
8003 nivs = least_common_multiple (group_sizep,
8004 const_nunits) / const_nunits;
8006 tree stept = TREE_TYPE (step_vectype);
8007 tree lupdate_mul = NULL_TREE;
8008 if (!nested_in_vect_loop)
8010 /* The number of iterations covered in one vector iteration. */
8011 unsigned lup_mul = (nvects * const_nunits) / group_size;
8012 lupdate_mul
8013 = build_vector_from_val (step_vectype,
8014 SCALAR_FLOAT_TYPE_P (stept)
8015 ? build_real_from_wide (stept, lup_mul,
8016 UNSIGNED)
8017 : build_int_cstu (stept, lup_mul));
8019 tree peel_mul = NULL_TREE;
8020 gimple_seq init_stmts = NULL;
8021 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8023 if (SCALAR_FLOAT_TYPE_P (stept))
8024 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8025 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8026 else
8027 peel_mul = gimple_convert (&init_stmts, stept,
8028 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8029 peel_mul = gimple_build_vector_from_val (&init_stmts,
8030 step_vectype, peel_mul);
8032 unsigned ivn;
8033 auto_vec<tree> vec_steps;
8034 for (ivn = 0; ivn < nivs; ++ivn)
8036 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8037 tree_vector_builder init_elts (vectype, const_nunits, 1);
8038 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8039 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8041 /* The scalar steps of the IVs. */
8042 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8043 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8044 step_elts.quick_push (elt);
8045 if (!init_node)
8047 /* The scalar inits of the IVs if not vectorized. */
8048 elt = inits[(ivn*const_nunits + eltn) % group_size];
8049 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8050 TREE_TYPE (elt)))
8051 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8052 TREE_TYPE (vectype), elt);
8053 init_elts.quick_push (elt);
8055 /* The number of steps to add to the initial values. */
8056 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8057 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8058 ? build_real_from_wide (stept,
8059 mul_elt, UNSIGNED)
8060 : build_int_cstu (stept, mul_elt));
8062 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8063 vec_steps.safe_push (vec_step);
8064 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8065 if (peel_mul)
8066 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8067 step_mul, peel_mul);
8068 if (!init_node)
8069 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8071 /* Create the induction-phi that defines the induction-operand. */
8072 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8073 "vec_iv_");
8074 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8075 induc_def = PHI_RESULT (induction_phi);
8077 /* Create the iv update inside the loop */
8078 tree up = vec_step;
8079 if (lupdate_mul)
8080 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8081 vec_step, lupdate_mul);
8082 gimple_seq stmts = NULL;
8083 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8084 vec_def = gimple_build (&stmts,
8085 PLUS_EXPR, step_vectype, vec_def, up);
8086 vec_def = gimple_convert (&stmts, vectype, vec_def);
8087 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8088 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8089 UNKNOWN_LOCATION);
8091 if (init_node)
8092 vec_init = vect_get_slp_vect_def (init_node, ivn);
8093 if (!nested_in_vect_loop
8094 && !integer_zerop (step_mul))
8096 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8097 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8098 vec_step, step_mul);
8099 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8100 vec_def, up);
8101 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8104 /* Set the arguments of the phi node: */
8105 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8107 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8109 if (!nested_in_vect_loop)
8111 /* Fill up to the number of vectors we need for the whole group. */
8112 nivs = least_common_multiple (group_size,
8113 const_nunits) / const_nunits;
8114 for (; ivn < nivs; ++ivn)
8116 SLP_TREE_VEC_STMTS (slp_node)
8117 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8118 vec_steps.safe_push (vec_steps[0]);
8122 /* Re-use IVs when we can. We are generating further vector
8123 stmts by adding VF' * stride to the IVs generated above. */
8124 if (ivn < nvects)
8126 unsigned vfp
8127 = least_common_multiple (group_size, const_nunits) / group_size;
8128 tree lupdate_mul
8129 = build_vector_from_val (step_vectype,
8130 SCALAR_FLOAT_TYPE_P (stept)
8131 ? build_real_from_wide (stept,
8132 vfp, UNSIGNED)
8133 : build_int_cstu (stept, vfp));
8134 for (; ivn < nvects; ++ivn)
8136 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8137 tree def = gimple_get_lhs (iv);
8138 if (ivn < 2*nivs)
8139 vec_steps[ivn - nivs]
8140 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8141 vec_steps[ivn - nivs], lupdate_mul);
8142 gimple_seq stmts = NULL;
8143 def = gimple_convert (&stmts, step_vectype, def);
8144 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8145 def, vec_steps[ivn % nivs]);
8146 def = gimple_convert (&stmts, vectype, def);
8147 if (gimple_code (iv) == GIMPLE_PHI)
8148 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8149 else
8151 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8152 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8154 SLP_TREE_VEC_STMTS (slp_node)
8155 .quick_push (SSA_NAME_DEF_STMT (def));
8159 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8160 gcc_assert (!new_bb);
8162 return true;
8165 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8166 loop_preheader_edge (iv_loop));
8168 gimple_seq stmts = NULL;
8169 if (!nested_in_vect_loop)
8171 /* Convert the initial value to the IV update type. */
8172 tree new_type = TREE_TYPE (step_expr);
8173 init_expr = gimple_convert (&stmts, new_type, init_expr);
8175 /* If we are using the loop mask to "peel" for alignment then we need
8176 to adjust the start value here. */
8177 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8178 if (skip_niters != NULL_TREE)
8180 if (FLOAT_TYPE_P (vectype))
8181 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8182 skip_niters);
8183 else
8184 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8185 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8186 skip_niters, step_expr);
8187 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8188 init_expr, skip_step);
8192 if (stmts)
8194 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8195 gcc_assert (!new_bb);
8198 /* Create the vector that holds the initial_value of the induction. */
8199 if (nested_in_vect_loop)
8201 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8202 been created during vectorization of previous stmts. We obtain it
8203 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8204 auto_vec<tree> vec_inits;
8205 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8206 init_expr, &vec_inits);
8207 vec_init = vec_inits[0];
8208 /* If the initial value is not of proper type, convert it. */
8209 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8211 new_stmt
8212 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8213 vect_simple_var,
8214 "vec_iv_"),
8215 VIEW_CONVERT_EXPR,
8216 build1 (VIEW_CONVERT_EXPR, vectype,
8217 vec_init));
8218 vec_init = gimple_assign_lhs (new_stmt);
8219 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8220 new_stmt);
8221 gcc_assert (!new_bb);
8224 else
8226 /* iv_loop is the loop to be vectorized. Create:
8227 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8228 stmts = NULL;
8229 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8231 unsigned HOST_WIDE_INT const_nunits;
8232 if (nunits.is_constant (&const_nunits))
8234 tree_vector_builder elts (step_vectype, const_nunits, 1);
8235 elts.quick_push (new_name);
8236 for (i = 1; i < const_nunits; i++)
8238 /* Create: new_name_i = new_name + step_expr */
8239 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8240 new_name, step_expr);
8241 elts.quick_push (new_name);
8243 /* Create a vector from [new_name_0, new_name_1, ...,
8244 new_name_nunits-1] */
8245 vec_init = gimple_build_vector (&stmts, &elts);
8247 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8248 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8249 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8250 new_name, step_expr);
8251 else
8253 /* Build:
8254 [base, base, base, ...]
8255 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8256 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8257 gcc_assert (flag_associative_math);
8258 tree index = build_index_vector (step_vectype, 0, 1);
8259 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8260 new_name);
8261 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8262 step_expr);
8263 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8264 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8265 vec_init, step_vec);
8266 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8267 vec_init, base_vec);
8269 vec_init = gimple_convert (&stmts, vectype, vec_init);
8271 if (stmts)
8273 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8274 gcc_assert (!new_bb);
8279 /* Create the vector that holds the step of the induction. */
8280 if (nested_in_vect_loop)
8281 /* iv_loop is nested in the loop to be vectorized. Generate:
8282 vec_step = [S, S, S, S] */
8283 new_name = step_expr;
8284 else
8286 /* iv_loop is the loop to be vectorized. Generate:
8287 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8288 gimple_seq seq = NULL;
8289 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8291 expr = build_int_cst (integer_type_node, vf);
8292 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8294 else
8295 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8296 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8297 expr, step_expr);
8298 if (seq)
8300 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8301 gcc_assert (!new_bb);
8305 t = unshare_expr (new_name);
8306 gcc_assert (CONSTANT_CLASS_P (new_name)
8307 || TREE_CODE (new_name) == SSA_NAME);
8308 new_vec = build_vector_from_val (step_vectype, t);
8309 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8310 new_vec, step_vectype, NULL);
8313 /* Create the following def-use cycle:
8314 loop prolog:
8315 vec_init = ...
8316 vec_step = ...
8317 loop:
8318 vec_iv = PHI <vec_init, vec_loop>
8320 STMT
8322 vec_loop = vec_iv + vec_step; */
8324 /* Create the induction-phi that defines the induction-operand. */
8325 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8326 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8327 induc_def = PHI_RESULT (induction_phi);
8329 /* Create the iv update inside the loop */
8330 stmts = NULL;
8331 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8332 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8333 vec_def = gimple_convert (&stmts, vectype, vec_def);
8334 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8335 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8337 /* Set the arguments of the phi node: */
8338 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8339 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8340 UNKNOWN_LOCATION);
8342 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8343 *vec_stmt = induction_phi;
8345 /* In case that vectorization factor (VF) is bigger than the number
8346 of elements that we can fit in a vectype (nunits), we have to generate
8347 more than one vector stmt - i.e - we need to "unroll" the
8348 vector stmt by a factor VF/nunits. For more details see documentation
8349 in vectorizable_operation. */
8351 if (ncopies > 1)
8353 gimple_seq seq = NULL;
8354 /* FORNOW. This restriction should be relaxed. */
8355 gcc_assert (!nested_in_vect_loop);
8357 /* Create the vector that holds the step of the induction. */
8358 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8360 expr = build_int_cst (integer_type_node, nunits);
8361 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8363 else
8364 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8365 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8366 expr, step_expr);
8367 if (seq)
8369 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8370 gcc_assert (!new_bb);
8373 t = unshare_expr (new_name);
8374 gcc_assert (CONSTANT_CLASS_P (new_name)
8375 || TREE_CODE (new_name) == SSA_NAME);
8376 new_vec = build_vector_from_val (step_vectype, t);
8377 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8378 new_vec, step_vectype, NULL);
8380 vec_def = induc_def;
8381 for (i = 1; i < ncopies; i++)
8383 /* vec_i = vec_prev + vec_step */
8384 gimple_seq stmts = NULL;
8385 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8386 vec_def = gimple_build (&stmts,
8387 PLUS_EXPR, step_vectype, vec_def, vec_step);
8388 vec_def = gimple_convert (&stmts, vectype, vec_def);
8390 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8391 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8392 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8396 if (dump_enabled_p ())
8397 dump_printf_loc (MSG_NOTE, vect_location,
8398 "transform induction: created def-use cycle: %G%G",
8399 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8401 return true;
8404 /* Function vectorizable_live_operation.
8406 STMT_INFO computes a value that is used outside the loop. Check if
8407 it can be supported. */
8409 bool
8410 vectorizable_live_operation (vec_info *vinfo,
8411 stmt_vec_info stmt_info,
8412 gimple_stmt_iterator *gsi,
8413 slp_tree slp_node, slp_instance slp_node_instance,
8414 int slp_index, bool vec_stmt_p,
8415 stmt_vector_for_cost *cost_vec)
8417 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8418 imm_use_iterator imm_iter;
8419 tree lhs, lhs_type, bitsize, vec_bitsize;
8420 tree vectype = (slp_node
8421 ? SLP_TREE_VECTYPE (slp_node)
8422 : STMT_VINFO_VECTYPE (stmt_info));
8423 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8424 int ncopies;
8425 gimple *use_stmt;
8426 auto_vec<tree> vec_oprnds;
8427 int vec_entry = 0;
8428 poly_uint64 vec_index = 0;
8430 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8432 /* If a stmt of a reduction is live, vectorize it via
8433 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8434 validity so just trigger the transform here. */
8435 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8437 if (!vec_stmt_p)
8438 return true;
8439 if (slp_node)
8441 /* For reduction chains the meta-info is attached to
8442 the group leader. */
8443 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8444 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8445 /* For SLP reductions we vectorize the epilogue for
8446 all involved stmts together. */
8447 else if (slp_index != 0)
8448 return true;
8449 else
8450 /* For SLP reductions the meta-info is attached to
8451 the representative. */
8452 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8454 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8455 gcc_assert (reduc_info->is_reduc_info);
8456 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8457 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8458 return true;
8459 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8460 slp_node_instance);
8461 return true;
8464 /* If STMT is not relevant and it is a simple assignment and its inputs are
8465 invariant then it can remain in place, unvectorized. The original last
8466 scalar value that it computes will be used. */
8467 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8469 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8470 if (dump_enabled_p ())
8471 dump_printf_loc (MSG_NOTE, vect_location,
8472 "statement is simple and uses invariant. Leaving in "
8473 "place.\n");
8474 return true;
8477 if (slp_node)
8478 ncopies = 1;
8479 else
8480 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8482 if (slp_node)
8484 gcc_assert (slp_index >= 0);
8486 /* Get the last occurrence of the scalar index from the concatenation of
8487 all the slp vectors. Calculate which slp vector it is and the index
8488 within. */
8489 int num_scalar = SLP_TREE_LANES (slp_node);
8490 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8491 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8493 /* Calculate which vector contains the result, and which lane of
8494 that vector we need. */
8495 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8497 if (dump_enabled_p ())
8498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8499 "Cannot determine which vector holds the"
8500 " final result.\n");
8501 return false;
8505 if (!vec_stmt_p)
8507 /* No transformation required. */
8508 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8510 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8511 OPTIMIZE_FOR_SPEED))
8513 if (dump_enabled_p ())
8514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8515 "can't operate on partial vectors "
8516 "because the target doesn't support extract "
8517 "last reduction.\n");
8518 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8520 else if (slp_node)
8522 if (dump_enabled_p ())
8523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8524 "can't operate on partial vectors "
8525 "because an SLP statement is live after "
8526 "the loop.\n");
8527 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8529 else if (ncopies > 1)
8531 if (dump_enabled_p ())
8532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8533 "can't operate on partial vectors "
8534 "because ncopies is greater than 1.\n");
8535 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8537 else
8539 gcc_assert (ncopies == 1 && !slp_node);
8540 vect_record_loop_mask (loop_vinfo,
8541 &LOOP_VINFO_MASKS (loop_vinfo),
8542 1, vectype, NULL);
8545 /* ??? Enable for loop costing as well. */
8546 if (!loop_vinfo)
8547 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8548 0, vect_epilogue);
8549 return true;
8552 /* Use the lhs of the original scalar statement. */
8553 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8554 if (dump_enabled_p ())
8555 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8556 "stmt %G", stmt);
8558 lhs = gimple_get_lhs (stmt);
8559 lhs_type = TREE_TYPE (lhs);
8561 bitsize = vector_element_bits_tree (vectype);
8562 vec_bitsize = TYPE_SIZE (vectype);
8564 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8565 tree vec_lhs, bitstart;
8566 gimple *vec_stmt;
8567 if (slp_node)
8569 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8571 /* Get the correct slp vectorized stmt. */
8572 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8573 vec_lhs = gimple_get_lhs (vec_stmt);
8575 /* Get entry to use. */
8576 bitstart = bitsize_int (vec_index);
8577 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8579 else
8581 /* For multiple copies, get the last copy. */
8582 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8583 vec_lhs = gimple_get_lhs (vec_stmt);
8585 /* Get the last lane in the vector. */
8586 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8589 if (loop_vinfo)
8591 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8592 requirement, insert one phi node for it. It looks like:
8593 loop;
8595 # lhs' = PHI <lhs>
8597 loop;
8599 # vec_lhs' = PHI <vec_lhs>
8600 new_tree = lane_extract <vec_lhs', ...>;
8601 lhs' = new_tree; */
8603 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8604 basic_block exit_bb = single_exit (loop)->dest;
8605 gcc_assert (single_pred_p (exit_bb));
8607 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8608 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8609 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8611 gimple_seq stmts = NULL;
8612 tree new_tree;
8613 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8615 /* Emit:
8617 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8619 where VEC_LHS is the vectorized live-out result and MASK is
8620 the loop mask for the final iteration. */
8621 gcc_assert (ncopies == 1 && !slp_node);
8622 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8623 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8624 1, vectype, 0);
8625 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8626 mask, vec_lhs_phi);
8628 /* Convert the extracted vector element to the scalar type. */
8629 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8631 else
8633 tree bftype = TREE_TYPE (vectype);
8634 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8635 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8636 new_tree = build3 (BIT_FIELD_REF, bftype,
8637 vec_lhs_phi, bitsize, bitstart);
8638 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8639 &stmts, true, NULL_TREE);
8642 if (stmts)
8644 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8645 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8647 /* Remove existing phi from lhs and create one copy from new_tree. */
8648 tree lhs_phi = NULL_TREE;
8649 gimple_stmt_iterator gsi;
8650 for (gsi = gsi_start_phis (exit_bb);
8651 !gsi_end_p (gsi); gsi_next (&gsi))
8653 gimple *phi = gsi_stmt (gsi);
8654 if ((gimple_phi_arg_def (phi, 0) == lhs))
8656 remove_phi_node (&gsi, false);
8657 lhs_phi = gimple_phi_result (phi);
8658 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8659 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8660 break;
8665 /* Replace use of lhs with newly computed result. If the use stmt is a
8666 single arg PHI, just replace all uses of PHI result. It's necessary
8667 because lcssa PHI defining lhs may be before newly inserted stmt. */
8668 use_operand_p use_p;
8669 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8670 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8671 && !is_gimple_debug (use_stmt))
8673 if (gimple_code (use_stmt) == GIMPLE_PHI
8674 && gimple_phi_num_args (use_stmt) == 1)
8676 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8678 else
8680 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8681 SET_USE (use_p, new_tree);
8683 update_stmt (use_stmt);
8686 else
8688 /* For basic-block vectorization simply insert the lane-extraction. */
8689 tree bftype = TREE_TYPE (vectype);
8690 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8691 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8692 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8693 vec_lhs, bitsize, bitstart);
8694 gimple_seq stmts = NULL;
8695 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8696 &stmts, true, NULL_TREE);
8697 if (TREE_CODE (new_tree) == SSA_NAME
8698 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8699 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8700 if (is_a <gphi *> (vec_stmt))
8702 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8703 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8705 else
8707 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8708 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8711 /* Replace use of lhs with newly computed result. If the use stmt is a
8712 single arg PHI, just replace all uses of PHI result. It's necessary
8713 because lcssa PHI defining lhs may be before newly inserted stmt. */
8714 use_operand_p use_p;
8715 stmt_vec_info use_stmt_info;
8716 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8717 if (!is_gimple_debug (use_stmt)
8718 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8719 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8721 /* ??? This can happen when the live lane ends up being
8722 used in a vector construction code-generated by an
8723 external SLP node (and code-generation for that already
8724 happened). See gcc.dg/vect/bb-slp-47.c.
8725 Doing this is what would happen if that vector CTOR
8726 were not code-generated yet so it is not too bad.
8727 ??? In fact we'd likely want to avoid this situation
8728 in the first place. */
8729 if (TREE_CODE (new_tree) == SSA_NAME
8730 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8731 && gimple_code (use_stmt) != GIMPLE_PHI
8732 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8733 use_stmt))
8735 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8736 gcc_assert (code == CONSTRUCTOR
8737 || code == VIEW_CONVERT_EXPR
8738 || CONVERT_EXPR_CODE_P (code));
8739 if (dump_enabled_p ())
8740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8741 "Using original scalar computation for "
8742 "live lane because use preceeds vector "
8743 "def\n");
8744 continue;
8746 /* ??? It can also happen that we end up pulling a def into
8747 a loop where replacing out-of-loop uses would require
8748 a new LC SSA PHI node. Retain the original scalar in
8749 those cases as well. PR98064. */
8750 if (TREE_CODE (new_tree) == SSA_NAME
8751 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8752 && (gimple_bb (use_stmt)->loop_father
8753 != gimple_bb (vec_stmt)->loop_father)
8754 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8755 gimple_bb (use_stmt)->loop_father))
8757 if (dump_enabled_p ())
8758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8759 "Using original scalar computation for "
8760 "live lane because there is an out-of-loop "
8761 "definition for it\n");
8762 continue;
8764 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8765 SET_USE (use_p, new_tree);
8766 update_stmt (use_stmt);
8770 return true;
8773 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8775 static void
8776 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8778 ssa_op_iter op_iter;
8779 imm_use_iterator imm_iter;
8780 def_operand_p def_p;
8781 gimple *ustmt;
8783 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8785 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8787 basic_block bb;
8789 if (!is_gimple_debug (ustmt))
8790 continue;
8792 bb = gimple_bb (ustmt);
8794 if (!flow_bb_inside_loop_p (loop, bb))
8796 if (gimple_debug_bind_p (ustmt))
8798 if (dump_enabled_p ())
8799 dump_printf_loc (MSG_NOTE, vect_location,
8800 "killing debug use\n");
8802 gimple_debug_bind_reset_value (ustmt);
8803 update_stmt (ustmt);
8805 else
8806 gcc_unreachable ();
8812 /* Given loop represented by LOOP_VINFO, return true if computation of
8813 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8814 otherwise. */
8816 static bool
8817 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8819 /* Constant case. */
8820 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8822 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8823 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8825 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8826 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8827 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8828 return true;
8831 widest_int max;
8832 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8833 /* Check the upper bound of loop niters. */
8834 if (get_max_loop_iterations (loop, &max))
8836 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8837 signop sgn = TYPE_SIGN (type);
8838 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8839 if (max < type_max)
8840 return true;
8842 return false;
8845 /* Return a mask type with half the number of elements as OLD_TYPE,
8846 given that it should have mode NEW_MODE. */
8848 tree
8849 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8851 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8852 return build_truth_vector_type_for_mode (nunits, new_mode);
8855 /* Return a mask type with twice as many elements as OLD_TYPE,
8856 given that it should have mode NEW_MODE. */
8858 tree
8859 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8861 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8862 return build_truth_vector_type_for_mode (nunits, new_mode);
8865 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8866 contain a sequence of NVECTORS masks that each control a vector of type
8867 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8868 these vector masks with the vector version of SCALAR_MASK. */
8870 void
8871 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8872 unsigned int nvectors, tree vectype, tree scalar_mask)
8874 gcc_assert (nvectors != 0);
8875 if (masks->length () < nvectors)
8876 masks->safe_grow_cleared (nvectors, true);
8877 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8878 /* The number of scalars per iteration and the number of vectors are
8879 both compile-time constants. */
8880 unsigned int nscalars_per_iter
8881 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8882 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8884 if (scalar_mask)
8886 scalar_cond_masked_key cond (scalar_mask, nvectors);
8887 loop_vinfo->scalar_cond_masked_set.add (cond);
8890 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8892 rgm->max_nscalars_per_iter = nscalars_per_iter;
8893 rgm->type = truth_type_for (vectype);
8894 rgm->factor = 1;
8898 /* Given a complete set of masks MASKS, extract mask number INDEX
8899 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8900 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8902 See the comment above vec_loop_masks for more details about the mask
8903 arrangement. */
8905 tree
8906 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8907 unsigned int nvectors, tree vectype, unsigned int index)
8909 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8910 tree mask_type = rgm->type;
8912 /* Populate the rgroup's mask array, if this is the first time we've
8913 used it. */
8914 if (rgm->controls.is_empty ())
8916 rgm->controls.safe_grow_cleared (nvectors, true);
8917 for (unsigned int i = 0; i < nvectors; ++i)
8919 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8920 /* Provide a dummy definition until the real one is available. */
8921 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8922 rgm->controls[i] = mask;
8926 tree mask = rgm->controls[index];
8927 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8928 TYPE_VECTOR_SUBPARTS (vectype)))
8930 /* A loop mask for data type X can be reused for data type Y
8931 if X has N times more elements than Y and if Y's elements
8932 are N times bigger than X's. In this case each sequence
8933 of N elements in the loop mask will be all-zero or all-one.
8934 We can then view-convert the mask so that each sequence of
8935 N elements is replaced by a single element. */
8936 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8937 TYPE_VECTOR_SUBPARTS (vectype)));
8938 gimple_seq seq = NULL;
8939 mask_type = truth_type_for (vectype);
8940 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8941 if (seq)
8942 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8944 return mask;
8947 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8948 lengths for controlling an operation on VECTYPE. The operation splits
8949 each element of VECTYPE into FACTOR separate subelements, measuring the
8950 length as a number of these subelements. */
8952 void
8953 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8954 unsigned int nvectors, tree vectype, unsigned int factor)
8956 gcc_assert (nvectors != 0);
8957 if (lens->length () < nvectors)
8958 lens->safe_grow_cleared (nvectors, true);
8959 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8961 /* The number of scalars per iteration, scalar occupied bytes and
8962 the number of vectors are both compile-time constants. */
8963 unsigned int nscalars_per_iter
8964 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8965 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8967 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8969 /* For now, we only support cases in which all loads and stores fall back
8970 to VnQI or none do. */
8971 gcc_assert (!rgl->max_nscalars_per_iter
8972 || (rgl->factor == 1 && factor == 1)
8973 || (rgl->max_nscalars_per_iter * rgl->factor
8974 == nscalars_per_iter * factor));
8975 rgl->max_nscalars_per_iter = nscalars_per_iter;
8976 rgl->type = vectype;
8977 rgl->factor = factor;
8981 /* Given a complete set of length LENS, extract length number INDEX for an
8982 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8984 tree
8985 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8986 unsigned int nvectors, unsigned int index)
8988 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8990 /* Populate the rgroup's len array, if this is the first time we've
8991 used it. */
8992 if (rgl->controls.is_empty ())
8994 rgl->controls.safe_grow_cleared (nvectors, true);
8995 for (unsigned int i = 0; i < nvectors; ++i)
8997 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8998 gcc_assert (len_type != NULL_TREE);
8999 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9001 /* Provide a dummy definition until the real one is available. */
9002 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9003 rgl->controls[i] = len;
9007 return rgl->controls[index];
9010 /* Scale profiling counters by estimation for LOOP which is vectorized
9011 by factor VF. */
9013 static void
9014 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9016 edge preheader = loop_preheader_edge (loop);
9017 /* Reduce loop iterations by the vectorization factor. */
9018 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9019 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9021 if (freq_h.nonzero_p ())
9023 profile_probability p;
9025 /* Avoid dropping loop body profile counter to 0 because of zero count
9026 in loop's preheader. */
9027 if (!(freq_e == profile_count::zero ()))
9028 freq_e = freq_e.force_nonzero ();
9029 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9030 scale_loop_frequencies (loop, p);
9033 edge exit_e = single_exit (loop);
9034 exit_e->probability = profile_probability::always ()
9035 .apply_scale (1, new_est_niter + 1);
9037 edge exit_l = single_pred_edge (loop->latch);
9038 profile_probability prob = exit_l->probability;
9039 exit_l->probability = exit_e->probability.invert ();
9040 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9041 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9044 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9045 latch edge values originally defined by it. */
9047 static void
9048 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9049 stmt_vec_info def_stmt_info)
9051 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9052 if (!def || TREE_CODE (def) != SSA_NAME)
9053 return;
9054 stmt_vec_info phi_info;
9055 imm_use_iterator iter;
9056 use_operand_p use_p;
9057 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9058 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9059 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9060 && (phi_info = loop_vinfo->lookup_stmt (phi))
9061 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9062 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9063 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9065 loop_p loop = gimple_bb (phi)->loop_father;
9066 edge e = loop_latch_edge (loop);
9067 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9069 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9070 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9071 gcc_assert (phi_defs.length () == latch_defs.length ());
9072 for (unsigned i = 0; i < phi_defs.length (); ++i)
9073 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9074 gimple_get_lhs (latch_defs[i]), e,
9075 gimple_phi_arg_location (phi, e->dest_idx));
9080 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9081 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9082 stmt_vec_info. */
9084 static bool
9085 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9086 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9088 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9089 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9091 if (dump_enabled_p ())
9092 dump_printf_loc (MSG_NOTE, vect_location,
9093 "------>vectorizing statement: %G", stmt_info->stmt);
9095 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9096 vect_loop_kill_debug_uses (loop, stmt_info);
9098 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9099 && !STMT_VINFO_LIVE_P (stmt_info))
9100 return false;
9102 if (STMT_VINFO_VECTYPE (stmt_info))
9104 poly_uint64 nunits
9105 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9106 if (!STMT_SLP_TYPE (stmt_info)
9107 && maybe_ne (nunits, vf)
9108 && dump_enabled_p ())
9109 /* For SLP VF is set according to unrolling factor, and not
9110 to vector size, hence for SLP this print is not valid. */
9111 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9114 /* Pure SLP statements have already been vectorized. We still need
9115 to apply loop vectorization to hybrid SLP statements. */
9116 if (PURE_SLP_STMT (stmt_info))
9117 return false;
9119 if (dump_enabled_p ())
9120 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9122 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9123 *seen_store = stmt_info;
9125 return true;
9128 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9129 in the hash_map with its corresponding values. */
9131 static tree
9132 find_in_mapping (tree t, void *context)
9134 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9136 tree *value = mapping->get (t);
9137 return value ? *value : t;
9140 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9141 original loop that has now been vectorized.
9143 The inits of the data_references need to be advanced with the number of
9144 iterations of the main loop. This has been computed in vect_do_peeling and
9145 is stored in parameter ADVANCE. We first restore the data_references
9146 initial offset with the values recored in ORIG_DRS_INIT.
9148 Since the loop_vec_info of this EPILOGUE was constructed for the original
9149 loop, its stmt_vec_infos all point to the original statements. These need
9150 to be updated to point to their corresponding copies as well as the SSA_NAMES
9151 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9153 The data_reference's connections also need to be updated. Their
9154 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9155 stmt_vec_infos, their statements need to point to their corresponding copy,
9156 if they are gather loads or scatter stores then their reference needs to be
9157 updated to point to its corresponding copy and finally we set
9158 'base_misaligned' to false as we have already peeled for alignment in the
9159 prologue of the main loop. */
9161 static void
9162 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9164 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9165 auto_vec<gimple *> stmt_worklist;
9166 hash_map<tree,tree> mapping;
9167 gimple *orig_stmt, *new_stmt;
9168 gimple_stmt_iterator epilogue_gsi;
9169 gphi_iterator epilogue_phi_gsi;
9170 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9171 basic_block *epilogue_bbs = get_loop_body (epilogue);
9172 unsigned i;
9174 free (LOOP_VINFO_BBS (epilogue_vinfo));
9175 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9177 /* Advance data_reference's with the number of iterations of the previous
9178 loop and its prologue. */
9179 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9182 /* The EPILOGUE loop is a copy of the original loop so they share the same
9183 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9184 point to the copied statements. We also create a mapping of all LHS' in
9185 the original loop and all the LHS' in the EPILOGUE and create worklists to
9186 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9187 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9189 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9190 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9192 new_stmt = epilogue_phi_gsi.phi ();
9194 gcc_assert (gimple_uid (new_stmt) > 0);
9195 stmt_vinfo
9196 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9198 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9199 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9201 mapping.put (gimple_phi_result (orig_stmt),
9202 gimple_phi_result (new_stmt));
9203 /* PHI nodes can not have patterns or related statements. */
9204 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9205 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9208 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9209 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9211 new_stmt = gsi_stmt (epilogue_gsi);
9212 if (is_gimple_debug (new_stmt))
9213 continue;
9215 gcc_assert (gimple_uid (new_stmt) > 0);
9216 stmt_vinfo
9217 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9219 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9220 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9222 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9223 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9225 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9227 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9228 for (gimple_stmt_iterator gsi = gsi_start (seq);
9229 !gsi_end_p (gsi); gsi_next (&gsi))
9230 stmt_worklist.safe_push (gsi_stmt (gsi));
9233 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9234 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9236 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9237 stmt_worklist.safe_push (stmt);
9238 /* Set BB such that the assert in
9239 'get_initial_def_for_reduction' is able to determine that
9240 the BB of the related stmt is inside this loop. */
9241 gimple_set_bb (stmt,
9242 gimple_bb (new_stmt));
9243 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9244 gcc_assert (related_vinfo == NULL
9245 || related_vinfo == stmt_vinfo);
9250 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9251 using the original main loop and thus need to be updated to refer to the
9252 cloned variables used in the epilogue. */
9253 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9255 gimple *stmt = stmt_worklist[i];
9256 tree *new_op;
9258 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9260 tree op = gimple_op (stmt, j);
9261 if ((new_op = mapping.get(op)))
9262 gimple_set_op (stmt, j, *new_op);
9263 else
9265 /* PR92429: The last argument of simplify_replace_tree disables
9266 folding when replacing arguments. This is required as
9267 otherwise you might end up with different statements than the
9268 ones analyzed in vect_loop_analyze, leading to different
9269 vectorization. */
9270 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9271 &find_in_mapping, &mapping, false);
9272 gimple_set_op (stmt, j, op);
9277 struct data_reference *dr;
9278 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9279 FOR_EACH_VEC_ELT (datarefs, i, dr)
9281 orig_stmt = DR_STMT (dr);
9282 gcc_assert (gimple_uid (orig_stmt) > 0);
9283 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9284 /* Data references for gather loads and scatter stores do not use the
9285 updated offset we set using ADVANCE. Instead we have to make sure the
9286 reference in the data references point to the corresponding copy of
9287 the original in the epilogue. */
9288 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9289 == VMAT_GATHER_SCATTER)
9291 DR_REF (dr)
9292 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9293 &find_in_mapping, &mapping);
9294 DR_BASE_ADDRESS (dr)
9295 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9296 &find_in_mapping, &mapping);
9298 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9299 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9300 /* The vector size of the epilogue is smaller than that of the main loop
9301 so the alignment is either the same or lower. This means the dr will
9302 thus by definition be aligned. */
9303 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9306 epilogue_vinfo->shared->datarefs_copy.release ();
9307 epilogue_vinfo->shared->save_datarefs ();
9310 /* Function vect_transform_loop.
9312 The analysis phase has determined that the loop is vectorizable.
9313 Vectorize the loop - created vectorized stmts to replace the scalar
9314 stmts in the loop, and update the loop exit condition.
9315 Returns scalar epilogue loop if any. */
9317 class loop *
9318 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9320 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9321 class loop *epilogue = NULL;
9322 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9323 int nbbs = loop->num_nodes;
9324 int i;
9325 tree niters_vector = NULL_TREE;
9326 tree step_vector = NULL_TREE;
9327 tree niters_vector_mult_vf = NULL_TREE;
9328 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9329 unsigned int lowest_vf = constant_lower_bound (vf);
9330 gimple *stmt;
9331 bool check_profitability = false;
9332 unsigned int th;
9334 DUMP_VECT_SCOPE ("vec_transform_loop");
9336 loop_vinfo->shared->check_datarefs ();
9338 /* Use the more conservative vectorization threshold. If the number
9339 of iterations is constant assume the cost check has been performed
9340 by our caller. If the threshold makes all loops profitable that
9341 run at least the (estimated) vectorization factor number of times
9342 checking is pointless, too. */
9343 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9344 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9346 if (dump_enabled_p ())
9347 dump_printf_loc (MSG_NOTE, vect_location,
9348 "Profitability threshold is %d loop iterations.\n",
9349 th);
9350 check_profitability = true;
9353 /* Make sure there exists a single-predecessor exit bb. Do this before
9354 versioning. */
9355 edge e = single_exit (loop);
9356 if (! single_pred_p (e->dest))
9358 split_loop_exit_edge (e, true);
9359 if (dump_enabled_p ())
9360 dump_printf (MSG_NOTE, "split exit edge\n");
9363 /* Version the loop first, if required, so the profitability check
9364 comes first. */
9366 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9368 class loop *sloop
9369 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9370 sloop->force_vectorize = false;
9371 check_profitability = false;
9374 /* Make sure there exists a single-predecessor exit bb also on the
9375 scalar loop copy. Do this after versioning but before peeling
9376 so CFG structure is fine for both scalar and if-converted loop
9377 to make slpeel_duplicate_current_defs_from_edges face matched
9378 loop closed PHI nodes on the exit. */
9379 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9381 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9382 if (! single_pred_p (e->dest))
9384 split_loop_exit_edge (e, true);
9385 if (dump_enabled_p ())
9386 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9390 tree niters = vect_build_loop_niters (loop_vinfo);
9391 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9392 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9393 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9394 tree advance;
9395 drs_init_vec orig_drs_init;
9397 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9398 &step_vector, &niters_vector_mult_vf, th,
9399 check_profitability, niters_no_overflow,
9400 &advance);
9402 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9403 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9404 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9405 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9407 if (niters_vector == NULL_TREE)
9409 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9410 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9411 && known_eq (lowest_vf, vf))
9413 niters_vector
9414 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9415 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9416 step_vector = build_one_cst (TREE_TYPE (niters));
9418 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9419 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9420 &step_vector, niters_no_overflow);
9421 else
9422 /* vect_do_peeling subtracted the number of peeled prologue
9423 iterations from LOOP_VINFO_NITERS. */
9424 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9425 &niters_vector, &step_vector,
9426 niters_no_overflow);
9429 /* 1) Make sure the loop header has exactly two entries
9430 2) Make sure we have a preheader basic block. */
9432 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9434 split_edge (loop_preheader_edge (loop));
9436 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9437 /* This will deal with any possible peeling. */
9438 vect_prepare_for_masked_peels (loop_vinfo);
9440 /* Schedule the SLP instances first, then handle loop vectorization
9441 below. */
9442 if (!loop_vinfo->slp_instances.is_empty ())
9444 DUMP_VECT_SCOPE ("scheduling SLP instances");
9445 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9448 /* FORNOW: the vectorizer supports only loops which body consist
9449 of one basic block (header + empty latch). When the vectorizer will
9450 support more involved loop forms, the order by which the BBs are
9451 traversed need to be reconsidered. */
9453 for (i = 0; i < nbbs; i++)
9455 basic_block bb = bbs[i];
9456 stmt_vec_info stmt_info;
9458 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9459 gsi_next (&si))
9461 gphi *phi = si.phi ();
9462 if (dump_enabled_p ())
9463 dump_printf_loc (MSG_NOTE, vect_location,
9464 "------>vectorizing phi: %G", phi);
9465 stmt_info = loop_vinfo->lookup_stmt (phi);
9466 if (!stmt_info)
9467 continue;
9469 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9470 vect_loop_kill_debug_uses (loop, stmt_info);
9472 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9473 && !STMT_VINFO_LIVE_P (stmt_info))
9474 continue;
9476 if (STMT_VINFO_VECTYPE (stmt_info)
9477 && (maybe_ne
9478 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9479 && dump_enabled_p ())
9480 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9482 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9483 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9484 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9485 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9486 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9487 && ! PURE_SLP_STMT (stmt_info))
9489 if (dump_enabled_p ())
9490 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9491 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9495 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9496 gsi_next (&si))
9498 gphi *phi = si.phi ();
9499 stmt_info = loop_vinfo->lookup_stmt (phi);
9500 if (!stmt_info)
9501 continue;
9503 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9504 && !STMT_VINFO_LIVE_P (stmt_info))
9505 continue;
9507 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9508 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9509 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9510 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9511 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9512 && ! PURE_SLP_STMT (stmt_info))
9513 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9516 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9517 !gsi_end_p (si);)
9519 stmt = gsi_stmt (si);
9520 /* During vectorization remove existing clobber stmts. */
9521 if (gimple_clobber_p (stmt))
9523 unlink_stmt_vdef (stmt);
9524 gsi_remove (&si, true);
9525 release_defs (stmt);
9527 else
9529 /* Ignore vector stmts created in the outer loop. */
9530 stmt_info = loop_vinfo->lookup_stmt (stmt);
9532 /* vector stmts created in the outer-loop during vectorization of
9533 stmts in an inner-loop may not have a stmt_info, and do not
9534 need to be vectorized. */
9535 stmt_vec_info seen_store = NULL;
9536 if (stmt_info)
9538 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9540 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9541 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9542 !gsi_end_p (subsi); gsi_next (&subsi))
9544 stmt_vec_info pat_stmt_info
9545 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9546 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9547 &si, &seen_store);
9549 stmt_vec_info pat_stmt_info
9550 = STMT_VINFO_RELATED_STMT (stmt_info);
9551 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9552 &si, &seen_store))
9553 maybe_set_vectorized_backedge_value (loop_vinfo,
9554 pat_stmt_info);
9556 else
9558 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9559 &seen_store))
9560 maybe_set_vectorized_backedge_value (loop_vinfo,
9561 stmt_info);
9564 gsi_next (&si);
9565 if (seen_store)
9567 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9568 /* Interleaving. If IS_STORE is TRUE, the
9569 vectorization of the interleaving chain was
9570 completed - free all the stores in the chain. */
9571 vect_remove_stores (loop_vinfo,
9572 DR_GROUP_FIRST_ELEMENT (seen_store));
9573 else
9574 /* Free the attached stmt_vec_info and remove the stmt. */
9575 loop_vinfo->remove_stmt (stmt_info);
9580 /* Stub out scalar statements that must not survive vectorization.
9581 Doing this here helps with grouped statements, or statements that
9582 are involved in patterns. */
9583 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9584 !gsi_end_p (gsi); gsi_next (&gsi))
9586 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9587 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9589 tree lhs = gimple_get_lhs (call);
9590 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9592 tree zero = build_zero_cst (TREE_TYPE (lhs));
9593 gimple *new_stmt = gimple_build_assign (lhs, zero);
9594 gsi_replace (&gsi, new_stmt, true);
9598 } /* BBs in loop */
9600 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9601 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9602 if (integer_onep (step_vector))
9603 niters_no_overflow = true;
9604 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9605 niters_vector_mult_vf, !niters_no_overflow);
9607 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9608 scale_profile_for_vect_loop (loop, assumed_vf);
9610 /* True if the final iteration might not handle a full vector's
9611 worth of scalar iterations. */
9612 bool final_iter_may_be_partial
9613 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9614 /* The minimum number of iterations performed by the epilogue. This
9615 is 1 when peeling for gaps because we always need a final scalar
9616 iteration. */
9617 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9618 /* +1 to convert latch counts to loop iteration counts,
9619 -min_epilogue_iters to remove iterations that cannot be performed
9620 by the vector code. */
9621 int bias_for_lowest = 1 - min_epilogue_iters;
9622 int bias_for_assumed = bias_for_lowest;
9623 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9624 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9626 /* When the amount of peeling is known at compile time, the first
9627 iteration will have exactly alignment_npeels active elements.
9628 In the worst case it will have at least one. */
9629 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9630 bias_for_lowest += lowest_vf - min_first_active;
9631 bias_for_assumed += assumed_vf - min_first_active;
9633 /* In these calculations the "- 1" converts loop iteration counts
9634 back to latch counts. */
9635 if (loop->any_upper_bound)
9636 loop->nb_iterations_upper_bound
9637 = (final_iter_may_be_partial
9638 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9639 lowest_vf) - 1
9640 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9641 lowest_vf) - 1);
9642 if (loop->any_likely_upper_bound)
9643 loop->nb_iterations_likely_upper_bound
9644 = (final_iter_may_be_partial
9645 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9646 + bias_for_lowest, lowest_vf) - 1
9647 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9648 + bias_for_lowest, lowest_vf) - 1);
9649 if (loop->any_estimate)
9650 loop->nb_iterations_estimate
9651 = (final_iter_may_be_partial
9652 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9653 assumed_vf) - 1
9654 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9655 assumed_vf) - 1);
9657 if (dump_enabled_p ())
9659 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9661 dump_printf_loc (MSG_NOTE, vect_location,
9662 "LOOP VECTORIZED\n");
9663 if (loop->inner)
9664 dump_printf_loc (MSG_NOTE, vect_location,
9665 "OUTER LOOP VECTORIZED\n");
9666 dump_printf (MSG_NOTE, "\n");
9668 else
9669 dump_printf_loc (MSG_NOTE, vect_location,
9670 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9671 GET_MODE_NAME (loop_vinfo->vector_mode));
9674 /* Loops vectorized with a variable factor won't benefit from
9675 unrolling/peeling. */
9676 if (!vf.is_constant ())
9678 loop->unroll = 1;
9679 if (dump_enabled_p ())
9680 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9681 " variable-length vectorization factor\n");
9683 /* Free SLP instances here because otherwise stmt reference counting
9684 won't work. */
9685 slp_instance instance;
9686 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9687 vect_free_slp_instance (instance);
9688 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9689 /* Clear-up safelen field since its value is invalid after vectorization
9690 since vectorized loop can have loop-carried dependencies. */
9691 loop->safelen = 0;
9693 if (epilogue)
9695 update_epilogue_loop_vinfo (epilogue, advance);
9697 epilogue->simduid = loop->simduid;
9698 epilogue->force_vectorize = loop->force_vectorize;
9699 epilogue->dont_vectorize = false;
9702 return epilogue;
9705 /* The code below is trying to perform simple optimization - revert
9706 if-conversion for masked stores, i.e. if the mask of a store is zero
9707 do not perform it and all stored value producers also if possible.
9708 For example,
9709 for (i=0; i<n; i++)
9710 if (c[i])
9712 p1[i] += 1;
9713 p2[i] = p3[i] +2;
9715 this transformation will produce the following semi-hammock:
9717 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9719 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9720 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9721 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9722 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9723 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9724 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9728 void
9729 optimize_mask_stores (class loop *loop)
9731 basic_block *bbs = get_loop_body (loop);
9732 unsigned nbbs = loop->num_nodes;
9733 unsigned i;
9734 basic_block bb;
9735 class loop *bb_loop;
9736 gimple_stmt_iterator gsi;
9737 gimple *stmt;
9738 auto_vec<gimple *> worklist;
9739 auto_purge_vect_location sentinel;
9741 vect_location = find_loop_location (loop);
9742 /* Pick up all masked stores in loop if any. */
9743 for (i = 0; i < nbbs; i++)
9745 bb = bbs[i];
9746 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9747 gsi_next (&gsi))
9749 stmt = gsi_stmt (gsi);
9750 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9751 worklist.safe_push (stmt);
9755 free (bbs);
9756 if (worklist.is_empty ())
9757 return;
9759 /* Loop has masked stores. */
9760 while (!worklist.is_empty ())
9762 gimple *last, *last_store;
9763 edge e, efalse;
9764 tree mask;
9765 basic_block store_bb, join_bb;
9766 gimple_stmt_iterator gsi_to;
9767 tree vdef, new_vdef;
9768 gphi *phi;
9769 tree vectype;
9770 tree zero;
9772 last = worklist.pop ();
9773 mask = gimple_call_arg (last, 2);
9774 bb = gimple_bb (last);
9775 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9776 the same loop as if_bb. It could be different to LOOP when two
9777 level loop-nest is vectorized and mask_store belongs to the inner
9778 one. */
9779 e = split_block (bb, last);
9780 bb_loop = bb->loop_father;
9781 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9782 join_bb = e->dest;
9783 store_bb = create_empty_bb (bb);
9784 add_bb_to_loop (store_bb, bb_loop);
9785 e->flags = EDGE_TRUE_VALUE;
9786 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9787 /* Put STORE_BB to likely part. */
9788 efalse->probability = profile_probability::unlikely ();
9789 store_bb->count = efalse->count ();
9790 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9791 if (dom_info_available_p (CDI_DOMINATORS))
9792 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9793 if (dump_enabled_p ())
9794 dump_printf_loc (MSG_NOTE, vect_location,
9795 "Create new block %d to sink mask stores.",
9796 store_bb->index);
9797 /* Create vector comparison with boolean result. */
9798 vectype = TREE_TYPE (mask);
9799 zero = build_zero_cst (vectype);
9800 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9801 gsi = gsi_last_bb (bb);
9802 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9803 /* Create new PHI node for vdef of the last masked store:
9804 .MEM_2 = VDEF <.MEM_1>
9805 will be converted to
9806 .MEM.3 = VDEF <.MEM_1>
9807 and new PHI node will be created in join bb
9808 .MEM_2 = PHI <.MEM_1, .MEM_3>
9810 vdef = gimple_vdef (last);
9811 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9812 gimple_set_vdef (last, new_vdef);
9813 phi = create_phi_node (vdef, join_bb);
9814 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9816 /* Put all masked stores with the same mask to STORE_BB if possible. */
9817 while (true)
9819 gimple_stmt_iterator gsi_from;
9820 gimple *stmt1 = NULL;
9822 /* Move masked store to STORE_BB. */
9823 last_store = last;
9824 gsi = gsi_for_stmt (last);
9825 gsi_from = gsi;
9826 /* Shift GSI to the previous stmt for further traversal. */
9827 gsi_prev (&gsi);
9828 gsi_to = gsi_start_bb (store_bb);
9829 gsi_move_before (&gsi_from, &gsi_to);
9830 /* Setup GSI_TO to the non-empty block start. */
9831 gsi_to = gsi_start_bb (store_bb);
9832 if (dump_enabled_p ())
9833 dump_printf_loc (MSG_NOTE, vect_location,
9834 "Move stmt to created bb\n%G", last);
9835 /* Move all stored value producers if possible. */
9836 while (!gsi_end_p (gsi))
9838 tree lhs;
9839 imm_use_iterator imm_iter;
9840 use_operand_p use_p;
9841 bool res;
9843 /* Skip debug statements. */
9844 if (is_gimple_debug (gsi_stmt (gsi)))
9846 gsi_prev (&gsi);
9847 continue;
9849 stmt1 = gsi_stmt (gsi);
9850 /* Do not consider statements writing to memory or having
9851 volatile operand. */
9852 if (gimple_vdef (stmt1)
9853 || gimple_has_volatile_ops (stmt1))
9854 break;
9855 gsi_from = gsi;
9856 gsi_prev (&gsi);
9857 lhs = gimple_get_lhs (stmt1);
9858 if (!lhs)
9859 break;
9861 /* LHS of vectorized stmt must be SSA_NAME. */
9862 if (TREE_CODE (lhs) != SSA_NAME)
9863 break;
9865 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9867 /* Remove dead scalar statement. */
9868 if (has_zero_uses (lhs))
9870 gsi_remove (&gsi_from, true);
9871 continue;
9875 /* Check that LHS does not have uses outside of STORE_BB. */
9876 res = true;
9877 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9879 gimple *use_stmt;
9880 use_stmt = USE_STMT (use_p);
9881 if (is_gimple_debug (use_stmt))
9882 continue;
9883 if (gimple_bb (use_stmt) != store_bb)
9885 res = false;
9886 break;
9889 if (!res)
9890 break;
9892 if (gimple_vuse (stmt1)
9893 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9894 break;
9896 /* Can move STMT1 to STORE_BB. */
9897 if (dump_enabled_p ())
9898 dump_printf_loc (MSG_NOTE, vect_location,
9899 "Move stmt to created bb\n%G", stmt1);
9900 gsi_move_before (&gsi_from, &gsi_to);
9901 /* Shift GSI_TO for further insertion. */
9902 gsi_prev (&gsi_to);
9904 /* Put other masked stores with the same mask to STORE_BB. */
9905 if (worklist.is_empty ()
9906 || gimple_call_arg (worklist.last (), 2) != mask
9907 || worklist.last () != stmt1)
9908 break;
9909 last = worklist.pop ();
9911 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9915 /* Decide whether it is possible to use a zero-based induction variable
9916 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9917 the value that the induction variable must be able to hold in order
9918 to ensure that the rgroups eventually have no active vector elements.
9919 Return -1 otherwise. */
9921 widest_int
9922 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9924 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9925 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9926 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9928 /* Calculate the value that the induction variable must be able
9929 to hit in order to ensure that we end the loop with an all-false mask.
9930 This involves adding the maximum number of inactive trailing scalar
9931 iterations. */
9932 widest_int iv_limit = -1;
9933 if (max_loop_iterations (loop, &iv_limit))
9935 if (niters_skip)
9937 /* Add the maximum number of skipped iterations to the
9938 maximum iteration count. */
9939 if (TREE_CODE (niters_skip) == INTEGER_CST)
9940 iv_limit += wi::to_widest (niters_skip);
9941 else
9942 iv_limit += max_vf - 1;
9944 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9945 /* Make a conservatively-correct assumption. */
9946 iv_limit += max_vf - 1;
9948 /* IV_LIMIT is the maximum number of latch iterations, which is also
9949 the maximum in-range IV value. Round this value down to the previous
9950 vector alignment boundary and then add an extra full iteration. */
9951 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9952 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9954 return iv_limit;
9957 /* For the given rgroup_controls RGC, check whether an induction variable
9958 would ever hit a value that produces a set of all-false masks or zero
9959 lengths before wrapping around. Return true if it's possible to wrap
9960 around before hitting the desirable value, otherwise return false. */
9962 bool
9963 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9965 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9967 if (iv_limit == -1)
9968 return true;
9970 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9971 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9972 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9974 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9975 return true;
9977 return false;