Using UNSPEC for vector compare to mask register.
[official-gcc.git] / gcc / tree-vect-loop.c
blobdba230f63204911b11fb70095ed50edc9f90db8e
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
692 /* Function vect_get_loop_niters.
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
699 Return the loop exit condition. */
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
716 if (!exit)
717 return cond;
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
731 if (may_be_zero)
733 if (COMPARISON_CLASS_P (may_be_zero))
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
748 may_be_zero = NULL_TREE;
750 else if (integer_nonzerop (may_be_zero))
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
756 else
757 return cond;
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
772 return cond;
775 /* Function bb_in_loop_p
777 Used as predicate for dfs order traversal of the loop bbs. */
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 rgroup_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_use_partial_vectors_p (true),
818 using_partial_vectors_p (false),
819 epil_using_partial_vectors_p (false),
820 peeling_for_gaps (false),
821 peeling_for_niter (false),
822 no_data_dependencies (false),
823 has_mask_store (false),
824 scalar_loop_scaling (profile_probability::uninitialized ()),
825 scalar_loop (NULL),
826 orig_loop_info (NULL)
828 /* CHECKME: We want to visit all BBs before their successors (except for
829 latch blocks, for which this assertion wouldn't hold). In the simple
830 case of the loop forms we allow, a dfs order of the BBs would the same
831 as reversed postorder traversal, so we are safe. */
833 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
834 bbs, loop->num_nodes, loop);
835 gcc_assert (nbbs == loop->num_nodes);
837 for (unsigned int i = 0; i < nbbs; i++)
839 basic_block bb = bbs[i];
840 gimple_stmt_iterator si;
842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple *phi = gsi_stmt (si);
845 gimple_set_uid (phi, 0);
846 add_stmt (phi);
849 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *stmt = gsi_stmt (si);
852 gimple_set_uid (stmt, 0);
853 if (is_gimple_debug (stmt))
854 continue;
855 add_stmt (stmt);
856 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
857 third argument is the #pragma omp simd if (x) condition, when 0,
858 loop shouldn't be vectorized, when non-zero constant, it should
859 be vectorized normally, otherwise versioned with vectorized loop
860 done if the condition is non-zero at runtime. */
861 if (loop_in->simduid
862 && is_gimple_call (stmt)
863 && gimple_call_internal_p (stmt)
864 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
865 && gimple_call_num_args (stmt) >= 3
866 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
867 && (loop_in->simduid
868 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
870 tree arg = gimple_call_arg (stmt, 2);
871 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
872 simd_if_cond = arg;
873 else
874 gcc_assert (integer_nonzerop (arg));
879 epilogue_vinfos.create (6);
882 /* Free all levels of rgroup CONTROLS. */
884 void
885 release_vec_loop_controls (vec<rgroup_controls> *controls)
887 rgroup_controls *rgc;
888 unsigned int i;
889 FOR_EACH_VEC_ELT (*controls, i, rgc)
890 rgc->controls.release ();
891 controls->release ();
894 /* Free all memory used by the _loop_vec_info, as well as all the
895 stmt_vec_info structs of all the stmts in the loop. */
897 _loop_vec_info::~_loop_vec_info ()
899 free (bbs);
901 release_vec_loop_controls (&masks);
902 release_vec_loop_controls (&lens);
903 delete ivexpr_map;
904 delete scan_map;
905 epilogue_vinfos.release ();
907 loop->aux = NULL;
910 /* Return an invariant or register for EXPR and emit necessary
911 computations in the LOOP_VINFO loop preheader. */
913 tree
914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
916 if (is_gimple_reg (expr)
917 || is_gimple_min_invariant (expr))
918 return expr;
920 if (! loop_vinfo->ivexpr_map)
921 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
922 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
923 if (! cached)
925 gimple_seq stmts = NULL;
926 cached = force_gimple_operand (unshare_expr (expr),
927 &stmts, true, NULL_TREE);
928 if (stmts)
930 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
931 gsi_insert_seq_on_edge_immediate (e, stmts);
934 return cached;
937 /* Return true if we can use CMP_TYPE as the comparison type to produce
938 all masks required to mask LOOP_VINFO. */
940 static bool
941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
943 rgroup_controls *rgm;
944 unsigned int i;
945 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
946 if (rgm->type != NULL_TREE
947 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
948 cmp_type, rgm->type,
949 OPTIMIZE_FOR_SPEED))
950 return false;
951 return true;
954 /* Calculate the maximum number of scalars per iteration for every
955 rgroup in LOOP_VINFO. */
957 static unsigned int
958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
960 unsigned int res = 1;
961 unsigned int i;
962 rgroup_controls *rgm;
963 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
964 res = MAX (res, rgm->max_nscalars_per_iter);
965 return res;
968 /* Calculate the minimum precision necessary to represent:
970 MAX_NITERS * FACTOR
972 as an unsigned integer, where MAX_NITERS is the maximum number of
973 loop header iterations for the original scalar form of LOOP_VINFO. */
975 static unsigned
976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
990 /* Work out how many bits we need to represent the limit. */
991 return wi::min_precision (max_ni * factor, UNSIGNED);
994 /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 whether we can actually generate the masks required. Return true if so,
996 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
998 static bool
999 vect_verify_full_masking (loop_vec_info loop_vinfo)
1001 unsigned int min_ni_width;
1002 unsigned int max_nscalars_per_iter
1003 = vect_get_max_nscalars_per_iter (loop_vinfo);
1005 /* Use a normal loop if there are no statements that need masking.
1006 This only happens in rare degenerate cases: it means that the loop
1007 has no loads, no stores, and no live-out values. */
1008 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009 return false;
1011 /* Work out how many bits we need to represent the limit. */
1012 min_ni_width
1013 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1015 /* Find a scalar mode for which WHILE_ULT is supported. */
1016 opt_scalar_int_mode cmp_mode_iter;
1017 tree cmp_type = NULL_TREE;
1018 tree iv_type = NULL_TREE;
1019 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1020 unsigned int iv_precision = UINT_MAX;
1022 if (iv_limit != -1)
1023 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024 UNSIGNED);
1026 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1028 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029 if (cmp_bits >= min_ni_width
1030 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1032 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033 if (this_type
1034 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1036 /* Although we could stop as soon as we find a valid mode,
1037 there are at least two reasons why that's not always the
1038 best choice:
1040 - An IV that's Pmode or wider is more likely to be reusable
1041 in address calculations than an IV that's narrower than
1042 Pmode.
1044 - Doing the comparison in IV_PRECISION or wider allows
1045 a natural 0-based IV, whereas using a narrower comparison
1046 type requires mitigations against wrap-around.
1048 Conversely, if the IV limit is variable, doing the comparison
1049 in a wider type than the original type can introduce
1050 unnecessary extensions, so picking the widest valid mode
1051 is not always a good choice either.
1053 Here we prefer the first IV type that's Pmode or wider,
1054 and the first comparison type that's IV_PRECISION or wider.
1055 (The comparison type must be no wider than the IV type,
1056 to avoid extensions in the vector loop.)
1058 ??? We might want to try continuing beyond Pmode for ILP32
1059 targets if CMP_BITS < IV_PRECISION. */
1060 iv_type = this_type;
1061 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062 cmp_type = this_type;
1063 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064 break;
1069 if (!cmp_type)
1070 return false;
1072 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1074 return true;
1077 /* Check whether we can use vector access with length based on precison
1078 comparison. So far, to keep it simple, we only allow the case that the
1079 precision of the target supported length is larger than the precision
1080 required by loop niters. */
1082 static bool
1083 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1085 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1086 return false;
1088 unsigned int max_nitems_per_iter = 1;
1089 unsigned int i;
1090 rgroup_controls *rgl;
1091 /* Find the maximum number of items per iteration for every rgroup. */
1092 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1094 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1095 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1098 /* Work out how many bits we need to represent the length limit. */
1099 unsigned int min_ni_prec
1100 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1102 /* Now use the maximum of below precisions for one suitable IV type:
1103 - the IV's natural precision
1104 - the precision needed to hold: the maximum number of scalar
1105 iterations multiplied by the scale factor (min_ni_prec above)
1106 - the Pmode precision
1108 If min_ni_prec is less than the precision of the current niters,
1109 we perfer to still use the niters type. Prefer to use Pmode and
1110 wider IV to avoid narrow conversions. */
1112 unsigned int ni_prec
1113 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1114 min_ni_prec = MAX (min_ni_prec, ni_prec);
1115 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1117 tree iv_type = NULL_TREE;
1118 opt_scalar_int_mode tmode_iter;
1119 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1121 scalar_mode tmode = tmode_iter.require ();
1122 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1124 /* ??? Do we really want to construct one IV whose precision exceeds
1125 BITS_PER_WORD? */
1126 if (tbits > BITS_PER_WORD)
1127 break;
1129 /* Find the first available standard integral type. */
1130 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1132 iv_type = build_nonstandard_integer_type (tbits, true);
1133 break;
1137 if (!iv_type)
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "can't vectorize with length-based partial vectors"
1142 " because there is no suitable iv type.\n");
1143 return false;
1146 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1147 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1149 return true;
1152 /* Calculate the cost of one scalar iteration of the loop. */
1153 static void
1154 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1156 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1157 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1158 int nbbs = loop->num_nodes, factor;
1159 int innerloop_iters, i;
1161 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1163 /* Gather costs for statements in the scalar loop. */
1165 /* FORNOW. */
1166 innerloop_iters = 1;
1167 if (loop->inner)
1168 innerloop_iters = 50; /* FIXME */
1170 for (i = 0; i < nbbs; i++)
1172 gimple_stmt_iterator si;
1173 basic_block bb = bbs[i];
1175 if (bb->loop_father == loop->inner)
1176 factor = innerloop_iters;
1177 else
1178 factor = 1;
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182 gimple *stmt = gsi_stmt (si);
1183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1185 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1186 continue;
1188 /* Skip stmts that are not vectorized inside the loop. */
1189 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1190 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1191 && (!STMT_VINFO_LIVE_P (vstmt_info)
1192 || !VECTORIZABLE_CYCLE_DEF
1193 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1194 continue;
1196 vect_cost_for_stmt kind;
1197 if (STMT_VINFO_DATA_REF (stmt_info))
1199 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1200 kind = scalar_load;
1201 else
1202 kind = scalar_store;
1204 else if (vect_nop_conversion_p (stmt_info))
1205 continue;
1206 else
1207 kind = scalar_stmt;
1209 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1210 factor, kind, stmt_info, 0, vect_prologue);
1214 /* Now accumulate cost. */
1215 void *target_cost_data = init_cost (loop);
1216 stmt_info_for_cost *si;
1217 int j;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1219 j, si)
1220 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1221 si->kind, si->stmt_info, si->vectype,
1222 si->misalign, vect_body);
1223 unsigned dummy, body_cost = 0;
1224 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1225 destroy_cost_data (target_cost_data);
1226 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1230 /* Function vect_analyze_loop_form_1.
1232 Verify that certain CFG restrictions hold, including:
1233 - the loop has a pre-header
1234 - the loop has a single entry and exit
1235 - the loop exit condition is simple enough
1236 - the number of iterations can be analyzed, i.e, a countable loop. The
1237 niter could be analyzed under some assumptions. */
1239 opt_result
1240 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1241 tree *assumptions, tree *number_of_iterationsm1,
1242 tree *number_of_iterations, gcond **inner_loop_cond)
1244 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1246 /* Different restrictions apply when we are considering an inner-most loop,
1247 vs. an outer (nested) loop.
1248 (FORNOW. May want to relax some of these restrictions in the future). */
1250 if (!loop->inner)
1252 /* Inner-most loop. We currently require that the number of BBs is
1253 exactly 2 (the header and latch). Vectorizable inner-most loops
1254 look like this:
1256 (pre-header)
1258 header <--------+
1259 | | |
1260 | +--> latch --+
1262 (exit-bb) */
1264 if (loop->num_nodes != 2)
1265 return opt_result::failure_at (vect_location,
1266 "not vectorized:"
1267 " control flow in loop.\n");
1269 if (empty_block_p (loop->header))
1270 return opt_result::failure_at (vect_location,
1271 "not vectorized: empty loop.\n");
1273 else
1275 class loop *innerloop = loop->inner;
1276 edge entryedge;
1278 /* Nested loop. We currently require that the loop is doubly-nested,
1279 contains a single inner loop, and the number of BBs is exactly 5.
1280 Vectorizable outer-loops look like this:
1282 (pre-header)
1284 header <---+
1286 inner-loop |
1288 tail ------+
1290 (exit-bb)
1292 The inner-loop has the properties expected of inner-most loops
1293 as described above. */
1295 if ((loop->inner)->inner || (loop->inner)->next)
1296 return opt_result::failure_at (vect_location,
1297 "not vectorized:"
1298 " multiple nested loops.\n");
1300 if (loop->num_nodes != 5)
1301 return opt_result::failure_at (vect_location,
1302 "not vectorized:"
1303 " control flow in loop.\n");
1305 entryedge = loop_preheader_edge (innerloop);
1306 if (entryedge->src != loop->header
1307 || !single_exit (innerloop)
1308 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1309 return opt_result::failure_at (vect_location,
1310 "not vectorized:"
1311 " unsupported outerloop form.\n");
1313 /* Analyze the inner-loop. */
1314 tree inner_niterm1, inner_niter, inner_assumptions;
1315 opt_result res
1316 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1317 &inner_assumptions, &inner_niterm1,
1318 &inner_niter, NULL);
1319 if (!res)
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: Bad inner loop.\n");
1324 return res;
1327 /* Don't support analyzing niter under assumptions for inner
1328 loop. */
1329 if (!integer_onep (inner_assumptions))
1330 return opt_result::failure_at (vect_location,
1331 "not vectorized: Bad inner loop.\n");
1333 if (!expr_invariant_in_loop_p (loop, inner_niter))
1334 return opt_result::failure_at (vect_location,
1335 "not vectorized: inner-loop count not"
1336 " invariant.\n");
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_NOTE, vect_location,
1340 "Considering outer-loop vectorization.\n");
1343 if (!single_exit (loop))
1344 return opt_result::failure_at (vect_location,
1345 "not vectorized: multiple exits.\n");
1346 if (EDGE_COUNT (loop->header->preds) != 2)
1347 return opt_result::failure_at (vect_location,
1348 "not vectorized:"
1349 " too many incoming edges.\n");
1351 /* We assume that the loop exit condition is at the end of the loop. i.e,
1352 that the loop is represented as a do-while (with a proper if-guard
1353 before the loop if needed), where the loop header contains all the
1354 executable statements, and the latch is empty. */
1355 if (!empty_block_p (loop->latch)
1356 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1357 return opt_result::failure_at (vect_location,
1358 "not vectorized: latch block not empty.\n");
1360 /* Make sure the exit is not abnormal. */
1361 edge e = single_exit (loop);
1362 if (e->flags & EDGE_ABNORMAL)
1363 return opt_result::failure_at (vect_location,
1364 "not vectorized:"
1365 " abnormal loop exit edge.\n");
1367 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1368 number_of_iterationsm1);
1369 if (!*loop_cond)
1370 return opt_result::failure_at
1371 (vect_location,
1372 "not vectorized: complicated exit condition.\n");
1374 if (integer_zerop (*assumptions)
1375 || !*number_of_iterations
1376 || chrec_contains_undetermined (*number_of_iterations))
1377 return opt_result::failure_at
1378 (*loop_cond,
1379 "not vectorized: number of iterations cannot be computed.\n");
1381 if (integer_zerop (*number_of_iterations))
1382 return opt_result::failure_at
1383 (*loop_cond,
1384 "not vectorized: number of iterations = 0.\n");
1386 return opt_result::success ();
1389 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1391 opt_loop_vec_info
1392 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1394 tree assumptions, number_of_iterations, number_of_iterationsm1;
1395 gcond *loop_cond, *inner_loop_cond = NULL;
1397 opt_result res
1398 = vect_analyze_loop_form_1 (loop, &loop_cond,
1399 &assumptions, &number_of_iterationsm1,
1400 &number_of_iterations, &inner_loop_cond);
1401 if (!res)
1402 return opt_loop_vec_info::propagate_failure (res);
1404 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1405 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1406 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1407 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1408 if (!integer_onep (assumptions))
1410 /* We consider to vectorize this loop by versioning it under
1411 some assumptions. In order to do this, we need to clear
1412 existing information computed by scev and niter analyzer. */
1413 scev_reset_htab ();
1414 free_numbers_of_iterations_estimates (loop);
1415 /* Also set flag for this loop so that following scev and niter
1416 analysis are done under the assumptions. */
1417 loop_constraint_set (loop, LOOP_C_FINITE);
1418 /* Also record the assumptions for versioning. */
1419 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1422 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1424 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Symbolic number of iterations is ");
1428 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1429 dump_printf (MSG_NOTE, "\n");
1433 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1434 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1435 if (inner_loop_cond)
1437 stmt_vec_info inner_loop_cond_info
1438 = loop_vinfo->lookup_stmt (inner_loop_cond);
1439 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1442 gcc_assert (!loop->aux);
1443 loop->aux = loop_vinfo;
1444 return opt_loop_vec_info::success (loop_vinfo);
1449 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1450 statements update the vectorization factor. */
1452 static void
1453 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1455 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457 int nbbs = loop->num_nodes;
1458 poly_uint64 vectorization_factor;
1459 int i;
1461 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1463 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1464 gcc_assert (known_ne (vectorization_factor, 0U));
1466 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1467 vectorization factor of the loop is the unrolling factor required by
1468 the SLP instances. If that unrolling factor is 1, we say, that we
1469 perform pure SLP on loop - cross iteration parallelism is not
1470 exploited. */
1471 bool only_slp_in_loop = true;
1472 for (i = 0; i < nbbs; i++)
1474 basic_block bb = bbs[i];
1475 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1476 gsi_next (&si))
1478 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1479 if (!stmt_info)
1480 continue;
1481 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1482 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1483 && !PURE_SLP_STMT (stmt_info))
1484 /* STMT needs both SLP and loop-based vectorization. */
1485 only_slp_in_loop = false;
1487 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1488 gsi_next (&si))
1490 if (is_gimple_debug (gsi_stmt (si)))
1491 continue;
1492 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1493 stmt_info = vect_stmt_to_vectorize (stmt_info);
1494 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496 && !PURE_SLP_STMT (stmt_info))
1497 /* STMT needs both SLP and loop-based vectorization. */
1498 only_slp_in_loop = false;
1502 if (only_slp_in_loop)
1504 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_NOTE, vect_location,
1506 "Loop contains only SLP stmts\n");
1507 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1509 else
1511 if (dump_enabled_p ())
1512 dump_printf_loc (MSG_NOTE, vect_location,
1513 "Loop contains SLP and non-SLP stmts\n");
1514 /* Both the vectorization factor and unroll factor have the form
1515 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1516 so they must have a common multiple. */
1517 vectorization_factor
1518 = force_common_multiple (vectorization_factor,
1519 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1522 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1523 if (dump_enabled_p ())
1525 dump_printf_loc (MSG_NOTE, vect_location,
1526 "Updating vectorization factor to ");
1527 dump_dec (MSG_NOTE, vectorization_factor);
1528 dump_printf (MSG_NOTE, ".\n");
1532 /* Return true if STMT_INFO describes a double reduction phi and if
1533 the other phi in the reduction is also relevant for vectorization.
1534 This rejects cases such as:
1536 outer1:
1537 x_1 = PHI <x_3(outer2), ...>;
1540 inner:
1541 x_2 = ...;
1544 outer2:
1545 x_3 = PHI <x_2(inner)>;
1547 if nothing in x_2 or elsewhere makes x_1 relevant. */
1549 static bool
1550 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1552 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1553 return false;
1555 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1558 /* Function vect_analyze_loop_operations.
1560 Scan the loop stmts and make sure they are all vectorizable. */
1562 static opt_result
1563 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1565 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567 int nbbs = loop->num_nodes;
1568 int i;
1569 stmt_vec_info stmt_info;
1570 bool need_to_vectorize = false;
1571 bool ok;
1573 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1575 auto_vec<stmt_info_for_cost> cost_vec;
1577 for (i = 0; i < nbbs; i++)
1579 basic_block bb = bbs[i];
1581 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1582 gsi_next (&si))
1584 gphi *phi = si.phi ();
1585 ok = true;
1587 stmt_info = loop_vinfo->lookup_stmt (phi);
1588 if (dump_enabled_p ())
1589 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1590 if (virtual_operand_p (gimple_phi_result (phi)))
1591 continue;
1593 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1594 (i.e., a phi in the tail of the outer-loop). */
1595 if (! is_loop_header_bb_p (bb))
1597 /* FORNOW: we currently don't support the case that these phis
1598 are not used in the outerloop (unless it is double reduction,
1599 i.e., this phi is vect_reduction_def), cause this case
1600 requires to actually do something here. */
1601 if (STMT_VINFO_LIVE_P (stmt_info)
1602 && !vect_active_double_reduction_p (stmt_info))
1603 return opt_result::failure_at (phi,
1604 "Unsupported loop-closed phi"
1605 " in outer-loop.\n");
1607 /* If PHI is used in the outer loop, we check that its operand
1608 is defined in the inner loop. */
1609 if (STMT_VINFO_RELEVANT_P (stmt_info))
1611 tree phi_op;
1613 if (gimple_phi_num_args (phi) != 1)
1614 return opt_result::failure_at (phi, "unsupported phi");
1616 phi_op = PHI_ARG_DEF (phi, 0);
1617 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1618 if (!op_def_info)
1619 return opt_result::failure_at (phi, "unsupported phi\n");
1621 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1622 && (STMT_VINFO_RELEVANT (op_def_info)
1623 != vect_used_in_outer_by_reduction))
1624 return opt_result::failure_at (phi, "unsupported phi\n");
1626 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1627 || (STMT_VINFO_DEF_TYPE (stmt_info)
1628 == vect_double_reduction_def))
1629 && !vectorizable_lc_phi (loop_vinfo,
1630 stmt_info, NULL, NULL))
1631 return opt_result::failure_at (phi, "unsupported phi\n");
1634 continue;
1637 gcc_assert (stmt_info);
1639 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1640 || STMT_VINFO_LIVE_P (stmt_info))
1641 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1642 /* A scalar-dependence cycle that we don't support. */
1643 return opt_result::failure_at (phi,
1644 "not vectorized:"
1645 " scalar dependence cycle.\n");
1647 if (STMT_VINFO_RELEVANT_P (stmt_info))
1649 need_to_vectorize = true;
1650 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1651 && ! PURE_SLP_STMT (stmt_info))
1652 ok = vectorizable_induction (loop_vinfo,
1653 stmt_info, NULL, NULL,
1654 &cost_vec);
1655 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1656 || (STMT_VINFO_DEF_TYPE (stmt_info)
1657 == vect_double_reduction_def)
1658 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1659 && ! PURE_SLP_STMT (stmt_info))
1660 ok = vectorizable_reduction (loop_vinfo,
1661 stmt_info, NULL, NULL, &cost_vec);
1664 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1665 if (ok
1666 && STMT_VINFO_LIVE_P (stmt_info)
1667 && !PURE_SLP_STMT (stmt_info))
1668 ok = vectorizable_live_operation (loop_vinfo,
1669 stmt_info, NULL, NULL, NULL,
1670 -1, false, &cost_vec);
1672 if (!ok)
1673 return opt_result::failure_at (phi,
1674 "not vectorized: relevant phi not "
1675 "supported: %G",
1676 static_cast <gimple *> (phi));
1679 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1680 gsi_next (&si))
1682 gimple *stmt = gsi_stmt (si);
1683 if (!gimple_clobber_p (stmt)
1684 && !is_gimple_debug (stmt))
1686 opt_result res
1687 = vect_analyze_stmt (loop_vinfo,
1688 loop_vinfo->lookup_stmt (stmt),
1689 &need_to_vectorize,
1690 NULL, NULL, &cost_vec);
1691 if (!res)
1692 return res;
1695 } /* bbs */
1697 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1699 /* All operations in the loop are either irrelevant (deal with loop
1700 control, or dead), or only used outside the loop and can be moved
1701 out of the loop (e.g. invariants, inductions). The loop can be
1702 optimized away by scalar optimizations. We're better off not
1703 touching this loop. */
1704 if (!need_to_vectorize)
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "All the computation can be taken out of the loop.\n");
1709 return opt_result::failure_at
1710 (vect_location,
1711 "not vectorized: redundant loop. no profit to vectorize.\n");
1714 return opt_result::success ();
1717 /* Return true if we know that the iteration count is smaller than the
1718 vectorization factor. Return false if it isn't, or if we can't be sure
1719 either way. */
1721 static bool
1722 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1724 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1726 HOST_WIDE_INT max_niter;
1727 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1728 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1729 else
1730 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1732 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1733 return true;
1735 return false;
1738 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1739 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1740 definitely no, or -1 if it's worth retrying. */
1742 static int
1743 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1745 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1746 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1748 /* Only loops that can handle partially-populated vectors can have iteration
1749 counts less than the vectorization factor. */
1750 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1752 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756 "not vectorized: iteration count smaller than "
1757 "vectorization factor.\n");
1758 return 0;
1762 int min_profitable_iters, min_profitable_estimate;
1763 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1764 &min_profitable_estimate);
1766 if (min_profitable_iters < 0)
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770 "not vectorized: vectorization not profitable.\n");
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "not vectorized: vector version will never be "
1774 "profitable.\n");
1775 return -1;
1778 int min_scalar_loop_bound = (param_min_vect_loop_bound
1779 * assumed_vf);
1781 /* Use the cost model only if it is more conservative than user specified
1782 threshold. */
1783 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1784 min_profitable_iters);
1786 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1788 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1789 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1791 if (dump_enabled_p ())
1792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1793 "not vectorized: vectorization not profitable.\n");
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_NOTE, vect_location,
1796 "not vectorized: iteration count smaller than user "
1797 "specified loop bound parameter or minimum profitable "
1798 "iterations (whichever is more conservative).\n");
1799 return 0;
1802 /* The static profitablity threshold min_profitable_estimate includes
1803 the cost of having to check at runtime whether the scalar loop
1804 should be used instead. If it turns out that we don't need or want
1805 such a check, the threshold we should use for the static estimate
1806 is simply the point at which the vector loop becomes more profitable
1807 than the scalar loop. */
1808 if (min_profitable_estimate > min_profitable_iters
1809 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1810 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1811 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1812 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1816 " choice between the scalar and vector loops\n");
1817 min_profitable_estimate = min_profitable_iters;
1820 HOST_WIDE_INT estimated_niter;
1822 /* If we are vectorizing an epilogue then we know the maximum number of
1823 scalar iterations it will cover is at least one lower than the
1824 vectorization factor of the main loop. */
1825 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1826 estimated_niter
1827 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1828 else
1830 estimated_niter = estimated_stmt_executions_int (loop);
1831 if (estimated_niter == -1)
1832 estimated_niter = likely_max_stmt_executions_int (loop);
1834 if (estimated_niter != -1
1835 && ((unsigned HOST_WIDE_INT) estimated_niter
1836 < MAX (th, (unsigned) min_profitable_estimate)))
1838 if (dump_enabled_p ())
1839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840 "not vectorized: estimated iteration count too "
1841 "small.\n");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "not vectorized: estimated iteration count smaller "
1845 "than specified loop bound parameter or minimum "
1846 "profitable iterations (whichever is more "
1847 "conservative).\n");
1848 return -1;
1851 return 1;
1854 static opt_result
1855 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1856 vec<data_reference_p> *datarefs,
1857 unsigned int *n_stmts)
1859 *n_stmts = 0;
1860 for (unsigned i = 0; i < loop->num_nodes; i++)
1861 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1862 !gsi_end_p (gsi); gsi_next (&gsi))
1864 gimple *stmt = gsi_stmt (gsi);
1865 if (is_gimple_debug (stmt))
1866 continue;
1867 ++(*n_stmts);
1868 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1869 if (!res)
1871 if (is_gimple_call (stmt) && loop->safelen)
1873 tree fndecl = gimple_call_fndecl (stmt), op;
1874 if (fndecl != NULL_TREE)
1876 cgraph_node *node = cgraph_node::get (fndecl);
1877 if (node != NULL && node->simd_clones != NULL)
1879 unsigned int j, n = gimple_call_num_args (stmt);
1880 for (j = 0; j < n; j++)
1882 op = gimple_call_arg (stmt, j);
1883 if (DECL_P (op)
1884 || (REFERENCE_CLASS_P (op)
1885 && get_base_address (op)))
1886 break;
1888 op = gimple_call_lhs (stmt);
1889 /* Ignore #pragma omp declare simd functions
1890 if they don't have data references in the
1891 call stmt itself. */
1892 if (j == n
1893 && !(op
1894 && (DECL_P (op)
1895 || (REFERENCE_CLASS_P (op)
1896 && get_base_address (op)))))
1897 continue;
1901 return res;
1903 /* If dependence analysis will give up due to the limit on the
1904 number of datarefs stop here and fail fatally. */
1905 if (datarefs->length ()
1906 > (unsigned)param_loop_max_datarefs_for_datadeps)
1907 return opt_result::failure_at (stmt, "exceeded param "
1908 "loop-max-datarefs-for-datadeps\n");
1910 return opt_result::success ();
1913 /* Look for SLP-only access groups and turn each individual access into its own
1914 group. */
1915 static void
1916 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1918 unsigned int i;
1919 struct data_reference *dr;
1921 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1923 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1924 FOR_EACH_VEC_ELT (datarefs, i, dr)
1926 gcc_assert (DR_REF (dr));
1927 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1929 /* Check if the load is a part of an interleaving chain. */
1930 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1932 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1933 unsigned int group_size = DR_GROUP_SIZE (first_element);
1935 /* Check if SLP-only groups. */
1936 if (!STMT_SLP_TYPE (stmt_info)
1937 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1939 /* Dissolve the group. */
1940 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1942 stmt_vec_info vinfo = first_element;
1943 while (vinfo)
1945 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1946 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1947 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1948 DR_GROUP_SIZE (vinfo) = 1;
1949 if (STMT_VINFO_STRIDED_P (first_element))
1950 DR_GROUP_GAP (vinfo) = 0;
1951 else
1952 DR_GROUP_GAP (vinfo) = group_size - 1;
1953 vinfo = next;
1961 /* Decides whether we need to create an epilogue loop to handle
1962 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1964 void
1965 determine_peel_for_niter (loop_vec_info loop_vinfo)
1967 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1969 unsigned HOST_WIDE_INT const_vf;
1970 HOST_WIDE_INT max_niter
1971 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1973 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1974 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1975 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1976 (loop_vinfo));
1978 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1979 /* The main loop handles all iterations. */
1980 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1981 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1984 /* Work out the (constant) number of iterations that need to be
1985 peeled for reasons other than niters. */
1986 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1987 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1988 peel_niter += 1;
1989 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1990 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1991 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1993 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1994 /* ??? When peeling for gaps but not alignment, we could
1995 try to check whether the (variable) niters is known to be
1996 VF * N + 1. That's something of a niche case though. */
1997 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1998 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1999 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2000 < (unsigned) exact_log2 (const_vf))
2001 /* In case of versioning, check if the maximum number of
2002 iterations is greater than th. If they are identical,
2003 the epilogue is unnecessary. */
2004 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2005 || ((unsigned HOST_WIDE_INT) max_niter
2006 > (th / const_vf) * const_vf))))
2007 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2011 /* Function vect_analyze_loop_2.
2013 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2014 for it. The different analyses will record information in the
2015 loop_vec_info struct. */
2016 static opt_result
2017 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2019 opt_result ok = opt_result::success ();
2020 int res;
2021 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2022 poly_uint64 min_vf = 2;
2023 loop_vec_info orig_loop_vinfo = NULL;
2025 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2026 loop_vec_info of the first vectorized loop. */
2027 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2028 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2029 else
2030 orig_loop_vinfo = loop_vinfo;
2031 gcc_assert (orig_loop_vinfo);
2033 /* The first group of checks is independent of the vector size. */
2034 fatal = true;
2036 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2037 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2038 return opt_result::failure_at (vect_location,
2039 "not vectorized: simd if(0)\n");
2041 /* Find all data references in the loop (which correspond to vdefs/vuses)
2042 and analyze their evolution in the loop. */
2044 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2046 /* Gather the data references and count stmts in the loop. */
2047 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2049 opt_result res
2050 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2051 &LOOP_VINFO_DATAREFS (loop_vinfo),
2052 n_stmts);
2053 if (!res)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2057 "not vectorized: loop contains function "
2058 "calls or data references that cannot "
2059 "be analyzed\n");
2060 return res;
2062 loop_vinfo->shared->save_datarefs ();
2064 else
2065 loop_vinfo->shared->check_datarefs ();
2067 /* Analyze the data references and also adjust the minimal
2068 vectorization factor according to the loads and stores. */
2070 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2071 if (!ok)
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075 "bad data references.\n");
2076 return ok;
2079 /* Classify all cross-iteration scalar data-flow cycles.
2080 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2081 vect_analyze_scalar_cycles (loop_vinfo);
2083 vect_pattern_recog (loop_vinfo);
2085 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2087 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2088 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2090 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2091 if (!ok)
2093 if (dump_enabled_p ())
2094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095 "bad data access.\n");
2096 return ok;
2099 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2101 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2102 if (!ok)
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106 "unexpected pattern.\n");
2107 return ok;
2110 /* While the rest of the analysis below depends on it in some way. */
2111 fatal = false;
2113 /* Analyze data dependences between the data-refs in the loop
2114 and adjust the maximum vectorization factor according to
2115 the dependences.
2116 FORNOW: fail at the first data dependence that we encounter. */
2118 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2119 if (!ok)
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 "bad data dependence.\n");
2124 return ok;
2126 if (max_vf != MAX_VECTORIZATION_FACTOR
2127 && maybe_lt (max_vf, min_vf))
2128 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2129 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2131 ok = vect_determine_vectorization_factor (loop_vinfo);
2132 if (!ok)
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "can't determine vectorization factor.\n");
2137 return ok;
2139 if (max_vf != MAX_VECTORIZATION_FACTOR
2140 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2143 /* Compute the scalar iteration cost. */
2144 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2146 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2149 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2150 if (!ok)
2151 return ok;
2153 /* If there are any SLP instances mark them as pure_slp. */
2154 bool slp = vect_make_slp_decision (loop_vinfo);
2155 if (slp)
2157 /* Find stmts that need to be both vectorized and SLPed. */
2158 vect_detect_hybrid_slp (loop_vinfo);
2160 /* Update the vectorization factor based on the SLP decision. */
2161 vect_update_vf_for_slp (loop_vinfo);
2163 /* Optimize the SLP graph with the vectorization factor fixed. */
2164 vect_optimize_slp (loop_vinfo);
2167 bool saved_can_use_partial_vectors_p
2168 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2170 /* We don't expect to have to roll back to anything other than an empty
2171 set of rgroups. */
2172 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2174 /* This is the point where we can re-start analysis with SLP forced off. */
2175 start_over:
2177 /* Now the vectorization factor is final. */
2178 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2179 gcc_assert (known_ne (vectorization_factor, 0U));
2181 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "vectorization_factor = ");
2185 dump_dec (MSG_NOTE, vectorization_factor);
2186 dump_printf (MSG_NOTE, ", niters = %wd\n",
2187 LOOP_VINFO_INT_NITERS (loop_vinfo));
2190 /* Analyze the alignment of the data-refs in the loop.
2191 Fail if a data reference is found that cannot be vectorized. */
2193 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2194 if (!ok)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "bad data alignment.\n");
2199 return ok;
2202 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2203 It is important to call pruning after vect_analyze_data_ref_accesses,
2204 since we use grouping information gathered by interleaving analysis. */
2205 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2206 if (!ok)
2207 return ok;
2209 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2210 vectorization, since we do not want to add extra peeling or
2211 add versioning for alignment. */
2212 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2213 /* This pass will decide on using loop versioning and/or loop peeling in
2214 order to enhance the alignment of data references in the loop. */
2215 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2216 if (!ok)
2217 return ok;
2219 if (slp)
2221 /* Analyze operations in the SLP instances. Note this may
2222 remove unsupported SLP instances which makes the above
2223 SLP kind detection invalid. */
2224 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2225 vect_slp_analyze_operations (loop_vinfo);
2226 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2228 ok = opt_result::failure_at (vect_location,
2229 "unsupported SLP instances\n");
2230 goto again;
2234 /* Dissolve SLP-only groups. */
2235 vect_dissolve_slp_only_groups (loop_vinfo);
2237 /* Scan all the remaining operations in the loop that are not subject
2238 to SLP and make sure they are vectorizable. */
2239 ok = vect_analyze_loop_operations (loop_vinfo);
2240 if (!ok)
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244 "bad operation or unsupported loop bound.\n");
2245 return ok;
2248 /* For now, we don't expect to mix both masking and length approaches for one
2249 loop, disable it if both are recorded. */
2250 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2251 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2252 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "can't vectorize a loop with partial vectors"
2257 " because we don't expect to mix different"
2258 " approaches with partial vectors for the"
2259 " same loop.\n");
2260 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2263 /* Decide whether to vectorize a loop with partial vectors for
2264 this vectorization factor. */
2265 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2267 if (param_vect_partial_vector_usage == 0)
2268 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2269 else if (vect_verify_full_masking (loop_vinfo)
2270 || vect_verify_loop_lens (loop_vinfo))
2272 /* The epilogue and other known niters less than VF
2273 cases can still use vector access with length fully. */
2274 if (param_vect_partial_vector_usage == 1
2275 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2276 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2278 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2279 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2281 else
2282 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2284 else
2285 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2287 else
2288 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2290 if (dump_enabled_p ())
2292 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "operating on partial vectors.\n");
2295 else
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "operating only on full vectors.\n");
2300 /* If epilog loop is required because of data accesses with gaps,
2301 one additional iteration needs to be peeled. Check if there is
2302 enough iterations for vectorization. */
2303 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2304 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2305 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2307 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2308 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2310 if (known_lt (wi::to_widest (scalar_niters), vf))
2311 return opt_result::failure_at (vect_location,
2312 "loop has no enough iterations to"
2313 " support peeling for gaps.\n");
2316 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2317 to be able to handle fewer than VF scalars, or needs to have a lower VF
2318 than the main loop. */
2319 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2320 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2321 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2322 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2323 return opt_result::failure_at (vect_location,
2324 "Vectorization factor too high for"
2325 " epilogue loop.\n");
2327 /* Check the costings of the loop make vectorizing worthwhile. */
2328 res = vect_analyze_loop_costing (loop_vinfo);
2329 if (res < 0)
2331 ok = opt_result::failure_at (vect_location,
2332 "Loop costings may not be worthwhile.\n");
2333 goto again;
2335 if (!res)
2336 return opt_result::failure_at (vect_location,
2337 "Loop costings not worthwhile.\n");
2339 determine_peel_for_niter (loop_vinfo);
2340 /* If an epilogue loop is required make sure we can create one. */
2341 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2342 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2346 if (!vect_can_advance_ivs_p (loop_vinfo)
2347 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2348 single_exit (LOOP_VINFO_LOOP
2349 (loop_vinfo))))
2351 ok = opt_result::failure_at (vect_location,
2352 "not vectorized: can't create required "
2353 "epilog loop\n");
2354 goto again;
2358 /* During peeling, we need to check if number of loop iterations is
2359 enough for both peeled prolog loop and vector loop. This check
2360 can be merged along with threshold check of loop versioning, so
2361 increase threshold for this case if necessary.
2363 If we are analyzing an epilogue we still want to check what its
2364 versioning threshold would be. If we decide to vectorize the epilogues we
2365 will want to use the lowest versioning threshold of all epilogues and main
2366 loop. This will enable us to enter a vectorized epilogue even when
2367 versioning the loop. We can't simply check whether the epilogue requires
2368 versioning though since we may have skipped some versioning checks when
2369 analyzing the epilogue. For instance, checks for alias versioning will be
2370 skipped when dealing with epilogues as we assume we already checked them
2371 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2372 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2374 poly_uint64 niters_th = 0;
2375 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2377 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2379 /* Niters for peeled prolog loop. */
2380 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2382 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2383 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2384 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2386 else
2387 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2390 /* Niters for at least one iteration of vectorized loop. */
2391 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2392 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2393 /* One additional iteration because of peeling for gap. */
2394 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2395 niters_th += 1;
2397 /* Use the same condition as vect_transform_loop to decide when to use
2398 the cost to determine a versioning threshold. */
2399 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2400 && ordered_p (th, niters_th))
2401 niters_th = ordered_max (poly_uint64 (th), niters_th);
2403 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2406 gcc_assert (known_eq (vectorization_factor,
2407 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2409 /* Ok to vectorize! */
2410 return opt_result::success ();
2412 again:
2413 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2414 gcc_assert (!ok);
2416 /* Try again with SLP forced off but if we didn't do any SLP there is
2417 no point in re-trying. */
2418 if (!slp)
2419 return ok;
2421 /* If there are reduction chains re-trying will fail anyway. */
2422 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2423 return ok;
2425 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2426 via interleaving or lane instructions. */
2427 slp_instance instance;
2428 slp_tree node;
2429 unsigned i, j;
2430 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2432 stmt_vec_info vinfo;
2433 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2434 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2435 continue;
2436 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2437 unsigned int size = DR_GROUP_SIZE (vinfo);
2438 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2439 if (! vect_store_lanes_supported (vectype, size, false)
2440 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2441 && ! vect_grouped_store_supported (vectype, size))
2442 return opt_result::failure_at (vinfo->stmt,
2443 "unsupported grouped store\n");
2444 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2446 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2447 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2448 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2449 size = DR_GROUP_SIZE (vinfo);
2450 vectype = STMT_VINFO_VECTYPE (vinfo);
2451 if (! vect_load_lanes_supported (vectype, size, false)
2452 && ! vect_grouped_load_supported (vectype, single_element_p,
2453 size))
2454 return opt_result::failure_at (vinfo->stmt,
2455 "unsupported grouped load\n");
2459 if (dump_enabled_p ())
2460 dump_printf_loc (MSG_NOTE, vect_location,
2461 "re-trying with SLP disabled\n");
2463 /* Roll back state appropriately. No SLP this time. */
2464 slp = false;
2465 /* Restore vectorization factor as it were without SLP. */
2466 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2467 /* Free the SLP instances. */
2468 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2469 vect_free_slp_instance (instance, false);
2470 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2471 /* Reset SLP type to loop_vect on all stmts. */
2472 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2474 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2475 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2476 !gsi_end_p (si); gsi_next (&si))
2478 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2479 STMT_SLP_TYPE (stmt_info) = loop_vect;
2480 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2481 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2483 /* vectorizable_reduction adjusts reduction stmt def-types,
2484 restore them to that of the PHI. */
2485 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2486 = STMT_VINFO_DEF_TYPE (stmt_info);
2487 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2488 (STMT_VINFO_REDUC_DEF (stmt_info)))
2489 = STMT_VINFO_DEF_TYPE (stmt_info);
2492 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2493 !gsi_end_p (si); gsi_next (&si))
2495 if (is_gimple_debug (gsi_stmt (si)))
2496 continue;
2497 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2498 STMT_SLP_TYPE (stmt_info) = loop_vect;
2499 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2501 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2502 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2503 STMT_SLP_TYPE (stmt_info) = loop_vect;
2504 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2505 !gsi_end_p (pi); gsi_next (&pi))
2506 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2507 = loop_vect;
2511 /* Free optimized alias test DDRS. */
2512 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2513 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2514 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2515 /* Reset target cost data. */
2516 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2517 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2518 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2519 /* Reset accumulated rgroup information. */
2520 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2521 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2522 /* Reset assorted flags. */
2523 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2524 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2525 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2526 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2527 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2528 = saved_can_use_partial_vectors_p;
2530 goto start_over;
2533 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2534 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2535 OLD_LOOP_VINFO is better unless something specifically indicates
2536 otherwise.
2538 Note that this deliberately isn't a partial order. */
2540 static bool
2541 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2542 loop_vec_info old_loop_vinfo)
2544 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2545 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2547 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2548 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2550 /* Always prefer a VF of loop->simdlen over any other VF. */
2551 if (loop->simdlen)
2553 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2554 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2555 if (new_simdlen_p != old_simdlen_p)
2556 return new_simdlen_p;
2559 /* Limit the VFs to what is likely to be the maximum number of iterations,
2560 to handle cases in which at least one loop_vinfo is fully-masked. */
2561 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2562 if (estimated_max_niter != -1)
2564 if (known_le (estimated_max_niter, new_vf))
2565 new_vf = estimated_max_niter;
2566 if (known_le (estimated_max_niter, old_vf))
2567 old_vf = estimated_max_niter;
2570 /* Check whether the (fractional) cost per scalar iteration is lower
2571 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2572 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2573 * poly_widest_int (old_vf));
2574 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2575 * poly_widest_int (new_vf));
2576 if (maybe_lt (rel_old, rel_new))
2578 /* When old_loop_vinfo uses a variable vectorization factor,
2579 we know that it has a lower cost for at least one runtime VF.
2580 However, we don't know how likely that VF is.
2582 One option would be to compare the costs for the estimated VFs.
2583 The problem is that that can put too much pressure on the cost
2584 model. E.g. if the estimated VF is also the lowest possible VF,
2585 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2586 for the estimated VF, we'd then choose new_loop_vinfo even
2587 though (a) new_loop_vinfo might not actually be better than
2588 old_loop_vinfo for that VF and (b) it would be significantly
2589 worse at larger VFs.
2591 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2592 no more expensive than old_loop_vinfo even after doubling the
2593 estimated old_loop_vinfo VF. For all but trivial loops, this
2594 ensures that we only pick new_loop_vinfo if it is significantly
2595 better than old_loop_vinfo at the estimated VF. */
2596 if (rel_new.is_constant ())
2597 return false;
2599 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2600 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2601 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2602 * widest_int (old_estimated_vf));
2603 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2604 * widest_int (new_estimated_vf));
2605 return estimated_rel_new * 2 <= estimated_rel_old;
2607 if (known_lt (rel_new, rel_old))
2608 return true;
2610 /* If there's nothing to choose between the loop bodies, see whether
2611 there's a difference in the prologue and epilogue costs. */
2612 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2613 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2615 return false;
2618 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2619 true if we should. */
2621 static bool
2622 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2623 loop_vec_info old_loop_vinfo)
2625 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2626 return false;
2628 if (dump_enabled_p ())
2629 dump_printf_loc (MSG_NOTE, vect_location,
2630 "***** Preferring vector mode %s to vector mode %s\n",
2631 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2632 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2633 return true;
2636 /* Function vect_analyze_loop.
2638 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2639 for it. The different analyses will record information in the
2640 loop_vec_info struct. */
2641 opt_loop_vec_info
2642 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2644 auto_vector_modes vector_modes;
2646 /* Autodetect first vector size we try. */
2647 unsigned int autovec_flags
2648 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2649 loop->simdlen != 0);
2650 unsigned int mode_i = 0;
2652 DUMP_VECT_SCOPE ("analyze_loop_nest");
2654 if (loop_outer (loop)
2655 && loop_vec_info_for_loop (loop_outer (loop))
2656 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2657 return opt_loop_vec_info::failure_at (vect_location,
2658 "outer-loop already vectorized.\n");
2660 if (!find_loop_nest (loop, &shared->loop_nest))
2661 return opt_loop_vec_info::failure_at
2662 (vect_location,
2663 "not vectorized: loop nest containing two or more consecutive inner"
2664 " loops cannot be vectorized\n");
2666 unsigned n_stmts = 0;
2667 machine_mode autodetected_vector_mode = VOIDmode;
2668 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2669 machine_mode next_vector_mode = VOIDmode;
2670 poly_uint64 lowest_th = 0;
2671 unsigned vectorized_loops = 0;
2672 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2673 && !unlimited_cost_model (loop));
2675 bool vect_epilogues = false;
2676 opt_result res = opt_result::success ();
2677 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2678 while (1)
2680 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2681 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2682 if (!loop_vinfo)
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686 "bad loop form.\n");
2687 gcc_checking_assert (first_loop_vinfo == NULL);
2688 return loop_vinfo;
2690 loop_vinfo->vector_mode = next_vector_mode;
2692 bool fatal = false;
2694 /* When pick_lowest_cost_p is true, we should in principle iterate
2695 over all the loop_vec_infos that LOOP_VINFO could replace and
2696 try to vectorize LOOP_VINFO under the same conditions.
2697 E.g. when trying to replace an epilogue loop, we should vectorize
2698 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2699 to replace the main loop, we should vectorize LOOP_VINFO as a main
2700 loop too.
2702 However, autovectorize_vector_modes is usually sorted as follows:
2704 - Modes that naturally produce lower VFs usually follow modes that
2705 naturally produce higher VFs.
2707 - When modes naturally produce the same VF, maskable modes
2708 usually follow unmaskable ones, so that the maskable mode
2709 can be used to vectorize the epilogue of the unmaskable mode.
2711 This order is preferred because it leads to the maximum
2712 epilogue vectorization opportunities. Targets should only use
2713 a different order if they want to make wide modes available while
2714 disparaging them relative to earlier, smaller modes. The assumption
2715 in that case is that the wider modes are more expensive in some
2716 way that isn't reflected directly in the costs.
2718 There should therefore be few interesting cases in which
2719 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2720 treated as a standalone loop, and ends up being genuinely cheaper
2721 than FIRST_LOOP_VINFO. */
2722 if (vect_epilogues)
2723 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2725 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2726 if (mode_i == 0)
2727 autodetected_vector_mode = loop_vinfo->vector_mode;
2728 if (dump_enabled_p ())
2730 if (res)
2731 dump_printf_loc (MSG_NOTE, vect_location,
2732 "***** Analysis succeeded with vector mode %s\n",
2733 GET_MODE_NAME (loop_vinfo->vector_mode));
2734 else
2735 dump_printf_loc (MSG_NOTE, vect_location,
2736 "***** Analysis failed with vector mode %s\n",
2737 GET_MODE_NAME (loop_vinfo->vector_mode));
2740 loop->aux = NULL;
2742 if (!fatal)
2743 while (mode_i < vector_modes.length ()
2744 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_NOTE, vect_location,
2748 "***** The result for vector mode %s would"
2749 " be the same\n",
2750 GET_MODE_NAME (vector_modes[mode_i]));
2751 mode_i += 1;
2754 if (res)
2756 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2757 vectorized_loops++;
2759 /* Once we hit the desired simdlen for the first time,
2760 discard any previous attempts. */
2761 if (simdlen
2762 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2764 delete first_loop_vinfo;
2765 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2766 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2767 simdlen = 0;
2769 else if (pick_lowest_cost_p && first_loop_vinfo)
2771 /* Keep trying to roll back vectorization attempts while the
2772 loop_vec_infos they produced were worse than this one. */
2773 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2774 while (!vinfos.is_empty ()
2775 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2777 gcc_assert (vect_epilogues);
2778 delete vinfos.pop ();
2780 if (vinfos.is_empty ()
2781 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2783 delete first_loop_vinfo;
2784 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2785 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2789 if (first_loop_vinfo == NULL)
2791 first_loop_vinfo = loop_vinfo;
2792 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2794 else if (vect_epilogues
2795 /* For now only allow one epilogue loop. */
2796 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2798 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2799 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2800 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2801 || maybe_ne (lowest_th, 0U));
2802 /* Keep track of the known smallest versioning
2803 threshold. */
2804 if (ordered_p (lowest_th, th))
2805 lowest_th = ordered_min (lowest_th, th);
2807 else
2809 delete loop_vinfo;
2810 loop_vinfo = opt_loop_vec_info::success (NULL);
2813 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2814 enabled, SIMDUID is not set, it is the innermost loop and we have
2815 either already found the loop's SIMDLEN or there was no SIMDLEN to
2816 begin with.
2817 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2818 vect_epilogues = (!simdlen
2819 && loop->inner == NULL
2820 && param_vect_epilogues_nomask
2821 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2822 && !loop->simduid
2823 /* For now only allow one epilogue loop, but allow
2824 pick_lowest_cost_p to replace it. */
2825 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2826 || pick_lowest_cost_p));
2828 /* Commit to first_loop_vinfo if we have no reason to try
2829 alternatives. */
2830 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2831 break;
2833 else
2835 delete loop_vinfo;
2836 loop_vinfo = opt_loop_vec_info::success (NULL);
2837 if (fatal)
2839 gcc_checking_assert (first_loop_vinfo == NULL);
2840 break;
2844 /* Handle the case that the original loop can use partial
2845 vectorization, but want to only adopt it for the epilogue.
2846 The retry should be in the same mode as original. */
2847 if (vect_epilogues
2848 && loop_vinfo
2849 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2851 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2852 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2853 if (dump_enabled_p ())
2854 dump_printf_loc (MSG_NOTE, vect_location,
2855 "***** Re-trying analysis with same vector mode"
2856 " %s for epilogue with partial vectors.\n",
2857 GET_MODE_NAME (loop_vinfo->vector_mode));
2858 continue;
2861 if (mode_i < vector_modes.length ()
2862 && VECTOR_MODE_P (autodetected_vector_mode)
2863 && (related_vector_mode (vector_modes[mode_i],
2864 GET_MODE_INNER (autodetected_vector_mode))
2865 == autodetected_vector_mode)
2866 && (related_vector_mode (autodetected_vector_mode,
2867 GET_MODE_INNER (vector_modes[mode_i]))
2868 == vector_modes[mode_i]))
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_NOTE, vect_location,
2872 "***** Skipping vector mode %s, which would"
2873 " repeat the analysis for %s\n",
2874 GET_MODE_NAME (vector_modes[mode_i]),
2875 GET_MODE_NAME (autodetected_vector_mode));
2876 mode_i += 1;
2879 if (mode_i == vector_modes.length ()
2880 || autodetected_vector_mode == VOIDmode)
2881 break;
2883 /* Try the next biggest vector size. */
2884 next_vector_mode = vector_modes[mode_i++];
2885 if (dump_enabled_p ())
2886 dump_printf_loc (MSG_NOTE, vect_location,
2887 "***** Re-trying analysis with vector mode %s\n",
2888 GET_MODE_NAME (next_vector_mode));
2891 if (first_loop_vinfo)
2893 loop->aux = (loop_vec_info) first_loop_vinfo;
2894 if (dump_enabled_p ())
2895 dump_printf_loc (MSG_NOTE, vect_location,
2896 "***** Choosing vector mode %s\n",
2897 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2898 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2899 return first_loop_vinfo;
2902 return opt_loop_vec_info::propagate_failure (res);
2905 /* Return true if there is an in-order reduction function for CODE, storing
2906 it in *REDUC_FN if so. */
2908 static bool
2909 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2911 switch (code)
2913 case PLUS_EXPR:
2914 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2915 return true;
2917 default:
2918 return false;
2922 /* Function reduction_fn_for_scalar_code
2924 Input:
2925 CODE - tree_code of a reduction operations.
2927 Output:
2928 REDUC_FN - the corresponding internal function to be used to reduce the
2929 vector of partial results into a single scalar result, or IFN_LAST
2930 if the operation is a supported reduction operation, but does not have
2931 such an internal function.
2933 Return FALSE if CODE currently cannot be vectorized as reduction. */
2935 static bool
2936 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2938 switch (code)
2940 case MAX_EXPR:
2941 *reduc_fn = IFN_REDUC_MAX;
2942 return true;
2944 case MIN_EXPR:
2945 *reduc_fn = IFN_REDUC_MIN;
2946 return true;
2948 case PLUS_EXPR:
2949 *reduc_fn = IFN_REDUC_PLUS;
2950 return true;
2952 case BIT_AND_EXPR:
2953 *reduc_fn = IFN_REDUC_AND;
2954 return true;
2956 case BIT_IOR_EXPR:
2957 *reduc_fn = IFN_REDUC_IOR;
2958 return true;
2960 case BIT_XOR_EXPR:
2961 *reduc_fn = IFN_REDUC_XOR;
2962 return true;
2964 case MULT_EXPR:
2965 case MINUS_EXPR:
2966 *reduc_fn = IFN_LAST;
2967 return true;
2969 default:
2970 return false;
2974 /* If there is a neutral value X such that SLP reduction NODE would not
2975 be affected by the introduction of additional X elements, return that X,
2976 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2977 is the vector type that would hold element X. REDUC_CHAIN is true if
2978 the SLP statements perform a single reduction, false if each statement
2979 performs an independent reduction. */
2981 static tree
2982 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2983 tree_code code, bool reduc_chain)
2985 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2986 stmt_vec_info stmt_vinfo = stmts[0];
2987 tree scalar_type = TREE_TYPE (vector_type);
2988 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2989 gcc_assert (loop);
2991 switch (code)
2993 case WIDEN_SUM_EXPR:
2994 case DOT_PROD_EXPR:
2995 case SAD_EXPR:
2996 case PLUS_EXPR:
2997 case MINUS_EXPR:
2998 case BIT_IOR_EXPR:
2999 case BIT_XOR_EXPR:
3000 return build_zero_cst (scalar_type);
3002 case MULT_EXPR:
3003 return build_one_cst (scalar_type);
3005 case BIT_AND_EXPR:
3006 return build_all_ones_cst (scalar_type);
3008 case MAX_EXPR:
3009 case MIN_EXPR:
3010 /* For MIN/MAX the initial values are neutral. A reduction chain
3011 has only a single initial value, so that value is neutral for
3012 all statements. */
3013 if (reduc_chain)
3014 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3015 loop_preheader_edge (loop));
3016 return NULL_TREE;
3018 default:
3019 return NULL_TREE;
3023 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3024 STMT is printed with a message MSG. */
3026 static void
3027 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3029 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3032 /* Return true if we need an in-order reduction for operation CODE
3033 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3034 overflow must wrap. */
3036 bool
3037 needs_fold_left_reduction_p (tree type, tree_code code)
3039 /* CHECKME: check for !flag_finite_math_only too? */
3040 if (SCALAR_FLOAT_TYPE_P (type))
3041 switch (code)
3043 case MIN_EXPR:
3044 case MAX_EXPR:
3045 return false;
3047 default:
3048 return !flag_associative_math;
3051 if (INTEGRAL_TYPE_P (type))
3053 if (!operation_no_trapping_overflow (type, code))
3054 return true;
3055 return false;
3058 if (SAT_FIXED_POINT_TYPE_P (type))
3059 return true;
3061 return false;
3064 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3065 has a handled computation expression. Store the main reduction
3066 operation in *CODE. */
3068 static bool
3069 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3070 tree loop_arg, enum tree_code *code,
3071 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3073 auto_bitmap visited;
3074 tree lookfor = PHI_RESULT (phi);
3075 ssa_op_iter curri;
3076 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3077 while (USE_FROM_PTR (curr) != loop_arg)
3078 curr = op_iter_next_use (&curri);
3079 curri.i = curri.numops;
3082 path.safe_push (std::make_pair (curri, curr));
3083 tree use = USE_FROM_PTR (curr);
3084 if (use == lookfor)
3085 break;
3086 gimple *def = SSA_NAME_DEF_STMT (use);
3087 if (gimple_nop_p (def)
3088 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3090 pop:
3093 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3094 curri = x.first;
3095 curr = x.second;
3097 curr = op_iter_next_use (&curri);
3098 /* Skip already visited or non-SSA operands (from iterating
3099 over PHI args). */
3100 while (curr != NULL_USE_OPERAND_P
3101 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3102 || ! bitmap_set_bit (visited,
3103 SSA_NAME_VERSION
3104 (USE_FROM_PTR (curr)))));
3106 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3107 if (curr == NULL_USE_OPERAND_P)
3108 break;
3110 else
3112 if (gimple_code (def) == GIMPLE_PHI)
3113 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3114 else
3115 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3116 while (curr != NULL_USE_OPERAND_P
3117 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3118 || ! bitmap_set_bit (visited,
3119 SSA_NAME_VERSION
3120 (USE_FROM_PTR (curr)))))
3121 curr = op_iter_next_use (&curri);
3122 if (curr == NULL_USE_OPERAND_P)
3123 goto pop;
3126 while (1);
3127 if (dump_file && (dump_flags & TDF_DETAILS))
3129 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3130 unsigned i;
3131 std::pair<ssa_op_iter, use_operand_p> *x;
3132 FOR_EACH_VEC_ELT (path, i, x)
3133 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3134 dump_printf (MSG_NOTE, "\n");
3137 /* Check whether the reduction path detected is valid. */
3138 bool fail = path.length () == 0;
3139 bool neg = false;
3140 int sign = -1;
3141 *code = ERROR_MARK;
3142 for (unsigned i = 1; i < path.length (); ++i)
3144 gimple *use_stmt = USE_STMT (path[i].second);
3145 tree op = USE_FROM_PTR (path[i].second);
3146 if (! is_gimple_assign (use_stmt)
3147 /* The following make sure we can compute the operand index
3148 easily plus it mostly disallows chaining via COND_EXPR condition
3149 operands. */
3150 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3151 && (gimple_num_ops (use_stmt) <= 2
3152 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3153 && (gimple_num_ops (use_stmt) <= 3
3154 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3156 fail = true;
3157 break;
3159 /* Check there's only a single stmt the op is used on inside
3160 of the loop. */
3161 imm_use_iterator imm_iter;
3162 gimple *op_use_stmt;
3163 unsigned cnt = 0;
3164 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3165 if (!is_gimple_debug (op_use_stmt)
3166 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3168 /* We want to allow x + x but not x < 1 ? x : 2. */
3169 if (is_gimple_assign (op_use_stmt)
3170 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3172 use_operand_p use_p;
3173 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3174 cnt++;
3176 else
3177 cnt++;
3179 if (cnt != 1)
3181 fail = true;
3182 break;
3184 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3185 if (use_code == MINUS_EXPR)
3187 use_code = PLUS_EXPR;
3188 /* Track whether we negate the reduction value each iteration. */
3189 if (gimple_assign_rhs2 (use_stmt) == op)
3190 neg = ! neg;
3192 if (CONVERT_EXPR_CODE_P (use_code)
3193 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3194 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3196 else if (*code == ERROR_MARK)
3198 *code = use_code;
3199 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3201 else if (use_code != *code)
3203 fail = true;
3204 break;
3206 else if ((use_code == MIN_EXPR
3207 || use_code == MAX_EXPR)
3208 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3210 fail = true;
3211 break;
3214 return ! fail && ! neg && *code != ERROR_MARK;
3217 bool
3218 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3219 tree loop_arg, enum tree_code code)
3221 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3222 enum tree_code code_;
3223 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3224 && code_ == code);
3229 /* Function vect_is_simple_reduction
3231 (1) Detect a cross-iteration def-use cycle that represents a simple
3232 reduction computation. We look for the following pattern:
3234 loop_header:
3235 a1 = phi < a0, a2 >
3236 a3 = ...
3237 a2 = operation (a3, a1)
3241 a3 = ...
3242 loop_header:
3243 a1 = phi < a0, a2 >
3244 a2 = operation (a3, a1)
3246 such that:
3247 1. operation is commutative and associative and it is safe to
3248 change the order of the computation
3249 2. no uses for a2 in the loop (a2 is used out of the loop)
3250 3. no uses of a1 in the loop besides the reduction operation
3251 4. no uses of a1 outside the loop.
3253 Conditions 1,4 are tested here.
3254 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3256 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3257 nested cycles.
3259 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3260 reductions:
3262 a1 = phi < a0, a2 >
3263 inner loop (def of a3)
3264 a2 = phi < a3 >
3266 (4) Detect condition expressions, ie:
3267 for (int i = 0; i < N; i++)
3268 if (a[i] < val)
3269 ret_val = a[i];
3273 static stmt_vec_info
3274 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3275 bool *double_reduc, bool *reduc_chain_p)
3277 gphi *phi = as_a <gphi *> (phi_info->stmt);
3278 gimple *phi_use_stmt = NULL;
3279 imm_use_iterator imm_iter;
3280 use_operand_p use_p;
3282 *double_reduc = false;
3283 *reduc_chain_p = false;
3284 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3286 tree phi_name = PHI_RESULT (phi);
3287 /* ??? If there are no uses of the PHI result the inner loop reduction
3288 won't be detected as possibly double-reduction by vectorizable_reduction
3289 because that tries to walk the PHI arg from the preheader edge which
3290 can be constant. See PR60382. */
3291 if (has_zero_uses (phi_name))
3292 return NULL;
3293 class loop *loop = (gimple_bb (phi))->loop_father;
3294 unsigned nphi_def_loop_uses = 0;
3295 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3297 gimple *use_stmt = USE_STMT (use_p);
3298 if (is_gimple_debug (use_stmt))
3299 continue;
3301 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3303 if (dump_enabled_p ())
3304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3305 "intermediate value used outside loop.\n");
3307 return NULL;
3310 nphi_def_loop_uses++;
3311 phi_use_stmt = use_stmt;
3314 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3315 if (TREE_CODE (latch_def) != SSA_NAME)
3317 if (dump_enabled_p ())
3318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3319 "reduction: not ssa_name: %T\n", latch_def);
3320 return NULL;
3323 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3324 if (!def_stmt_info
3325 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3326 return NULL;
3328 bool nested_in_vect_loop
3329 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3330 unsigned nlatch_def_loop_uses = 0;
3331 auto_vec<gphi *, 3> lcphis;
3332 bool inner_loop_of_double_reduc = false;
3333 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3335 gimple *use_stmt = USE_STMT (use_p);
3336 if (is_gimple_debug (use_stmt))
3337 continue;
3338 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3339 nlatch_def_loop_uses++;
3340 else
3342 /* We can have more than one loop-closed PHI. */
3343 lcphis.safe_push (as_a <gphi *> (use_stmt));
3344 if (nested_in_vect_loop
3345 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3346 == vect_double_reduction_def))
3347 inner_loop_of_double_reduc = true;
3351 /* If we are vectorizing an inner reduction we are executing that
3352 in the original order only in case we are not dealing with a
3353 double reduction. */
3354 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3356 if (dump_enabled_p ())
3357 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3358 "detected nested cycle: ");
3359 return def_stmt_info;
3362 /* If this isn't a nested cycle or if the nested cycle reduction value
3363 is used ouside of the inner loop we cannot handle uses of the reduction
3364 value. */
3365 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3367 if (dump_enabled_p ())
3368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3369 "reduction used in loop.\n");
3370 return NULL;
3373 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3374 defined in the inner loop. */
3375 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3377 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3378 if (gimple_phi_num_args (def_stmt) != 1
3379 || TREE_CODE (op1) != SSA_NAME)
3381 if (dump_enabled_p ())
3382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3383 "unsupported phi node definition.\n");
3385 return NULL;
3388 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3389 if (gimple_bb (def1)
3390 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3391 && loop->inner
3392 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3393 && is_gimple_assign (def1)
3394 && is_a <gphi *> (phi_use_stmt)
3395 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3397 if (dump_enabled_p ())
3398 report_vect_op (MSG_NOTE, def_stmt,
3399 "detected double reduction: ");
3401 *double_reduc = true;
3402 return def_stmt_info;
3405 return NULL;
3408 /* Look for the expression computing latch_def from then loop PHI result. */
3409 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3410 enum tree_code code;
3411 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3412 path))
3414 STMT_VINFO_REDUC_CODE (phi_info) = code;
3415 if (code == COND_EXPR && !nested_in_vect_loop)
3416 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3418 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3419 reduction chain for which the additional restriction is that
3420 all operations in the chain are the same. */
3421 auto_vec<stmt_vec_info, 8> reduc_chain;
3422 unsigned i;
3423 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3424 for (i = path.length () - 1; i >= 1; --i)
3426 gimple *stmt = USE_STMT (path[i].second);
3427 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3428 STMT_VINFO_REDUC_IDX (stmt_info)
3429 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3430 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3431 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3432 && (i == 1 || i == path.length () - 1));
3433 if ((stmt_code != code && !leading_conversion)
3434 /* We can only handle the final value in epilogue
3435 generation for reduction chains. */
3436 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3437 is_slp_reduc = false;
3438 /* For reduction chains we support a trailing/leading
3439 conversions. We do not store those in the actual chain. */
3440 if (leading_conversion)
3441 continue;
3442 reduc_chain.safe_push (stmt_info);
3444 if (is_slp_reduc && reduc_chain.length () > 1)
3446 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3448 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3449 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3451 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3452 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3454 /* Save the chain for further analysis in SLP detection. */
3455 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3456 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3458 *reduc_chain_p = true;
3459 if (dump_enabled_p ())
3460 dump_printf_loc (MSG_NOTE, vect_location,
3461 "reduction: detected reduction chain\n");
3463 else if (dump_enabled_p ())
3464 dump_printf_loc (MSG_NOTE, vect_location,
3465 "reduction: detected reduction\n");
3467 return def_stmt_info;
3470 if (dump_enabled_p ())
3471 dump_printf_loc (MSG_NOTE, vect_location,
3472 "reduction: unknown pattern\n");
3474 return NULL;
3477 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3478 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3479 or -1 if not known. */
3481 static int
3482 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3484 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3485 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "cost model: epilogue peel iters set to vf/2 "
3490 "because loop iterations are unknown .\n");
3491 return assumed_vf / 2;
3493 else
3495 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3496 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3497 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3498 /* If we need to peel for gaps, but no peeling is required, we have to
3499 peel VF iterations. */
3500 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3501 peel_iters_epilogue = assumed_vf;
3502 return peel_iters_epilogue;
3506 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3508 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3509 int *peel_iters_epilogue,
3510 stmt_vector_for_cost *scalar_cost_vec,
3511 stmt_vector_for_cost *prologue_cost_vec,
3512 stmt_vector_for_cost *epilogue_cost_vec)
3514 int retval = 0;
3516 *peel_iters_epilogue
3517 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3519 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3521 /* If peeled iterations are known but number of scalar loop
3522 iterations are unknown, count a taken branch per peeled loop. */
3523 if (peel_iters_prologue > 0)
3524 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3525 NULL, NULL_TREE, 0, vect_prologue);
3526 if (*peel_iters_epilogue > 0)
3527 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3528 NULL, NULL_TREE, 0, vect_epilogue);
3531 stmt_info_for_cost *si;
3532 int j;
3533 if (peel_iters_prologue)
3534 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3535 retval += record_stmt_cost (prologue_cost_vec,
3536 si->count * peel_iters_prologue,
3537 si->kind, si->stmt_info, si->misalign,
3538 vect_prologue);
3539 if (*peel_iters_epilogue)
3540 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3541 retval += record_stmt_cost (epilogue_cost_vec,
3542 si->count * *peel_iters_epilogue,
3543 si->kind, si->stmt_info, si->misalign,
3544 vect_epilogue);
3546 return retval;
3549 /* Function vect_estimate_min_profitable_iters
3551 Return the number of iterations required for the vector version of the
3552 loop to be profitable relative to the cost of the scalar version of the
3553 loop.
3555 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3556 of iterations for vectorization. -1 value means loop vectorization
3557 is not profitable. This returned value may be used for dynamic
3558 profitability check.
3560 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3561 for static check against estimated number of iterations. */
3563 static void
3564 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3565 int *ret_min_profitable_niters,
3566 int *ret_min_profitable_estimate)
3568 int min_profitable_iters;
3569 int min_profitable_estimate;
3570 int peel_iters_prologue;
3571 int peel_iters_epilogue;
3572 unsigned vec_inside_cost = 0;
3573 int vec_outside_cost = 0;
3574 unsigned vec_prologue_cost = 0;
3575 unsigned vec_epilogue_cost = 0;
3576 int scalar_single_iter_cost = 0;
3577 int scalar_outside_cost = 0;
3578 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3579 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3580 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3582 /* Cost model disabled. */
3583 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3587 *ret_min_profitable_niters = 0;
3588 *ret_min_profitable_estimate = 0;
3589 return;
3592 /* Requires loop versioning tests to handle misalignment. */
3593 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3595 /* FIXME: Make cost depend on complexity of individual check. */
3596 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3597 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3598 NULL, NULL_TREE, 0, vect_prologue);
3599 if (dump_enabled_p ())
3600 dump_printf (MSG_NOTE,
3601 "cost model: Adding cost of checks for loop "
3602 "versioning to treat misalignment.\n");
3605 /* Requires loop versioning with alias checks. */
3606 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3608 /* FIXME: Make cost depend on complexity of individual check. */
3609 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3610 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3611 NULL, NULL_TREE, 0, vect_prologue);
3612 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3613 if (len)
3614 /* Count LEN - 1 ANDs and LEN comparisons. */
3615 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3616 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3617 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3618 if (len)
3620 /* Count LEN - 1 ANDs and LEN comparisons. */
3621 unsigned int nstmts = len * 2 - 1;
3622 /* +1 for each bias that needs adding. */
3623 for (unsigned int i = 0; i < len; ++i)
3624 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3625 nstmts += 1;
3626 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3627 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3629 if (dump_enabled_p ())
3630 dump_printf (MSG_NOTE,
3631 "cost model: Adding cost of checks for loop "
3632 "versioning aliasing.\n");
3635 /* Requires loop versioning with niter checks. */
3636 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3638 /* FIXME: Make cost depend on complexity of individual check. */
3639 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3640 NULL, NULL_TREE, 0, vect_prologue);
3641 if (dump_enabled_p ())
3642 dump_printf (MSG_NOTE,
3643 "cost model: Adding cost of checks for loop "
3644 "versioning niters.\n");
3647 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3648 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3649 NULL, NULL_TREE, 0, vect_prologue);
3651 /* Count statements in scalar loop. Using this as scalar cost for a single
3652 iteration for now.
3654 TODO: Add outer loop support.
3656 TODO: Consider assigning different costs to different scalar
3657 statements. */
3659 scalar_single_iter_cost
3660 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3662 /* Add additional cost for the peeled instructions in prologue and epilogue
3663 loop. (For fully-masked loops there will be no peeling.)
3665 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3666 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3668 TODO: Build an expression that represents peel_iters for prologue and
3669 epilogue to be used in a run-time test. */
3671 bool prologue_need_br_taken_cost = false;
3672 bool prologue_need_br_not_taken_cost = false;
3674 /* Calculate peel_iters_prologue. */
3675 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3676 peel_iters_prologue = 0;
3677 else if (npeel < 0)
3679 peel_iters_prologue = assumed_vf / 2;
3680 if (dump_enabled_p ())
3681 dump_printf (MSG_NOTE, "cost model: "
3682 "prologue peel iters set to vf/2.\n");
3684 /* If peeled iterations are unknown, count a taken branch and a not taken
3685 branch per peeled loop. Even if scalar loop iterations are known,
3686 vector iterations are not known since peeled prologue iterations are
3687 not known. Hence guards remain the same. */
3688 prologue_need_br_taken_cost = true;
3689 prologue_need_br_not_taken_cost = true;
3691 else
3693 peel_iters_prologue = npeel;
3694 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3695 /* If peeled iterations are known but number of scalar loop
3696 iterations are unknown, count a taken branch per peeled loop. */
3697 prologue_need_br_taken_cost = true;
3700 bool epilogue_need_br_taken_cost = false;
3701 bool epilogue_need_br_not_taken_cost = false;
3703 /* Calculate peel_iters_epilogue. */
3704 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3705 /* We need to peel exactly one iteration for gaps. */
3706 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3707 else if (npeel < 0)
3709 /* If peeling for alignment is unknown, loop bound of main loop
3710 becomes unknown. */
3711 peel_iters_epilogue = assumed_vf / 2;
3712 if (dump_enabled_p ())
3713 dump_printf (MSG_NOTE, "cost model: "
3714 "epilogue peel iters set to vf/2 because "
3715 "peeling for alignment is unknown.\n");
3717 /* See the same reason above in peel_iters_prologue calculation. */
3718 epilogue_need_br_taken_cost = true;
3719 epilogue_need_br_not_taken_cost = true;
3721 else
3723 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3724 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3725 /* If peeled iterations are known but number of scalar loop
3726 iterations are unknown, count a taken branch per peeled loop. */
3727 epilogue_need_br_taken_cost = true;
3730 stmt_info_for_cost *si;
3731 int j;
3732 /* Add costs associated with peel_iters_prologue. */
3733 if (peel_iters_prologue)
3734 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3736 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3737 si->count * peel_iters_prologue, si->kind,
3738 si->stmt_info, si->vectype, si->misalign,
3739 vect_prologue);
3742 /* Add costs associated with peel_iters_epilogue. */
3743 if (peel_iters_epilogue)
3744 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3746 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3747 si->count * peel_iters_epilogue, si->kind,
3748 si->stmt_info, si->vectype, si->misalign,
3749 vect_epilogue);
3752 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3754 if (prologue_need_br_taken_cost)
3755 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3756 NULL, NULL_TREE, 0, vect_prologue);
3758 if (prologue_need_br_not_taken_cost)
3759 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3760 cond_branch_not_taken, NULL, NULL_TREE, 0,
3761 vect_prologue);
3763 if (epilogue_need_br_taken_cost)
3764 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3765 NULL, NULL_TREE, 0, vect_epilogue);
3767 if (epilogue_need_br_not_taken_cost)
3768 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3769 cond_branch_not_taken, NULL, NULL_TREE, 0,
3770 vect_epilogue);
3772 /* Take care of special costs for rgroup controls of partial vectors. */
3773 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3775 /* Calculate how many masks we need to generate. */
3776 unsigned int num_masks = 0;
3777 rgroup_controls *rgm;
3778 unsigned int num_vectors_m1;
3779 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3780 if (rgm->type)
3781 num_masks += num_vectors_m1 + 1;
3782 gcc_assert (num_masks > 0);
3784 /* In the worst case, we need to generate each mask in the prologue
3785 and in the loop body. One of the loop body mask instructions
3786 replaces the comparison in the scalar loop, and since we don't
3787 count the scalar comparison against the scalar body, we shouldn't
3788 count that vector instruction against the vector body either.
3790 Sometimes we can use unpacks instead of generating prologue
3791 masks and sometimes the prologue mask will fold to a constant,
3792 so the actual prologue cost might be smaller. However, it's
3793 simpler and safer to use the worst-case cost; if this ends up
3794 being the tie-breaker between vectorizing or not, then it's
3795 probably better not to vectorize. */
3796 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3797 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3798 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3799 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3801 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3803 /* Referring to the functions vect_set_loop_condition_partial_vectors
3804 and vect_set_loop_controls_directly, we need to generate each
3805 length in the prologue and in the loop body if required. Although
3806 there are some possible optimizations, we consider the worst case
3807 here. */
3809 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3810 bool need_iterate_p
3811 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3812 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3814 /* Calculate how many statements to be added. */
3815 unsigned int prologue_stmts = 0;
3816 unsigned int body_stmts = 0;
3818 rgroup_controls *rgc;
3819 unsigned int num_vectors_m1;
3820 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3821 if (rgc->type)
3823 /* May need one SHIFT for nitems_total computation. */
3824 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3825 if (nitems != 1 && !niters_known_p)
3826 prologue_stmts += 1;
3828 /* May need one MAX and one MINUS for wrap around. */
3829 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3830 prologue_stmts += 2;
3832 /* Need one MAX and one MINUS for each batch limit excepting for
3833 the 1st one. */
3834 prologue_stmts += num_vectors_m1 * 2;
3836 unsigned int num_vectors = num_vectors_m1 + 1;
3838 /* Need to set up lengths in prologue, only one MIN required
3839 for each since start index is zero. */
3840 prologue_stmts += num_vectors;
3842 /* Each may need two MINs and one MINUS to update lengths in body
3843 for next iteration. */
3844 if (need_iterate_p)
3845 body_stmts += 3 * num_vectors;
3848 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3849 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3850 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3851 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3854 /* FORNOW: The scalar outside cost is incremented in one of the
3855 following ways:
3857 1. The vectorizer checks for alignment and aliasing and generates
3858 a condition that allows dynamic vectorization. A cost model
3859 check is ANDED with the versioning condition. Hence scalar code
3860 path now has the added cost of the versioning check.
3862 if (cost > th & versioning_check)
3863 jmp to vector code
3865 Hence run-time scalar is incremented by not-taken branch cost.
3867 2. The vectorizer then checks if a prologue is required. If the
3868 cost model check was not done before during versioning, it has to
3869 be done before the prologue check.
3871 if (cost <= th)
3872 prologue = scalar_iters
3873 if (prologue == 0)
3874 jmp to vector code
3875 else
3876 execute prologue
3877 if (prologue == num_iters)
3878 go to exit
3880 Hence the run-time scalar cost is incremented by a taken branch,
3881 plus a not-taken branch, plus a taken branch cost.
3883 3. The vectorizer then checks if an epilogue is required. If the
3884 cost model check was not done before during prologue check, it
3885 has to be done with the epilogue check.
3887 if (prologue == 0)
3888 jmp to vector code
3889 else
3890 execute prologue
3891 if (prologue == num_iters)
3892 go to exit
3893 vector code:
3894 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3895 jmp to epilogue
3897 Hence the run-time scalar cost should be incremented by 2 taken
3898 branches.
3900 TODO: The back end may reorder the BBS's differently and reverse
3901 conditions/branch directions. Change the estimates below to
3902 something more reasonable. */
3904 /* If the number of iterations is known and we do not do versioning, we can
3905 decide whether to vectorize at compile time. Hence the scalar version
3906 do not carry cost model guard costs. */
3907 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3908 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3910 /* Cost model check occurs at versioning. */
3911 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3912 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3913 else
3915 /* Cost model check occurs at prologue generation. */
3916 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3917 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3918 + vect_get_stmt_cost (cond_branch_not_taken);
3919 /* Cost model check occurs at epilogue generation. */
3920 else
3921 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3925 /* Complete the target-specific cost calculations. */
3926 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3927 &vec_inside_cost, &vec_epilogue_cost);
3929 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3931 /* Stash the costs so that we can compare two loop_vec_infos. */
3932 loop_vinfo->vec_inside_cost = vec_inside_cost;
3933 loop_vinfo->vec_outside_cost = vec_outside_cost;
3935 if (dump_enabled_p ())
3937 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3938 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3939 vec_inside_cost);
3940 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3941 vec_prologue_cost);
3942 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3943 vec_epilogue_cost);
3944 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3945 scalar_single_iter_cost);
3946 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3947 scalar_outside_cost);
3948 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3949 vec_outside_cost);
3950 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3951 peel_iters_prologue);
3952 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3953 peel_iters_epilogue);
3956 /* Calculate number of iterations required to make the vector version
3957 profitable, relative to the loop bodies only. The following condition
3958 must hold true:
3959 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3960 where
3961 SIC = scalar iteration cost, VIC = vector iteration cost,
3962 VOC = vector outside cost, VF = vectorization factor,
3963 NPEEL = prologue iterations + epilogue iterations,
3964 SOC = scalar outside cost for run time cost model check. */
3966 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3967 - vec_inside_cost);
3968 if (saving_per_viter <= 0)
3970 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3971 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3972 "vectorization did not happen for a simd loop");
3974 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976 "cost model: the vector iteration cost = %d "
3977 "divided by the scalar iteration cost = %d "
3978 "is greater or equal to the vectorization factor = %d"
3979 ".\n",
3980 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3981 *ret_min_profitable_niters = -1;
3982 *ret_min_profitable_estimate = -1;
3983 return;
3986 /* ??? The "if" arm is written to handle all cases; see below for what
3987 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
3988 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3990 /* Rewriting the condition above in terms of the number of
3991 vector iterations (vniters) rather than the number of
3992 scalar iterations (niters) gives:
3994 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3996 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3998 For integer N, X and Y when X > 0:
4000 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4001 int outside_overhead = (vec_outside_cost
4002 - scalar_single_iter_cost * peel_iters_prologue
4003 - scalar_single_iter_cost * peel_iters_epilogue
4004 - scalar_outside_cost);
4005 /* We're only interested in cases that require at least one
4006 vector iteration. */
4007 int min_vec_niters = 1;
4008 if (outside_overhead > 0)
4009 min_vec_niters = outside_overhead / saving_per_viter + 1;
4011 if (dump_enabled_p ())
4012 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4013 min_vec_niters);
4015 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4017 /* Now that we know the minimum number of vector iterations,
4018 find the minimum niters for which the scalar cost is larger:
4020 SIC * niters > VIC * vniters + VOC - SOC
4022 We know that the minimum niters is no more than
4023 vniters * VF + NPEEL, but it might be (and often is) less
4024 than that if a partial vector iteration is cheaper than the
4025 equivalent scalar code. */
4026 int threshold = (vec_inside_cost * min_vec_niters
4027 + vec_outside_cost
4028 - scalar_outside_cost);
4029 if (threshold <= 0)
4030 min_profitable_iters = 1;
4031 else
4032 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4034 else
4035 /* Convert the number of vector iterations into a number of
4036 scalar iterations. */
4037 min_profitable_iters = (min_vec_niters * assumed_vf
4038 + peel_iters_prologue
4039 + peel_iters_epilogue);
4041 else
4043 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4044 * assumed_vf
4045 - vec_inside_cost * peel_iters_prologue
4046 - vec_inside_cost * peel_iters_epilogue);
4047 if (min_profitable_iters <= 0)
4048 min_profitable_iters = 0;
4049 else
4051 min_profitable_iters /= saving_per_viter;
4053 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4054 <= (((int) vec_inside_cost * min_profitable_iters)
4055 + (((int) vec_outside_cost - scalar_outside_cost)
4056 * assumed_vf)))
4057 min_profitable_iters++;
4061 if (dump_enabled_p ())
4062 dump_printf (MSG_NOTE,
4063 " Calculated minimum iters for profitability: %d\n",
4064 min_profitable_iters);
4066 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4067 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4068 /* We want the vectorized loop to execute at least once. */
4069 min_profitable_iters = assumed_vf + peel_iters_prologue;
4070 else if (min_profitable_iters < peel_iters_prologue)
4071 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4072 vectorized loop executes at least once. */
4073 min_profitable_iters = peel_iters_prologue;
4075 if (dump_enabled_p ())
4076 dump_printf_loc (MSG_NOTE, vect_location,
4077 " Runtime profitability threshold = %d\n",
4078 min_profitable_iters);
4080 *ret_min_profitable_niters = min_profitable_iters;
4082 /* Calculate number of iterations required to make the vector version
4083 profitable, relative to the loop bodies only.
4085 Non-vectorized variant is SIC * niters and it must win over vector
4086 variant on the expected loop trip count. The following condition must hold true:
4087 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4089 if (vec_outside_cost <= 0)
4090 min_profitable_estimate = 0;
4091 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4093 /* This is a repeat of the code above, but with + SOC rather
4094 than - SOC. */
4095 int outside_overhead = (vec_outside_cost
4096 - scalar_single_iter_cost * peel_iters_prologue
4097 - scalar_single_iter_cost * peel_iters_epilogue
4098 + scalar_outside_cost);
4099 int min_vec_niters = 1;
4100 if (outside_overhead > 0)
4101 min_vec_niters = outside_overhead / saving_per_viter + 1;
4103 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4105 int threshold = (vec_inside_cost * min_vec_niters
4106 + vec_outside_cost
4107 + scalar_outside_cost);
4108 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4110 else
4111 min_profitable_estimate = (min_vec_niters * assumed_vf
4112 + peel_iters_prologue
4113 + peel_iters_epilogue);
4115 else
4117 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4118 * assumed_vf
4119 - vec_inside_cost * peel_iters_prologue
4120 - vec_inside_cost * peel_iters_epilogue)
4121 / ((scalar_single_iter_cost * assumed_vf)
4122 - vec_inside_cost);
4124 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4125 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_NOTE, vect_location,
4127 " Static estimate profitability threshold = %d\n",
4128 min_profitable_estimate);
4130 *ret_min_profitable_estimate = min_profitable_estimate;
4133 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4134 vector elements (not bits) for a vector with NELT elements. */
4135 static void
4136 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4137 vec_perm_builder *sel)
4139 /* The encoding is a single stepped pattern. Any wrap-around is handled
4140 by vec_perm_indices. */
4141 sel->new_vector (nelt, 1, 3);
4142 for (unsigned int i = 0; i < 3; i++)
4143 sel->quick_push (i + offset);
4146 /* Checks whether the target supports whole-vector shifts for vectors of mode
4147 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4148 it supports vec_perm_const with masks for all necessary shift amounts. */
4149 static bool
4150 have_whole_vector_shift (machine_mode mode)
4152 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4153 return true;
4155 /* Variable-length vectors should be handled via the optab. */
4156 unsigned int nelt;
4157 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4158 return false;
4160 vec_perm_builder sel;
4161 vec_perm_indices indices;
4162 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4164 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4165 indices.new_vector (sel, 2, nelt);
4166 if (!can_vec_perm_const_p (mode, indices, false))
4167 return false;
4169 return true;
4172 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4173 functions. Design better to avoid maintenance issues. */
4175 /* Function vect_model_reduction_cost.
4177 Models cost for a reduction operation, including the vector ops
4178 generated within the strip-mine loop, the initial definition before
4179 the loop, and the epilogue code that must be generated. */
4181 static void
4182 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4183 stmt_vec_info stmt_info, internal_fn reduc_fn,
4184 vect_reduction_type reduction_type,
4185 int ncopies, stmt_vector_for_cost *cost_vec)
4187 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4188 enum tree_code code;
4189 optab optab;
4190 tree vectype;
4191 machine_mode mode;
4192 class loop *loop = NULL;
4194 if (loop_vinfo)
4195 loop = LOOP_VINFO_LOOP (loop_vinfo);
4197 /* Condition reductions generate two reductions in the loop. */
4198 if (reduction_type == COND_REDUCTION)
4199 ncopies *= 2;
4201 vectype = STMT_VINFO_VECTYPE (stmt_info);
4202 mode = TYPE_MODE (vectype);
4203 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4205 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4207 if (reduction_type == EXTRACT_LAST_REDUCTION)
4208 /* No extra instructions are needed in the prologue. The loop body
4209 operations are costed in vectorizable_condition. */
4210 inside_cost = 0;
4211 else if (reduction_type == FOLD_LEFT_REDUCTION)
4213 /* No extra instructions needed in the prologue. */
4214 prologue_cost = 0;
4216 if (reduc_fn != IFN_LAST)
4217 /* Count one reduction-like operation per vector. */
4218 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4219 stmt_info, 0, vect_body);
4220 else
4222 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4223 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4224 inside_cost = record_stmt_cost (cost_vec, nelements,
4225 vec_to_scalar, stmt_info, 0,
4226 vect_body);
4227 inside_cost += record_stmt_cost (cost_vec, nelements,
4228 scalar_stmt, stmt_info, 0,
4229 vect_body);
4232 else
4234 /* Add in cost for initial definition.
4235 For cond reduction we have four vectors: initial index, step,
4236 initial result of the data reduction, initial value of the index
4237 reduction. */
4238 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4239 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4240 scalar_to_vec, stmt_info, 0,
4241 vect_prologue);
4243 /* Cost of reduction op inside loop. */
4244 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4245 stmt_info, 0, vect_body);
4248 /* Determine cost of epilogue code.
4250 We have a reduction operator that will reduce the vector in one statement.
4251 Also requires scalar extract. */
4253 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4255 if (reduc_fn != IFN_LAST)
4257 if (reduction_type == COND_REDUCTION)
4259 /* An EQ stmt and an COND_EXPR stmt. */
4260 epilogue_cost += record_stmt_cost (cost_vec, 2,
4261 vector_stmt, stmt_info, 0,
4262 vect_epilogue);
4263 /* Reduction of the max index and a reduction of the found
4264 values. */
4265 epilogue_cost += record_stmt_cost (cost_vec, 2,
4266 vec_to_scalar, stmt_info, 0,
4267 vect_epilogue);
4268 /* A broadcast of the max value. */
4269 epilogue_cost += record_stmt_cost (cost_vec, 1,
4270 scalar_to_vec, stmt_info, 0,
4271 vect_epilogue);
4273 else
4275 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4276 stmt_info, 0, vect_epilogue);
4277 epilogue_cost += record_stmt_cost (cost_vec, 1,
4278 vec_to_scalar, stmt_info, 0,
4279 vect_epilogue);
4282 else if (reduction_type == COND_REDUCTION)
4284 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4285 /* Extraction of scalar elements. */
4286 epilogue_cost += record_stmt_cost (cost_vec,
4287 2 * estimated_nunits,
4288 vec_to_scalar, stmt_info, 0,
4289 vect_epilogue);
4290 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4291 epilogue_cost += record_stmt_cost (cost_vec,
4292 2 * estimated_nunits - 3,
4293 scalar_stmt, stmt_info, 0,
4294 vect_epilogue);
4296 else if (reduction_type == EXTRACT_LAST_REDUCTION
4297 || reduction_type == FOLD_LEFT_REDUCTION)
4298 /* No extra instructions need in the epilogue. */
4300 else
4302 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4303 tree bitsize =
4304 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4305 int element_bitsize = tree_to_uhwi (bitsize);
4306 int nelements = vec_size_in_bits / element_bitsize;
4308 if (code == COND_EXPR)
4309 code = MAX_EXPR;
4311 optab = optab_for_tree_code (code, vectype, optab_default);
4313 /* We have a whole vector shift available. */
4314 if (optab != unknown_optab
4315 && VECTOR_MODE_P (mode)
4316 && optab_handler (optab, mode) != CODE_FOR_nothing
4317 && have_whole_vector_shift (mode))
4319 /* Final reduction via vector shifts and the reduction operator.
4320 Also requires scalar extract. */
4321 epilogue_cost += record_stmt_cost (cost_vec,
4322 exact_log2 (nelements) * 2,
4323 vector_stmt, stmt_info, 0,
4324 vect_epilogue);
4325 epilogue_cost += record_stmt_cost (cost_vec, 1,
4326 vec_to_scalar, stmt_info, 0,
4327 vect_epilogue);
4329 else
4330 /* Use extracts and reduction op for final reduction. For N
4331 elements, we have N extracts and N-1 reduction ops. */
4332 epilogue_cost += record_stmt_cost (cost_vec,
4333 nelements + nelements - 1,
4334 vector_stmt, stmt_info, 0,
4335 vect_epilogue);
4339 if (dump_enabled_p ())
4340 dump_printf (MSG_NOTE,
4341 "vect_model_reduction_cost: inside_cost = %d, "
4342 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4343 prologue_cost, epilogue_cost);
4347 /* Function vect_model_induction_cost.
4349 Models cost for induction operations. */
4351 static void
4352 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4353 stmt_vector_for_cost *cost_vec)
4355 unsigned inside_cost, prologue_cost;
4357 if (PURE_SLP_STMT (stmt_info))
4358 return;
4360 /* loop cost for vec_loop. */
4361 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4362 stmt_info, 0, vect_body);
4364 /* prologue cost for vec_init and vec_step. */
4365 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4366 stmt_info, 0, vect_prologue);
4368 if (dump_enabled_p ())
4369 dump_printf_loc (MSG_NOTE, vect_location,
4370 "vect_model_induction_cost: inside_cost = %d, "
4371 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4376 /* Function get_initial_def_for_reduction
4378 Input:
4379 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4380 INIT_VAL - the initial value of the reduction variable
4382 Output:
4383 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4384 of the reduction (used for adjusting the epilog - see below).
4385 Return a vector variable, initialized according to the operation that
4386 STMT_VINFO performs. This vector will be used as the initial value
4387 of the vector of partial results.
4389 Option1 (adjust in epilog): Initialize the vector as follows:
4390 add/bit or/xor: [0,0,...,0,0]
4391 mult/bit and: [1,1,...,1,1]
4392 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4393 and when necessary (e.g. add/mult case) let the caller know
4394 that it needs to adjust the result by init_val.
4396 Option2: Initialize the vector as follows:
4397 add/bit or/xor: [init_val,0,0,...,0]
4398 mult/bit and: [init_val,1,1,...,1]
4399 min/max/cond_expr: [init_val,init_val,...,init_val]
4400 and no adjustments are needed.
4402 For example, for the following code:
4404 s = init_val;
4405 for (i=0;i<n;i++)
4406 s = s + a[i];
4408 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4409 For a vector of 4 units, we want to return either [0,0,0,init_val],
4410 or [0,0,0,0] and let the caller know that it needs to adjust
4411 the result at the end by 'init_val'.
4413 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4414 initialization vector is simpler (same element in all entries), if
4415 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4417 A cost model should help decide between these two schemes. */
4419 static tree
4420 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4421 stmt_vec_info stmt_vinfo,
4422 enum tree_code code, tree init_val,
4423 tree *adjustment_def)
4425 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4426 tree scalar_type = TREE_TYPE (init_val);
4427 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4428 tree def_for_init;
4429 tree init_def;
4430 REAL_VALUE_TYPE real_init_val = dconst0;
4431 int int_init_val = 0;
4432 gimple_seq stmts = NULL;
4434 gcc_assert (vectype);
4436 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4437 || SCALAR_FLOAT_TYPE_P (scalar_type));
4439 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4440 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4442 /* ADJUSTMENT_DEF is NULL when called from
4443 vect_create_epilog_for_reduction to vectorize double reduction. */
4444 if (adjustment_def)
4445 *adjustment_def = NULL;
4447 switch (code)
4449 case WIDEN_SUM_EXPR:
4450 case DOT_PROD_EXPR:
4451 case SAD_EXPR:
4452 case PLUS_EXPR:
4453 case MINUS_EXPR:
4454 case BIT_IOR_EXPR:
4455 case BIT_XOR_EXPR:
4456 case MULT_EXPR:
4457 case BIT_AND_EXPR:
4459 if (code == MULT_EXPR)
4461 real_init_val = dconst1;
4462 int_init_val = 1;
4465 if (code == BIT_AND_EXPR)
4466 int_init_val = -1;
4468 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4469 def_for_init = build_real (scalar_type, real_init_val);
4470 else
4471 def_for_init = build_int_cst (scalar_type, int_init_val);
4473 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4475 /* Option1: the first element is '0' or '1' as well. */
4476 if (!operand_equal_p (def_for_init, init_val, 0))
4477 *adjustment_def = init_val;
4478 init_def = gimple_build_vector_from_val (&stmts, vectype,
4479 def_for_init);
4481 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4483 /* Option2 (variable length): the first element is INIT_VAL. */
4484 init_def = gimple_build_vector_from_val (&stmts, vectype,
4485 def_for_init);
4486 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4487 vectype, init_def, init_val);
4489 else
4491 /* Option2: the first element is INIT_VAL. */
4492 tree_vector_builder elts (vectype, 1, 2);
4493 elts.quick_push (init_val);
4494 elts.quick_push (def_for_init);
4495 init_def = gimple_build_vector (&stmts, &elts);
4498 break;
4500 case MIN_EXPR:
4501 case MAX_EXPR:
4502 case COND_EXPR:
4504 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4505 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4507 break;
4509 default:
4510 gcc_unreachable ();
4513 if (stmts)
4514 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4515 return init_def;
4518 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4519 NUMBER_OF_VECTORS is the number of vector defs to create.
4520 If NEUTRAL_OP is nonnull, introducing extra elements of that
4521 value will not change the result. */
4523 static void
4524 get_initial_defs_for_reduction (vec_info *vinfo,
4525 slp_tree slp_node,
4526 vec<tree> *vec_oprnds,
4527 unsigned int number_of_vectors,
4528 bool reduc_chain, tree neutral_op)
4530 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4531 stmt_vec_info stmt_vinfo = stmts[0];
4532 unsigned HOST_WIDE_INT nunits;
4533 unsigned j, number_of_places_left_in_vector;
4534 tree vector_type;
4535 unsigned int group_size = stmts.length ();
4536 unsigned int i;
4537 class loop *loop;
4539 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4541 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4543 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4544 gcc_assert (loop);
4545 edge pe = loop_preheader_edge (loop);
4547 gcc_assert (!reduc_chain || neutral_op);
4549 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4550 created vectors. It is greater than 1 if unrolling is performed.
4552 For example, we have two scalar operands, s1 and s2 (e.g., group of
4553 strided accesses of size two), while NUNITS is four (i.e., four scalars
4554 of this type can be packed in a vector). The output vector will contain
4555 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4556 will be 2).
4558 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4559 vectors containing the operands.
4561 For example, NUNITS is four as before, and the group size is 8
4562 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4563 {s5, s6, s7, s8}. */
4565 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4566 nunits = group_size;
4568 number_of_places_left_in_vector = nunits;
4569 bool constant_p = true;
4570 tree_vector_builder elts (vector_type, nunits, 1);
4571 elts.quick_grow (nunits);
4572 gimple_seq ctor_seq = NULL;
4573 for (j = 0; j < nunits * number_of_vectors; ++j)
4575 tree op;
4576 i = j % group_size;
4577 stmt_vinfo = stmts[i];
4579 /* Get the def before the loop. In reduction chain we have only
4580 one initial value. Else we have as many as PHIs in the group. */
4581 if (reduc_chain)
4582 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4583 else if (((vec_oprnds->length () + 1) * nunits
4584 - number_of_places_left_in_vector >= group_size)
4585 && neutral_op)
4586 op = neutral_op;
4587 else
4588 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4590 /* Create 'vect_ = {op0,op1,...,opn}'. */
4591 number_of_places_left_in_vector--;
4592 elts[nunits - number_of_places_left_in_vector - 1] = op;
4593 if (!CONSTANT_CLASS_P (op))
4594 constant_p = false;
4596 if (number_of_places_left_in_vector == 0)
4598 tree init;
4599 if (constant_p && !neutral_op
4600 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4601 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4602 /* Build the vector directly from ELTS. */
4603 init = gimple_build_vector (&ctor_seq, &elts);
4604 else if (neutral_op)
4606 /* Build a vector of the neutral value and shift the
4607 other elements into place. */
4608 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4609 neutral_op);
4610 int k = nunits;
4611 while (k > 0 && elts[k - 1] == neutral_op)
4612 k -= 1;
4613 while (k > 0)
4615 k -= 1;
4616 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4617 vector_type, init, elts[k]);
4620 else
4622 /* First time round, duplicate ELTS to fill the
4623 required number of vectors. */
4624 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4625 number_of_vectors, *vec_oprnds);
4626 break;
4628 vec_oprnds->quick_push (init);
4630 number_of_places_left_in_vector = nunits;
4631 elts.new_vector (vector_type, nunits, 1);
4632 elts.quick_grow (nunits);
4633 constant_p = true;
4636 if (ctor_seq != NULL)
4637 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4640 /* For a statement STMT_INFO taking part in a reduction operation return
4641 the stmt_vec_info the meta information is stored on. */
4643 stmt_vec_info
4644 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4646 stmt_info = vect_orig_stmt (stmt_info);
4647 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4648 if (!is_a <gphi *> (stmt_info->stmt))
4649 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4650 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4651 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4653 if (gimple_phi_num_args (phi) == 1)
4654 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4656 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4658 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4659 stmt_vec_info info
4660 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4661 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4662 stmt_info = info;
4664 return stmt_info;
4667 /* Function vect_create_epilog_for_reduction
4669 Create code at the loop-epilog to finalize the result of a reduction
4670 computation.
4672 STMT_INFO is the scalar reduction stmt that is being vectorized.
4673 SLP_NODE is an SLP node containing a group of reduction statements. The
4674 first one in this group is STMT_INFO.
4675 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4676 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4677 (counting from 0)
4679 This function:
4680 1. Completes the reduction def-use cycles.
4681 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4682 by calling the function specified by REDUC_FN if available, or by
4683 other means (whole-vector shifts or a scalar loop).
4684 The function also creates a new phi node at the loop exit to preserve
4685 loop-closed form, as illustrated below.
4687 The flow at the entry to this function:
4689 loop:
4690 vec_def = phi <vec_init, null> # REDUCTION_PHI
4691 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4692 s_loop = scalar_stmt # (scalar) STMT_INFO
4693 loop_exit:
4694 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4695 use <s_out0>
4696 use <s_out0>
4698 The above is transformed by this function into:
4700 loop:
4701 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4702 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4703 s_loop = scalar_stmt # (scalar) STMT_INFO
4704 loop_exit:
4705 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4706 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4707 v_out2 = reduce <v_out1>
4708 s_out3 = extract_field <v_out2, 0>
4709 s_out4 = adjust_result <s_out3>
4710 use <s_out4>
4711 use <s_out4>
4714 static void
4715 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4716 stmt_vec_info stmt_info,
4717 slp_tree slp_node,
4718 slp_instance slp_node_instance)
4720 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4721 gcc_assert (reduc_info->is_reduc_info);
4722 /* For double reductions we need to get at the inner loop reduction
4723 stmt which has the meta info attached. Our stmt_info is that of the
4724 loop-closed PHI of the inner loop which we remember as
4725 def for the reduction PHI generation. */
4726 bool double_reduc = false;
4727 stmt_vec_info rdef_info = stmt_info;
4728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4730 gcc_assert (!slp_node);
4731 double_reduc = true;
4732 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4733 (stmt_info->stmt, 0));
4734 stmt_info = vect_stmt_to_vectorize (stmt_info);
4736 gphi *reduc_def_stmt
4737 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4738 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4739 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4740 tree vectype;
4741 machine_mode mode;
4742 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4743 basic_block exit_bb;
4744 tree scalar_dest;
4745 tree scalar_type;
4746 gimple *new_phi = NULL, *phi;
4747 gimple_stmt_iterator exit_gsi;
4748 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4749 gimple *epilog_stmt = NULL;
4750 gimple *exit_phi;
4751 tree bitsize;
4752 tree def;
4753 tree orig_name, scalar_result;
4754 imm_use_iterator imm_iter, phi_imm_iter;
4755 use_operand_p use_p, phi_use_p;
4756 gimple *use_stmt;
4757 bool nested_in_vect_loop = false;
4758 auto_vec<gimple *> new_phis;
4759 int j, i;
4760 auto_vec<tree> scalar_results;
4761 unsigned int group_size = 1, k;
4762 auto_vec<gimple *> phis;
4763 bool slp_reduc = false;
4764 bool direct_slp_reduc;
4765 tree new_phi_result;
4766 tree induction_index = NULL_TREE;
4768 if (slp_node)
4769 group_size = SLP_TREE_LANES (slp_node);
4771 if (nested_in_vect_loop_p (loop, stmt_info))
4773 outer_loop = loop;
4774 loop = loop->inner;
4775 nested_in_vect_loop = true;
4776 gcc_assert (!slp_node);
4778 gcc_assert (!nested_in_vect_loop || double_reduc);
4780 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4781 gcc_assert (vectype);
4782 mode = TYPE_MODE (vectype);
4784 tree initial_def = NULL;
4785 tree induc_val = NULL_TREE;
4786 tree adjustment_def = NULL;
4787 if (slp_node)
4789 else
4791 /* Get at the scalar def before the loop, that defines the initial value
4792 of the reduction variable. */
4793 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4794 loop_preheader_edge (loop));
4795 /* Optimize: for induction condition reduction, if we can't use zero
4796 for induc_val, use initial_def. */
4797 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4798 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4799 else if (double_reduc)
4801 else if (nested_in_vect_loop)
4803 else
4804 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4807 unsigned vec_num;
4808 int ncopies;
4809 if (slp_node)
4811 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4812 ncopies = 1;
4814 else
4816 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4817 vec_num = 1;
4818 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4821 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4822 which is updated with the current index of the loop for every match of
4823 the original loop's cond_expr (VEC_STMT). This results in a vector
4824 containing the last time the condition passed for that vector lane.
4825 The first match will be a 1 to allow 0 to be used for non-matching
4826 indexes. If there are no matches at all then the vector will be all
4827 zeroes.
4829 PR92772: This algorithm is broken for architectures that support
4830 masked vectors, but do not provide fold_extract_last. */
4831 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4833 auto_vec<std::pair<tree, bool>, 2> ccompares;
4834 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4835 cond_info = vect_stmt_to_vectorize (cond_info);
4836 while (cond_info != reduc_info)
4838 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4840 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4841 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4842 ccompares.safe_push
4843 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4844 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4846 cond_info
4847 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4848 1 + STMT_VINFO_REDUC_IDX
4849 (cond_info)));
4850 cond_info = vect_stmt_to_vectorize (cond_info);
4852 gcc_assert (ccompares.length () != 0);
4854 tree indx_before_incr, indx_after_incr;
4855 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4856 int scalar_precision
4857 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4858 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4859 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4860 (TYPE_MODE (vectype), cr_index_scalar_type,
4861 TYPE_VECTOR_SUBPARTS (vectype));
4863 /* First we create a simple vector induction variable which starts
4864 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4865 vector size (STEP). */
4867 /* Create a {1,2,3,...} vector. */
4868 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4870 /* Create a vector of the step value. */
4871 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4872 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4874 /* Create an induction variable. */
4875 gimple_stmt_iterator incr_gsi;
4876 bool insert_after;
4877 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4878 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4879 insert_after, &indx_before_incr, &indx_after_incr);
4881 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4882 filled with zeros (VEC_ZERO). */
4884 /* Create a vector of 0s. */
4885 tree zero = build_zero_cst (cr_index_scalar_type);
4886 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4888 /* Create a vector phi node. */
4889 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4890 new_phi = create_phi_node (new_phi_tree, loop->header);
4891 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4892 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4894 /* Now take the condition from the loops original cond_exprs
4895 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4896 every match uses values from the induction variable
4897 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4898 (NEW_PHI_TREE).
4899 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4900 the new cond_expr (INDEX_COND_EXPR). */
4901 gimple_seq stmts = NULL;
4902 for (int i = ccompares.length () - 1; i != -1; --i)
4904 tree ccompare = ccompares[i].first;
4905 if (ccompares[i].second)
4906 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4907 cr_index_vector_type,
4908 ccompare,
4909 indx_before_incr, new_phi_tree);
4910 else
4911 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4912 cr_index_vector_type,
4913 ccompare,
4914 new_phi_tree, indx_before_incr);
4916 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4918 /* Update the phi with the vec cond. */
4919 induction_index = new_phi_tree;
4920 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4921 loop_latch_edge (loop), UNKNOWN_LOCATION);
4924 /* 2. Create epilog code.
4925 The reduction epilog code operates across the elements of the vector
4926 of partial results computed by the vectorized loop.
4927 The reduction epilog code consists of:
4929 step 1: compute the scalar result in a vector (v_out2)
4930 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4931 step 3: adjust the scalar result (s_out3) if needed.
4933 Step 1 can be accomplished using one the following three schemes:
4934 (scheme 1) using reduc_fn, if available.
4935 (scheme 2) using whole-vector shifts, if available.
4936 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4937 combined.
4939 The overall epilog code looks like this:
4941 s_out0 = phi <s_loop> # original EXIT_PHI
4942 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4943 v_out2 = reduce <v_out1> # step 1
4944 s_out3 = extract_field <v_out2, 0> # step 2
4945 s_out4 = adjust_result <s_out3> # step 3
4947 (step 3 is optional, and steps 1 and 2 may be combined).
4948 Lastly, the uses of s_out0 are replaced by s_out4. */
4951 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4952 v_out1 = phi <VECT_DEF>
4953 Store them in NEW_PHIS. */
4954 if (double_reduc)
4955 loop = outer_loop;
4956 exit_bb = single_exit (loop)->dest;
4957 new_phis.create (slp_node ? vec_num : ncopies);
4958 for (unsigned i = 0; i < vec_num; i++)
4960 if (slp_node)
4961 def = vect_get_slp_vect_def (slp_node, i);
4962 else
4963 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4964 for (j = 0; j < ncopies; j++)
4966 tree new_def = copy_ssa_name (def);
4967 phi = create_phi_node (new_def, exit_bb);
4968 if (j == 0)
4969 new_phis.quick_push (phi);
4970 else
4972 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4973 new_phis.quick_push (phi);
4976 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4980 exit_gsi = gsi_after_labels (exit_bb);
4982 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4983 (i.e. when reduc_fn is not available) and in the final adjustment
4984 code (if needed). Also get the original scalar reduction variable as
4985 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4986 represents a reduction pattern), the tree-code and scalar-def are
4987 taken from the original stmt that the pattern-stmt (STMT) replaces.
4988 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4989 are taken from STMT. */
4991 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4992 if (orig_stmt_info != stmt_info)
4994 /* Reduction pattern */
4995 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4996 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4999 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5000 scalar_type = TREE_TYPE (scalar_dest);
5001 scalar_results.create (group_size);
5002 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5003 bitsize = TYPE_SIZE (scalar_type);
5005 /* SLP reduction without reduction chain, e.g.,
5006 # a1 = phi <a2, a0>
5007 # b1 = phi <b2, b0>
5008 a2 = operation (a1)
5009 b2 = operation (b1) */
5010 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5012 /* True if we should implement SLP_REDUC using native reduction operations
5013 instead of scalar operations. */
5014 direct_slp_reduc = (reduc_fn != IFN_LAST
5015 && slp_reduc
5016 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5018 /* In case of reduction chain, e.g.,
5019 # a1 = phi <a3, a0>
5020 a2 = operation (a1)
5021 a3 = operation (a2),
5023 we may end up with more than one vector result. Here we reduce them to
5024 one vector. */
5025 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5027 gimple_seq stmts = NULL;
5028 tree first_vect = PHI_RESULT (new_phis[0]);
5029 first_vect = gimple_convert (&stmts, vectype, first_vect);
5030 for (k = 1; k < new_phis.length (); k++)
5032 gimple *next_phi = new_phis[k];
5033 tree second_vect = PHI_RESULT (next_phi);
5034 second_vect = gimple_convert (&stmts, vectype, second_vect);
5035 first_vect = gimple_build (&stmts, code, vectype,
5036 first_vect, second_vect);
5038 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5040 new_phi_result = first_vect;
5041 new_phis.truncate (0);
5042 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5044 /* Likewise if we couldn't use a single defuse cycle. */
5045 else if (ncopies > 1)
5047 gimple_seq stmts = NULL;
5048 tree first_vect = PHI_RESULT (new_phis[0]);
5049 first_vect = gimple_convert (&stmts, vectype, first_vect);
5050 for (int k = 1; k < ncopies; ++k)
5052 tree second_vect = PHI_RESULT (new_phis[k]);
5053 second_vect = gimple_convert (&stmts, vectype, second_vect);
5054 first_vect = gimple_build (&stmts, code, vectype,
5055 first_vect, second_vect);
5057 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5058 new_phi_result = first_vect;
5059 new_phis.truncate (0);
5060 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5062 else
5063 new_phi_result = PHI_RESULT (new_phis[0]);
5065 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5066 && reduc_fn != IFN_LAST)
5068 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5069 various data values where the condition matched and another vector
5070 (INDUCTION_INDEX) containing all the indexes of those matches. We
5071 need to extract the last matching index (which will be the index with
5072 highest value) and use this to index into the data vector.
5073 For the case where there were no matches, the data vector will contain
5074 all default values and the index vector will be all zeros. */
5076 /* Get various versions of the type of the vector of indexes. */
5077 tree index_vec_type = TREE_TYPE (induction_index);
5078 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5079 tree index_scalar_type = TREE_TYPE (index_vec_type);
5080 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5082 /* Get an unsigned integer version of the type of the data vector. */
5083 int scalar_precision
5084 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5085 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5086 tree vectype_unsigned = build_vector_type
5087 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5089 /* First we need to create a vector (ZERO_VEC) of zeros and another
5090 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5091 can create using a MAX reduction and then expanding.
5092 In the case where the loop never made any matches, the max index will
5093 be zero. */
5095 /* Vector of {0, 0, 0,...}. */
5096 tree zero_vec = build_zero_cst (vectype);
5098 gimple_seq stmts = NULL;
5099 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5100 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5102 /* Find maximum value from the vector of found indexes. */
5103 tree max_index = make_ssa_name (index_scalar_type);
5104 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5105 1, induction_index);
5106 gimple_call_set_lhs (max_index_stmt, max_index);
5107 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5109 /* Vector of {max_index, max_index, max_index,...}. */
5110 tree max_index_vec = make_ssa_name (index_vec_type);
5111 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5112 max_index);
5113 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5114 max_index_vec_rhs);
5115 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5117 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5118 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5119 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5120 otherwise. Only one value should match, resulting in a vector
5121 (VEC_COND) with one data value and the rest zeros.
5122 In the case where the loop never made any matches, every index will
5123 match, resulting in a vector with all data values (which will all be
5124 the default value). */
5126 /* Compare the max index vector to the vector of found indexes to find
5127 the position of the max value. */
5128 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5129 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5130 induction_index,
5131 max_index_vec);
5132 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5134 /* Use the compare to choose either values from the data vector or
5135 zero. */
5136 tree vec_cond = make_ssa_name (vectype);
5137 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5138 vec_compare, new_phi_result,
5139 zero_vec);
5140 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5142 /* Finally we need to extract the data value from the vector (VEC_COND)
5143 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5144 reduction, but because this doesn't exist, we can use a MAX reduction
5145 instead. The data value might be signed or a float so we need to cast
5146 it first.
5147 In the case where the loop never made any matches, the data values are
5148 all identical, and so will reduce down correctly. */
5150 /* Make the matched data values unsigned. */
5151 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5152 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5153 vec_cond);
5154 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5155 VIEW_CONVERT_EXPR,
5156 vec_cond_cast_rhs);
5157 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5159 /* Reduce down to a scalar value. */
5160 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5161 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5162 1, vec_cond_cast);
5163 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5164 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5166 /* Convert the reduced value back to the result type and set as the
5167 result. */
5168 stmts = NULL;
5169 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5170 data_reduc);
5171 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5172 scalar_results.safe_push (new_temp);
5174 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5175 && reduc_fn == IFN_LAST)
5177 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5178 idx = 0;
5179 idx_val = induction_index[0];
5180 val = data_reduc[0];
5181 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5182 if (induction_index[i] > idx_val)
5183 val = data_reduc[i], idx_val = induction_index[i];
5184 return val; */
5186 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5187 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5188 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5189 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5190 /* Enforced by vectorizable_reduction, which ensures we have target
5191 support before allowing a conditional reduction on variable-length
5192 vectors. */
5193 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5194 tree idx_val = NULL_TREE, val = NULL_TREE;
5195 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5197 tree old_idx_val = idx_val;
5198 tree old_val = val;
5199 idx_val = make_ssa_name (idx_eltype);
5200 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5201 build3 (BIT_FIELD_REF, idx_eltype,
5202 induction_index,
5203 bitsize_int (el_size),
5204 bitsize_int (off)));
5205 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5206 val = make_ssa_name (data_eltype);
5207 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5208 build3 (BIT_FIELD_REF,
5209 data_eltype,
5210 new_phi_result,
5211 bitsize_int (el_size),
5212 bitsize_int (off)));
5213 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5214 if (off != 0)
5216 tree new_idx_val = idx_val;
5217 if (off != v_size - el_size)
5219 new_idx_val = make_ssa_name (idx_eltype);
5220 epilog_stmt = gimple_build_assign (new_idx_val,
5221 MAX_EXPR, idx_val,
5222 old_idx_val);
5223 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225 tree new_val = make_ssa_name (data_eltype);
5226 epilog_stmt = gimple_build_assign (new_val,
5227 COND_EXPR,
5228 build2 (GT_EXPR,
5229 boolean_type_node,
5230 idx_val,
5231 old_idx_val),
5232 val, old_val);
5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 idx_val = new_idx_val;
5235 val = new_val;
5238 /* Convert the reduced value back to the result type and set as the
5239 result. */
5240 gimple_seq stmts = NULL;
5241 val = gimple_convert (&stmts, scalar_type, val);
5242 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5243 scalar_results.safe_push (val);
5246 /* 2.3 Create the reduction code, using one of the three schemes described
5247 above. In SLP we simply need to extract all the elements from the
5248 vector (without reducing them), so we use scalar shifts. */
5249 else if (reduc_fn != IFN_LAST && !slp_reduc)
5251 tree tmp;
5252 tree vec_elem_type;
5254 /* Case 1: Create:
5255 v_out2 = reduc_expr <v_out1> */
5257 if (dump_enabled_p ())
5258 dump_printf_loc (MSG_NOTE, vect_location,
5259 "Reduce using direct vector reduction.\n");
5261 gimple_seq stmts = NULL;
5262 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5263 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5264 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5265 vec_elem_type, new_phi_result);
5266 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5267 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5269 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5270 && induc_val)
5272 /* Earlier we set the initial value to be a vector if induc_val
5273 values. Check the result and if it is induc_val then replace
5274 with the original initial value, unless induc_val is
5275 the same as initial_def already. */
5276 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5277 induc_val);
5279 tmp = make_ssa_name (new_scalar_dest);
5280 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5281 initial_def, new_temp);
5282 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 new_temp = tmp;
5286 scalar_results.safe_push (new_temp);
5288 else if (direct_slp_reduc)
5290 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5291 with the elements for other SLP statements replaced with the
5292 neutral value. We can then do a normal reduction on each vector. */
5294 /* Enforced by vectorizable_reduction. */
5295 gcc_assert (new_phis.length () == 1);
5296 gcc_assert (pow2p_hwi (group_size));
5298 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5299 vec<stmt_vec_info> orig_phis
5300 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5301 gimple_seq seq = NULL;
5303 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5304 and the same element size as VECTYPE. */
5305 tree index = build_index_vector (vectype, 0, 1);
5306 tree index_type = TREE_TYPE (index);
5307 tree index_elt_type = TREE_TYPE (index_type);
5308 tree mask_type = truth_type_for (index_type);
5310 /* Create a vector that, for each element, identifies which of
5311 the REDUC_GROUP_SIZE results should use it. */
5312 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5313 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5314 build_vector_from_val (index_type, index_mask));
5316 /* Get a neutral vector value. This is simply a splat of the neutral
5317 scalar value if we have one, otherwise the initial scalar value
5318 is itself a neutral value. */
5319 tree vector_identity = NULL_TREE;
5320 tree neutral_op = NULL_TREE;
5321 if (slp_node)
5323 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5324 neutral_op
5325 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5326 vectype, code, first != NULL);
5328 if (neutral_op)
5329 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5330 neutral_op);
5331 for (unsigned int i = 0; i < group_size; ++i)
5333 /* If there's no univeral neutral value, we can use the
5334 initial scalar value from the original PHI. This is used
5335 for MIN and MAX reduction, for example. */
5336 if (!neutral_op)
5338 tree scalar_value
5339 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5340 loop_preheader_edge (loop));
5341 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5342 scalar_value);
5343 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5344 scalar_value);
5347 /* Calculate the equivalent of:
5349 sel[j] = (index[j] == i);
5351 which selects the elements of NEW_PHI_RESULT that should
5352 be included in the result. */
5353 tree compare_val = build_int_cst (index_elt_type, i);
5354 compare_val = build_vector_from_val (index_type, compare_val);
5355 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5356 index, compare_val);
5358 /* Calculate the equivalent of:
5360 vec = seq ? new_phi_result : vector_identity;
5362 VEC is now suitable for a full vector reduction. */
5363 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5364 sel, new_phi_result, vector_identity);
5366 /* Do the reduction and convert it to the appropriate type. */
5367 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5368 TREE_TYPE (vectype), vec);
5369 scalar = gimple_convert (&seq, scalar_type, scalar);
5370 scalar_results.safe_push (scalar);
5372 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5374 else
5376 bool reduce_with_shift;
5377 tree vec_temp;
5379 gcc_assert (slp_reduc || new_phis.length () == 1);
5381 /* See if the target wants to do the final (shift) reduction
5382 in a vector mode of smaller size and first reduce upper/lower
5383 halves against each other. */
5384 enum machine_mode mode1 = mode;
5385 tree stype = TREE_TYPE (vectype);
5386 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5387 unsigned nunits1 = nunits;
5388 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5389 && new_phis.length () == 1)
5391 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5392 /* For SLP reductions we have to make sure lanes match up, but
5393 since we're doing individual element final reduction reducing
5394 vector width here is even more important.
5395 ??? We can also separate lanes with permutes, for the common
5396 case of power-of-two group-size odd/even extracts would work. */
5397 if (slp_reduc && nunits != nunits1)
5399 nunits1 = least_common_multiple (nunits1, group_size);
5400 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5403 if (!slp_reduc
5404 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5405 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5407 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5408 stype, nunits1);
5409 reduce_with_shift = have_whole_vector_shift (mode1);
5410 if (!VECTOR_MODE_P (mode1))
5411 reduce_with_shift = false;
5412 else
5414 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5415 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5416 reduce_with_shift = false;
5419 /* First reduce the vector to the desired vector size we should
5420 do shift reduction on by combining upper and lower halves. */
5421 new_temp = new_phi_result;
5422 while (nunits > nunits1)
5424 nunits /= 2;
5425 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5426 stype, nunits);
5427 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5429 /* The target has to make sure we support lowpart/highpart
5430 extraction, either via direct vector extract or through
5431 an integer mode punning. */
5432 tree dst1, dst2;
5433 if (convert_optab_handler (vec_extract_optab,
5434 TYPE_MODE (TREE_TYPE (new_temp)),
5435 TYPE_MODE (vectype1))
5436 != CODE_FOR_nothing)
5438 /* Extract sub-vectors directly once vec_extract becomes
5439 a conversion optab. */
5440 dst1 = make_ssa_name (vectype1);
5441 epilog_stmt
5442 = gimple_build_assign (dst1, BIT_FIELD_REF,
5443 build3 (BIT_FIELD_REF, vectype1,
5444 new_temp, TYPE_SIZE (vectype1),
5445 bitsize_int (0)));
5446 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5447 dst2 = make_ssa_name (vectype1);
5448 epilog_stmt
5449 = gimple_build_assign (dst2, BIT_FIELD_REF,
5450 build3 (BIT_FIELD_REF, vectype1,
5451 new_temp, TYPE_SIZE (vectype1),
5452 bitsize_int (bitsize)));
5453 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5455 else
5457 /* Extract via punning to appropriately sized integer mode
5458 vector. */
5459 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5460 tree etype = build_vector_type (eltype, 2);
5461 gcc_assert (convert_optab_handler (vec_extract_optab,
5462 TYPE_MODE (etype),
5463 TYPE_MODE (eltype))
5464 != CODE_FOR_nothing);
5465 tree tem = make_ssa_name (etype);
5466 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5467 build1 (VIEW_CONVERT_EXPR,
5468 etype, new_temp));
5469 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470 new_temp = tem;
5471 tem = make_ssa_name (eltype);
5472 epilog_stmt
5473 = gimple_build_assign (tem, BIT_FIELD_REF,
5474 build3 (BIT_FIELD_REF, eltype,
5475 new_temp, TYPE_SIZE (eltype),
5476 bitsize_int (0)));
5477 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5478 dst1 = make_ssa_name (vectype1);
5479 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5480 build1 (VIEW_CONVERT_EXPR,
5481 vectype1, tem));
5482 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5483 tem = make_ssa_name (eltype);
5484 epilog_stmt
5485 = gimple_build_assign (tem, BIT_FIELD_REF,
5486 build3 (BIT_FIELD_REF, eltype,
5487 new_temp, TYPE_SIZE (eltype),
5488 bitsize_int (bitsize)));
5489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490 dst2 = make_ssa_name (vectype1);
5491 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5492 build1 (VIEW_CONVERT_EXPR,
5493 vectype1, tem));
5494 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5497 new_temp = make_ssa_name (vectype1);
5498 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5499 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5500 new_phis[0] = epilog_stmt;
5503 if (reduce_with_shift && !slp_reduc)
5505 int element_bitsize = tree_to_uhwi (bitsize);
5506 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5507 for variable-length vectors and also requires direct target support
5508 for loop reductions. */
5509 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5510 int nelements = vec_size_in_bits / element_bitsize;
5511 vec_perm_builder sel;
5512 vec_perm_indices indices;
5514 int elt_offset;
5516 tree zero_vec = build_zero_cst (vectype1);
5517 /* Case 2: Create:
5518 for (offset = nelements/2; offset >= 1; offset/=2)
5520 Create: va' = vec_shift <va, offset>
5521 Create: va = vop <va, va'>
5522 } */
5524 tree rhs;
5526 if (dump_enabled_p ())
5527 dump_printf_loc (MSG_NOTE, vect_location,
5528 "Reduce using vector shifts\n");
5530 gimple_seq stmts = NULL;
5531 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5532 for (elt_offset = nelements / 2;
5533 elt_offset >= 1;
5534 elt_offset /= 2)
5536 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5537 indices.new_vector (sel, 2, nelements);
5538 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5539 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5540 new_temp, zero_vec, mask);
5541 new_temp = gimple_build (&stmts, code,
5542 vectype1, new_name, new_temp);
5544 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5546 /* 2.4 Extract the final scalar result. Create:
5547 s_out3 = extract_field <v_out2, bitpos> */
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_NOTE, vect_location,
5551 "extract scalar result\n");
5553 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5554 bitsize, bitsize_zero_node);
5555 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5556 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5557 gimple_assign_set_lhs (epilog_stmt, new_temp);
5558 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5559 scalar_results.safe_push (new_temp);
5561 else
5563 /* Case 3: Create:
5564 s = extract_field <v_out2, 0>
5565 for (offset = element_size;
5566 offset < vector_size;
5567 offset += element_size;)
5569 Create: s' = extract_field <v_out2, offset>
5570 Create: s = op <s, s'> // For non SLP cases
5571 } */
5573 if (dump_enabled_p ())
5574 dump_printf_loc (MSG_NOTE, vect_location,
5575 "Reduce using scalar code.\n");
5577 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5578 int element_bitsize = tree_to_uhwi (bitsize);
5579 tree compute_type = TREE_TYPE (vectype);
5580 gimple_seq stmts = NULL;
5581 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5583 int bit_offset;
5584 if (gimple_code (new_phi) == GIMPLE_PHI)
5585 vec_temp = PHI_RESULT (new_phi);
5586 else
5587 vec_temp = gimple_assign_lhs (new_phi);
5588 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5589 vec_temp, bitsize, bitsize_zero_node);
5591 /* In SLP we don't need to apply reduction operation, so we just
5592 collect s' values in SCALAR_RESULTS. */
5593 if (slp_reduc)
5594 scalar_results.safe_push (new_temp);
5596 for (bit_offset = element_bitsize;
5597 bit_offset < vec_size_in_bits;
5598 bit_offset += element_bitsize)
5600 tree bitpos = bitsize_int (bit_offset);
5601 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5602 compute_type, vec_temp,
5603 bitsize, bitpos);
5604 if (slp_reduc)
5606 /* In SLP we don't need to apply reduction operation, so
5607 we just collect s' values in SCALAR_RESULTS. */
5608 new_temp = new_name;
5609 scalar_results.safe_push (new_name);
5611 else
5612 new_temp = gimple_build (&stmts, code, compute_type,
5613 new_name, new_temp);
5617 /* The only case where we need to reduce scalar results in SLP, is
5618 unrolling. If the size of SCALAR_RESULTS is greater than
5619 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5620 REDUC_GROUP_SIZE. */
5621 if (slp_reduc)
5623 tree res, first_res, new_res;
5625 /* Reduce multiple scalar results in case of SLP unrolling. */
5626 for (j = group_size; scalar_results.iterate (j, &res);
5627 j++)
5629 first_res = scalar_results[j % group_size];
5630 new_res = gimple_build (&stmts, code, compute_type,
5631 first_res, res);
5632 scalar_results[j % group_size] = new_res;
5634 for (k = 0; k < group_size; k++)
5635 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5636 scalar_results[k]);
5638 else
5640 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5641 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5642 scalar_results.safe_push (new_temp);
5645 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5648 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5649 && induc_val)
5651 /* Earlier we set the initial value to be a vector if induc_val
5652 values. Check the result and if it is induc_val then replace
5653 with the original initial value, unless induc_val is
5654 the same as initial_def already. */
5655 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5656 induc_val);
5658 tree tmp = make_ssa_name (new_scalar_dest);
5659 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5660 initial_def, new_temp);
5661 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5662 scalar_results[0] = tmp;
5666 /* 2.5 Adjust the final result by the initial value of the reduction
5667 variable. (When such adjustment is not needed, then
5668 'adjustment_def' is zero). For example, if code is PLUS we create:
5669 new_temp = loop_exit_def + adjustment_def */
5671 if (adjustment_def)
5673 gcc_assert (!slp_reduc);
5674 gimple_seq stmts = NULL;
5675 if (nested_in_vect_loop)
5677 new_phi = new_phis[0];
5678 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5679 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5680 new_temp = gimple_build (&stmts, code, vectype,
5681 PHI_RESULT (new_phi), adjustment_def);
5683 else
5685 new_temp = scalar_results[0];
5686 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5687 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5688 new_temp = gimple_build (&stmts, code, scalar_type,
5689 new_temp, adjustment_def);
5692 epilog_stmt = gimple_seq_last_stmt (stmts);
5693 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5694 if (nested_in_vect_loop)
5696 if (!double_reduc)
5697 scalar_results.quick_push (new_temp);
5698 else
5699 scalar_results[0] = new_temp;
5701 else
5702 scalar_results[0] = new_temp;
5704 new_phis[0] = epilog_stmt;
5707 if (double_reduc)
5708 loop = loop->inner;
5710 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5711 phis with new adjusted scalar results, i.e., replace use <s_out0>
5712 with use <s_out4>.
5714 Transform:
5715 loop_exit:
5716 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5717 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5718 v_out2 = reduce <v_out1>
5719 s_out3 = extract_field <v_out2, 0>
5720 s_out4 = adjust_result <s_out3>
5721 use <s_out0>
5722 use <s_out0>
5724 into:
5726 loop_exit:
5727 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5728 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5729 v_out2 = reduce <v_out1>
5730 s_out3 = extract_field <v_out2, 0>
5731 s_out4 = adjust_result <s_out3>
5732 use <s_out4>
5733 use <s_out4> */
5736 /* In SLP reduction chain we reduce vector results into one vector if
5737 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5738 LHS of the last stmt in the reduction chain, since we are looking for
5739 the loop exit phi node. */
5740 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5742 stmt_vec_info dest_stmt_info
5743 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5744 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5745 group_size = 1;
5748 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5749 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5750 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5751 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5752 correspond to the first vector stmt, etc.
5753 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5754 if (group_size > new_phis.length ())
5755 gcc_assert (!(group_size % new_phis.length ()));
5757 for (k = 0; k < group_size; k++)
5759 if (slp_reduc)
5761 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5763 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5764 /* SLP statements can't participate in patterns. */
5765 gcc_assert (!orig_stmt_info);
5766 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5769 if (nested_in_vect_loop)
5771 if (double_reduc)
5772 loop = outer_loop;
5773 else
5774 gcc_unreachable ();
5777 phis.create (3);
5778 /* Find the loop-closed-use at the loop exit of the original scalar
5779 result. (The reduction result is expected to have two immediate uses,
5780 one at the latch block, and one at the loop exit). For double
5781 reductions we are looking for exit phis of the outer loop. */
5782 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5784 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5786 if (!is_gimple_debug (USE_STMT (use_p)))
5787 phis.safe_push (USE_STMT (use_p));
5789 else
5791 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5793 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5795 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5797 if (!flow_bb_inside_loop_p (loop,
5798 gimple_bb (USE_STMT (phi_use_p)))
5799 && !is_gimple_debug (USE_STMT (phi_use_p)))
5800 phis.safe_push (USE_STMT (phi_use_p));
5806 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5808 /* Replace the uses: */
5809 orig_name = PHI_RESULT (exit_phi);
5810 scalar_result = scalar_results[k];
5811 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5813 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5814 SET_USE (use_p, scalar_result);
5815 update_stmt (use_stmt);
5819 phis.release ();
5823 /* Return a vector of type VECTYPE that is equal to the vector select
5824 operation "MASK ? VEC : IDENTITY". Insert the select statements
5825 before GSI. */
5827 static tree
5828 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5829 tree vec, tree identity)
5831 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5832 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5833 mask, vec, identity);
5834 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5835 return cond;
5838 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5839 order, starting with LHS. Insert the extraction statements before GSI and
5840 associate the new scalar SSA names with variable SCALAR_DEST.
5841 Return the SSA name for the result. */
5843 static tree
5844 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5845 tree_code code, tree lhs, tree vector_rhs)
5847 tree vectype = TREE_TYPE (vector_rhs);
5848 tree scalar_type = TREE_TYPE (vectype);
5849 tree bitsize = TYPE_SIZE (scalar_type);
5850 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5851 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5853 for (unsigned HOST_WIDE_INT bit_offset = 0;
5854 bit_offset < vec_size_in_bits;
5855 bit_offset += element_bitsize)
5857 tree bitpos = bitsize_int (bit_offset);
5858 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5859 bitsize, bitpos);
5861 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5862 rhs = make_ssa_name (scalar_dest, stmt);
5863 gimple_assign_set_lhs (stmt, rhs);
5864 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5866 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5867 tree new_name = make_ssa_name (scalar_dest, stmt);
5868 gimple_assign_set_lhs (stmt, new_name);
5869 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5870 lhs = new_name;
5872 return lhs;
5875 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5876 type of the vector input. */
5878 static internal_fn
5879 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5881 internal_fn mask_reduc_fn;
5883 switch (reduc_fn)
5885 case IFN_FOLD_LEFT_PLUS:
5886 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5887 break;
5889 default:
5890 return IFN_LAST;
5893 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5894 OPTIMIZE_FOR_SPEED))
5895 return mask_reduc_fn;
5896 return IFN_LAST;
5899 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5900 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5901 statement. CODE is the operation performed by STMT_INFO and OPS are
5902 its scalar operands. REDUC_INDEX is the index of the operand in
5903 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5904 implements in-order reduction, or IFN_LAST if we should open-code it.
5905 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5906 that should be used to control the operation in a fully-masked loop. */
5908 static bool
5909 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5910 stmt_vec_info stmt_info,
5911 gimple_stmt_iterator *gsi,
5912 gimple **vec_stmt, slp_tree slp_node,
5913 gimple *reduc_def_stmt,
5914 tree_code code, internal_fn reduc_fn,
5915 tree ops[3], tree vectype_in,
5916 int reduc_index, vec_loop_masks *masks)
5918 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5919 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5920 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5922 int ncopies;
5923 if (slp_node)
5924 ncopies = 1;
5925 else
5926 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5928 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5929 gcc_assert (ncopies == 1);
5930 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5932 if (slp_node)
5933 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5934 TYPE_VECTOR_SUBPARTS (vectype_in)));
5936 tree op0 = ops[1 - reduc_index];
5938 int group_size = 1;
5939 stmt_vec_info scalar_dest_def_info;
5940 auto_vec<tree> vec_oprnds0;
5941 if (slp_node)
5943 auto_vec<vec<tree> > vec_defs (2);
5944 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5945 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5946 vec_defs[0].release ();
5947 vec_defs[1].release ();
5948 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5949 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5951 else
5953 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5954 op0, &vec_oprnds0);
5955 scalar_dest_def_info = stmt_info;
5958 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5959 tree scalar_type = TREE_TYPE (scalar_dest);
5960 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5962 int vec_num = vec_oprnds0.length ();
5963 gcc_assert (vec_num == 1 || slp_node);
5964 tree vec_elem_type = TREE_TYPE (vectype_out);
5965 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5967 tree vector_identity = NULL_TREE;
5968 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5969 vector_identity = build_zero_cst (vectype_out);
5971 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5972 int i;
5973 tree def0;
5974 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5976 gimple *new_stmt;
5977 tree mask = NULL_TREE;
5978 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5979 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5981 /* Handle MINUS by adding the negative. */
5982 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5984 tree negated = make_ssa_name (vectype_out);
5985 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5986 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5987 def0 = negated;
5990 if (mask && mask_reduc_fn == IFN_LAST)
5991 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5992 vector_identity);
5994 /* On the first iteration the input is simply the scalar phi
5995 result, and for subsequent iterations it is the output of
5996 the preceding operation. */
5997 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5999 if (mask && mask_reduc_fn != IFN_LAST)
6000 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6001 def0, mask);
6002 else
6003 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6004 def0);
6005 /* For chained SLP reductions the output of the previous reduction
6006 operation serves as the input of the next. For the final statement
6007 the output cannot be a temporary - we reuse the original
6008 scalar destination of the last statement. */
6009 if (i != vec_num - 1)
6011 gimple_set_lhs (new_stmt, scalar_dest_var);
6012 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6013 gimple_set_lhs (new_stmt, reduc_var);
6016 else
6018 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6019 reduc_var, def0);
6020 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6021 /* Remove the statement, so that we can use the same code paths
6022 as for statements that we've just created. */
6023 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6024 gsi_remove (&tmp_gsi, true);
6027 if (i == vec_num - 1)
6029 gimple_set_lhs (new_stmt, scalar_dest);
6030 vect_finish_replace_stmt (loop_vinfo,
6031 scalar_dest_def_info,
6032 new_stmt);
6034 else
6035 vect_finish_stmt_generation (loop_vinfo,
6036 scalar_dest_def_info,
6037 new_stmt, gsi);
6039 if (slp_node)
6040 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6041 else
6043 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6044 *vec_stmt = new_stmt;
6048 return true;
6051 /* Function is_nonwrapping_integer_induction.
6053 Check if STMT_VINO (which is part of loop LOOP) both increments and
6054 does not cause overflow. */
6056 static bool
6057 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6059 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6060 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6061 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6062 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6063 widest_int ni, max_loop_value, lhs_max;
6064 wi::overflow_type overflow = wi::OVF_NONE;
6066 /* Make sure the loop is integer based. */
6067 if (TREE_CODE (base) != INTEGER_CST
6068 || TREE_CODE (step) != INTEGER_CST)
6069 return false;
6071 /* Check that the max size of the loop will not wrap. */
6073 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6074 return true;
6076 if (! max_stmt_executions (loop, &ni))
6077 return false;
6079 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6080 &overflow);
6081 if (overflow)
6082 return false;
6084 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6085 TYPE_SIGN (lhs_type), &overflow);
6086 if (overflow)
6087 return false;
6089 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6090 <= TYPE_PRECISION (lhs_type));
6093 /* Check if masking can be supported by inserting a conditional expression.
6094 CODE is the code for the operation. COND_FN is the conditional internal
6095 function, if it exists. VECTYPE_IN is the type of the vector input. */
6096 static bool
6097 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6098 tree vectype_in)
6100 if (cond_fn != IFN_LAST
6101 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6102 OPTIMIZE_FOR_SPEED))
6103 return false;
6105 switch (code)
6107 case DOT_PROD_EXPR:
6108 case SAD_EXPR:
6109 return true;
6111 default:
6112 return false;
6116 /* Insert a conditional expression to enable masked vectorization. CODE is the
6117 code for the operation. VOP is the array of operands. MASK is the loop
6118 mask. GSI is a statement iterator used to place the new conditional
6119 expression. */
6120 static void
6121 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6122 gimple_stmt_iterator *gsi)
6124 switch (code)
6126 case DOT_PROD_EXPR:
6128 tree vectype = TREE_TYPE (vop[1]);
6129 tree zero = build_zero_cst (vectype);
6130 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6131 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6132 mask, vop[1], zero);
6133 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6134 vop[1] = masked_op1;
6135 break;
6138 case SAD_EXPR:
6140 tree vectype = TREE_TYPE (vop[1]);
6141 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6142 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6143 mask, vop[1], vop[0]);
6144 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6145 vop[1] = masked_op1;
6146 break;
6149 default:
6150 gcc_unreachable ();
6154 /* Function vectorizable_reduction.
6156 Check if STMT_INFO performs a reduction operation that can be vectorized.
6157 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6158 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6159 Return true if STMT_INFO is vectorizable in this way.
6161 This function also handles reduction idioms (patterns) that have been
6162 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6163 may be of this form:
6164 X = pattern_expr (arg0, arg1, ..., X)
6165 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6166 sequence that had been detected and replaced by the pattern-stmt
6167 (STMT_INFO).
6169 This function also handles reduction of condition expressions, for example:
6170 for (int i = 0; i < N; i++)
6171 if (a[i] < value)
6172 last = a[i];
6173 This is handled by vectorising the loop and creating an additional vector
6174 containing the loop indexes for which "a[i] < value" was true. In the
6175 function epilogue this is reduced to a single max value and then used to
6176 index into the vector of results.
6178 In some cases of reduction patterns, the type of the reduction variable X is
6179 different than the type of the other arguments of STMT_INFO.
6180 In such cases, the vectype that is used when transforming STMT_INFO into
6181 a vector stmt is different than the vectype that is used to determine the
6182 vectorization factor, because it consists of a different number of elements
6183 than the actual number of elements that are being operated upon in parallel.
6185 For example, consider an accumulation of shorts into an int accumulator.
6186 On some targets it's possible to vectorize this pattern operating on 8
6187 shorts at a time (hence, the vectype for purposes of determining the
6188 vectorization factor should be V8HI); on the other hand, the vectype that
6189 is used to create the vector form is actually V4SI (the type of the result).
6191 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6192 indicates what is the actual level of parallelism (V8HI in the example), so
6193 that the right vectorization factor would be derived. This vectype
6194 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6195 be used to create the vectorized stmt. The right vectype for the vectorized
6196 stmt is obtained from the type of the result X:
6197 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6199 This means that, contrary to "regular" reductions (or "regular" stmts in
6200 general), the following equation:
6201 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6202 does *NOT* necessarily hold for reduction patterns. */
6204 bool
6205 vectorizable_reduction (loop_vec_info loop_vinfo,
6206 stmt_vec_info stmt_info, slp_tree slp_node,
6207 slp_instance slp_node_instance,
6208 stmt_vector_for_cost *cost_vec)
6210 tree scalar_dest;
6211 tree vectype_in = NULL_TREE;
6212 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6213 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6214 stmt_vec_info cond_stmt_vinfo = NULL;
6215 tree scalar_type;
6216 int i;
6217 int ncopies;
6218 bool single_defuse_cycle = false;
6219 bool nested_cycle = false;
6220 bool double_reduc = false;
6221 int vec_num;
6222 tree tem;
6223 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6224 tree cond_reduc_val = NULL_TREE;
6226 /* Make sure it was already recognized as a reduction computation. */
6227 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6228 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6229 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6230 return false;
6232 /* The stmt we store reduction analysis meta on. */
6233 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6234 reduc_info->is_reduc_info = true;
6236 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6238 if (is_a <gphi *> (stmt_info->stmt))
6239 /* Analysis for double-reduction is done on the outer
6240 loop PHI, nested cycles have no further restrictions. */
6241 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6242 else
6243 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6244 return true;
6247 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6248 stmt_vec_info phi_info = stmt_info;
6249 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6250 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6252 if (!is_a <gphi *> (stmt_info->stmt))
6254 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6255 return true;
6257 if (slp_node)
6259 slp_node_instance->reduc_phis = slp_node;
6260 /* ??? We're leaving slp_node to point to the PHIs, we only
6261 need it to get at the number of vector stmts which wasn't
6262 yet initialized for the instance root. */
6264 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6265 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6266 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6268 use_operand_p use_p;
6269 gimple *use_stmt;
6270 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6271 &use_p, &use_stmt);
6272 gcc_assert (res);
6273 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6274 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6278 /* PHIs should not participate in patterns. */
6279 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6280 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6282 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6283 and compute the reduction chain length. */
6284 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6285 loop_latch_edge (loop));
6286 unsigned reduc_chain_length = 0;
6287 bool only_slp_reduc_chain = true;
6288 stmt_info = NULL;
6289 while (reduc_def != PHI_RESULT (reduc_def_phi))
6291 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6292 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6293 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6295 if (dump_enabled_p ())
6296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297 "reduction chain broken by patterns.\n");
6298 return false;
6300 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6301 only_slp_reduc_chain = false;
6302 /* ??? For epilogue generation live members of the chain need
6303 to point back to the PHI via their original stmt for
6304 info_for_reduction to work. */
6305 if (STMT_VINFO_LIVE_P (vdef))
6306 STMT_VINFO_REDUC_DEF (def) = phi_info;
6307 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6308 if (!assign)
6310 if (dump_enabled_p ())
6311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6312 "reduction chain includes calls.\n");
6313 return false;
6315 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6317 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6318 TREE_TYPE (gimple_assign_rhs1 (assign))))
6320 if (dump_enabled_p ())
6321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6322 "conversion in the reduction chain.\n");
6323 return false;
6326 else if (!stmt_info)
6327 /* First non-conversion stmt. */
6328 stmt_info = vdef;
6329 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6330 reduc_chain_length++;
6332 /* PHIs should not participate in patterns. */
6333 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6335 if (nested_in_vect_loop_p (loop, stmt_info))
6337 loop = loop->inner;
6338 nested_cycle = true;
6341 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6342 element. */
6343 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6345 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6346 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6348 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6349 gcc_assert (slp_node
6350 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6352 /* 1. Is vectorizable reduction? */
6353 /* Not supportable if the reduction variable is used in the loop, unless
6354 it's a reduction chain. */
6355 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6356 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6357 return false;
6359 /* Reductions that are not used even in an enclosing outer-loop,
6360 are expected to be "live" (used out of the loop). */
6361 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6362 && !STMT_VINFO_LIVE_P (stmt_info))
6363 return false;
6365 /* 2. Has this been recognized as a reduction pattern?
6367 Check if STMT represents a pattern that has been recognized
6368 in earlier analysis stages. For stmts that represent a pattern,
6369 the STMT_VINFO_RELATED_STMT field records the last stmt in
6370 the original sequence that constitutes the pattern. */
6372 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6373 if (orig_stmt_info)
6375 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6376 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6379 /* 3. Check the operands of the operation. The first operands are defined
6380 inside the loop body. The last operand is the reduction variable,
6381 which is defined by the loop-header-phi. */
6383 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6384 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6385 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6386 enum tree_code code = gimple_assign_rhs_code (stmt);
6387 bool lane_reduc_code_p
6388 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6389 int op_type = TREE_CODE_LENGTH (code);
6391 scalar_dest = gimple_assign_lhs (stmt);
6392 scalar_type = TREE_TYPE (scalar_dest);
6393 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6394 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6395 return false;
6397 /* Do not try to vectorize bit-precision reductions. */
6398 if (!type_has_mode_precision_p (scalar_type))
6399 return false;
6401 /* For lane-reducing ops we're reducing the number of reduction PHIs
6402 which means the only use of that may be in the lane-reducing operation. */
6403 if (lane_reduc_code_p
6404 && reduc_chain_length != 1
6405 && !only_slp_reduc_chain)
6407 if (dump_enabled_p ())
6408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6409 "lane-reducing reduction with extra stmts.\n");
6410 return false;
6413 /* All uses but the last are expected to be defined in the loop.
6414 The last use is the reduction variable. In case of nested cycle this
6415 assumption is not true: we use reduc_index to record the index of the
6416 reduction variable. */
6417 /* ??? To get at invariant/constant uses on the SLP node we have to
6418 get to it here, slp_node is still the reduction PHI. */
6419 slp_tree slp_for_stmt_info = NULL;
6420 if (slp_node)
6422 slp_for_stmt_info = slp_node_instance->root;
6423 /* And then there's reduction chain with a conversion ... */
6424 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6425 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6426 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6428 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6429 /* We need to skip an extra operand for COND_EXPRs with embedded
6430 comparison. */
6431 unsigned opno_adjust = 0;
6432 if (code == COND_EXPR
6433 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6434 opno_adjust = 1;
6435 for (i = 0; i < op_type; i++)
6437 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6438 if (i == 0 && code == COND_EXPR)
6439 continue;
6441 stmt_vec_info def_stmt_info;
6442 enum vect_def_type dt;
6443 tree op;
6444 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6445 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6446 &def_stmt_info))
6448 if (dump_enabled_p ())
6449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6450 "use not simple.\n");
6451 return false;
6453 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6454 continue;
6456 /* There should be only one cycle def in the stmt, the one
6457 leading to reduc_def. */
6458 if (VECTORIZABLE_CYCLE_DEF (dt))
6459 return false;
6461 /* To properly compute ncopies we are interested in the widest
6462 non-reduction input type in case we're looking at a widening
6463 accumulation that we later handle in vect_transform_reduction. */
6464 if (lane_reduc_code_p
6465 && tem
6466 && (!vectype_in
6467 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6468 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6469 vectype_in = tem;
6471 if (code == COND_EXPR)
6473 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6474 if (dt == vect_constant_def)
6476 cond_reduc_dt = dt;
6477 cond_reduc_val = op;
6479 if (dt == vect_induction_def
6480 && def_stmt_info
6481 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6483 cond_reduc_dt = dt;
6484 cond_stmt_vinfo = def_stmt_info;
6488 if (!vectype_in)
6489 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6490 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6492 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6493 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6494 /* If we have a condition reduction, see if we can simplify it further. */
6495 if (v_reduc_type == COND_REDUCTION)
6497 if (slp_node)
6498 return false;
6500 /* When the condition uses the reduction value in the condition, fail. */
6501 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6503 if (dump_enabled_p ())
6504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6505 "condition depends on previous iteration\n");
6506 return false;
6509 if (reduc_chain_length == 1
6510 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6511 vectype_in, OPTIMIZE_FOR_SPEED))
6513 if (dump_enabled_p ())
6514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6515 "optimizing condition reduction with"
6516 " FOLD_EXTRACT_LAST.\n");
6517 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6519 else if (cond_reduc_dt == vect_induction_def)
6521 tree base
6522 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6523 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6525 gcc_assert (TREE_CODE (base) == INTEGER_CST
6526 && TREE_CODE (step) == INTEGER_CST);
6527 cond_reduc_val = NULL_TREE;
6528 enum tree_code cond_reduc_op_code = ERROR_MARK;
6529 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6530 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6532 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6533 above base; punt if base is the minimum value of the type for
6534 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6535 else if (tree_int_cst_sgn (step) == -1)
6537 cond_reduc_op_code = MIN_EXPR;
6538 if (tree_int_cst_sgn (base) == -1)
6539 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6540 else if (tree_int_cst_lt (base,
6541 TYPE_MAX_VALUE (TREE_TYPE (base))))
6542 cond_reduc_val
6543 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6545 else
6547 cond_reduc_op_code = MAX_EXPR;
6548 if (tree_int_cst_sgn (base) == 1)
6549 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6550 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6551 base))
6552 cond_reduc_val
6553 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6555 if (cond_reduc_val)
6557 if (dump_enabled_p ())
6558 dump_printf_loc (MSG_NOTE, vect_location,
6559 "condition expression based on "
6560 "integer induction.\n");
6561 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6562 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6563 = cond_reduc_val;
6564 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6567 else if (cond_reduc_dt == vect_constant_def)
6569 enum vect_def_type cond_initial_dt;
6570 tree cond_initial_val
6571 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6573 gcc_assert (cond_reduc_val != NULL_TREE);
6574 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6575 if (cond_initial_dt == vect_constant_def
6576 && types_compatible_p (TREE_TYPE (cond_initial_val),
6577 TREE_TYPE (cond_reduc_val)))
6579 tree e = fold_binary (LE_EXPR, boolean_type_node,
6580 cond_initial_val, cond_reduc_val);
6581 if (e && (integer_onep (e) || integer_zerop (e)))
6583 if (dump_enabled_p ())
6584 dump_printf_loc (MSG_NOTE, vect_location,
6585 "condition expression based on "
6586 "compile time constant.\n");
6587 /* Record reduction code at analysis stage. */
6588 STMT_VINFO_REDUC_CODE (reduc_info)
6589 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6590 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6596 if (STMT_VINFO_LIVE_P (phi_info))
6597 return false;
6599 if (slp_node)
6600 ncopies = 1;
6601 else
6602 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6604 gcc_assert (ncopies >= 1);
6606 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6608 if (nested_cycle)
6610 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6611 == vect_double_reduction_def);
6612 double_reduc = true;
6615 /* 4.2. Check support for the epilog operation.
6617 If STMT represents a reduction pattern, then the type of the
6618 reduction variable may be different than the type of the rest
6619 of the arguments. For example, consider the case of accumulation
6620 of shorts into an int accumulator; The original code:
6621 S1: int_a = (int) short_a;
6622 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6624 was replaced with:
6625 STMT: int_acc = widen_sum <short_a, int_acc>
6627 This means that:
6628 1. The tree-code that is used to create the vector operation in the
6629 epilog code (that reduces the partial results) is not the
6630 tree-code of STMT, but is rather the tree-code of the original
6631 stmt from the pattern that STMT is replacing. I.e, in the example
6632 above we want to use 'widen_sum' in the loop, but 'plus' in the
6633 epilog.
6634 2. The type (mode) we use to check available target support
6635 for the vector operation to be created in the *epilog*, is
6636 determined by the type of the reduction variable (in the example
6637 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6638 However the type (mode) we use to check available target support
6639 for the vector operation to be created *inside the loop*, is
6640 determined by the type of the other arguments to STMT (in the
6641 example we'd check this: optab_handler (widen_sum_optab,
6642 vect_short_mode)).
6644 This is contrary to "regular" reductions, in which the types of all
6645 the arguments are the same as the type of the reduction variable.
6646 For "regular" reductions we can therefore use the same vector type
6647 (and also the same tree-code) when generating the epilog code and
6648 when generating the code inside the loop. */
6650 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6651 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6653 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6654 if (reduction_type == TREE_CODE_REDUCTION)
6656 /* Check whether it's ok to change the order of the computation.
6657 Generally, when vectorizing a reduction we change the order of the
6658 computation. This may change the behavior of the program in some
6659 cases, so we need to check that this is ok. One exception is when
6660 vectorizing an outer-loop: the inner-loop is executed sequentially,
6661 and therefore vectorizing reductions in the inner-loop during
6662 outer-loop vectorization is safe. */
6663 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6665 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6666 is not directy used in stmt. */
6667 if (!only_slp_reduc_chain
6668 && reduc_chain_length != 1)
6670 if (dump_enabled_p ())
6671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6672 "in-order reduction chain without SLP.\n");
6673 return false;
6675 STMT_VINFO_REDUC_TYPE (reduc_info)
6676 = reduction_type = FOLD_LEFT_REDUCTION;
6678 else if (!commutative_tree_code (orig_code)
6679 || !associative_tree_code (orig_code))
6681 if (dump_enabled_p ())
6682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6683 "reduction: not commutative/associative");
6684 return false;
6688 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6689 && ncopies > 1)
6691 if (dump_enabled_p ())
6692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6693 "multiple types in double reduction or condition "
6694 "reduction or fold-left reduction.\n");
6695 return false;
6698 internal_fn reduc_fn = IFN_LAST;
6699 if (reduction_type == TREE_CODE_REDUCTION
6700 || reduction_type == FOLD_LEFT_REDUCTION
6701 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6702 || reduction_type == CONST_COND_REDUCTION)
6704 if (reduction_type == FOLD_LEFT_REDUCTION
6705 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6706 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6708 if (reduc_fn != IFN_LAST
6709 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6710 OPTIMIZE_FOR_SPEED))
6712 if (dump_enabled_p ())
6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 "reduc op not supported by target.\n");
6716 reduc_fn = IFN_LAST;
6719 else
6721 if (!nested_cycle || double_reduc)
6723 if (dump_enabled_p ())
6724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725 "no reduc code for scalar code.\n");
6727 return false;
6731 else if (reduction_type == COND_REDUCTION)
6733 int scalar_precision
6734 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6735 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6736 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6737 nunits_out);
6739 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6740 OPTIMIZE_FOR_SPEED))
6741 reduc_fn = IFN_REDUC_MAX;
6743 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6745 if (reduction_type != EXTRACT_LAST_REDUCTION
6746 && (!nested_cycle || double_reduc)
6747 && reduc_fn == IFN_LAST
6748 && !nunits_out.is_constant ())
6750 if (dump_enabled_p ())
6751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6752 "missing target support for reduction on"
6753 " variable-length vectors.\n");
6754 return false;
6757 /* For SLP reductions, see if there is a neutral value we can use. */
6758 tree neutral_op = NULL_TREE;
6759 if (slp_node)
6760 neutral_op = neutral_op_for_slp_reduction
6761 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6762 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6764 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6766 /* We can't support in-order reductions of code such as this:
6768 for (int i = 0; i < n1; ++i)
6769 for (int j = 0; j < n2; ++j)
6770 l += a[j];
6772 since GCC effectively transforms the loop when vectorizing:
6774 for (int i = 0; i < n1 / VF; ++i)
6775 for (int j = 0; j < n2; ++j)
6776 for (int k = 0; k < VF; ++k)
6777 l += a[j];
6779 which is a reassociation of the original operation. */
6780 if (dump_enabled_p ())
6781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782 "in-order double reduction not supported.\n");
6784 return false;
6787 if (reduction_type == FOLD_LEFT_REDUCTION
6788 && slp_node
6789 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6791 /* We cannot use in-order reductions in this case because there is
6792 an implicit reassociation of the operations involved. */
6793 if (dump_enabled_p ())
6794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795 "in-order unchained SLP reductions not supported.\n");
6796 return false;
6799 /* For double reductions, and for SLP reductions with a neutral value,
6800 we construct a variable-length initial vector by loading a vector
6801 full of the neutral value and then shift-and-inserting the start
6802 values into the low-numbered elements. */
6803 if ((double_reduc || neutral_op)
6804 && !nunits_out.is_constant ()
6805 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6806 vectype_out, OPTIMIZE_FOR_SPEED))
6808 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 "reduction on variable-length vectors requires"
6811 " target support for a vector-shift-and-insert"
6812 " operation.\n");
6813 return false;
6816 /* Check extra constraints for variable-length unchained SLP reductions. */
6817 if (STMT_SLP_TYPE (stmt_info)
6818 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6819 && !nunits_out.is_constant ())
6821 /* We checked above that we could build the initial vector when
6822 there's a neutral element value. Check here for the case in
6823 which each SLP statement has its own initial value and in which
6824 that value needs to be repeated for every instance of the
6825 statement within the initial vector. */
6826 unsigned int group_size = SLP_TREE_LANES (slp_node);
6827 if (!neutral_op
6828 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6829 TREE_TYPE (vectype_out)))
6831 if (dump_enabled_p ())
6832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833 "unsupported form of SLP reduction for"
6834 " variable-length vectors: cannot build"
6835 " initial vector.\n");
6836 return false;
6838 /* The epilogue code relies on the number of elements being a multiple
6839 of the group size. The duplicate-and-interleave approach to setting
6840 up the initial vector does too. */
6841 if (!multiple_p (nunits_out, group_size))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845 "unsupported form of SLP reduction for"
6846 " variable-length vectors: the vector size"
6847 " is not a multiple of the number of results.\n");
6848 return false;
6852 if (reduction_type == COND_REDUCTION)
6854 widest_int ni;
6856 if (! max_loop_iterations (loop, &ni))
6858 if (dump_enabled_p ())
6859 dump_printf_loc (MSG_NOTE, vect_location,
6860 "loop count not known, cannot create cond "
6861 "reduction.\n");
6862 return false;
6864 /* Convert backedges to iterations. */
6865 ni += 1;
6867 /* The additional index will be the same type as the condition. Check
6868 that the loop can fit into this less one (because we'll use up the
6869 zero slot for when there are no matches). */
6870 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6871 if (wi::geu_p (ni, wi::to_widest (max_index)))
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_NOTE, vect_location,
6875 "loop size is greater than data size.\n");
6876 return false;
6880 /* In case the vectorization factor (VF) is bigger than the number
6881 of elements that we can fit in a vectype (nunits), we have to generate
6882 more than one vector stmt - i.e - we need to "unroll" the
6883 vector stmt by a factor VF/nunits. For more details see documentation
6884 in vectorizable_operation. */
6886 /* If the reduction is used in an outer loop we need to generate
6887 VF intermediate results, like so (e.g. for ncopies=2):
6888 r0 = phi (init, r0)
6889 r1 = phi (init, r1)
6890 r0 = x0 + r0;
6891 r1 = x1 + r1;
6892 (i.e. we generate VF results in 2 registers).
6893 In this case we have a separate def-use cycle for each copy, and therefore
6894 for each copy we get the vector def for the reduction variable from the
6895 respective phi node created for this copy.
6897 Otherwise (the reduction is unused in the loop nest), we can combine
6898 together intermediate results, like so (e.g. for ncopies=2):
6899 r = phi (init, r)
6900 r = x0 + r;
6901 r = x1 + r;
6902 (i.e. we generate VF/2 results in a single register).
6903 In this case for each copy we get the vector def for the reduction variable
6904 from the vectorized reduction operation generated in the previous iteration.
6906 This only works when we see both the reduction PHI and its only consumer
6907 in vectorizable_reduction and there are no intermediate stmts
6908 participating. */
6909 if (ncopies > 1
6910 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6911 && reduc_chain_length == 1)
6912 single_defuse_cycle = true;
6914 if (single_defuse_cycle || lane_reduc_code_p)
6916 gcc_assert (code != COND_EXPR);
6918 /* 4. Supportable by target? */
6919 bool ok = true;
6921 /* 4.1. check support for the operation in the loop */
6922 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6923 if (!optab)
6925 if (dump_enabled_p ())
6926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6927 "no optab.\n");
6928 ok = false;
6931 machine_mode vec_mode = TYPE_MODE (vectype_in);
6932 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6934 if (dump_enabled_p ())
6935 dump_printf (MSG_NOTE, "op not supported by target.\n");
6936 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6937 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6938 ok = false;
6939 else
6940 if (dump_enabled_p ())
6941 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6944 /* Worthwhile without SIMD support? */
6945 if (ok
6946 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6947 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6949 if (dump_enabled_p ())
6950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6951 "not worthwhile without SIMD support.\n");
6952 ok = false;
6955 /* lane-reducing operations have to go through vect_transform_reduction.
6956 For the other cases try without the single cycle optimization. */
6957 if (!ok)
6959 if (lane_reduc_code_p)
6960 return false;
6961 else
6962 single_defuse_cycle = false;
6965 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6967 /* If the reduction stmt is one of the patterns that have lane
6968 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6969 if ((ncopies > 1 && ! single_defuse_cycle)
6970 && lane_reduc_code_p)
6972 if (dump_enabled_p ())
6973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974 "multi def-use cycle not possible for lane-reducing "
6975 "reduction operation\n");
6976 return false;
6979 if (slp_node
6980 && !(!single_defuse_cycle
6981 && code != DOT_PROD_EXPR
6982 && code != WIDEN_SUM_EXPR
6983 && code != SAD_EXPR
6984 && reduction_type != FOLD_LEFT_REDUCTION))
6985 for (i = 0; i < op_type; i++)
6986 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6988 if (dump_enabled_p ())
6989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6990 "incompatible vector types for invariants\n");
6991 return false;
6994 if (slp_node)
6995 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6996 else
6997 vec_num = 1;
6999 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7000 reduction_type, ncopies, cost_vec);
7001 if (dump_enabled_p ()
7002 && reduction_type == FOLD_LEFT_REDUCTION)
7003 dump_printf_loc (MSG_NOTE, vect_location,
7004 "using an in-order (fold-left) reduction.\n");
7005 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7006 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7007 reductions go through their own vectorizable_* routines. */
7008 if (!single_defuse_cycle
7009 && code != DOT_PROD_EXPR
7010 && code != WIDEN_SUM_EXPR
7011 && code != SAD_EXPR
7012 && reduction_type != FOLD_LEFT_REDUCTION)
7014 stmt_vec_info tem
7015 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7016 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7018 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7019 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7021 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7022 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7024 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7026 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7027 internal_fn cond_fn = get_conditional_internal_fn (code);
7029 if (reduction_type != FOLD_LEFT_REDUCTION
7030 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7031 && (cond_fn == IFN_LAST
7032 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7033 OPTIMIZE_FOR_SPEED)))
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037 "can't operate on partial vectors because"
7038 " no conditional operation is available.\n");
7039 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7041 else if (reduction_type == FOLD_LEFT_REDUCTION
7042 && reduc_fn == IFN_LAST
7043 && !expand_vec_cond_expr_p (vectype_in,
7044 truth_type_for (vectype_in),
7045 SSA_NAME))
7047 if (dump_enabled_p ())
7048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7049 "can't operate on partial vectors because"
7050 " no conditional operation is available.\n");
7051 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7053 else
7054 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7055 vectype_in, NULL);
7057 return true;
7060 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7061 value. */
7063 bool
7064 vect_transform_reduction (loop_vec_info loop_vinfo,
7065 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7066 gimple **vec_stmt, slp_tree slp_node)
7068 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7069 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7070 int i;
7071 int ncopies;
7072 int vec_num;
7074 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7075 gcc_assert (reduc_info->is_reduc_info);
7077 if (nested_in_vect_loop_p (loop, stmt_info))
7079 loop = loop->inner;
7080 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7083 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7084 enum tree_code code = gimple_assign_rhs_code (stmt);
7085 int op_type = TREE_CODE_LENGTH (code);
7087 /* Flatten RHS. */
7088 tree ops[3];
7089 switch (get_gimple_rhs_class (code))
7091 case GIMPLE_TERNARY_RHS:
7092 ops[2] = gimple_assign_rhs3 (stmt);
7093 /* Fall thru. */
7094 case GIMPLE_BINARY_RHS:
7095 ops[0] = gimple_assign_rhs1 (stmt);
7096 ops[1] = gimple_assign_rhs2 (stmt);
7097 break;
7098 default:
7099 gcc_unreachable ();
7102 /* All uses but the last are expected to be defined in the loop.
7103 The last use is the reduction variable. In case of nested cycle this
7104 assumption is not true: we use reduc_index to record the index of the
7105 reduction variable. */
7106 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7107 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7108 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7109 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7111 if (slp_node)
7113 ncopies = 1;
7114 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7116 else
7118 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7119 vec_num = 1;
7122 internal_fn cond_fn = get_conditional_internal_fn (code);
7123 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7124 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7126 /* Transform. */
7127 tree new_temp = NULL_TREE;
7128 auto_vec<tree> vec_oprnds0;
7129 auto_vec<tree> vec_oprnds1;
7130 auto_vec<tree> vec_oprnds2;
7131 tree def0;
7133 if (dump_enabled_p ())
7134 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7136 /* FORNOW: Multiple types are not supported for condition. */
7137 if (code == COND_EXPR)
7138 gcc_assert (ncopies == 1);
7140 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7142 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7143 if (reduction_type == FOLD_LEFT_REDUCTION)
7145 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7146 return vectorize_fold_left_reduction
7147 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7148 reduc_fn, ops, vectype_in, reduc_index, masks);
7151 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7152 gcc_assert (single_defuse_cycle
7153 || code == DOT_PROD_EXPR
7154 || code == WIDEN_SUM_EXPR
7155 || code == SAD_EXPR);
7157 /* Create the destination vector */
7158 tree scalar_dest = gimple_assign_lhs (stmt);
7159 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7161 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7162 single_defuse_cycle && reduc_index == 0
7163 ? NULL_TREE : ops[0], &vec_oprnds0,
7164 single_defuse_cycle && reduc_index == 1
7165 ? NULL_TREE : ops[1], &vec_oprnds1,
7166 op_type == ternary_op
7167 && !(single_defuse_cycle && reduc_index == 2)
7168 ? ops[2] : NULL_TREE, &vec_oprnds2);
7169 if (single_defuse_cycle)
7171 gcc_assert (!slp_node);
7172 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7173 ops[reduc_index],
7174 reduc_index == 0 ? &vec_oprnds0
7175 : (reduc_index == 1 ? &vec_oprnds1
7176 : &vec_oprnds2));
7179 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7181 gimple *new_stmt;
7182 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7183 if (masked_loop_p && !mask_by_cond_expr)
7185 /* Make sure that the reduction accumulator is vop[0]. */
7186 if (reduc_index == 1)
7188 gcc_assert (commutative_tree_code (code));
7189 std::swap (vop[0], vop[1]);
7191 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7192 vectype_in, i);
7193 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7194 vop[0], vop[1], vop[0]);
7195 new_temp = make_ssa_name (vec_dest, call);
7196 gimple_call_set_lhs (call, new_temp);
7197 gimple_call_set_nothrow (call, true);
7198 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7199 new_stmt = call;
7201 else
7203 if (op_type == ternary_op)
7204 vop[2] = vec_oprnds2[i];
7206 if (masked_loop_p && mask_by_cond_expr)
7208 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7209 vectype_in, i);
7210 build_vect_cond_expr (code, vop, mask, gsi);
7213 new_stmt = gimple_build_assign (vec_dest, code,
7214 vop[0], vop[1], vop[2]);
7215 new_temp = make_ssa_name (vec_dest, new_stmt);
7216 gimple_assign_set_lhs (new_stmt, new_temp);
7217 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7220 if (slp_node)
7221 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7222 else if (single_defuse_cycle
7223 && i < ncopies - 1)
7225 if (reduc_index == 0)
7226 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7227 else if (reduc_index == 1)
7228 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7229 else if (reduc_index == 2)
7230 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7232 else
7233 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7236 if (!slp_node)
7237 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7239 return true;
7242 /* Transform phase of a cycle PHI. */
7244 bool
7245 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7246 stmt_vec_info stmt_info, gimple **vec_stmt,
7247 slp_tree slp_node, slp_instance slp_node_instance)
7249 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7250 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7251 int i;
7252 int ncopies;
7253 int j;
7254 bool nested_cycle = false;
7255 int vec_num;
7257 if (nested_in_vect_loop_p (loop, stmt_info))
7259 loop = loop->inner;
7260 nested_cycle = true;
7263 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7264 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7265 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7266 gcc_assert (reduc_info->is_reduc_info);
7268 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7269 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7270 /* Leave the scalar phi in place. */
7271 return true;
7273 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7274 /* For a nested cycle we do not fill the above. */
7275 if (!vectype_in)
7276 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7277 gcc_assert (vectype_in);
7279 if (slp_node)
7281 /* The size vect_schedule_slp_instance computes is off for us. */
7282 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7283 * SLP_TREE_LANES (slp_node), vectype_in);
7284 ncopies = 1;
7286 else
7288 vec_num = 1;
7289 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7292 /* Check whether we should use a single PHI node and accumulate
7293 vectors to one before the backedge. */
7294 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7295 ncopies = 1;
7297 /* Create the destination vector */
7298 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7299 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7300 vectype_out);
7302 /* Get the loop-entry arguments. */
7303 tree vec_initial_def;
7304 auto_vec<tree> vec_initial_defs;
7305 if (slp_node)
7307 vec_initial_defs.reserve (vec_num);
7308 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7309 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7310 tree neutral_op
7311 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7312 STMT_VINFO_REDUC_CODE (reduc_info),
7313 first != NULL);
7314 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7315 &vec_initial_defs, vec_num,
7316 first != NULL, neutral_op);
7318 else
7320 /* Get at the scalar def before the loop, that defines the initial
7321 value of the reduction variable. */
7322 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7323 loop_preheader_edge (loop));
7324 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7325 and we can't use zero for induc_val, use initial_def. Similarly
7326 for REDUC_MIN and initial_def larger than the base. */
7327 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7329 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7330 if (TREE_CODE (initial_def) == INTEGER_CST
7331 && !integer_zerop (induc_val)
7332 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7333 && tree_int_cst_lt (initial_def, induc_val))
7334 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7335 && tree_int_cst_lt (induc_val, initial_def))))
7337 induc_val = initial_def;
7338 /* Communicate we used the initial_def to epilouge
7339 generation. */
7340 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7342 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7343 vec_initial_defs.create (ncopies);
7344 for (i = 0; i < ncopies; ++i)
7345 vec_initial_defs.quick_push (vec_initial_def);
7347 else if (nested_cycle)
7349 /* Do not use an adjustment def as that case is not supported
7350 correctly if ncopies is not one. */
7351 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7352 ncopies, initial_def,
7353 &vec_initial_defs);
7355 else
7357 tree adjustment_def = NULL_TREE;
7358 tree *adjustment_defp = &adjustment_def;
7359 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7360 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7361 adjustment_defp = NULL;
7362 vec_initial_def
7363 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7364 initial_def, adjustment_defp);
7365 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7366 vec_initial_defs.create (ncopies);
7367 for (i = 0; i < ncopies; ++i)
7368 vec_initial_defs.quick_push (vec_initial_def);
7372 /* Generate the reduction PHIs upfront. */
7373 for (i = 0; i < vec_num; i++)
7375 tree vec_init_def = vec_initial_defs[i];
7376 for (j = 0; j < ncopies; j++)
7378 /* Create the reduction-phi that defines the reduction
7379 operand. */
7380 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7382 /* Set the loop-entry arg of the reduction-phi. */
7383 if (j != 0 && nested_cycle)
7384 vec_init_def = vec_initial_defs[j];
7385 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7386 UNKNOWN_LOCATION);
7388 /* The loop-latch arg is set in epilogue processing. */
7390 if (slp_node)
7391 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7392 else
7394 if (j == 0)
7395 *vec_stmt = new_phi;
7396 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7401 return true;
7404 /* Vectorizes LC PHIs. */
7406 bool
7407 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7408 stmt_vec_info stmt_info, gimple **vec_stmt,
7409 slp_tree slp_node)
7411 if (!loop_vinfo
7412 || !is_a <gphi *> (stmt_info->stmt)
7413 || gimple_phi_num_args (stmt_info->stmt) != 1)
7414 return false;
7416 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7417 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7418 return false;
7420 if (!vec_stmt) /* transformation not required. */
7422 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7423 return true;
7426 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7427 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7428 basic_block bb = gimple_bb (stmt_info->stmt);
7429 edge e = single_pred_edge (bb);
7430 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7431 auto_vec<tree> vec_oprnds;
7432 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7433 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7434 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7435 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7437 /* Create the vectorized LC PHI node. */
7438 gphi *new_phi = create_phi_node (vec_dest, bb);
7439 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7440 if (slp_node)
7441 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7442 else
7443 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7445 if (!slp_node)
7446 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7448 return true;
7452 /* Function vect_min_worthwhile_factor.
7454 For a loop where we could vectorize the operation indicated by CODE,
7455 return the minimum vectorization factor that makes it worthwhile
7456 to use generic vectors. */
7457 static unsigned int
7458 vect_min_worthwhile_factor (enum tree_code code)
7460 switch (code)
7462 case PLUS_EXPR:
7463 case MINUS_EXPR:
7464 case NEGATE_EXPR:
7465 return 4;
7467 case BIT_AND_EXPR:
7468 case BIT_IOR_EXPR:
7469 case BIT_XOR_EXPR:
7470 case BIT_NOT_EXPR:
7471 return 2;
7473 default:
7474 return INT_MAX;
7478 /* Return true if VINFO indicates we are doing loop vectorization and if
7479 it is worth decomposing CODE operations into scalar operations for
7480 that loop's vectorization factor. */
7482 bool
7483 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7485 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7486 unsigned HOST_WIDE_INT value;
7487 return (loop_vinfo
7488 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7489 && value >= vect_min_worthwhile_factor (code));
7492 /* Function vectorizable_induction
7494 Check if STMT_INFO performs an induction computation that can be vectorized.
7495 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7496 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7497 Return true if STMT_INFO is vectorizable in this way. */
7499 bool
7500 vectorizable_induction (loop_vec_info loop_vinfo,
7501 stmt_vec_info stmt_info,
7502 gimple **vec_stmt, slp_tree slp_node,
7503 stmt_vector_for_cost *cost_vec)
7505 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7506 unsigned ncopies;
7507 bool nested_in_vect_loop = false;
7508 class loop *iv_loop;
7509 tree vec_def;
7510 edge pe = loop_preheader_edge (loop);
7511 basic_block new_bb;
7512 tree new_vec, vec_init, vec_step, t;
7513 tree new_name;
7514 gimple *new_stmt;
7515 gphi *induction_phi;
7516 tree induc_def, vec_dest;
7517 tree init_expr, step_expr;
7518 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7519 unsigned i;
7520 tree expr;
7521 gimple_seq stmts;
7522 gimple_stmt_iterator si;
7524 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7525 if (!phi)
7526 return false;
7528 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7529 return false;
7531 /* Make sure it was recognized as induction computation. */
7532 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7533 return false;
7535 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7536 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7538 if (slp_node)
7539 ncopies = 1;
7540 else
7541 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7542 gcc_assert (ncopies >= 1);
7544 /* FORNOW. These restrictions should be relaxed. */
7545 if (nested_in_vect_loop_p (loop, stmt_info))
7547 imm_use_iterator imm_iter;
7548 use_operand_p use_p;
7549 gimple *exit_phi;
7550 edge latch_e;
7551 tree loop_arg;
7553 if (ncopies > 1)
7555 if (dump_enabled_p ())
7556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7557 "multiple types in nested loop.\n");
7558 return false;
7561 /* FORNOW: outer loop induction with SLP not supported. */
7562 if (STMT_SLP_TYPE (stmt_info))
7563 return false;
7565 exit_phi = NULL;
7566 latch_e = loop_latch_edge (loop->inner);
7567 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7568 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7570 gimple *use_stmt = USE_STMT (use_p);
7571 if (is_gimple_debug (use_stmt))
7572 continue;
7574 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7576 exit_phi = use_stmt;
7577 break;
7580 if (exit_phi)
7582 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7583 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7584 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7586 if (dump_enabled_p ())
7587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7588 "inner-loop induction only used outside "
7589 "of the outer vectorized loop.\n");
7590 return false;
7594 nested_in_vect_loop = true;
7595 iv_loop = loop->inner;
7597 else
7598 iv_loop = loop;
7599 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7601 if (slp_node && !nunits.is_constant ())
7603 /* The current SLP code creates the initial value element-by-element. */
7604 if (dump_enabled_p ())
7605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7606 "SLP induction not supported for variable-length"
7607 " vectors.\n");
7608 return false;
7611 if (!vec_stmt) /* transformation not required. */
7613 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7614 DUMP_VECT_SCOPE ("vectorizable_induction");
7615 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7616 return true;
7619 /* Transform. */
7621 /* Compute a vector variable, initialized with the first VF values of
7622 the induction variable. E.g., for an iv with IV_PHI='X' and
7623 evolution S, for a vector of 4 units, we want to compute:
7624 [X, X + S, X + 2*S, X + 3*S]. */
7626 if (dump_enabled_p ())
7627 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7629 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7630 gcc_assert (step_expr != NULL_TREE);
7631 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7633 pe = loop_preheader_edge (iv_loop);
7634 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7635 loop_preheader_edge (iv_loop));
7637 stmts = NULL;
7638 if (!nested_in_vect_loop)
7640 /* Convert the initial value to the IV update type. */
7641 tree new_type = TREE_TYPE (step_expr);
7642 init_expr = gimple_convert (&stmts, new_type, init_expr);
7644 /* If we are using the loop mask to "peel" for alignment then we need
7645 to adjust the start value here. */
7646 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7647 if (skip_niters != NULL_TREE)
7649 if (FLOAT_TYPE_P (vectype))
7650 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7651 skip_niters);
7652 else
7653 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7654 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7655 skip_niters, step_expr);
7656 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7657 init_expr, skip_step);
7661 if (stmts)
7663 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7664 gcc_assert (!new_bb);
7667 /* Find the first insertion point in the BB. */
7668 basic_block bb = gimple_bb (phi);
7669 si = gsi_after_labels (bb);
7671 /* For SLP induction we have to generate several IVs as for example
7672 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7673 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7674 [VF*S, VF*S, VF*S, VF*S] for all. */
7675 if (slp_node)
7677 /* Enforced above. */
7678 unsigned int const_nunits = nunits.to_constant ();
7680 /* Generate [VF*S, VF*S, ... ]. */
7681 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7683 expr = build_int_cst (integer_type_node, vf);
7684 expr = fold_convert (TREE_TYPE (step_expr), expr);
7686 else
7687 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7688 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7689 expr, step_expr);
7690 if (! CONSTANT_CLASS_P (new_name))
7691 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7692 TREE_TYPE (step_expr), NULL);
7693 new_vec = build_vector_from_val (step_vectype, new_name);
7694 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7695 new_vec, step_vectype, NULL);
7697 /* Now generate the IVs. */
7698 unsigned group_size = SLP_TREE_LANES (slp_node);
7699 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7700 unsigned elts = const_nunits * nvects;
7701 /* Compute the number of distinct IVs we need. First reduce
7702 group_size if it is a multiple of const_nunits so we get
7703 one IV for a group_size of 4 but const_nunits 2. */
7704 unsigned group_sizep = group_size;
7705 if (group_sizep % const_nunits == 0)
7706 group_sizep = group_sizep / const_nunits;
7707 unsigned nivs = least_common_multiple (group_sizep,
7708 const_nunits) / const_nunits;
7709 gcc_assert (elts % group_size == 0);
7710 tree elt = init_expr;
7711 unsigned ivn;
7712 for (ivn = 0; ivn < nivs; ++ivn)
7714 tree_vector_builder elts (step_vectype, const_nunits, 1);
7715 stmts = NULL;
7716 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7718 if (ivn*const_nunits + eltn >= group_size
7719 && (ivn * const_nunits + eltn) % group_size == 0)
7720 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7721 elt, step_expr);
7722 elts.quick_push (elt);
7724 vec_init = gimple_build_vector (&stmts, &elts);
7725 vec_init = gimple_convert (&stmts, vectype, vec_init);
7726 if (stmts)
7728 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7729 gcc_assert (!new_bb);
7732 /* Create the induction-phi that defines the induction-operand. */
7733 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7734 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7735 induc_def = PHI_RESULT (induction_phi);
7737 /* Create the iv update inside the loop */
7738 gimple_seq stmts = NULL;
7739 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7740 vec_def = gimple_build (&stmts,
7741 PLUS_EXPR, step_vectype, vec_def, vec_step);
7742 vec_def = gimple_convert (&stmts, vectype, vec_def);
7743 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7745 /* Set the arguments of the phi node: */
7746 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7747 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7748 UNKNOWN_LOCATION);
7750 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7752 /* Fill up to the number of vectors we need for the whole group. */
7753 nivs = least_common_multiple (group_size,
7754 const_nunits) / const_nunits;
7755 for (; ivn < nivs; ++ivn)
7756 SLP_TREE_VEC_STMTS (slp_node)
7757 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7759 /* Re-use IVs when we can. */
7760 if (ivn < nvects)
7762 unsigned vfp
7763 = least_common_multiple (group_size, const_nunits) / group_size;
7764 /* Generate [VF'*S, VF'*S, ... ]. */
7765 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7767 expr = build_int_cst (integer_type_node, vfp);
7768 expr = fold_convert (TREE_TYPE (step_expr), expr);
7770 else
7771 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7772 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7773 expr, step_expr);
7774 if (! CONSTANT_CLASS_P (new_name))
7775 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7776 TREE_TYPE (step_expr), NULL);
7777 new_vec = build_vector_from_val (step_vectype, new_name);
7778 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7779 step_vectype, NULL);
7780 for (; ivn < nvects; ++ivn)
7782 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7783 tree def;
7784 if (gimple_code (iv) == GIMPLE_PHI)
7785 def = gimple_phi_result (iv);
7786 else
7787 def = gimple_assign_lhs (iv);
7788 gimple_seq stmts = NULL;
7789 def = gimple_convert (&stmts, step_vectype, def);
7790 def = gimple_build (&stmts,
7791 PLUS_EXPR, step_vectype, def, vec_step);
7792 def = gimple_convert (&stmts, vectype, def);
7793 if (gimple_code (iv) == GIMPLE_PHI)
7794 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7795 else
7797 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7798 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7800 SLP_TREE_VEC_STMTS (slp_node)
7801 .quick_push (SSA_NAME_DEF_STMT (def));
7805 return true;
7808 /* Create the vector that holds the initial_value of the induction. */
7809 if (nested_in_vect_loop)
7811 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7812 been created during vectorization of previous stmts. We obtain it
7813 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7814 auto_vec<tree> vec_inits;
7815 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7816 init_expr, &vec_inits);
7817 vec_init = vec_inits[0];
7818 /* If the initial value is not of proper type, convert it. */
7819 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7821 new_stmt
7822 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7823 vect_simple_var,
7824 "vec_iv_"),
7825 VIEW_CONVERT_EXPR,
7826 build1 (VIEW_CONVERT_EXPR, vectype,
7827 vec_init));
7828 vec_init = gimple_assign_lhs (new_stmt);
7829 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7830 new_stmt);
7831 gcc_assert (!new_bb);
7834 else
7836 /* iv_loop is the loop to be vectorized. Create:
7837 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7838 stmts = NULL;
7839 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7841 unsigned HOST_WIDE_INT const_nunits;
7842 if (nunits.is_constant (&const_nunits))
7844 tree_vector_builder elts (step_vectype, const_nunits, 1);
7845 elts.quick_push (new_name);
7846 for (i = 1; i < const_nunits; i++)
7848 /* Create: new_name_i = new_name + step_expr */
7849 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7850 new_name, step_expr);
7851 elts.quick_push (new_name);
7853 /* Create a vector from [new_name_0, new_name_1, ...,
7854 new_name_nunits-1] */
7855 vec_init = gimple_build_vector (&stmts, &elts);
7857 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7858 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7859 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7860 new_name, step_expr);
7861 else
7863 /* Build:
7864 [base, base, base, ...]
7865 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7866 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7867 gcc_assert (flag_associative_math);
7868 tree index = build_index_vector (step_vectype, 0, 1);
7869 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7870 new_name);
7871 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7872 step_expr);
7873 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7874 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7875 vec_init, step_vec);
7876 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7877 vec_init, base_vec);
7879 vec_init = gimple_convert (&stmts, vectype, vec_init);
7881 if (stmts)
7883 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7884 gcc_assert (!new_bb);
7889 /* Create the vector that holds the step of the induction. */
7890 if (nested_in_vect_loop)
7891 /* iv_loop is nested in the loop to be vectorized. Generate:
7892 vec_step = [S, S, S, S] */
7893 new_name = step_expr;
7894 else
7896 /* iv_loop is the loop to be vectorized. Generate:
7897 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7898 gimple_seq seq = NULL;
7899 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7901 expr = build_int_cst (integer_type_node, vf);
7902 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7904 else
7905 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7906 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7907 expr, step_expr);
7908 if (seq)
7910 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7911 gcc_assert (!new_bb);
7915 t = unshare_expr (new_name);
7916 gcc_assert (CONSTANT_CLASS_P (new_name)
7917 || TREE_CODE (new_name) == SSA_NAME);
7918 new_vec = build_vector_from_val (step_vectype, t);
7919 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7920 new_vec, step_vectype, NULL);
7923 /* Create the following def-use cycle:
7924 loop prolog:
7925 vec_init = ...
7926 vec_step = ...
7927 loop:
7928 vec_iv = PHI <vec_init, vec_loop>
7930 STMT
7932 vec_loop = vec_iv + vec_step; */
7934 /* Create the induction-phi that defines the induction-operand. */
7935 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7936 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7937 induc_def = PHI_RESULT (induction_phi);
7939 /* Create the iv update inside the loop */
7940 stmts = NULL;
7941 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7942 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7943 vec_def = gimple_convert (&stmts, vectype, vec_def);
7944 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7945 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7947 /* Set the arguments of the phi node: */
7948 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7949 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7950 UNKNOWN_LOCATION);
7952 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7953 *vec_stmt = induction_phi;
7955 /* In case that vectorization factor (VF) is bigger than the number
7956 of elements that we can fit in a vectype (nunits), we have to generate
7957 more than one vector stmt - i.e - we need to "unroll" the
7958 vector stmt by a factor VF/nunits. For more details see documentation
7959 in vectorizable_operation. */
7961 if (ncopies > 1)
7963 gimple_seq seq = NULL;
7964 /* FORNOW. This restriction should be relaxed. */
7965 gcc_assert (!nested_in_vect_loop);
7967 /* Create the vector that holds the step of the induction. */
7968 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7970 expr = build_int_cst (integer_type_node, nunits);
7971 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7973 else
7974 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7975 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7976 expr, step_expr);
7977 if (seq)
7979 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7980 gcc_assert (!new_bb);
7983 t = unshare_expr (new_name);
7984 gcc_assert (CONSTANT_CLASS_P (new_name)
7985 || TREE_CODE (new_name) == SSA_NAME);
7986 new_vec = build_vector_from_val (step_vectype, t);
7987 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7988 new_vec, step_vectype, NULL);
7990 vec_def = induc_def;
7991 for (i = 1; i < ncopies; i++)
7993 /* vec_i = vec_prev + vec_step */
7994 gimple_seq stmts = NULL;
7995 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7996 vec_def = gimple_build (&stmts,
7997 PLUS_EXPR, step_vectype, vec_def, vec_step);
7998 vec_def = gimple_convert (&stmts, vectype, vec_def);
8000 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8001 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8002 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8006 if (dump_enabled_p ())
8007 dump_printf_loc (MSG_NOTE, vect_location,
8008 "transform induction: created def-use cycle: %G%G",
8009 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8011 return true;
8014 /* Function vectorizable_live_operation.
8016 STMT_INFO computes a value that is used outside the loop. Check if
8017 it can be supported. */
8019 bool
8020 vectorizable_live_operation (loop_vec_info loop_vinfo,
8021 stmt_vec_info stmt_info,
8022 gimple_stmt_iterator *gsi,
8023 slp_tree slp_node, slp_instance slp_node_instance,
8024 int slp_index, bool vec_stmt_p,
8025 stmt_vector_for_cost *)
8027 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8028 imm_use_iterator imm_iter;
8029 tree lhs, lhs_type, bitsize, vec_bitsize;
8030 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8031 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8032 int ncopies;
8033 gimple *use_stmt;
8034 auto_vec<tree> vec_oprnds;
8035 int vec_entry = 0;
8036 poly_uint64 vec_index = 0;
8038 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8040 /* If a stmt of a reduction is live, vectorize it via
8041 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8042 validity so just trigger the transform here. */
8043 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8045 if (!vec_stmt_p)
8046 return true;
8047 if (slp_node)
8049 /* For reduction chains the meta-info is attached to
8050 the group leader. */
8051 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8052 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8053 /* For SLP reductions we vectorize the epilogue for
8054 all involved stmts together. */
8055 else if (slp_index != 0)
8056 return true;
8057 else
8058 /* For SLP reductions the meta-info is attached to
8059 the representative. */
8060 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8062 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8063 gcc_assert (reduc_info->is_reduc_info);
8064 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8065 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8066 return true;
8067 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8068 slp_node_instance);
8069 return true;
8072 /* FORNOW. CHECKME. */
8073 if (nested_in_vect_loop_p (loop, stmt_info))
8074 return false;
8076 /* If STMT is not relevant and it is a simple assignment and its inputs are
8077 invariant then it can remain in place, unvectorized. The original last
8078 scalar value that it computes will be used. */
8079 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8081 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8082 if (dump_enabled_p ())
8083 dump_printf_loc (MSG_NOTE, vect_location,
8084 "statement is simple and uses invariant. Leaving in "
8085 "place.\n");
8086 return true;
8089 if (slp_node)
8090 ncopies = 1;
8091 else
8092 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8094 if (slp_node)
8096 gcc_assert (slp_index >= 0);
8098 int num_scalar = SLP_TREE_LANES (slp_node);
8099 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8101 /* Get the last occurrence of the scalar index from the concatenation of
8102 all the slp vectors. Calculate which slp vector it is and the index
8103 within. */
8104 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8106 /* Calculate which vector contains the result, and which lane of
8107 that vector we need. */
8108 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8110 if (dump_enabled_p ())
8111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112 "Cannot determine which vector holds the"
8113 " final result.\n");
8114 return false;
8118 if (!vec_stmt_p)
8120 /* No transformation required. */
8121 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8123 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8124 OPTIMIZE_FOR_SPEED))
8126 if (dump_enabled_p ())
8127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128 "can't operate on partial vectors "
8129 "because the target doesn't support extract "
8130 "last reduction.\n");
8131 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8133 else if (slp_node)
8135 if (dump_enabled_p ())
8136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8137 "can't operate on partial vectors "
8138 "because an SLP statement is live after "
8139 "the loop.\n");
8140 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8142 else if (ncopies > 1)
8144 if (dump_enabled_p ())
8145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8146 "can't operate on partial vectors "
8147 "because ncopies is greater than 1.\n");
8148 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8150 else
8152 gcc_assert (ncopies == 1 && !slp_node);
8153 vect_record_loop_mask (loop_vinfo,
8154 &LOOP_VINFO_MASKS (loop_vinfo),
8155 1, vectype, NULL);
8158 return true;
8161 /* Use the lhs of the original scalar statement. */
8162 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8164 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8165 : gimple_get_lhs (stmt);
8166 lhs_type = TREE_TYPE (lhs);
8168 bitsize = vector_element_bits_tree (vectype);
8169 vec_bitsize = TYPE_SIZE (vectype);
8171 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8172 tree vec_lhs, bitstart;
8173 if (slp_node)
8175 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8177 /* Get the correct slp vectorized stmt. */
8178 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8179 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8180 vec_lhs = gimple_phi_result (phi);
8181 else
8182 vec_lhs = gimple_get_lhs (vec_stmt);
8184 /* Get entry to use. */
8185 bitstart = bitsize_int (vec_index);
8186 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8188 else
8190 /* For multiple copies, get the last copy. */
8191 vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
8193 /* Get the last lane in the vector. */
8194 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8197 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8198 requirement, insert one phi node for it. It looks like:
8199 loop;
8201 # lhs' = PHI <lhs>
8203 loop;
8205 # vec_lhs' = PHI <vec_lhs>
8206 new_tree = lane_extract <vec_lhs', ...>;
8207 lhs' = new_tree; */
8209 basic_block exit_bb = single_exit (loop)->dest;
8210 gcc_assert (single_pred_p (exit_bb));
8212 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8213 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8214 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8216 gimple_seq stmts = NULL;
8217 tree new_tree;
8218 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8220 /* Emit:
8222 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8224 where VEC_LHS is the vectorized live-out result and MASK is
8225 the loop mask for the final iteration. */
8226 gcc_assert (ncopies == 1 && !slp_node);
8227 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8228 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8229 vectype, 0);
8230 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8231 mask, vec_lhs_phi);
8233 /* Convert the extracted vector element to the required scalar type. */
8234 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8236 else
8238 tree bftype = TREE_TYPE (vectype);
8239 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8240 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8241 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8242 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8243 &stmts, true, NULL_TREE);
8246 if (stmts)
8248 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8249 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8251 /* Remove existing phi from lhs and create one copy from new_tree. */
8252 tree lhs_phi = NULL_TREE;
8253 gimple_stmt_iterator gsi;
8254 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8256 gimple *phi = gsi_stmt (gsi);
8257 if ((gimple_phi_arg_def (phi, 0) == lhs))
8259 remove_phi_node (&gsi, false);
8260 lhs_phi = gimple_phi_result (phi);
8261 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8262 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8263 break;
8268 /* Replace use of lhs with newly computed result. If the use stmt is a
8269 single arg PHI, just replace all uses of PHI result. It's necessary
8270 because lcssa PHI defining lhs may be before newly inserted stmt. */
8271 use_operand_p use_p;
8272 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8273 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8274 && !is_gimple_debug (use_stmt))
8276 if (gimple_code (use_stmt) == GIMPLE_PHI
8277 && gimple_phi_num_args (use_stmt) == 1)
8279 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8281 else
8283 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8284 SET_USE (use_p, new_tree);
8286 update_stmt (use_stmt);
8289 return true;
8292 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8294 static void
8295 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8297 ssa_op_iter op_iter;
8298 imm_use_iterator imm_iter;
8299 def_operand_p def_p;
8300 gimple *ustmt;
8302 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8304 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8306 basic_block bb;
8308 if (!is_gimple_debug (ustmt))
8309 continue;
8311 bb = gimple_bb (ustmt);
8313 if (!flow_bb_inside_loop_p (loop, bb))
8315 if (gimple_debug_bind_p (ustmt))
8317 if (dump_enabled_p ())
8318 dump_printf_loc (MSG_NOTE, vect_location,
8319 "killing debug use\n");
8321 gimple_debug_bind_reset_value (ustmt);
8322 update_stmt (ustmt);
8324 else
8325 gcc_unreachable ();
8331 /* Given loop represented by LOOP_VINFO, return true if computation of
8332 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8333 otherwise. */
8335 static bool
8336 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8338 /* Constant case. */
8339 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8341 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8342 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8344 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8345 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8346 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8347 return true;
8350 widest_int max;
8351 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8352 /* Check the upper bound of loop niters. */
8353 if (get_max_loop_iterations (loop, &max))
8355 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8356 signop sgn = TYPE_SIGN (type);
8357 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8358 if (max < type_max)
8359 return true;
8361 return false;
8364 /* Return a mask type with half the number of elements as OLD_TYPE,
8365 given that it should have mode NEW_MODE. */
8367 tree
8368 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8370 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8371 return build_truth_vector_type_for_mode (nunits, new_mode);
8374 /* Return a mask type with twice as many elements as OLD_TYPE,
8375 given that it should have mode NEW_MODE. */
8377 tree
8378 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8380 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8381 return build_truth_vector_type_for_mode (nunits, new_mode);
8384 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8385 contain a sequence of NVECTORS masks that each control a vector of type
8386 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8387 these vector masks with the vector version of SCALAR_MASK. */
8389 void
8390 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8391 unsigned int nvectors, tree vectype, tree scalar_mask)
8393 gcc_assert (nvectors != 0);
8394 if (masks->length () < nvectors)
8395 masks->safe_grow_cleared (nvectors);
8396 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8397 /* The number of scalars per iteration and the number of vectors are
8398 both compile-time constants. */
8399 unsigned int nscalars_per_iter
8400 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8401 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8403 if (scalar_mask)
8405 scalar_cond_masked_key cond (scalar_mask, nvectors);
8406 loop_vinfo->scalar_cond_masked_set.add (cond);
8409 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8411 rgm->max_nscalars_per_iter = nscalars_per_iter;
8412 rgm->type = truth_type_for (vectype);
8413 rgm->factor = 1;
8417 /* Given a complete set of masks MASKS, extract mask number INDEX
8418 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8419 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8421 See the comment above vec_loop_masks for more details about the mask
8422 arrangement. */
8424 tree
8425 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8426 unsigned int nvectors, tree vectype, unsigned int index)
8428 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8429 tree mask_type = rgm->type;
8431 /* Populate the rgroup's mask array, if this is the first time we've
8432 used it. */
8433 if (rgm->controls.is_empty ())
8435 rgm->controls.safe_grow_cleared (nvectors);
8436 for (unsigned int i = 0; i < nvectors; ++i)
8438 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8439 /* Provide a dummy definition until the real one is available. */
8440 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8441 rgm->controls[i] = mask;
8445 tree mask = rgm->controls[index];
8446 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8447 TYPE_VECTOR_SUBPARTS (vectype)))
8449 /* A loop mask for data type X can be reused for data type Y
8450 if X has N times more elements than Y and if Y's elements
8451 are N times bigger than X's. In this case each sequence
8452 of N elements in the loop mask will be all-zero or all-one.
8453 We can then view-convert the mask so that each sequence of
8454 N elements is replaced by a single element. */
8455 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8456 TYPE_VECTOR_SUBPARTS (vectype)));
8457 gimple_seq seq = NULL;
8458 mask_type = truth_type_for (vectype);
8459 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8460 if (seq)
8461 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8463 return mask;
8466 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8467 lengths for controlling an operation on VECTYPE. The operation splits
8468 each element of VECTYPE into FACTOR separate subelements, measuring the
8469 length as a number of these subelements. */
8471 void
8472 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8473 unsigned int nvectors, tree vectype, unsigned int factor)
8475 gcc_assert (nvectors != 0);
8476 if (lens->length () < nvectors)
8477 lens->safe_grow_cleared (nvectors);
8478 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8480 /* The number of scalars per iteration, scalar occupied bytes and
8481 the number of vectors are both compile-time constants. */
8482 unsigned int nscalars_per_iter
8483 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8484 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8486 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8488 /* For now, we only support cases in which all loads and stores fall back
8489 to VnQI or none do. */
8490 gcc_assert (!rgl->max_nscalars_per_iter
8491 || (rgl->factor == 1 && factor == 1)
8492 || (rgl->max_nscalars_per_iter * rgl->factor
8493 == nscalars_per_iter * factor));
8494 rgl->max_nscalars_per_iter = nscalars_per_iter;
8495 rgl->type = vectype;
8496 rgl->factor = factor;
8500 /* Given a complete set of length LENS, extract length number INDEX for an
8501 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8503 tree
8504 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8505 unsigned int nvectors, unsigned int index)
8507 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8509 /* Populate the rgroup's len array, if this is the first time we've
8510 used it. */
8511 if (rgl->controls.is_empty ())
8513 rgl->controls.safe_grow_cleared (nvectors);
8514 for (unsigned int i = 0; i < nvectors; ++i)
8516 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8517 gcc_assert (len_type != NULL_TREE);
8518 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8520 /* Provide a dummy definition until the real one is available. */
8521 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8522 rgl->controls[i] = len;
8526 return rgl->controls[index];
8529 /* Scale profiling counters by estimation for LOOP which is vectorized
8530 by factor VF. */
8532 static void
8533 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8535 edge preheader = loop_preheader_edge (loop);
8536 /* Reduce loop iterations by the vectorization factor. */
8537 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8538 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8540 if (freq_h.nonzero_p ())
8542 profile_probability p;
8544 /* Avoid dropping loop body profile counter to 0 because of zero count
8545 in loop's preheader. */
8546 if (!(freq_e == profile_count::zero ()))
8547 freq_e = freq_e.force_nonzero ();
8548 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8549 scale_loop_frequencies (loop, p);
8552 edge exit_e = single_exit (loop);
8553 exit_e->probability = profile_probability::always ()
8554 .apply_scale (1, new_est_niter + 1);
8556 edge exit_l = single_pred_edge (loop->latch);
8557 profile_probability prob = exit_l->probability;
8558 exit_l->probability = exit_e->probability.invert ();
8559 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8560 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8563 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8564 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8565 stmt_vec_info. */
8567 static void
8568 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8569 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8571 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8572 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8574 if (dump_enabled_p ())
8575 dump_printf_loc (MSG_NOTE, vect_location,
8576 "------>vectorizing statement: %G", stmt_info->stmt);
8578 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8579 vect_loop_kill_debug_uses (loop, stmt_info);
8581 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8582 && !STMT_VINFO_LIVE_P (stmt_info))
8583 return;
8585 if (STMT_VINFO_VECTYPE (stmt_info))
8587 poly_uint64 nunits
8588 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8589 if (!STMT_SLP_TYPE (stmt_info)
8590 && maybe_ne (nunits, vf)
8591 && dump_enabled_p ())
8592 /* For SLP VF is set according to unrolling factor, and not
8593 to vector size, hence for SLP this print is not valid. */
8594 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8597 /* Pure SLP statements have already been vectorized. We still need
8598 to apply loop vectorization to hybrid SLP statements. */
8599 if (PURE_SLP_STMT (stmt_info))
8600 return;
8602 if (dump_enabled_p ())
8603 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8605 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8606 *seen_store = stmt_info;
8609 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8610 in the hash_map with its corresponding values. */
8612 static tree
8613 find_in_mapping (tree t, void *context)
8615 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8617 tree *value = mapping->get (t);
8618 return value ? *value : t;
8621 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8622 original loop that has now been vectorized.
8624 The inits of the data_references need to be advanced with the number of
8625 iterations of the main loop. This has been computed in vect_do_peeling and
8626 is stored in parameter ADVANCE. We first restore the data_references
8627 initial offset with the values recored in ORIG_DRS_INIT.
8629 Since the loop_vec_info of this EPILOGUE was constructed for the original
8630 loop, its stmt_vec_infos all point to the original statements. These need
8631 to be updated to point to their corresponding copies as well as the SSA_NAMES
8632 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8634 The data_reference's connections also need to be updated. Their
8635 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8636 stmt_vec_infos, their statements need to point to their corresponding copy,
8637 if they are gather loads or scatter stores then their reference needs to be
8638 updated to point to its corresponding copy and finally we set
8639 'base_misaligned' to false as we have already peeled for alignment in the
8640 prologue of the main loop. */
8642 static void
8643 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8645 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8646 auto_vec<gimple *> stmt_worklist;
8647 hash_map<tree,tree> mapping;
8648 gimple *orig_stmt, *new_stmt;
8649 gimple_stmt_iterator epilogue_gsi;
8650 gphi_iterator epilogue_phi_gsi;
8651 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8652 basic_block *epilogue_bbs = get_loop_body (epilogue);
8653 unsigned i;
8655 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8657 /* Advance data_reference's with the number of iterations of the previous
8658 loop and its prologue. */
8659 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8662 /* The EPILOGUE loop is a copy of the original loop so they share the same
8663 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8664 point to the copied statements. We also create a mapping of all LHS' in
8665 the original loop and all the LHS' in the EPILOGUE and create worklists to
8666 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8667 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8669 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8670 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8672 new_stmt = epilogue_phi_gsi.phi ();
8674 gcc_assert (gimple_uid (new_stmt) > 0);
8675 stmt_vinfo
8676 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8678 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8679 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8681 mapping.put (gimple_phi_result (orig_stmt),
8682 gimple_phi_result (new_stmt));
8683 /* PHI nodes can not have patterns or related statements. */
8684 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8685 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8688 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8689 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8691 new_stmt = gsi_stmt (epilogue_gsi);
8692 if (is_gimple_debug (new_stmt))
8693 continue;
8695 gcc_assert (gimple_uid (new_stmt) > 0);
8696 stmt_vinfo
8697 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8699 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8700 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8702 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8703 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8705 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8707 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8708 for (gimple_stmt_iterator gsi = gsi_start (seq);
8709 !gsi_end_p (gsi); gsi_next (&gsi))
8710 stmt_worklist.safe_push (gsi_stmt (gsi));
8713 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8714 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8716 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8717 stmt_worklist.safe_push (stmt);
8718 /* Set BB such that the assert in
8719 'get_initial_def_for_reduction' is able to determine that
8720 the BB of the related stmt is inside this loop. */
8721 gimple_set_bb (stmt,
8722 gimple_bb (new_stmt));
8723 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8724 gcc_assert (related_vinfo == NULL
8725 || related_vinfo == stmt_vinfo);
8730 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8731 using the original main loop and thus need to be updated to refer to the
8732 cloned variables used in the epilogue. */
8733 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8735 gimple *stmt = stmt_worklist[i];
8736 tree *new_op;
8738 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8740 tree op = gimple_op (stmt, j);
8741 if ((new_op = mapping.get(op)))
8742 gimple_set_op (stmt, j, *new_op);
8743 else
8745 /* PR92429: The last argument of simplify_replace_tree disables
8746 folding when replacing arguments. This is required as
8747 otherwise you might end up with different statements than the
8748 ones analyzed in vect_loop_analyze, leading to different
8749 vectorization. */
8750 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8751 &find_in_mapping, &mapping, false);
8752 gimple_set_op (stmt, j, op);
8757 struct data_reference *dr;
8758 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8759 FOR_EACH_VEC_ELT (datarefs, i, dr)
8761 orig_stmt = DR_STMT (dr);
8762 gcc_assert (gimple_uid (orig_stmt) > 0);
8763 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8764 /* Data references for gather loads and scatter stores do not use the
8765 updated offset we set using ADVANCE. Instead we have to make sure the
8766 reference in the data references point to the corresponding copy of
8767 the original in the epilogue. */
8768 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8769 == VMAT_GATHER_SCATTER)
8771 DR_REF (dr)
8772 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8773 &find_in_mapping, &mapping);
8774 DR_BASE_ADDRESS (dr)
8775 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8776 &find_in_mapping, &mapping);
8778 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8779 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8780 /* The vector size of the epilogue is smaller than that of the main loop
8781 so the alignment is either the same or lower. This means the dr will
8782 thus by definition be aligned. */
8783 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8786 epilogue_vinfo->shared->datarefs_copy.release ();
8787 epilogue_vinfo->shared->save_datarefs ();
8790 /* Function vect_transform_loop.
8792 The analysis phase has determined that the loop is vectorizable.
8793 Vectorize the loop - created vectorized stmts to replace the scalar
8794 stmts in the loop, and update the loop exit condition.
8795 Returns scalar epilogue loop if any. */
8797 class loop *
8798 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8800 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8801 class loop *epilogue = NULL;
8802 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8803 int nbbs = loop->num_nodes;
8804 int i;
8805 tree niters_vector = NULL_TREE;
8806 tree step_vector = NULL_TREE;
8807 tree niters_vector_mult_vf = NULL_TREE;
8808 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8809 unsigned int lowest_vf = constant_lower_bound (vf);
8810 gimple *stmt;
8811 bool check_profitability = false;
8812 unsigned int th;
8814 DUMP_VECT_SCOPE ("vec_transform_loop");
8816 loop_vinfo->shared->check_datarefs ();
8818 /* Use the more conservative vectorization threshold. If the number
8819 of iterations is constant assume the cost check has been performed
8820 by our caller. If the threshold makes all loops profitable that
8821 run at least the (estimated) vectorization factor number of times
8822 checking is pointless, too. */
8823 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8824 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8826 if (dump_enabled_p ())
8827 dump_printf_loc (MSG_NOTE, vect_location,
8828 "Profitability threshold is %d loop iterations.\n",
8829 th);
8830 check_profitability = true;
8833 /* Make sure there exists a single-predecessor exit bb. Do this before
8834 versioning. */
8835 edge e = single_exit (loop);
8836 if (! single_pred_p (e->dest))
8838 split_loop_exit_edge (e, true);
8839 if (dump_enabled_p ())
8840 dump_printf (MSG_NOTE, "split exit edge\n");
8843 /* Version the loop first, if required, so the profitability check
8844 comes first. */
8846 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8848 class loop *sloop
8849 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8850 sloop->force_vectorize = false;
8851 check_profitability = false;
8854 /* Make sure there exists a single-predecessor exit bb also on the
8855 scalar loop copy. Do this after versioning but before peeling
8856 so CFG structure is fine for both scalar and if-converted loop
8857 to make slpeel_duplicate_current_defs_from_edges face matched
8858 loop closed PHI nodes on the exit. */
8859 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8861 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8862 if (! single_pred_p (e->dest))
8864 split_loop_exit_edge (e, true);
8865 if (dump_enabled_p ())
8866 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8870 tree niters = vect_build_loop_niters (loop_vinfo);
8871 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8872 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8873 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8874 tree advance;
8875 drs_init_vec orig_drs_init;
8877 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8878 &step_vector, &niters_vector_mult_vf, th,
8879 check_profitability, niters_no_overflow,
8880 &advance);
8882 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8883 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8884 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8885 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8887 if (niters_vector == NULL_TREE)
8889 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8890 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
8891 && known_eq (lowest_vf, vf))
8893 niters_vector
8894 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8895 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8896 step_vector = build_one_cst (TREE_TYPE (niters));
8898 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
8899 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8900 &step_vector, niters_no_overflow);
8901 else
8902 /* vect_do_peeling subtracted the number of peeled prologue
8903 iterations from LOOP_VINFO_NITERS. */
8904 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
8905 &niters_vector, &step_vector,
8906 niters_no_overflow);
8909 /* 1) Make sure the loop header has exactly two entries
8910 2) Make sure we have a preheader basic block. */
8912 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8914 split_edge (loop_preheader_edge (loop));
8916 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8917 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8918 /* This will deal with any possible peeling. */
8919 vect_prepare_for_masked_peels (loop_vinfo);
8921 /* Schedule the SLP instances first, then handle loop vectorization
8922 below. */
8923 if (!loop_vinfo->slp_instances.is_empty ())
8925 DUMP_VECT_SCOPE ("scheduling SLP instances");
8926 vect_schedule_slp (loop_vinfo);
8929 /* FORNOW: the vectorizer supports only loops which body consist
8930 of one basic block (header + empty latch). When the vectorizer will
8931 support more involved loop forms, the order by which the BBs are
8932 traversed need to be reconsidered. */
8934 for (i = 0; i < nbbs; i++)
8936 basic_block bb = bbs[i];
8937 stmt_vec_info stmt_info;
8939 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8940 gsi_next (&si))
8942 gphi *phi = si.phi ();
8943 if (dump_enabled_p ())
8944 dump_printf_loc (MSG_NOTE, vect_location,
8945 "------>vectorizing phi: %G", phi);
8946 stmt_info = loop_vinfo->lookup_stmt (phi);
8947 if (!stmt_info)
8948 continue;
8950 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8951 vect_loop_kill_debug_uses (loop, stmt_info);
8953 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8954 && !STMT_VINFO_LIVE_P (stmt_info))
8955 continue;
8957 if (STMT_VINFO_VECTYPE (stmt_info)
8958 && (maybe_ne
8959 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8960 && dump_enabled_p ())
8961 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8963 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8964 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8965 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8966 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8967 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8968 && ! PURE_SLP_STMT (stmt_info))
8970 if (dump_enabled_p ())
8971 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8972 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8976 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8977 !gsi_end_p (si);)
8979 stmt = gsi_stmt (si);
8980 /* During vectorization remove existing clobber stmts. */
8981 if (gimple_clobber_p (stmt))
8983 unlink_stmt_vdef (stmt);
8984 gsi_remove (&si, true);
8985 release_defs (stmt);
8987 else
8989 /* Ignore vector stmts created in the outer loop. */
8990 stmt_info = loop_vinfo->lookup_stmt (stmt);
8992 /* vector stmts created in the outer-loop during vectorization of
8993 stmts in an inner-loop may not have a stmt_info, and do not
8994 need to be vectorized. */
8995 stmt_vec_info seen_store = NULL;
8996 if (stmt_info)
8998 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9000 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9001 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9002 !gsi_end_p (subsi); gsi_next (&subsi))
9004 stmt_vec_info pat_stmt_info
9005 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9006 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9007 &si, &seen_store);
9009 stmt_vec_info pat_stmt_info
9010 = STMT_VINFO_RELATED_STMT (stmt_info);
9011 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9012 &seen_store);
9014 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9015 &seen_store);
9017 gsi_next (&si);
9018 if (seen_store)
9020 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9021 /* Interleaving. If IS_STORE is TRUE, the
9022 vectorization of the interleaving chain was
9023 completed - free all the stores in the chain. */
9024 vect_remove_stores (loop_vinfo,
9025 DR_GROUP_FIRST_ELEMENT (seen_store));
9026 else
9027 /* Free the attached stmt_vec_info and remove the stmt. */
9028 loop_vinfo->remove_stmt (stmt_info);
9033 /* Stub out scalar statements that must not survive vectorization.
9034 Doing this here helps with grouped statements, or statements that
9035 are involved in patterns. */
9036 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9037 !gsi_end_p (gsi); gsi_next (&gsi))
9039 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9040 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9042 tree lhs = gimple_get_lhs (call);
9043 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9045 tree zero = build_zero_cst (TREE_TYPE (lhs));
9046 gimple *new_stmt = gimple_build_assign (lhs, zero);
9047 gsi_replace (&gsi, new_stmt, true);
9051 } /* BBs in loop */
9053 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9054 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9055 if (integer_onep (step_vector))
9056 niters_no_overflow = true;
9057 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9058 niters_vector_mult_vf, !niters_no_overflow);
9060 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9061 scale_profile_for_vect_loop (loop, assumed_vf);
9063 /* True if the final iteration might not handle a full vector's
9064 worth of scalar iterations. */
9065 bool final_iter_may_be_partial
9066 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9067 /* The minimum number of iterations performed by the epilogue. This
9068 is 1 when peeling for gaps because we always need a final scalar
9069 iteration. */
9070 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9071 /* +1 to convert latch counts to loop iteration counts,
9072 -min_epilogue_iters to remove iterations that cannot be performed
9073 by the vector code. */
9074 int bias_for_lowest = 1 - min_epilogue_iters;
9075 int bias_for_assumed = bias_for_lowest;
9076 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9077 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9079 /* When the amount of peeling is known at compile time, the first
9080 iteration will have exactly alignment_npeels active elements.
9081 In the worst case it will have at least one. */
9082 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9083 bias_for_lowest += lowest_vf - min_first_active;
9084 bias_for_assumed += assumed_vf - min_first_active;
9086 /* In these calculations the "- 1" converts loop iteration counts
9087 back to latch counts. */
9088 if (loop->any_upper_bound)
9089 loop->nb_iterations_upper_bound
9090 = (final_iter_may_be_partial
9091 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9092 lowest_vf) - 1
9093 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9094 lowest_vf) - 1);
9095 if (loop->any_likely_upper_bound)
9096 loop->nb_iterations_likely_upper_bound
9097 = (final_iter_may_be_partial
9098 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9099 + bias_for_lowest, lowest_vf) - 1
9100 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9101 + bias_for_lowest, lowest_vf) - 1);
9102 if (loop->any_estimate)
9103 loop->nb_iterations_estimate
9104 = (final_iter_may_be_partial
9105 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9106 assumed_vf) - 1
9107 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9108 assumed_vf) - 1);
9110 if (dump_enabled_p ())
9112 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9114 dump_printf_loc (MSG_NOTE, vect_location,
9115 "LOOP VECTORIZED\n");
9116 if (loop->inner)
9117 dump_printf_loc (MSG_NOTE, vect_location,
9118 "OUTER LOOP VECTORIZED\n");
9119 dump_printf (MSG_NOTE, "\n");
9121 else
9122 dump_printf_loc (MSG_NOTE, vect_location,
9123 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9124 GET_MODE_NAME (loop_vinfo->vector_mode));
9127 /* Loops vectorized with a variable factor won't benefit from
9128 unrolling/peeling. */
9129 if (!vf.is_constant ())
9131 loop->unroll = 1;
9132 if (dump_enabled_p ())
9133 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9134 " variable-length vectorization factor\n");
9136 /* Free SLP instances here because otherwise stmt reference counting
9137 won't work. */
9138 slp_instance instance;
9139 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9140 vect_free_slp_instance (instance, true);
9141 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9142 /* Clear-up safelen field since its value is invalid after vectorization
9143 since vectorized loop can have loop-carried dependencies. */
9144 loop->safelen = 0;
9146 if (epilogue)
9148 update_epilogue_loop_vinfo (epilogue, advance);
9150 epilogue->simduid = loop->simduid;
9151 epilogue->force_vectorize = loop->force_vectorize;
9152 epilogue->dont_vectorize = false;
9155 return epilogue;
9158 /* The code below is trying to perform simple optimization - revert
9159 if-conversion for masked stores, i.e. if the mask of a store is zero
9160 do not perform it and all stored value producers also if possible.
9161 For example,
9162 for (i=0; i<n; i++)
9163 if (c[i])
9165 p1[i] += 1;
9166 p2[i] = p3[i] +2;
9168 this transformation will produce the following semi-hammock:
9170 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9172 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9173 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9174 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9175 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9176 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9177 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9181 void
9182 optimize_mask_stores (class loop *loop)
9184 basic_block *bbs = get_loop_body (loop);
9185 unsigned nbbs = loop->num_nodes;
9186 unsigned i;
9187 basic_block bb;
9188 class loop *bb_loop;
9189 gimple_stmt_iterator gsi;
9190 gimple *stmt;
9191 auto_vec<gimple *> worklist;
9192 auto_purge_vect_location sentinel;
9194 vect_location = find_loop_location (loop);
9195 /* Pick up all masked stores in loop if any. */
9196 for (i = 0; i < nbbs; i++)
9198 bb = bbs[i];
9199 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9200 gsi_next (&gsi))
9202 stmt = gsi_stmt (gsi);
9203 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9204 worklist.safe_push (stmt);
9208 free (bbs);
9209 if (worklist.is_empty ())
9210 return;
9212 /* Loop has masked stores. */
9213 while (!worklist.is_empty ())
9215 gimple *last, *last_store;
9216 edge e, efalse;
9217 tree mask;
9218 basic_block store_bb, join_bb;
9219 gimple_stmt_iterator gsi_to;
9220 tree vdef, new_vdef;
9221 gphi *phi;
9222 tree vectype;
9223 tree zero;
9225 last = worklist.pop ();
9226 mask = gimple_call_arg (last, 2);
9227 bb = gimple_bb (last);
9228 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9229 the same loop as if_bb. It could be different to LOOP when two
9230 level loop-nest is vectorized and mask_store belongs to the inner
9231 one. */
9232 e = split_block (bb, last);
9233 bb_loop = bb->loop_father;
9234 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9235 join_bb = e->dest;
9236 store_bb = create_empty_bb (bb);
9237 add_bb_to_loop (store_bb, bb_loop);
9238 e->flags = EDGE_TRUE_VALUE;
9239 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9240 /* Put STORE_BB to likely part. */
9241 efalse->probability = profile_probability::unlikely ();
9242 store_bb->count = efalse->count ();
9243 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9244 if (dom_info_available_p (CDI_DOMINATORS))
9245 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9246 if (dump_enabled_p ())
9247 dump_printf_loc (MSG_NOTE, vect_location,
9248 "Create new block %d to sink mask stores.",
9249 store_bb->index);
9250 /* Create vector comparison with boolean result. */
9251 vectype = TREE_TYPE (mask);
9252 zero = build_zero_cst (vectype);
9253 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9254 gsi = gsi_last_bb (bb);
9255 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9256 /* Create new PHI node for vdef of the last masked store:
9257 .MEM_2 = VDEF <.MEM_1>
9258 will be converted to
9259 .MEM.3 = VDEF <.MEM_1>
9260 and new PHI node will be created in join bb
9261 .MEM_2 = PHI <.MEM_1, .MEM_3>
9263 vdef = gimple_vdef (last);
9264 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9265 gimple_set_vdef (last, new_vdef);
9266 phi = create_phi_node (vdef, join_bb);
9267 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9269 /* Put all masked stores with the same mask to STORE_BB if possible. */
9270 while (true)
9272 gimple_stmt_iterator gsi_from;
9273 gimple *stmt1 = NULL;
9275 /* Move masked store to STORE_BB. */
9276 last_store = last;
9277 gsi = gsi_for_stmt (last);
9278 gsi_from = gsi;
9279 /* Shift GSI to the previous stmt for further traversal. */
9280 gsi_prev (&gsi);
9281 gsi_to = gsi_start_bb (store_bb);
9282 gsi_move_before (&gsi_from, &gsi_to);
9283 /* Setup GSI_TO to the non-empty block start. */
9284 gsi_to = gsi_start_bb (store_bb);
9285 if (dump_enabled_p ())
9286 dump_printf_loc (MSG_NOTE, vect_location,
9287 "Move stmt to created bb\n%G", last);
9288 /* Move all stored value producers if possible. */
9289 while (!gsi_end_p (gsi))
9291 tree lhs;
9292 imm_use_iterator imm_iter;
9293 use_operand_p use_p;
9294 bool res;
9296 /* Skip debug statements. */
9297 if (is_gimple_debug (gsi_stmt (gsi)))
9299 gsi_prev (&gsi);
9300 continue;
9302 stmt1 = gsi_stmt (gsi);
9303 /* Do not consider statements writing to memory or having
9304 volatile operand. */
9305 if (gimple_vdef (stmt1)
9306 || gimple_has_volatile_ops (stmt1))
9307 break;
9308 gsi_from = gsi;
9309 gsi_prev (&gsi);
9310 lhs = gimple_get_lhs (stmt1);
9311 if (!lhs)
9312 break;
9314 /* LHS of vectorized stmt must be SSA_NAME. */
9315 if (TREE_CODE (lhs) != SSA_NAME)
9316 break;
9318 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9320 /* Remove dead scalar statement. */
9321 if (has_zero_uses (lhs))
9323 gsi_remove (&gsi_from, true);
9324 continue;
9328 /* Check that LHS does not have uses outside of STORE_BB. */
9329 res = true;
9330 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9332 gimple *use_stmt;
9333 use_stmt = USE_STMT (use_p);
9334 if (is_gimple_debug (use_stmt))
9335 continue;
9336 if (gimple_bb (use_stmt) != store_bb)
9338 res = false;
9339 break;
9342 if (!res)
9343 break;
9345 if (gimple_vuse (stmt1)
9346 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9347 break;
9349 /* Can move STMT1 to STORE_BB. */
9350 if (dump_enabled_p ())
9351 dump_printf_loc (MSG_NOTE, vect_location,
9352 "Move stmt to created bb\n%G", stmt1);
9353 gsi_move_before (&gsi_from, &gsi_to);
9354 /* Shift GSI_TO for further insertion. */
9355 gsi_prev (&gsi_to);
9357 /* Put other masked stores with the same mask to STORE_BB. */
9358 if (worklist.is_empty ()
9359 || gimple_call_arg (worklist.last (), 2) != mask
9360 || worklist.last () != stmt1)
9361 break;
9362 last = worklist.pop ();
9364 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9368 /* Decide whether it is possible to use a zero-based induction variable
9369 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9370 the value that the induction variable must be able to hold in order
9371 to ensure that the rgroups eventually have no active vector elements.
9372 Return -1 otherwise. */
9374 widest_int
9375 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9377 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9378 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9379 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9381 /* Calculate the value that the induction variable must be able
9382 to hit in order to ensure that we end the loop with an all-false mask.
9383 This involves adding the maximum number of inactive trailing scalar
9384 iterations. */
9385 widest_int iv_limit = -1;
9386 if (max_loop_iterations (loop, &iv_limit))
9388 if (niters_skip)
9390 /* Add the maximum number of skipped iterations to the
9391 maximum iteration count. */
9392 if (TREE_CODE (niters_skip) == INTEGER_CST)
9393 iv_limit += wi::to_widest (niters_skip);
9394 else
9395 iv_limit += max_vf - 1;
9397 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9398 /* Make a conservatively-correct assumption. */
9399 iv_limit += max_vf - 1;
9401 /* IV_LIMIT is the maximum number of latch iterations, which is also
9402 the maximum in-range IV value. Round this value down to the previous
9403 vector alignment boundary and then add an extra full iteration. */
9404 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9405 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9407 return iv_limit;
9410 /* For the given rgroup_controls RGC, check whether an induction variable
9411 would ever hit a value that produces a set of all-false masks or zero
9412 lengths before wrapping around. Return true if it's possible to wrap
9413 around before hitting the desirable value, otherwise return false. */
9415 bool
9416 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9418 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9420 if (iv_limit == -1)
9421 return true;
9423 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9424 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9425 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9427 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9428 return true;
9430 return false;